def tran_feat_composition( df, var_formula="FORMULA", preset_name="magpie", append=True, ignore_errors=True, **kwargs, ): r"""Featurize a dataset using matminer Featurize chemical composition using matminer package. Args: df (DataFrame): Data to featurize var_formula (string): Column in df with chemical formula; formula given as string append (bool): Append results to original columns? preset_name (string): Matminer featurization preset Kwargs: ignore_errors (bool): Do not throw an error while parsing formulae; set to True to return NaN's for invalid formulae. Notes: - A pre-processor and wrapper for matminer.featurizers.composition References: Ward, L., Dunn, A., Faghaninia, A., Zimmermann, N. E. R., Bajaj, S., Wang, Q., Montoya, J. H., Chen, J., Bystrom, K., Dylla, M., Chard, K., Asta, M., Persson, K., Snyder, G. J., Foster, I., Jain, A., Matminer: An open source toolkit for materials data mining. Comput. Mater. Sci. 152, 60-69 (2018). Examples: >>> import grama as gr >>> from grama.tran import tf_feat_composition >>> ( >>> gr.df_make(FORMULA=["C6H12O6"]) >>> >> gr.tf_feat_composition() >>> ) """ ## Check invariants ## Featurize featurizer = ElementProperty.from_preset(preset_name=preset_name) df_res = StrToComposition().featurize_dataframe( df[[var_formula]], var_formula, ignore_errors=ignore_errors, ) df_res = featurizer.featurize_dataframe( df_res, col_id="composition", ignore_errors=ignore_errors, **kwargs, ) df_res.drop(columns=[var_formula, "composition"], inplace=True) ## Concatenate as necessary if append: df_res = concat((df, df_res), axis=1) return df_res
data_3, "composition") os_feat = OxidationStates() data_3 = os_feat.featurize_dataframe(data_3, "composition_oxid") from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() data_3 = df_feat.featurize_dataframe(data_3, "structure") unwanted_columns = [ "elasticity", "material_id", "nsites", "compliance_tensor", "elastic_tensor", "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss", "warnings" ] data_4 = data_3.drop(unwanted_columns, axis=1) # In[ ]: # Additional data cleaning after some trial runs y = data_4['K_VRH'].values excluded = [ "G_VRH", "K_VRH", "elastic_anisotropy", "pretty_formula", "poisson_ratio", "structure", "composition", "composition_oxid", "G_Voigt_Reuss_Hill", "K_Voigt_Reuss_Hill", "homogeneous_poisson", "universal_anisotropy" ] X = data_4.drop(excluded, axis=1) # The final row of excluded lable list is to minimize training interference. # An exceptionally good or near perfect linear fit was first obtained without dropping them, which is abnormal.
''' Block 3 - Loading and making predictions ''' # Saving Id and Formula to an output dataframe odf odf = pd.DataFrame() # output dataframe odf['Id'] = fdf['Id'] odf['Reduced Formula'] = fdf['Reduced Formula'] excluded = [ 'Id', 'Reduced Formula', 'composition', 'composition_oxid', 'HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element' ] # A few additional adjustments fdf = fdf.drop(excluded, axis=1) fdf = fdf.replace([np.inf, -np.inf], np.nan) fdf = fdf.fillna(0) # No target variables, therefore: X = fdf # Normalizing the data from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index) # Check dimensionality print('There are {} possible descriptors:\n{}'.format(X.shape[1], X.columns.values))
from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, col_id='structure') y = df['K_VRH'].values excluded = ["G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id", "poisson_ratio", "structure", "composition", "composition_oxid"] X = df.drop(excluded, axis=1) print("There are {} possible descriptors:\n\n{}".format(X.shape[1], X.columns.values)) from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import numpy as np lr = LinearRegression() lr.fit(X, y) print(lr.score(X, y)) print(np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X)))) from sklearn.model_selection import KFold, cross_val_score crossvalidation = KFold(n_splits=10, shuffle=False, random_state=1)
import pandas as pd # pd.set_option('display.height', 1000) pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) df = load_dataset("glass_ternary_landolt") df = df.rename(columns={"formula": "composition"}) df = df[["composition", "gfa"]] df = StrToComposition(target_col_id="composition_obj").featurize_dataframe( df, "composition") df["composition"] = [c.reduced_formula for c in df["composition_obj"]] df = df.drop(columns=["composition_obj"]) # print("Ground truth") # print(df[df["composition"]=="ZrTi9"]) # should be False in final dataframe also!! # print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also! # print(df["gfa"].value_counts()) # proportion is about 5000 GFA 2054 no GFA # raise ValueError unique = df["composition"].unique() print(len(df)) print(len(unique)) problem_compositions = [] new_df_dict = {"composition": [], "gfa": []} for c in tqdm(unique): df_per_comp_gfa = df[df["composition"] == c]
''' df = load_dataframe_from_json('data/Batteries_raw.json') # Select the working ion among {Li, Al, Zr, Mg} select = 'Li' # Initial filter based on the selected element from matminer.featurizers.conversions import StrToComposition fdf = StrToComposition().featurize_dataframe(df, 'Ion') select_at = fdf["composition"].apply(lambda x: x.get_atomic_fraction(select)) fdf = fdf[select_at == 1] # Debug print("Remaining samples: {}".format(fdf.describe)) fdf = fdf.drop(['composition'], axis=1) ## Initial conversion to matminer objects from matminer.featurizers.conversions import StrToComposition fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula') from matminer.featurizers.conversions import CompositionToOxidComposition fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition') print("The initial dataset has {}".format(fdf.shape)) # fdf.to_csv(r"Batteries_feat.csv", index = None, header = True) ''' Block 2 - Featurization ''' # # -- start F1