def __init__(self, exclude=None): super(CompositionFeaturizers, self).__init__(exclude=exclude) self._fast_featurizers = [ cf.AtomicOrbitals(), cf.ElementProperty.from_preset("matminer"), cf.ElementProperty.from_preset("magpie"), cf.ElementFraction(), cf.Stoichiometry(), cf.TMetalFraction(), cf.BandCenter(), cf.ValenceOrbital() ] self._slow_featurizers = [ cf.Miedema(), cf.AtomicPackingEfficiency(), # slower than the rest cf.CohesiveEnergy() # requires mpid present ] self._need_oxi_featurizers = [ cf.CationProperty.from_preset(preset_name='deml'), cf.OxidationStates.from_preset(preset_name='deml'), cf.ElectronAffinity(), cf.ElectronegativityDiff(), cf.YangSolidSolution(), cf.IonProperty() ]
def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition()
def all(self): fs = [ cf.AtomicOrbitals(), cf.ElementProperty.from_preset("matminer"), cf.ElementProperty.from_preset("magpie"), cf.ElementProperty.from_preset("matscholar_el"), cf.ElementProperty.from_preset("deml"), cf.ElementFraction(), cf.Stoichiometry(), cf.TMetalFraction(), cf.BandCenter(), cf.ValenceOrbital(), cf.YangSolidSolution(), cf.CationProperty.from_preset(preset_name='deml'), cf.OxidationStates.from_preset(preset_name='deml'), cf.ElectronAffinity(), cf.ElectronegativityDiff(), cf.IonProperty(), cf.Miedema(), cf.AtomicPackingEfficiency(), # slower than the rest cf.CohesiveEnergy() # requires mpid present ] return self._get_featurizers(fs)
# element property ep_feat = composition.ElementProperty.from_preset(preset_name="magpie") df_ft = ep_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True)# input the "composition" column to the featurizer # atomic orbitals ao_feat = composition.AtomicOrbitals() df_ft = ao_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) # band center bc_feat = composition.BandCenter() df_ft = bc_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) # miedema m_feat = composition.Miedema() df_ft = m_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) # stoichiometry s_feat = composition.Stoichiometry() df_ft = s_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) # t metal fraction tmf_feat = composition.TMetalFraction() df_ft = tmf_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) # # valence orbital # vo_feat = composition.ValenceOrbital() # df_ft = vo_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) # # yang solid solution # yss_feat = composition.YangSolidSolution() # df_ft = yss_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) # # atomic packing efficiency # ape_feat = composition.AtomicPackingEfficiency() # df_ft = ape_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) df_ft.shape
data.columns.tolist()) # Get only the minimum energy structure at each composition data['composition'] = data['structure'].apply(lambda x: x.composition) data['integer_formula'] = data['composition'].apply( lambda x: x.get_integer_formula_and_factor()[0]) data.sort_values('e_above_hull', ascending=True, inplace=True) data.drop_duplicates('integer_formula', keep='first', inplace=True) print('Reduced dataset to {} unique compositions.'.format(len(data))) data.reset_index(inplace=True, drop=True) # Create the featurizer, which will take the composition as input featurizer = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie'), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True) ]) # Compute the features featurizer.set_n_jobs(1) X = featurizer.featurize_many(data['composition']) # Make the model model = Pipeline([('imputer', Imputer()), ('model', RandomForestRegressor())]) model.fit(X, data['formation_energy_per_atom']) print('Trained a RandomForest model') # Save the model, featurizer, and data using pickle
def load_data_zT(): results_dir = setResDir() ## Metadata keys_response = [ 'Seebeck coefficient; squared', 'Electrical resistivity', 'Thermal conductivity' ] sign = np.array([ +1, # Seebeck -1, # Electric resistivity -1 # Thermal conductivity ]) ## Load data, if possible # -------------------------------------------------- try: df_X_all = pd.read_csv(results_dir + file_features) X_all = df_X_all.drop(df_X_all.columns[0], axis=1).values df_Y_all = pd.read_csv(results_dir + file_responses) Y_all = df_Y_all.drop(df_Y_all.columns[0], axis=1).values print("Cached data loaded.") except FileNotFoundError: ## Data Import # -------------------------------------------------- # Initialize client print("Accessing data from Citrination...") site = 'https://citrination.com' # Citrination client = CitrinationClient(api_key=os.environ['CITRINATION_API_KEY'], site=site) search_client = client.search # Aluminum dataset dataset_id = 178480 # ucsb_te_roomtemp_seebeck system_query = PifSystemReturningQuery( size=1000, query=DataQuery(dataset=DatasetQuery(id=Filter( equal=str(dataset_id))))) query_result = search_client.pif_search(system_query) print(" Found {} PIFs in dataset {}.".format( query_result.total_num_hits, dataset_id)) ## Wrangle # -------------------------------------------------- pifs = [x.system for x in query_result.hits] # Utility function will tabularize PIFs df_response = pifs2df(pifs) # Down-select columns to play well with to_numeric df_response = df_response[[ 'Seebeck coefficient', 'Electrical resistivity', 'Thermal conductivity' ]] df_response = df_response.apply(pd.to_numeric) # Parse chemical compositions formulas = [pif.chemical_formula for pif in pifs] df_comp = pd.DataFrame(columns=['chemical_formula'], data=formulas) # Join df_data = pd.concat([df_comp, df_response], axis=1) print(" Accessed data.") # Featurize print("Featurizing data...") df_data['composition'] = df_data['chemical_formula'].apply( get_compostion) f = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"), cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True) ]) X = np.array(f.featurize_many(df_data['composition'])) # Find valid response values keys_original = [ 'Seebeck coefficient', 'Electrical resistivity', 'Thermal conductivity' ] index_valid_response = { key: df_data[key].dropna().index.values for key in keys_original } index_valid_all = df_data[keys_original].dropna().index.values X_all = X[index_valid_all, :] Y_all = df_data[keys_original].iloc[index_valid_all].values # Manipulate columns for proper objective values Y_all[:, 0] = Y_all[:, 0]**2 # Squared seebeck print(" Data prepared; {0:} valid observations.".format( X_all.shape[0])) # Cache data pd.DataFrame(data=X_all).to_csv(results_dir + file_features) pd.DataFrame(data=Y_all, columns=keys_response).to_csv(results_dir + file_responses) print("Data cached in results directory.") return X_all, Y_all, sign, keys_response, prefix