def get_structure_properties(structure: Structure, mode: str = 'all') -> dict: if mode == 'all': featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset( 'CoordinationNumber_ward-prb-2017'), StructuralHeterogeneity(), ChemicalOrdering(), DensityFeatures(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( 'LocalPropertyDifference_ward-prb-2017'), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) else: # Calculate only those which do not need a Voronoi tesselation featurizer = MultipleFeaturizer([ DensityFeatures(), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) X = featurizer.featurize(structure) matminer_dict = dict(list(zip(featurizer.feature_labels(), X))) matminer_dict['volume'] = structure.volume return matminer_dict
def test_density_features(self): df = DensityFeatures() f = df.featurize(self.diamond) self.assertAlmostEqual(f[0], 3.49, 2) self.assertAlmostEqual(f[1], 5.71, 2) self.assertAlmostEqual(f[2], 0.25, 2) f = df.featurize(self.nacl) self.assertAlmostEqual(f[0], 2.105, 2) self.assertAlmostEqual(f[1], 23.046, 2) self.assertAlmostEqual(f[2], 0.620, 2)
def test_density_features(self): df = DensityFeatures() f = df.featurize(self.diamond) self.assertAlmostEqual(f[0], 3.49, 2) self.assertAlmostEqual(f[1], 5.71, 2) self.assertAlmostEqual(f[2], 0.25, 2) f = df.featurize(self.nacl) self.assertAlmostEqual(f[0], 2.105, 2) self.assertAlmostEqual(f[1], 23.046, 2) self.assertAlmostEqual(f[2], 0.620, 2) nacl_disordered = copy.deepcopy(self.nacl) nacl_disordered.replace_species({"Cl1-": "Cl0.99H0.01"}) self.assertFalse(df.precheck(nacl_disordered)) structures = [self.diamond, self.nacl, nacl_disordered] df2 = pd.DataFrame({"structure": structures}) self.assertAlmostEqual(df.precheck_dataframe(df2, "structure"), 2 / 3)
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with structural features from matminer. Currently applies the set of all matminer structure features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying structure featurizers...") df = df.copy() structure_features = [ DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), BagofBonds() ] featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features]) df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d) df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int) return clean_df(df)
def generate_data(): df = load_elastic_tensor() df.to_csv('原始elastic数据.csv') print(df.columns) unwanted_columns = [ 'volume', 'nsites', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss' ] df = df.drop(unwanted_columns, axis=1) print(df.head()) df.to_csv('扔掉不需要的部分.csv') #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look.csv') #通过观察数据发现并没有什么异常之处 df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) df.to_csv('引入composition.csv') #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('引入氧化态之后.csv') #其实除了基于composition的特征之外还有很多其他的,比如基于结构的 df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, 'structure') print(df.head()) df.to_csv('引入结构中的密度.csv') print(df_feat.feature_labels())
def plot_mean_elastic_tensors(): """ An example of heatmap_df where the input data is real and in dataframe format. We want to look at how average of the elastic constant tensor changes with the density and crystal system. Note that density is not a categorical variable in the final dataframe. Returns: plotly plot in "offline" mode poped in the default browser. """ df = load_elastic_tensor() # data preparation: df['Mean Elastic Constant'] = df['elastic_tensor'].apply(lambda x: np.mean(x)) gs = GlobalSymmetryFeatures(desired_features=['crystal_system']) df = gs.featurize_dataframe(df, col_id='structure') dsf = DensityFeatures(desired_features=['density']) df = dsf.featurize_dataframe(df, col_id='structure') # actual plotting pf = PlotlyFig(fontscale=0.75, filename='static_elastic_constants', colorscale='RdBu') pf.heatmap_df(df[['crystal_system', 'density', 'Mean Elastic Constant']])
def add_cs_features(df,rdf_flag=False): df["composition"] = str_to_composition(df["pretty_formula"]) df["composition_oxid"] = composition_to_oxidcomposition(df["composition"]) df["structure"] = dict_to_object(df["structure"]) vo = ValenceOrbital() df = vo.featurize_dataframe(df,"composition") ox = OxidationStates() df = ox.featurize_dataframe(df, "composition_oxid") # structure features den = DensityFeatures() df = den.featurize_dataframe(df, "structure") if rdf_flag: rdf = RadialDistributionFunction(cutoff=15.0,bin_size=0.2) df = rdf.featurize_dataframe(df, "structure") return df
def plot_mean_elastic_tensors(): """ An example of heatmap_df where the input data is real and in dataframe format. We want to look at how average of the elastic constant tensor changes with the density and crystal system. Note that density is not a categorical variable in the final dataframe. Returns: plotly plot in "offline" mode poped in the default browser. """ df = load_dataset("elastic_tensor_2015") # data preparation: df['Mean Elastic Constant'] = df['elastic_tensor'].apply( lambda x: np.mean(x)) gs = GlobalSymmetryFeatures(desired_features=['crystal_system']) df = gs.featurize_dataframe(df, col_id='structure') dsf = DensityFeatures(desired_features=['density']) df = dsf.featurize_dataframe(df, col_id='structure') # actual plotting pf = PlotlyFig(fontscale=0.75, filename='static_elastic_constants', colorscale='RdBu') pf.heatmap_df(df[['crystal_system', 'density', 'Mean Elastic Constant']])
def test_featurizers_by_users(self): df = copy.copy(self.test_df.iloc[:self.limit]) target = "K_VRH" dn = DensityFeatures() gsf = GlobalSymmetryFeatures() featurizers = {"structure": [dn, gsf]} af = AutoFeaturizer(featurizers=featurizers) df = af.fit_transform(df, target) # Ensure that the featurizers are not set automatically, metaselection # is not used, exclude is None and featurizers not passed by the users # are not used. self.assertFalse(af.auto_featurizer) self.assertTrue(af.exclude == []) self.assertIn(dn, af.featurizers["structure"]) self.assertIn(gsf, af.featurizers["structure"]) ep = ElementProperty.from_preset("matminer") ep_feats = ep.feature_labels() self.assertFalse(any([f in df.columns for f in ep_feats]))
from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, col_id='structure') y = df['K_VRH'].values excluded = ["G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id", "poisson_ratio", "structure", "composition", "composition_oxid"] X = df.drop(excluded, axis=1) print("There are {} possible descriptors:\n\n{}".format(X.shape[1], X.columns.values)) from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import numpy as np lr = LinearRegression() lr.fit(X, y) print(lr.score(X, y))
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer): from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, CohesiveEnergy, ElectronAffinity, ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( BagofBonds, BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) from matminer.featurizers.dos import ( DOSFeaturizer, SiteDOS, Hybridization, DosAsymmetry, ) from matminer.featurizers.bandstructure import ( BandFeaturizer, BranchPointEnergy ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxid_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), #PartialRadialDistributionFunction(), #Introduces a large amount of features SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) dos_featurizers = ( DOSFeaturizer(), SiteDOS(), Hybridization() ) band_featurizers = ( BandFeaturizer(), BranchPointEnergy() ) def __init__(self, n_jobs=None): self._n_jobs = n_jobs def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( _orbitals ) df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( _orbitals ) df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) return clean_df(df) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ "distances" ][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d ) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function" ].apply(lambda x: x["distribution"][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7, } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system" ].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) return clean_df(df) def featurize_dos(self, df): """Applies the presetdos featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_dos(df) hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"] one_hot = pd.get_dummies(df[hotencodeColumns]) df = df.drop(hotencodeColumns, axis = 1).join(one_hot) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["DOSFeaturizer|vbm_character_1"] = df[ "DOSFeaturizer|vbm_character_1" ].map(_orbitals) df["DOSFeaturizer|cbm_character_1"] = df[ "DOSFeaturizer|cbm_character_1" ].map(_orbitals) # Splitting one feature into several floating features # e.g. number;number;number into three columns splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"] for column in splitColumns: try: newColumns = df[column].str.split(";", n = 2, expand = True) for i in range(0,3): df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float) except: continue df = df.drop(splitColumns, axis=1) df = df.drop(["dos"], axis=1) return clean_df(df) def featurize_bandstructure(self, df): """Applies the preset band structure featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_bandstructure(df) def _int_map(x): if str(x) == "False": return 0 elif str(x) == "True": return 1 df["BandFeaturizer|is_gap_direct"] = df[ "BandFeaturizer|is_gap_direct" ].map(_int_map) df = df.drop(["bandstructure"], axis=1) return clean_df(df) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
def _extract_features(self, df_input): """ Extract features using Matminer from the 'structure' column in df_input Args: df_input (DataFrame): Pandas DataFrame whcih conatains features from Materials Project Database of the input samples Returns: df_extracted (DataFrame): Pandas DataFrame which contains features of input samples extracted using Matminer """ # Dropping the 'theoretical' column df_input.drop(columns=["theoretical"], inplace=True) # Extracting the features dfeat = DensityFeatures() symmfeat = GlobalSymmetryFeatures() mfeat = Meredig() cefeat = CohesiveEnergy() df_input["density"] = df_input.structure.apply( lambda x: dfeat.featurize(x)[0]) df_input["vpa"] = df_input.structure.apply( lambda x: dfeat.featurize(x)[1]) df_input["packing fraction"] = df_input.structure.apply( lambda x: dfeat.featurize(x)[2]) df_input["spacegroup_num"] = df_input.structure.apply( lambda x: symmfeat.featurize(x)[0]) df_input["cohesive_energy"] = df_input.apply( lambda x: cefeat.featurize( x.structure.composition, formation_energy_per_atom=x.formation_energy_per_atom, )[0], axis=1, ) df_input["mean AtomicWeight"] = df_input.structure.apply( lambda x: mfeat.featurize(x.composition)[-17]) df_input["range AtomicRadius"] = df_input.structure.apply( lambda x: mfeat.featurize(x.composition)[-12]) df_input["mean AtomicRadius"] = df_input.structure.apply( lambda x: mfeat.featurize(x.composition)[-11]) df_input["range Electronegativity"] = df_input.structure.apply( lambda x: mfeat.featurize(x.composition)[-10]) df_input["mean Electronegativity"] = df_input.structure.apply( lambda x: mfeat.featurize(x.composition)[-9]) # Drop 'structure' column df_input.drop(columns=["structure"], inplace=True) # ignore compounds that failed to featurize df_extracted = df_input.fillna( df_input.mean()).query("cohesive_energy > 0.0") # Re-arranging the 'PU Label' column pu_label = df_extracted["PU_label"] df_extracted = df_extracted.drop(["PU_label"], axis=1) df_extracted["PU_label"] = pu_label # Drop the icsd_ids column df_extracted.drop(columns=["icsd_ids"], inplace=True) return df_extracted
def AddFeatures(df): # Add features by Matminer from matminer.featurizers.conversions import StrToComposition df = StrToComposition().featurize_dataframe(df, "formula") from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe( df, col_id="composition" ) # input the "composition" column to the featurizer from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") from matminer.featurizers.composition import ElectronAffinity ea_feat = ElectronAffinity() df = ea_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import BandCenter bc_feat = BandCenter() df = bc_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import CohesiveEnergy ce_feat = CohesiveEnergy() df = ce_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import Miedema m_feat = Miedema() df = m_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import TMetalFraction tmf_feat = TMetalFraction() df = tmf_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import ValenceOrbital vo_feat = ValenceOrbital() df = vo_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import YangSolidSolution yss_feat = YangSolidSolution() df = yss_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.structure import GlobalSymmetryFeatures # This is the border between compositional features and structural features. Comment out the following featurizers to use only compostional features. gsf_feat = GlobalSymmetryFeatures() df = gsf_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import StructuralComplexity sc_feat = StructuralComplexity() df = sc_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import ChemicalOrdering co_feat = ChemicalOrdering() df = co_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import MaximumPackingEfficiency mpe_feat = MaximumPackingEfficiency() df = mpe_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import MinimumRelativeDistances mrd_feat = MinimumRelativeDistances() df = mrd_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import StructuralHeterogeneity sh_feat = StructuralHeterogeneity() df = sh_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import SiteStatsFingerprint from matminer.featurizers.site import AverageBondLength from pymatgen.analysis.local_env import CrystalNN bl_feat = SiteStatsFingerprint( AverageBondLength(CrystalNN(search_cutoff=20))) df = bl_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import AverageBondAngle ba_feat = SiteStatsFingerprint( AverageBondAngle(CrystalNN(search_cutoff=20))) df = ba_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import BondOrientationalParameter bop_feat = SiteStatsFingerprint(BondOrientationalParameter()) df = bop_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import CoordinationNumber cn_feat = SiteStatsFingerprint(CoordinationNumber()) df = cn_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, "structure", ignore_errors=True) return (df)
ep_feat = ElementProperty.from_preset(preset_name="magpie") data_3 = ep_feat.featurize_dataframe(data_3, col_id="composition") from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates data_3 = CompositionToOxidComposition().featurize_dataframe( data_3, "composition") os_feat = OxidationStates() data_3 = os_feat.featurize_dataframe(data_3, "composition_oxid") from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() data_3 = df_feat.featurize_dataframe(data_3, "structure") unwanted_columns = [ "elasticity", "material_id", "nsites", "compliance_tensor", "elastic_tensor", "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss", "warnings" ] data_4 = data_3.drop(unwanted_columns, axis=1) # In[ ]: # Additional data cleaning after some trial runs y = data_4['K_VRH'].values excluded = [ "G_VRH", "K_VRH", "elastic_anisotropy", "pretty_formula", "poisson_ratio",
def predict_log10_eps( target: Union[Structure, Composition], dielectric_type: str, model_type: str, ) -> float: """ :param target: structure or composition to predict dielectric constants :param dielectric_type: "el" or "ion" :param model_type: "comp" or "comp_st" :return: Descriptor vector """ if dielectric_type not in ["el", "ion"]: raise ValueError( f'Specify dielectric type "el" or "ion"\nInput: {dielectric_type}') if model_type not in ["comp", "comp_st"]: raise ValueError( f'Specify regression_type "comp" or "comp_st"\nInput: {model_type}' ) if model_type == "comp": if isinstance(target, Structure): comp = target.composition else: comp = target comp_oxi = comp.add_charges_from_oxi_state_guesses() if dielectric_type == "el": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) ion_prop = ScalarFeaturizer(IonProperty(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) descriptor = [ ep.get_from_label("PymatgenData minimum X"), ep.get_from_label("PymatgenData range X"), ep.get_from_label("PymatgenData std_dev X"), ep.get_from_label("PymatgenData mean row"), ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData mean group"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev block"), ep.get_from_label("PymatgenData mean atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_radius"), ep.get_from_label("PymatgenData minimum mendeleev_no"), ep.get_from_label("PymatgenData range mendeleev_no"), ep.get_from_label("PymatgenData std_dev mendeleev_no"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev thermal_conductivity"), ep.get_from_label("PymatgenData mean melting_point"), ep.get_from_label("PymatgenData std_dev melting_point"), valence.get_from_label("avg s valence electrons"), valence.get_from_label("avg d valence electrons"), valence.get_from_label("frac s valence electrons"), valence.get_from_label("frac p valence electrons"), valence.get_from_label("frac d valence electrons"), ion_prop.get_from_label("avg ionic char"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("maximum EN difference"), en_diff.get_from_label("range EN difference"), en_diff.get_from_label("mean EN difference"), en_diff.get_from_label("std_dev EN difference"), BandCenter().featurize(comp)[0], oxi_state.get_from_label("std_dev oxidation state"), atomic_orbital.get_from_label("HOMO_energy"), atomic_orbital.get_from_label("LUMO_energy"), atomic_orbital.get_from_label("gap_AO"), ] elif dielectric_type == "ion": stoich = ScalarFeaturizer(Stoichiometry(), comp) ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) ion_prop = ScalarFeaturizer(IonProperty(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) at_pack_eff = ScalarFeaturizer(AtomicPackingEfficiency(), comp) descriptor = [ stoich.get_from_label("3-norm"), stoich.get_from_label("5-norm"), ep.get_from_label("PymatgenData mean X"), ep.get_from_label("PymatgenData mean row"), ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData std_dev group"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev block"), ep.get_from_label("PymatgenData maximum atomic_mass"), ep.get_from_label("PymatgenData range atomic_mass"), ep.get_from_label("PymatgenData mean atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_mass"), ep.get_from_label("PymatgenData maximum atomic_radius"), ep.get_from_label("PymatgenData range atomic_radius"), ep.get_from_label("PymatgenData mean atomic_radius"), ep.get_from_label("PymatgenData std_dev atomic_radius"), ep.get_from_label("PymatgenData minimum mendeleev_no"), ep.get_from_label("PymatgenData mean mendeleev_no"), ep.get_from_label("PymatgenData std_dev mendeleev_no"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev thermal_conductivity"), ep.get_from_label("PymatgenData mean melting_point"), ep.get_from_label("PymatgenData std_dev melting_point"), valence.get_from_label("avg s valence electrons"), valence.get_from_label("frac s valence electrons"), valence.get_from_label("frac p valence electrons"), valence.get_from_label("frac d valence electrons"), ion_prop.get_from_label("avg ionic char"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("minimum EN difference"), en_diff.get_from_label("range EN difference"), en_diff.get_from_label("mean EN difference"), en_diff.get_from_label("std_dev EN difference"), oxi_state.get_from_label("range oxidation state"), oxi_state.get_from_label("std_dev oxidation state"), atomic_orbital.get_from_label("LUMO_energy"), atomic_orbital.get_from_label("gap_AO"), at_pack_eff.get_from_label("mean simul. packing efficiency"), at_pack_eff.get_from_label( "mean abs simul. packing efficiency"), at_pack_eff.get_from_label( "dist from 1 clusters |APE| < 0.010"), at_pack_eff.get_from_label( "dist from 3 clusters |APE| < 0.010"), at_pack_eff.get_from_label( "dist from 5 clusters |APE| < 0.010"), ] elif model_type == "comp_st": if isinstance(target, Composition): raise ValueError( 'comp_st (Using compositional and structural descriptor) is specified, ' 'but target is composition') comp: Composition = target.composition comp_oxi = comp.add_charges_from_oxi_state_guesses() target_orig = deepcopy(target) target.add_oxidation_state_by_guess() if dielectric_type == "el": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) density = ScalarFeaturizer(DensityFeatures(), target) dist_btw_nn = MinimumRelativeDistances().featurize(target_orig) opsf = SiteFeaturizer(OPSiteFingerprint(), target) voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True), target) gsf = SiteFeaturizer(GaussianSymmFunc(), target) lpd = SiteFeaturizer( LocalPropertyDifference.from_preset("ward-prb-2017"), target) descriptor = [ ep.get_from_label("PymatgenData std_dev X"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev atomic_mass"), valence.get_from_label("frac d valence electrons"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("maximum EN difference"), en_diff.get_from_label("mean EN difference"), atomic_orbital.get_from_label("HOMO_energy"), atomic_orbital.get_from_label("LUMO_energy"), density.get_from_label("density"), np.mean(dist_btw_nn), np.std(dist_btw_nn), opsf.get_from_label_func("tetrahedral CN_4", np.max), opsf.get_from_label_func("rectangular see-saw-like CN_4", np.max), np.max([ EwaldSiteEnergy(accuracy=4).featurize(target, i) for i in range(target.num_sites) ]), voro_fp.get_from_label_func("Voro_area_std_dev", np.max), voro_fp.get_from_label_func("Voro_area_std_dev", np.mean), voro_fp.get_from_label_func("Voro_dist_minimum", np.min), voro_fp.get_from_label_func("Voro_dist_minimum", np.std), gsf.get_from_label_func("G2_20.0", np.std), gsf.get_from_label_func("G2_80.0", np.max), gsf.get_from_label_func("G4_0.005_4.0_-1.0", np.mean), lpd.get_from_label_func("local difference in NdValence", np.mean), lpd.get_from_label_func("local difference in NValence", np.min), lpd.get_from_label_func("local difference in NValence", np.std), lpd.get_from_label_func("local difference in NdUnfilled", np.mean), lpd.get_from_label_func("local difference in NUnfilled", np.min), lpd.get_from_label_func("local difference in NUnfilled", np.mean), lpd.get_from_label_func("local difference in GSmagmom", np.mean) ] elif dielectric_type == "ion": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) atomic_orbitals = ScalarFeaturizer(AtomicOrbitals(), comp) density = ScalarFeaturizer(DensityFeatures(), target) str_het = ScalarFeaturizer(StructuralHeterogeneity(), target) opsf = SiteFeaturizer(OPSiteFingerprint(), target) voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True), target) gsf = SiteFeaturizer(GaussianSymmFunc(), target) lpd = SiteFeaturizer( LocalPropertyDifference.from_preset("ward-prb-2017"), target) descriptor = [ ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev melting_point"), TMetalFraction().featurize(comp)[0], atomic_orbitals.get_from_label("gap_AO"), density.get_from_label("density"), density.get_from_label("packing fraction"), str_het.get_from_label("mean neighbor distance variation"), str_het.get_from_label("avg_dev neighbor distance variation"), opsf.get_from_label_func("sgl_bd CN_1", np.mean), opsf.get_from_label_func("bent 150 degrees CN_2", np.mean), opsf.get_from_label_func("linear CN_2", np.mean), opsf.get_from_label_func("trigonal planar CN_3", np.mean), opsf.get_from_label_func("pentagonal planar CN_5", np.std), opsf.get_from_label_func("octahedral CN_6", np.max), opsf.get_from_label_func("octahedral CN_6", np.std), opsf.get_from_label_func("q6 CN_12", np.mean), np.max([ EwaldSiteEnergy(accuracy=4).featurize(target, i) for i in range(target.num_sites) ]), voro_fp.get_from_label_func("Symmetry_weighted_index_4", np.std), voro_fp.get_from_label_func("Voro_vol_maximum", np.mean), voro_fp.get_from_label_func("Voro_area_std_dev", np.mean), voro_fp.get_from_label_func("Voro_area_minimum", np.std), voro_fp.get_from_label_func("Voro_area_maximum", np.min), voro_fp.get_from_label_func("Voro_dist_std_dev", np.mean), gsf.get_from_label_func("G2_80.0", np.min), gsf.get_from_label_func("G4_0.005_4.0_1.0", np.std), lpd.get_from_label_func("local difference in Number", np.max), lpd.get_from_label_func("local difference in MendeleevNumber", np.max), lpd.get_from_label_func("local difference in MendeleevNumber", np.min), lpd.get_from_label_func("local difference in AtomicWeight", np.max), lpd.get_from_label_func("local difference in AtomicWeight", np.mean), lpd.get_from_label_func("local difference in MeltingT", np.mean), lpd.get_from_label_func("local difference in Row", np.max), lpd.get_from_label_func( "local difference in Electronegativity", np.min), lpd.get_from_label_func("local difference in NValence", np.std), lpd.get_from_label_func("local difference in NsUnfilled", np.mean), lpd.get_from_label_func("local difference in NdUnfilled", np.max), lpd.get_from_label_func("local difference in NdUnfilled", np.std), lpd.get_from_label_func("local difference in NUnfilled", np.max), lpd.get_from_label_func("local difference in NUnfilled", np.min), lpd.get_from_label_func("local difference in NUnfilled", np.mean), lpd.get_from_label_func("local difference in NUnfilled", np.std), lpd.get_from_label_func("local difference in GSvolume_pa", np.max), lpd.get_from_label_func("local difference in GSvolume_pa", np.min), lpd.get_from_label_func("local difference in SpaceGroupNumber", np.max), ] with open( f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}.joblib", "rb") as fr: model: RandomForestRegressor = joblib.load(fr) with open( f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}_scaler.joblib", "rb") as fr: scaler: StandardScaler = joblib.load(fr) descriptor = scaler.transform([descriptor]) return model.predict(descriptor)[0]
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer): """ Featurizer presets used for the paper 'Machine learning materials properties for small datasets' by Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020). Uses most of the featurizers implemented by matminer at the time of writing with their default hyperparameters and presets. """ from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, # CohesiveEnergy, - This descriptor was not used in the paper preset # ElectronAffinity, - This descriptor was not used in the paper preset ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( # BagofBonds, - This descriptor was not used in the paper preset BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, # PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxide_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), # PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), # BagofBonds(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) def featurize_composition(self, df): """ Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df[ 'AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df[ 'AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df[ 'AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df['AtomicOrbitals|LUMO_element'] = df[ 'AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df = df.replace([np.inf, -np.inf, np.nan], 0) return modnet.featurizers.clean_df(df) def featurize_structure(self, df): """ Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df[ "RadialDistributionFunction|radial distribution function"].iloc[0][ 'distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function"].apply( lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map) return modnet.featurizers.clean_df(df) def featurize_site(self, df): """ Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ # rename some features for backwards compatibility with pretrained models aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return modnet.featurizers.clean_df(df)
structlist.append([Structure.from_file(directoryname + i) ]) #Converts CIF to pymatgen structure object namelist.append(os.path.splitext(i)[0]) #Collects all the structure names structs.append(Structure.from_file(directoryname + i)) #Creates Pandas dataframe with data being a list of structures and the row name being the structure name dftest = pd.DataFrame(data=structlist, index=namelist, columns=namecolumns) p = PartialRadialDistributionFunction() p.fit(np.asarray(structs)) c = CoulombMatrix() c.fit(np.asarray(structs)) erdf = ElectronicRadialDistributionFunction() erdf.cutoff = 10 #longest diagonal of lattice...I picked a number semi-arbitrarily #Featurizes the structures featurizer = MultipleFeaturizer([ ElementProperty.from_preset('magpie'), OxidationStates(), AtomicOrbitals(), BandCenter(), ElectronegativityDiff(), DensityFeatures(), RadialDistributionFunction(), p, c, erdf ]) r = (featurizer.featurize_many(dftest, ['structure']) ) #Featurizes entire Pandas Dataframe #Yay it runs!