def get_structure_properties(structure: Structure, mode: str = 'all') -> dict: if mode == 'all': featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset( 'CoordinationNumber_ward-prb-2017'), StructuralHeterogeneity(), ChemicalOrdering(), DensityFeatures(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( 'LocalPropertyDifference_ward-prb-2017'), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) else: # Calculate only those which do not need a Voronoi tesselation featurizer = MultipleFeaturizer([ DensityFeatures(), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) X = featurizer.featurize(structure) matminer_dict = dict(list(zip(featurizer.feature_labels(), X))) matminer_dict['volume'] = structure.volume return matminer_dict
def featurize_structures(self, featurizer=None, **kwargs): """ Featurizes the hypothetical structures available from hypo_structures method. Hypothetical structures for which featurization fails are removed and valid structures are made available as valid_structures Args: featurizer (Featurizer): A MatMiner Featurizer. Defaults to MultipleFeaturizer with PRB Ward Voronoi descriptors. **kwargs (dict): kwargs passed to featurize_many method of featurizer. Returns: (pandas.DataFrame): features """ # Note the redundancy here is for pandas to work if self.hypo_structures is None: warnings.warn("No structures available. Generating structures.") self.get_structures() print("Generating features") featurizer = featurizer if featurizer else MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=['frac'])), StructureComposition(IonProperty(fast=True)) ]) features = featurizer.featurize_many( self.hypo_structures['structure'], ignore_errors=True, **kwargs) n_species, formula = [], [] for s in self.hypo_structures['structure']: n_species.append(len(s.composition.elements)) formula.append(s.composition.formula) self._features_df = pd.DataFrame.from_records( features, columns=featurizer.feature_labels()) self._features_df.index = self.hypo_structures.index self._features_df['N_species'] = n_species self._features_df['Composition'] = formula self._features_df['structure'] = self.hypo_structures['structure'] self.features = self._features_df.dropna(axis=0, how='any') self.features = self.features.reindex(sorted(self.features.columns), axis=1) self._valid_structure_labels = list(self.features.index) self.valid_structures = self.hypo_structures.loc[self._valid_structure_labels] print("{} out of {} structures were successfully featurized.".format( self.features.shape[0], self._features_df.shape[0])) return self.features
def featurize_composition(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with composition features from matminer. Currently applies the set of all matminer composition features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying composition featurizers...") df = df.copy() df['composition'] = df['structure'].apply(lambda s: s.composition) featurizer = MultipleFeaturizer([ElementProperty.from_preset("magpie"), AtomicOrbitals(), BandCenter(), # ElectronAffinity(), - This descriptor was not used in the paper preset Stoichiometry(), ValenceOrbital(), IonProperty(), ElementFraction(), TMetalFraction(), # CohesiveEnergy(), - This descriptor was not used in the paper preset Miedema(), YangSolidSolution(), AtomicPackingEfficiency(), ]) df = featurizer.featurize_dataframe(df, "composition", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') ox_featurizer = MultipleFeaturizer([OxidationStates(), ElectronegativityDiff() ]) df = CompositionToOxidComposition().featurize_dataframe(df, "Input Data|composition") df = ox_featurizer.featurize_dataframe(df, "composition_oxid", multiindex=True, ignore_errors=True) df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df['AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df['AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df['AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df['AtomicOrbitals|LUMO_element'] = df['AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df = df.replace([np.inf, -np.inf, np.nan], 0) return clean_df(df)
def similarity(_parents, target): featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( "LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=["frac"])), StructureComposition(IonProperty(fast=True)), ]) # HACK celery doesn't work with multiprocessing (used by matminer) try: from celery import current_task if current_task: featurizer.set_n_jobs(1) except ImportError: pass x_target = pd.DataFrame.from_records([featurizer.featurize(target)], columns=featurizer.feature_labels()) x_parent = pd.DataFrame.from_records( featurizer.featurize_many(_parents, ignore_errors=True, pbar=False), columns=featurizer.feature_labels(), ) nulls = x_parent[x_parent.isnull().any(axis=1)].index.values x_parent.fillna(100000, inplace=True) x_target = x_target.reindex(sorted(x_target.columns), axis=1) x_parent = x_parent.reindex(sorted(x_parent.columns), axis=1) with open(os.path.join(settings.rxn_files, "scaler2.pickle"), "rb") as f: scaler = pickle.load(f) with open(os.path.join(settings.rxn_files, "quantiles.pickle"), "rb") as f: quantiles = pickle.load(f) X = scaler.transform(x_parent.append(x_target)) D = [pairwise_distances(np.array([row, X[-1]]))[0, 1] for row in X[:-1]] _res = [] for d in D: _res.append(np.linspace(0, 1, 101)[np.abs(quantiles - d).argmin()]) _res = np.array(_res) _res[nulls] = -1 return _res
def test_stoich(self): featurizer = Stoichiometry(num_atoms=True) df_stoich = Stoichiometry(num_atoms=True).featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_stoich["num atoms"][0], 5) self.assertAlmostEqual(df_stoich["0-norm"][0], 2) self.assertAlmostEqual(df_stoich["7-norm"][0], 0.604895199) # Test whether the number of formula units affects result original_value = featurizer.featurize(Composition("FeO")) self.assertArrayAlmostEqual(featurizer.featurize(Composition("Fe0.5O0.5")), original_value) self.assertArrayAlmostEqual(featurizer.featurize(Composition("Fe2O2")), original_value)
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer): """ Featurizer presets used for the paper 'Machine learning materials properties for small datasets' by Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020). Uses most of the featurizers implemented by matminer at the time of writing with their default hyperparameters and presets. """ from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, # CohesiveEnergy, - This descriptor was not used in the paper preset # ElectronAffinity, - This descriptor was not used in the paper preset ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( # BagofBonds, - This descriptor was not used in the paper preset BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, # PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxide_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), # PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), # BagofBonds(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) def featurize_composition(self, df): """ Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df[ 'AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df[ 'AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df[ 'AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df['AtomicOrbitals|LUMO_element'] = df[ 'AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df = df.replace([np.inf, -np.inf, np.nan], 0) return modnet.featurizers.clean_df(df) def featurize_structure(self, df): """ Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df[ "RadialDistributionFunction|radial distribution function"].iloc[0][ 'distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function"].apply( lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map) return modnet.featurizers.clean_df(df) def featurize_site(self, df): """ Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ # rename some features for backwards compatibility with pretrained models aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return modnet.featurizers.clean_df(df)
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer): from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, CohesiveEnergy, ElectronAffinity, ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( BagofBonds, BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) from matminer.featurizers.dos import ( DOSFeaturizer, SiteDOS, Hybridization, DosAsymmetry, ) from matminer.featurizers.bandstructure import ( BandFeaturizer, BranchPointEnergy ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxid_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), #PartialRadialDistributionFunction(), #Introduces a large amount of features SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) dos_featurizers = ( DOSFeaturizer(), SiteDOS(), Hybridization() ) band_featurizers = ( BandFeaturizer(), BranchPointEnergy() ) def __init__(self, n_jobs=None): self._n_jobs = n_jobs def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( _orbitals ) df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( _orbitals ) df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) return clean_df(df) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ "distances" ][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d ) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function" ].apply(lambda x: x["distribution"][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7, } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system" ].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) return clean_df(df) def featurize_dos(self, df): """Applies the presetdos featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_dos(df) hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"] one_hot = pd.get_dummies(df[hotencodeColumns]) df = df.drop(hotencodeColumns, axis = 1).join(one_hot) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["DOSFeaturizer|vbm_character_1"] = df[ "DOSFeaturizer|vbm_character_1" ].map(_orbitals) df["DOSFeaturizer|cbm_character_1"] = df[ "DOSFeaturizer|cbm_character_1" ].map(_orbitals) # Splitting one feature into several floating features # e.g. number;number;number into three columns splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"] for column in splitColumns: try: newColumns = df[column].str.split(";", n = 2, expand = True) for i in range(0,3): df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float) except: continue df = df.drop(splitColumns, axis=1) df = df.drop(["dos"], axis=1) return clean_df(df) def featurize_bandstructure(self, df): """Applies the preset band structure featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_bandstructure(df) def _int_map(x): if str(x) == "False": return 0 elif str(x) == "True": return 1 df["BandFeaturizer|is_gap_direct"] = df[ "BandFeaturizer|is_gap_direct" ].map(_int_map) df = df.drop(["bandstructure"], axis=1) return clean_df(df) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
def predict_log10_eps( target: Union[Structure, Composition], dielectric_type: str, model_type: str, ) -> float: """ :param target: structure or composition to predict dielectric constants :param dielectric_type: "el" or "ion" :param model_type: "comp" or "comp_st" :return: Descriptor vector """ if dielectric_type not in ["el", "ion"]: raise ValueError( f'Specify dielectric type "el" or "ion"\nInput: {dielectric_type}') if model_type not in ["comp", "comp_st"]: raise ValueError( f'Specify regression_type "comp" or "comp_st"\nInput: {model_type}' ) if model_type == "comp": if isinstance(target, Structure): comp = target.composition else: comp = target comp_oxi = comp.add_charges_from_oxi_state_guesses() if dielectric_type == "el": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) ion_prop = ScalarFeaturizer(IonProperty(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) descriptor = [ ep.get_from_label("PymatgenData minimum X"), ep.get_from_label("PymatgenData range X"), ep.get_from_label("PymatgenData std_dev X"), ep.get_from_label("PymatgenData mean row"), ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData mean group"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev block"), ep.get_from_label("PymatgenData mean atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_radius"), ep.get_from_label("PymatgenData minimum mendeleev_no"), ep.get_from_label("PymatgenData range mendeleev_no"), ep.get_from_label("PymatgenData std_dev mendeleev_no"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev thermal_conductivity"), ep.get_from_label("PymatgenData mean melting_point"), ep.get_from_label("PymatgenData std_dev melting_point"), valence.get_from_label("avg s valence electrons"), valence.get_from_label("avg d valence electrons"), valence.get_from_label("frac s valence electrons"), valence.get_from_label("frac p valence electrons"), valence.get_from_label("frac d valence electrons"), ion_prop.get_from_label("avg ionic char"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("maximum EN difference"), en_diff.get_from_label("range EN difference"), en_diff.get_from_label("mean EN difference"), en_diff.get_from_label("std_dev EN difference"), BandCenter().featurize(comp)[0], oxi_state.get_from_label("std_dev oxidation state"), atomic_orbital.get_from_label("HOMO_energy"), atomic_orbital.get_from_label("LUMO_energy"), atomic_orbital.get_from_label("gap_AO"), ] elif dielectric_type == "ion": stoich = ScalarFeaturizer(Stoichiometry(), comp) ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) ion_prop = ScalarFeaturizer(IonProperty(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) at_pack_eff = ScalarFeaturizer(AtomicPackingEfficiency(), comp) descriptor = [ stoich.get_from_label("3-norm"), stoich.get_from_label("5-norm"), ep.get_from_label("PymatgenData mean X"), ep.get_from_label("PymatgenData mean row"), ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData std_dev group"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev block"), ep.get_from_label("PymatgenData maximum atomic_mass"), ep.get_from_label("PymatgenData range atomic_mass"), ep.get_from_label("PymatgenData mean atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_mass"), ep.get_from_label("PymatgenData maximum atomic_radius"), ep.get_from_label("PymatgenData range atomic_radius"), ep.get_from_label("PymatgenData mean atomic_radius"), ep.get_from_label("PymatgenData std_dev atomic_radius"), ep.get_from_label("PymatgenData minimum mendeleev_no"), ep.get_from_label("PymatgenData mean mendeleev_no"), ep.get_from_label("PymatgenData std_dev mendeleev_no"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev thermal_conductivity"), ep.get_from_label("PymatgenData mean melting_point"), ep.get_from_label("PymatgenData std_dev melting_point"), valence.get_from_label("avg s valence electrons"), valence.get_from_label("frac s valence electrons"), valence.get_from_label("frac p valence electrons"), valence.get_from_label("frac d valence electrons"), ion_prop.get_from_label("avg ionic char"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("minimum EN difference"), en_diff.get_from_label("range EN difference"), en_diff.get_from_label("mean EN difference"), en_diff.get_from_label("std_dev EN difference"), oxi_state.get_from_label("range oxidation state"), oxi_state.get_from_label("std_dev oxidation state"), atomic_orbital.get_from_label("LUMO_energy"), atomic_orbital.get_from_label("gap_AO"), at_pack_eff.get_from_label("mean simul. packing efficiency"), at_pack_eff.get_from_label( "mean abs simul. packing efficiency"), at_pack_eff.get_from_label( "dist from 1 clusters |APE| < 0.010"), at_pack_eff.get_from_label( "dist from 3 clusters |APE| < 0.010"), at_pack_eff.get_from_label( "dist from 5 clusters |APE| < 0.010"), ] elif model_type == "comp_st": if isinstance(target, Composition): raise ValueError( 'comp_st (Using compositional and structural descriptor) is specified, ' 'but target is composition') comp: Composition = target.composition comp_oxi = comp.add_charges_from_oxi_state_guesses() target_orig = deepcopy(target) target.add_oxidation_state_by_guess() if dielectric_type == "el": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) density = ScalarFeaturizer(DensityFeatures(), target) dist_btw_nn = MinimumRelativeDistances().featurize(target_orig) opsf = SiteFeaturizer(OPSiteFingerprint(), target) voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True), target) gsf = SiteFeaturizer(GaussianSymmFunc(), target) lpd = SiteFeaturizer( LocalPropertyDifference.from_preset("ward-prb-2017"), target) descriptor = [ ep.get_from_label("PymatgenData std_dev X"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev atomic_mass"), valence.get_from_label("frac d valence electrons"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("maximum EN difference"), en_diff.get_from_label("mean EN difference"), atomic_orbital.get_from_label("HOMO_energy"), atomic_orbital.get_from_label("LUMO_energy"), density.get_from_label("density"), np.mean(dist_btw_nn), np.std(dist_btw_nn), opsf.get_from_label_func("tetrahedral CN_4", np.max), opsf.get_from_label_func("rectangular see-saw-like CN_4", np.max), np.max([ EwaldSiteEnergy(accuracy=4).featurize(target, i) for i in range(target.num_sites) ]), voro_fp.get_from_label_func("Voro_area_std_dev", np.max), voro_fp.get_from_label_func("Voro_area_std_dev", np.mean), voro_fp.get_from_label_func("Voro_dist_minimum", np.min), voro_fp.get_from_label_func("Voro_dist_minimum", np.std), gsf.get_from_label_func("G2_20.0", np.std), gsf.get_from_label_func("G2_80.0", np.max), gsf.get_from_label_func("G4_0.005_4.0_-1.0", np.mean), lpd.get_from_label_func("local difference in NdValence", np.mean), lpd.get_from_label_func("local difference in NValence", np.min), lpd.get_from_label_func("local difference in NValence", np.std), lpd.get_from_label_func("local difference in NdUnfilled", np.mean), lpd.get_from_label_func("local difference in NUnfilled", np.min), lpd.get_from_label_func("local difference in NUnfilled", np.mean), lpd.get_from_label_func("local difference in GSmagmom", np.mean) ] elif dielectric_type == "ion": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) atomic_orbitals = ScalarFeaturizer(AtomicOrbitals(), comp) density = ScalarFeaturizer(DensityFeatures(), target) str_het = ScalarFeaturizer(StructuralHeterogeneity(), target) opsf = SiteFeaturizer(OPSiteFingerprint(), target) voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True), target) gsf = SiteFeaturizer(GaussianSymmFunc(), target) lpd = SiteFeaturizer( LocalPropertyDifference.from_preset("ward-prb-2017"), target) descriptor = [ ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev melting_point"), TMetalFraction().featurize(comp)[0], atomic_orbitals.get_from_label("gap_AO"), density.get_from_label("density"), density.get_from_label("packing fraction"), str_het.get_from_label("mean neighbor distance variation"), str_het.get_from_label("avg_dev neighbor distance variation"), opsf.get_from_label_func("sgl_bd CN_1", np.mean), opsf.get_from_label_func("bent 150 degrees CN_2", np.mean), opsf.get_from_label_func("linear CN_2", np.mean), opsf.get_from_label_func("trigonal planar CN_3", np.mean), opsf.get_from_label_func("pentagonal planar CN_5", np.std), opsf.get_from_label_func("octahedral CN_6", np.max), opsf.get_from_label_func("octahedral CN_6", np.std), opsf.get_from_label_func("q6 CN_12", np.mean), np.max([ EwaldSiteEnergy(accuracy=4).featurize(target, i) for i in range(target.num_sites) ]), voro_fp.get_from_label_func("Symmetry_weighted_index_4", np.std), voro_fp.get_from_label_func("Voro_vol_maximum", np.mean), voro_fp.get_from_label_func("Voro_area_std_dev", np.mean), voro_fp.get_from_label_func("Voro_area_minimum", np.std), voro_fp.get_from_label_func("Voro_area_maximum", np.min), voro_fp.get_from_label_func("Voro_dist_std_dev", np.mean), gsf.get_from_label_func("G2_80.0", np.min), gsf.get_from_label_func("G4_0.005_4.0_1.0", np.std), lpd.get_from_label_func("local difference in Number", np.max), lpd.get_from_label_func("local difference in MendeleevNumber", np.max), lpd.get_from_label_func("local difference in MendeleevNumber", np.min), lpd.get_from_label_func("local difference in AtomicWeight", np.max), lpd.get_from_label_func("local difference in AtomicWeight", np.mean), lpd.get_from_label_func("local difference in MeltingT", np.mean), lpd.get_from_label_func("local difference in Row", np.max), lpd.get_from_label_func( "local difference in Electronegativity", np.min), lpd.get_from_label_func("local difference in NValence", np.std), lpd.get_from_label_func("local difference in NsUnfilled", np.mean), lpd.get_from_label_func("local difference in NdUnfilled", np.max), lpd.get_from_label_func("local difference in NdUnfilled", np.std), lpd.get_from_label_func("local difference in NUnfilled", np.max), lpd.get_from_label_func("local difference in NUnfilled", np.min), lpd.get_from_label_func("local difference in NUnfilled", np.mean), lpd.get_from_label_func("local difference in NUnfilled", np.std), lpd.get_from_label_func("local difference in GSvolume_pa", np.max), lpd.get_from_label_func("local difference in GSvolume_pa", np.min), lpd.get_from_label_func("local difference in SpaceGroupNumber", np.max), ] with open( f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}.joblib", "rb") as fr: model: RandomForestRegressor = joblib.load(fr) with open( f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}_scaler.joblib", "rb") as fr: scaler: StandardScaler = joblib.load(fr) descriptor = scaler.transform([descriptor]) return model.predict(descriptor)[0]
def test_stoich(self): df_stoich = Stoichiometry(num_atoms=True).featurize_dataframe( self.df, col_id="composition") self.assertAlmostEqual(df_stoich["num atoms"][0], 5) self.assertAlmostEqual(df_stoich["0-norm"][0], 2) self.assertAlmostEqual(df_stoich["7-norm"][0], 0.604895199)
'MeltingT', 'NsValence', 'NpValence', 'NdValence', 'NfValence', 'NValence', 'NsUnfilled', 'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled', 'GSvolume_pa', 'SpaceGroupNumber', 'GSbandgap', 'GSmagmom') #The following features will be created by using matminer package. featurizer = MultipleFeaturizer([ SiteStatsFingerprint(CoordinationNumber().from_preset('VoronoiNN'), stats=('mean', 'std_dev', 'minimum', 'maximum')), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint( LocalPropertyDifference(properties=element_properties), stats=('mean', 'std_dev', 'minimum', 'maximum', 'range')), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=['frac'])), StructureComposition(IonProperty(fast=True)) ]) #Generate VT based features from the material's crystal lat_params. feature_data = featurizer.featurize_dataframe(df, col_id=['structure'], ignore_errors=True) #"lat_params","compound possible" and "material_id" are not resonable physical features, so we drop these three columns feature_data = feature_data.drop( ["structure", "compound possible", "material_id"], axis=1) #write the data into a csv file for later use feature_data.to_csv("data_delta_e_data.csv", index=False) from sklearn.model_selection import KFold, cross_val_score