def get_tet_bcc_motif(structure, idx): """ Convenience class-method from Nils Zimmermann. Used to distinguish coordination environment in half-Heuslers. Args: structure (pymatgen Structure): the target structure to evaluate idx (index): the site index in the structure Returns: (str) that describes site coordination enviornment 'bcc' 'tet' 'unrecognized' """ op_site_fp = OPSiteFingerprint() fp = op_site_fp.featurize(structure, idx) labels = op_site_fp.feature_labels() i_tet = labels.index('tet CN_4') i_bcc = labels.index('bcc CN_8') if fp[i_bcc] > 0.5: return 'bcc' elif fp[i_tet] > 0.5: return 'tet' else: return 'unrecognized'
def get_fps(structure, cutoff=10.0, processes=8): all_descrs = [] try: coordination_number_ = CoordinationNumber.from_preset('VoronoiNN') voronoi_fps_ = VoronoiFingerprintModified( cutoff=cutoff).featurize_structure(structure) crystal_nn_fingerprint_ = CrystalNNFingerprint.from_preset('cn') op_site_fingerprint_ = OPSiteFingerprint() agni_fingerprints_ = AGNIFingerprints() gaussian_symm_func_fps_ = GaussianSymmFuncModified( ).featurize_structure(structure) pymatgen_data_ = PymatgenData() magpie_data_ = MagpieData() data_list = [[ structure, i, site, coordination_number_, voronoi_fps_, crystal_nn_fingerprint_, op_site_fingerprint_, agni_fingerprints_, gaussian_symm_func_fps_, pymatgen_data_, magpie_data_ ] for i, site in enumerate(structure)] pool = multiprocessing.Pool(processes=processes) all_descrs = np.array(pool.map(get_all_site_descrs, data_list)) except (AttributeError, IndexError) as error: pass return all_descrs
def __init__(self, materials, site_descriptors, query=None, **kwargs): """ Calculates site descriptors for materials Args: materials (Store): Store of materials documents site_descriptors (Store): Store of site-descriptors data such as tetrahedral order parameter or percentage of 8-fold coordination query (dict): dictionary to limit materials to be analyzed """ self.materials = materials self.site_descriptors = site_descriptors self.query = query if query else {} # Set up all targeted site descriptors. self.sds = {} for nn in NearNeighbors.__subclasses__(): nn_ = getattr(pymatgen.analysis.local_env, nn.__name__) t = nn.__name__ if nn.__name__ \ not in cls_to_abbrev.keys() \ else cls_to_abbrev[nn.__name__] k = 'cn_{}'.format(t) self.sds[k] = CoordinationNumber(nn_(), use_weights=False) k = 'cn_wt_{}'.format(t) self.sds[k] = CoordinationNumber(nn_(), use_weights=True) self.sds['opsf'] = OPSiteFingerprint() #self.sds['csf'] = CrystalSiteFingerprint.from_preset('ops') super().__init__(sources=[materials], targets=[site_descriptors], **kwargs)
def __init__(self, op_site_fp=None, stats=('mean', 'std_dev', 'minimum', 'maximum'), min_oxi=None, max_oxi=None): self.op_site_fp = OPSiteFingerprint() if op_site_fp is None \ else op_site_fp self._labels = self.op_site_fp.feature_labels() self.stats = tuple([stats]) if type(stats) == str else stats if self.stats and '_mode' in ''.join(self.stats): nmodes = 0 for stat in self.stats: if '_mode' in stat and int(stat[0]) > nmodes: nmodes = int(stat[0]) self.nmodes = nmodes self.min_oxi = min_oxi self.max_oxi = max_oxi
def get_op_stats_vector_diff(s1, s2, max_dr=0.2, ddr=0.01, ddist=0.01): """ Determine the difference vector between two order parameter-statistics feature vector resulting from two input structures. Args: s1 (Structure): first input structure. s2 (Structure): second input structure. max_dr (float): maximum neighbor-finding parameter to be tested. ddr (float): step size for increasing neighbor-finding parameter. ddist (float): bin size for histogramming distances of varying dr. Returns: (float, [float]) optimal neighbor-finding parameter and difference vector between order parameter-statistics feature vectors obtained from the two input structures (s1 - s2). """ # Compute OP stats vector distances for varying neigh-find paras. dr = [] dist = [] delta = [] nbins = int(max_dr/ddr) + 1 for i in range(nbins): dr.append(float(i+1)*ddr) opsf = OPStructureFingerprint(op_site_fp=OPSiteFingerprint(dr=dr[i])) delta.append(np.array( opsf.featurize(s1)) - np.array(opsf.featurize(s2))) dist.append(np.linalg.norm(delta[i])) # Compute distance histogram, determine peak, and location # of smallest dr with peak value. nbins = int(max(dist) / ddist) + 1 hist, bin_edges = np.histogram( dist, bins=[float(i)*ddist for i in range(nbins)], normed=False, weights=None, density=False) idx = list(hist).index(max(hist)) dist_peak = 0.5 * (bin_edges[idx] + bin_edges[idx+1]) idx = -1 for i, d in enumerate(dist): if fabs(d - dist_peak) <= ddist: idx = i break return dr[idx], delta[idx]
def __init__(self, materials, site_descriptors, mat_query=None, **kwargs): """ Calculates site-based descriptors (e.g., coordination numbers with different near-neighbor finding approaches) for materials and runs statistics analysis on selected descriptor types (order parameter-based site fingerprints). The latter is useful as a definition of a structure fingerprint on the basis of local coordination information. Args: materials (Store): Store of materials documents. site_descriptors (Store): Store of site-descriptors data such as tetrahedral order parameter or fraction of being 8-fold coordinated. mat_query (dict): dictionary to limit materials to be analyzed. """ self.materials = materials self.site_descriptors = site_descriptors self.mat_query = mat_query if mat_query else {} # Set up all targeted site descriptors. self.sds = {} for nn in NearNeighbors.__subclasses__(): nn_ = getattr(pymatgen.analysis.local_env, nn.__name__) t = nn.__name__ k = 'cn_{}'.format(t) self.sds[k] = CoordinationNumber(nn_(), use_weights='none') k = 'cn_wt_{}'.format(t) self.sds[k] = CoordinationNumber(nn_(), use_weights='sum') self.all_output_pieces = { 'site_descriptors': [k for k in self.sds.keys()] } self.sds['opsf'] = OPSiteFingerprint() self.sds['csf'] = CrystalSiteFingerprint.from_preset('ops') self.all_output_pieces['statistics'] = ['opsf', 'csf'] super().__init__(sources=[materials], targets=[site_descriptors], **kwargs)
def test_op_site_fingerprint(self): opsf = OPSiteFingerprint() l = opsf.feature_labels() t = ['sgl_bd CN_1', 'L-shaped CN_2', 'water-like CN_2', \ 'bent 120 degrees CN_2', 'bent 150 degrees CN_2', \ 'linear CN_2', 'trigonal planar CN_3', \ 'trigonal non-coplanar CN_3', 'T-shaped CN_3', \ 'square co-planar CN_4', 'tetrahedral CN_4', \ 'rectangular see-saw-like CN_4', 'see-saw-like CN_4', \ 'trigonal pyramidal CN_4', 'pentagonal planar CN_5', \ 'square pyramidal CN_5', 'trigonal bipyramidal CN_5', \ 'hexagonal planar CN_6', 'octahedral CN_6', \ 'pentagonal pyramidal CN_6', 'hexagonal pyramidal CN_7', \ 'pentagonal bipyramidal CN_7', 'body-centered cubic CN_8', \ 'hexagonal bipyramidal CN_8', 'q2 CN_9', 'q4 CN_9', 'q6 CN_9', \ 'q2 CN_10', 'q4 CN_10', 'q6 CN_10', \ 'q2 CN_11', 'q4 CN_11', 'q6 CN_11', \ 'cuboctahedral CN_12', 'q2 CN_12', 'q4 CN_12', 'q6 CN_12'] for i in range(len(l)): self.assertEqual(l[i], t[i]) ops = opsf.featurize(self.sc, 0) self.assertEqual(len(ops), 37) self.assertAlmostEqual( ops[opsf.feature_labels().index('octahedral CN_6')], 0.9995, places=7) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual( ops[opsf.feature_labels().index('body-centered cubic CN_8')], 0.8955, places=7) opsf = OPSiteFingerprint(dist_exp=0) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual( ops[opsf.feature_labels().index('body-centered cubic CN_8')], 0.9555, places=7) # The following test aims at ensuring the copying of the OP dictionaries work. opsfp = OPSiteFingerprint() cnnfp = CrystalNNFingerprint.from_preset('ops') self.assertEqual( len([1 for l in opsfp.feature_labels() if l.split()[0] == 'wt']), 0)
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer): """ Featurizer presets used for the paper 'Machine learning materials properties for small datasets' by Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020). Uses most of the featurizers implemented by matminer at the time of writing with their default hyperparameters and presets. """ from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, # CohesiveEnergy, - This descriptor was not used in the paper preset # ElectronAffinity, - This descriptor was not used in the paper preset ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( # BagofBonds, - This descriptor was not used in the paper preset BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, # PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxide_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), # PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), # BagofBonds(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) def featurize_composition(self, df): """ Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df[ 'AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df[ 'AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df[ 'AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df['AtomicOrbitals|LUMO_element'] = df[ 'AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df = df.replace([np.inf, -np.inf, np.nan], 0) return modnet.featurizers.clean_df(df) def featurize_structure(self, df): """ Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df[ "RadialDistributionFunction|radial distribution function"].iloc[0][ 'distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function"].apply( lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map) return modnet.featurizers.clean_df(df) def featurize_site(self, df): """ Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ # rename some features for backwards compatibility with pretrained models aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return modnet.featurizers.clean_df(df)
def test_op_site_fingerprint(self): opsf = OPSiteFingerprint() l = opsf.feature_labels() t = ['sgl_bd CN_1', 'L-shaped CN_2', 'water-like CN_2', \ 'bent 120 degrees CN_2', 'bent 150 degrees CN_2', \ 'linear CN_2', 'trigonal planar CN_3', \ 'trigonal non-coplanar CN_3', 'T-shaped CN_3', \ 'square co-planar CN_4', 'tetrahedral CN_4', \ 'rectangular see-saw-like CN_4', 'see-saw-like CN_4', \ 'trigonal pyramidal CN_4', 'pentagonal planar CN_5', \ 'square pyramidal CN_5', 'trigonal bipyramidal CN_5', \ 'hexagonal planar CN_6', 'octahedral CN_6', \ 'pentagonal pyramidal CN_6', 'hexagonal pyramidal CN_7', \ 'pentagonal bipyramidal CN_7', 'body-centered cubic CN_8', \ 'hexagonal bipyramidal CN_8', 'q2 CN_9', 'q4 CN_9', 'q6 CN_9', \ 'q2 CN_10', 'q4 CN_10', 'q6 CN_10', \ 'q2 CN_11', 'q4 CN_11', 'q6 CN_11', \ 'cuboctahedral CN_12', 'q2 CN_12', 'q4 CN_12', 'q6 CN_12'] for i in range(len(l)): self.assertEqual(l[i], t[i]) ops = opsf.featurize(self.sc, 0) self.assertEqual(len(ops), 37) self.assertAlmostEqual(ops[opsf.feature_labels().index( 'octahedral CN_6')], 0.9995, places=7) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual(ops[opsf.feature_labels().index( 'body-centered cubic CN_8')], 0.8955, places=7) opsf = OPSiteFingerprint(dist_exp=0) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual(ops[opsf.feature_labels().index( 'body-centered cubic CN_8')], 0.9555, places=7) # The following test aims at ensuring the copying of the OP dictionaries work. opsfp = OPSiteFingerprint() cnnfp = CrystalNNFingerprint.from_preset('ops') self.assertEqual(len([1 for l in opsfp.feature_labels() if l.split()[0] == 'wt']), 0)
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer): from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, CohesiveEnergy, ElectronAffinity, ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( BagofBonds, BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) from matminer.featurizers.dos import ( DOSFeaturizer, SiteDOS, Hybridization, DosAsymmetry, ) from matminer.featurizers.bandstructure import ( BandFeaturizer, BranchPointEnergy ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxid_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), #PartialRadialDistributionFunction(), #Introduces a large amount of features SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) dos_featurizers = ( DOSFeaturizer(), SiteDOS(), Hybridization() ) band_featurizers = ( BandFeaturizer(), BranchPointEnergy() ) def __init__(self, n_jobs=None): self._n_jobs = n_jobs def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( _orbitals ) df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( _orbitals ) df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) return clean_df(df) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ "distances" ][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d ) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function" ].apply(lambda x: x["distribution"][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7, } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system" ].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) return clean_df(df) def featurize_dos(self, df): """Applies the presetdos featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_dos(df) hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"] one_hot = pd.get_dummies(df[hotencodeColumns]) df = df.drop(hotencodeColumns, axis = 1).join(one_hot) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["DOSFeaturizer|vbm_character_1"] = df[ "DOSFeaturizer|vbm_character_1" ].map(_orbitals) df["DOSFeaturizer|cbm_character_1"] = df[ "DOSFeaturizer|cbm_character_1" ].map(_orbitals) # Splitting one feature into several floating features # e.g. number;number;number into three columns splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"] for column in splitColumns: try: newColumns = df[column].str.split(";", n = 2, expand = True) for i in range(0,3): df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float) except: continue df = df.drop(splitColumns, axis=1) df = df.drop(["dos"], axis=1) return clean_df(df) def featurize_bandstructure(self, df): """Applies the preset band structure featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_bandstructure(df) def _int_map(x): if str(x) == "False": return 0 elif str(x) == "True": return 1 df["BandFeaturizer|is_gap_direct"] = df[ "BandFeaturizer|is_gap_direct" ].map(_int_map) df = df.drop(["bandstructure"], axis=1) return clean_df(df) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
def predict_log10_eps( target: Union[Structure, Composition], dielectric_type: str, model_type: str, ) -> float: """ :param target: structure or composition to predict dielectric constants :param dielectric_type: "el" or "ion" :param model_type: "comp" or "comp_st" :return: Descriptor vector """ if dielectric_type not in ["el", "ion"]: raise ValueError( f'Specify dielectric type "el" or "ion"\nInput: {dielectric_type}') if model_type not in ["comp", "comp_st"]: raise ValueError( f'Specify regression_type "comp" or "comp_st"\nInput: {model_type}' ) if model_type == "comp": if isinstance(target, Structure): comp = target.composition else: comp = target comp_oxi = comp.add_charges_from_oxi_state_guesses() if dielectric_type == "el": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) ion_prop = ScalarFeaturizer(IonProperty(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) descriptor = [ ep.get_from_label("PymatgenData minimum X"), ep.get_from_label("PymatgenData range X"), ep.get_from_label("PymatgenData std_dev X"), ep.get_from_label("PymatgenData mean row"), ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData mean group"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev block"), ep.get_from_label("PymatgenData mean atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_radius"), ep.get_from_label("PymatgenData minimum mendeleev_no"), ep.get_from_label("PymatgenData range mendeleev_no"), ep.get_from_label("PymatgenData std_dev mendeleev_no"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev thermal_conductivity"), ep.get_from_label("PymatgenData mean melting_point"), ep.get_from_label("PymatgenData std_dev melting_point"), valence.get_from_label("avg s valence electrons"), valence.get_from_label("avg d valence electrons"), valence.get_from_label("frac s valence electrons"), valence.get_from_label("frac p valence electrons"), valence.get_from_label("frac d valence electrons"), ion_prop.get_from_label("avg ionic char"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("maximum EN difference"), en_diff.get_from_label("range EN difference"), en_diff.get_from_label("mean EN difference"), en_diff.get_from_label("std_dev EN difference"), BandCenter().featurize(comp)[0], oxi_state.get_from_label("std_dev oxidation state"), atomic_orbital.get_from_label("HOMO_energy"), atomic_orbital.get_from_label("LUMO_energy"), atomic_orbital.get_from_label("gap_AO"), ] elif dielectric_type == "ion": stoich = ScalarFeaturizer(Stoichiometry(), comp) ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) ion_prop = ScalarFeaturizer(IonProperty(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) at_pack_eff = ScalarFeaturizer(AtomicPackingEfficiency(), comp) descriptor = [ stoich.get_from_label("3-norm"), stoich.get_from_label("5-norm"), ep.get_from_label("PymatgenData mean X"), ep.get_from_label("PymatgenData mean row"), ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData std_dev group"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev block"), ep.get_from_label("PymatgenData maximum atomic_mass"), ep.get_from_label("PymatgenData range atomic_mass"), ep.get_from_label("PymatgenData mean atomic_mass"), ep.get_from_label("PymatgenData std_dev atomic_mass"), ep.get_from_label("PymatgenData maximum atomic_radius"), ep.get_from_label("PymatgenData range atomic_radius"), ep.get_from_label("PymatgenData mean atomic_radius"), ep.get_from_label("PymatgenData std_dev atomic_radius"), ep.get_from_label("PymatgenData minimum mendeleev_no"), ep.get_from_label("PymatgenData mean mendeleev_no"), ep.get_from_label("PymatgenData std_dev mendeleev_no"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev thermal_conductivity"), ep.get_from_label("PymatgenData mean melting_point"), ep.get_from_label("PymatgenData std_dev melting_point"), valence.get_from_label("avg s valence electrons"), valence.get_from_label("frac s valence electrons"), valence.get_from_label("frac p valence electrons"), valence.get_from_label("frac d valence electrons"), ion_prop.get_from_label("avg ionic char"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("minimum EN difference"), en_diff.get_from_label("range EN difference"), en_diff.get_from_label("mean EN difference"), en_diff.get_from_label("std_dev EN difference"), oxi_state.get_from_label("range oxidation state"), oxi_state.get_from_label("std_dev oxidation state"), atomic_orbital.get_from_label("LUMO_energy"), atomic_orbital.get_from_label("gap_AO"), at_pack_eff.get_from_label("mean simul. packing efficiency"), at_pack_eff.get_from_label( "mean abs simul. packing efficiency"), at_pack_eff.get_from_label( "dist from 1 clusters |APE| < 0.010"), at_pack_eff.get_from_label( "dist from 3 clusters |APE| < 0.010"), at_pack_eff.get_from_label( "dist from 5 clusters |APE| < 0.010"), ] elif model_type == "comp_st": if isinstance(target, Composition): raise ValueError( 'comp_st (Using compositional and structural descriptor) is specified, ' 'but target is composition') comp: Composition = target.composition comp_oxi = comp.add_charges_from_oxi_state_guesses() target_orig = deepcopy(target) target.add_oxidation_state_by_guess() if dielectric_type == "el": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) valence = ScalarFeaturizer(ValenceOrbital(), comp) en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi) atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp) density = ScalarFeaturizer(DensityFeatures(), target) dist_btw_nn = MinimumRelativeDistances().featurize(target_orig) opsf = SiteFeaturizer(OPSiteFingerprint(), target) voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True), target) gsf = SiteFeaturizer(GaussianSymmFunc(), target) lpd = SiteFeaturizer( LocalPropertyDifference.from_preset("ward-prb-2017"), target) descriptor = [ ep.get_from_label("PymatgenData std_dev X"), ep.get_from_label("PymatgenData mean block"), ep.get_from_label("PymatgenData std_dev atomic_mass"), valence.get_from_label("frac d valence electrons"), TMetalFraction().featurize(comp)[0], en_diff.get_from_label("maximum EN difference"), en_diff.get_from_label("mean EN difference"), atomic_orbital.get_from_label("HOMO_energy"), atomic_orbital.get_from_label("LUMO_energy"), density.get_from_label("density"), np.mean(dist_btw_nn), np.std(dist_btw_nn), opsf.get_from_label_func("tetrahedral CN_4", np.max), opsf.get_from_label_func("rectangular see-saw-like CN_4", np.max), np.max([ EwaldSiteEnergy(accuracy=4).featurize(target, i) for i in range(target.num_sites) ]), voro_fp.get_from_label_func("Voro_area_std_dev", np.max), voro_fp.get_from_label_func("Voro_area_std_dev", np.mean), voro_fp.get_from_label_func("Voro_dist_minimum", np.min), voro_fp.get_from_label_func("Voro_dist_minimum", np.std), gsf.get_from_label_func("G2_20.0", np.std), gsf.get_from_label_func("G2_80.0", np.max), gsf.get_from_label_func("G4_0.005_4.0_-1.0", np.mean), lpd.get_from_label_func("local difference in NdValence", np.mean), lpd.get_from_label_func("local difference in NValence", np.min), lpd.get_from_label_func("local difference in NValence", np.std), lpd.get_from_label_func("local difference in NdUnfilled", np.mean), lpd.get_from_label_func("local difference in NUnfilled", np.min), lpd.get_from_label_func("local difference in NUnfilled", np.mean), lpd.get_from_label_func("local difference in GSmagmom", np.mean) ] elif dielectric_type == "ion": ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"), comp) atomic_orbitals = ScalarFeaturizer(AtomicOrbitals(), comp) density = ScalarFeaturizer(DensityFeatures(), target) str_het = ScalarFeaturizer(StructuralHeterogeneity(), target) opsf = SiteFeaturizer(OPSiteFingerprint(), target) voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True), target) gsf = SiteFeaturizer(GaussianSymmFunc(), target) lpd = SiteFeaturizer( LocalPropertyDifference.from_preset("ward-prb-2017"), target) descriptor = [ ep.get_from_label("PymatgenData std_dev row"), ep.get_from_label("PymatgenData mean thermal_conductivity"), ep.get_from_label("PymatgenData std_dev melting_point"), TMetalFraction().featurize(comp)[0], atomic_orbitals.get_from_label("gap_AO"), density.get_from_label("density"), density.get_from_label("packing fraction"), str_het.get_from_label("mean neighbor distance variation"), str_het.get_from_label("avg_dev neighbor distance variation"), opsf.get_from_label_func("sgl_bd CN_1", np.mean), opsf.get_from_label_func("bent 150 degrees CN_2", np.mean), opsf.get_from_label_func("linear CN_2", np.mean), opsf.get_from_label_func("trigonal planar CN_3", np.mean), opsf.get_from_label_func("pentagonal planar CN_5", np.std), opsf.get_from_label_func("octahedral CN_6", np.max), opsf.get_from_label_func("octahedral CN_6", np.std), opsf.get_from_label_func("q6 CN_12", np.mean), np.max([ EwaldSiteEnergy(accuracy=4).featurize(target, i) for i in range(target.num_sites) ]), voro_fp.get_from_label_func("Symmetry_weighted_index_4", np.std), voro_fp.get_from_label_func("Voro_vol_maximum", np.mean), voro_fp.get_from_label_func("Voro_area_std_dev", np.mean), voro_fp.get_from_label_func("Voro_area_minimum", np.std), voro_fp.get_from_label_func("Voro_area_maximum", np.min), voro_fp.get_from_label_func("Voro_dist_std_dev", np.mean), gsf.get_from_label_func("G2_80.0", np.min), gsf.get_from_label_func("G4_0.005_4.0_1.0", np.std), lpd.get_from_label_func("local difference in Number", np.max), lpd.get_from_label_func("local difference in MendeleevNumber", np.max), lpd.get_from_label_func("local difference in MendeleevNumber", np.min), lpd.get_from_label_func("local difference in AtomicWeight", np.max), lpd.get_from_label_func("local difference in AtomicWeight", np.mean), lpd.get_from_label_func("local difference in MeltingT", np.mean), lpd.get_from_label_func("local difference in Row", np.max), lpd.get_from_label_func( "local difference in Electronegativity", np.min), lpd.get_from_label_func("local difference in NValence", np.std), lpd.get_from_label_func("local difference in NsUnfilled", np.mean), lpd.get_from_label_func("local difference in NdUnfilled", np.max), lpd.get_from_label_func("local difference in NdUnfilled", np.std), lpd.get_from_label_func("local difference in NUnfilled", np.max), lpd.get_from_label_func("local difference in NUnfilled", np.min), lpd.get_from_label_func("local difference in NUnfilled", np.mean), lpd.get_from_label_func("local difference in NUnfilled", np.std), lpd.get_from_label_func("local difference in GSvolume_pa", np.max), lpd.get_from_label_func("local difference in GSvolume_pa", np.min), lpd.get_from_label_func("local difference in SpaceGroupNumber", np.max), ] with open( f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}.joblib", "rb") as fr: model: RandomForestRegressor = joblib.load(fr) with open( f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}_scaler.joblib", "rb") as fr: scaler: StandardScaler = joblib.load(fr) descriptor = scaler.transform([descriptor]) return model.predict(descriptor)[0]
def from_preset(preset, **kwargs): """ Create a SiteStatsFingerprint class according to a preset Args: preset (str) - Name of preset kwargs - Options for SiteStatsFingerprint """ if preset == "CrystalNNFingerprint_cn": return SiteStatsFingerprint( CrystalNNFingerprint.from_preset("cn", cation_anion=False), **kwargs) elif preset == "CrystalNNFingerprint_cn_cation_anion": return SiteStatsFingerprint( CrystalNNFingerprint.from_preset("cn", cation_anion=True), **kwargs) elif preset == "CrystalNNFingerprint_ops": return SiteStatsFingerprint( CrystalNNFingerprint.from_preset("ops", cation_anion=False), **kwargs) elif preset == "CrystalNNFingerprint_ops_cation_anion": return SiteStatsFingerprint( CrystalNNFingerprint.from_preset("ops", cation_anion=True), **kwargs) elif preset == "OPSiteFingerprint": return SiteStatsFingerprint(OPSiteFingerprint(), **kwargs) elif preset == "LocalPropertyDifference_ward-prb-2017": return SiteStatsFingerprint( LocalPropertyDifference.from_preset("ward-prb-2017"), stats=["minimum", "maximum", "range", "mean", "avg_dev"]) elif preset == "CoordinationNumber_ward-prb-2017": return SiteStatsFingerprint( CoordinationNumber(nn=VoronoiNN(weight='area'), use_weights="effective"), stats=["minimum", "maximum", "range", "mean", "avg_dev"]) elif preset == "Composition-dejong2016_AD": return SiteStatsFingerprint( LocalPropertyDifference(properties=[ "Number", "AtomicWeight", "Column", "Row", "CovalentRadius", "Electronegativity" ], signed=False), stats=['holder_mean::%d' % d for d in range(0, 4 + 1)] + ['std_dev'], ) elif preset == "Composition-dejong2016_SD": return SiteStatsFingerprint( LocalPropertyDifference(properties=[ "Number", "AtomicWeight", "Column", "Row", "CovalentRadius", "Electronegativity" ], signed=True), stats=['holder_mean::%d' % d for d in [1, 2, 4]] + ['std_dev'], ) elif preset == "BondLength-dejong2016": return SiteStatsFingerprint( AverageBondLength(VoronoiNN()), stats=['holder_mean::%d' % d for d in range(-4, 4 + 1)] + ['std_dev', 'geom_std_dev']) elif preset == "BondAngle-dejong2016": return SiteStatsFingerprint( AverageBondAngle(VoronoiNN()), stats=['holder_mean::%d' % d for d in range(-4, 4 + 1)] + ['std_dev', 'geom_std_dev']) else: # TODO: Why assume coordination number? Should this just raise an error? - lw # One of the various Coordination Number presets: # MinimumVIRENN, MinimumDistanceNN, JmolNN, VoronoiNN, etc. try: return SiteStatsFingerprint( CoordinationNumber.from_preset(preset), **kwargs) except: pass raise ValueError("Unrecognized preset!")
def test_op_site_fingerprint(self): opsf = OPSiteFingerprint() l = opsf.feature_labels() t = ["sgl_bd CN_1", "bent180 CN_2", "bent45 CN_2", "bent90 CN_2", \ "bent135 CN_2", "tri_plan CN_3", "tet CN_3", "T CN_3", \ "sq_plan CN_4", "sq CN_4", "tet CN_4", "see_saw CN_4", \ "tri_pyr CN_4", "pent_plan CN_5", "sq_pyr CN_5", \ "tri_bipyr CN_5", "oct CN_6", "pent_pyr CN_6", "hex_pyr CN_7", \ "pent_bipyr CN_7", "bcc CN_8", "hex_bipyr CN_8", \ "q2 CN_9", "q4 CN_9", "q6 CN_9", \ "q2 CN_10", "q4 CN_10", "q6 CN_10", "q2 CN_11", "q4 CN_11", "q6 CN_11", \ "cuboct CN_12", "q2 CN_12", "q4 CN_12", "q6 CN_12"] for i in range(len(l)): self.assertEqual(l[i], t[i]) ops = opsf.featurize(self.sc, 0) self.assertEqual(len(ops), 35) self.assertAlmostEqual(int(1000 * ops[opsf.feature_labels().index( 'oct CN_6')]), 999) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual(int(1000 * ops[opsf.feature_labels().index( 'bcc CN_8')] + 0.5), 895) opsf = OPSiteFingerprint(dist_exp=0) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual(int(1000 * ops[opsf.feature_labels().index( 'bcc CN_8')] + 0.5), 955)
def test_op_site_fingerprint(self): opsf = OPSiteFingerprint() l = opsf.feature_labels() t = ['sgl_bd CN_1', 'L-shaped CN_2', 'water-like CN_2', \ 'bent 120 degrees CN_2', 'bent 150 degrees CN_2', \ 'linear CN_2', 'trigonal planar CN_3', \ 'trigonal non-coplanar CN_3', 'T-shaped CN_3', \ 'square co-planar CN_4', 'tetrahedral CN_4', \ 'rectangular see-saw-like CN_4', 'see-saw-like CN_4', \ 'trigonal pyramidal CN_4', 'pentagonal planar CN_5', \ 'square pyramidal CN_5', 'trigonal bipyramidal CN_5', \ 'hexagonal planar CN_6', 'octahedral CN_6', \ 'pentagonal pyramidal CN_6', 'hexagonal pyramidal CN_7', \ 'pentagonal bipyramidal CN_7', 'body-centered cubic CN_8', \ 'hexagonal bipyramidal CN_8', 'q2 CN_9', 'q4 CN_9', 'q6 CN_9', \ 'q2 CN_10', 'q4 CN_10', 'q6 CN_10', \ 'q2 CN_11', 'q4 CN_11', 'q6 CN_11', \ 'cuboctahedral CN_12', 'q2 CN_12', 'q4 CN_12', 'q6 CN_12'] for i in range(len(l)): self.assertEqual(l[i], t[i]) ops = opsf.featurize(self.sc, 0) self.assertEqual(len(ops), 37) self.assertAlmostEqual( ops[opsf.feature_labels().index('octahedral CN_6')], 0.9995, places=7) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual( ops[opsf.feature_labels().index('body-centered cubic CN_8')], 0.8955, places=7) opsf = OPSiteFingerprint(dist_exp=0) ops = opsf.featurize(self.cscl, 0) self.assertAlmostEqual( ops[opsf.feature_labels().index('body-centered cubic CN_8')], 0.9555, places=7)
def featurize_site(df: pd.DataFrame, site_stats=("mean", "std_dev")) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with site features from matminer. Currently creates the set of all matminer structure features with the `matminer.featurizers.structure.SiteStatsFingerprint`. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. site_stats (Tuple[str]): the matminer site stats to use in the `SiteStatsFingerprint` for all features. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying site featurizers...") df = df.copy() df.columns = ["Input data|" + x for x in df.columns] site_fingerprints = ( AGNIFingerprints(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), OPSiteFingerprint(), CrystalNNFingerprint.from_preset("ops"), VoronoiFingerprint(), GaussianSymmFunc(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), LocalPropertyDifference(), BondOrientationalParameter(), AverageBondLength(VoronoiNN()), AverageBondAngle(VoronoiNN()) ) for fingerprint in site_fingerprints: site_stats_fingerprint = SiteStatsFingerprint( fingerprint, stats=site_stats ) df = site_stats_fingerprint.featurize_dataframe( df, "Input data|structure", multiindex=False, ignore_errors=True ) fingerprint_name = fingerprint.__class__.__name__ # rename some features for backwards compatibility with pretrained models if fingerprint_name == "GeneralizedRadialDistributionFunction": fingerprint_name = "GeneralizedRDF" elif fingerprint_name == "AGNIFingerprints": fingerprint_name = "AGNIFingerPrint" elif fingerprint_name == "BondOrientationalParameter": fingerprint_name = "BondOrientationParameter" elif fingerprint_name == "GaussianSymmFunc": fingerprint_name = "ChemEnvSiteFingerprint|GaussianSymmFunc" if "|" not in fingerprint_name: fingerprint_name += "|" df.columns = [f"{fingerprint_name}{x}" if "|" not in x else x for x in df.columns] df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
def get_op_site_features(s, site_idx): opsf = OPSiteFingerprint() f = opsf.featurize(s, site_idx) return f.tolist()
class OPStructureFingerprint(BaseFeaturizer): """ Calculates all order parameters (OPs) for all sites in a crystal structure. Args: op_site_fp (OPSiteFingerprint): defines the types of order parameters to be calculated. stats ([str]): list of weighted statistics to compute for each feature. If stats is None, for each order parameter, a list is returned that contains the calculated parameter for each site in the structure. *Note for nth mode, stat must be 'n*_mode'; e.g. stat='2nd_mode' min_oxi (int): minimum site oxidation state for inclusion (e.g., zero means metals/cations only) max_oxi (int): maximum site oxidation state for inclusion """ def __init__(self, op_site_fp=None, stats=('mean', 'std_dev', 'minimum', 'maximum'), min_oxi=None, max_oxi=None): self.op_site_fp = OPSiteFingerprint() if op_site_fp is None \ else op_site_fp self._labels = self.op_site_fp.feature_labels() self.stats = tuple([stats]) if type(stats) == str else stats if self.stats and '_mode' in ''.join(self.stats): nmodes = 0 for stat in self.stats: if '_mode' in stat and int(stat[0]) > nmodes: nmodes = int(stat[0]) self.nmodes = nmodes self.min_oxi = min_oxi self.max_oxi = max_oxi def featurize(self, s): """ Calculate all sites' local structure order parameters (LSOPs). Args: s: Pymatgen Structure object. Returns: opvals: (2D array of floats) LSOP values of all sites' (1st dimension) order parameters (2nd dimension). 46 order parameters are computed per site: q_cn (coordination number), q_lin, 35 x q_bent (starting with a target angle of 5 degrees and, increasing by 5 degrees, until 175 degrees), q_tet, q_oct, q_bcc, q_2, q_4, q_6, q_reg_tri, q_sq, q_sq_pyr. """ opvals = [[] for t in self._labels] for i, site in enumerate(s.sites): if (self.min_oxi is None or site.specie.oxi_state >= self.min_oxi) \ and (self.max_oxi is None or site.specie.oxi_state >= self.max_oxi): opvalstmp = self.op_site_fp.featurize(s, i) for j, opval in enumerate(opvalstmp): if opval is None: opvals[j].append(0.0) else: opvals[j].append(opval) if self.stats: opstats = [] for op in opvals: if '_mode' in ''.join(self.stats): modes = self.n_numerical_modes(op, self.nmodes, 0.01) for stat in self.stats: if '_mode' in stat: opstats.append(modes[int(stat[0])-1]) else: opstats.append(PropertyStats().calc_stat(op, stat)) return opstats else: return opvals def feature_labels(self): if self.stats: labels = [] for attr in self._labels: for stat in self.stats: labels.append('%s %s' % (stat, attr)) return labels else: return self._labels def citations(self): return ('@article{zimmermann_jain_2017, title={Applications of order' ' parameter feature vectors}, journal={in progress}, author={' 'Zimmermann, N. E. R. and Jain, A.}, year={2017}}') def implementors(self): return (['Nils E. R. Zimmermann', 'Alireza Faghaninia', 'Anubhav Jain']) @staticmethod def n_numerical_modes(data_lst, n=2, dl=0.1): """ Returns the n first modes of a data set that are obtained with a finite bin size for the underlying frequency distribution. Args: data_lst ([float]): data values. n (integer): number of most frequent elements to be determined. dl (float): bin size of underlying (coarsened) distribution. Returns: ([float]): first n most frequent entries (or nan if not found). """ if len(set(data_lst)) == 1: return [data_lst[0]] + [float('NaN') for _ in range(n-1)] hist, bins = np.histogram(data_lst, bins=np.arange( min(data_lst), max(data_lst), dl), density=False) modes = list(bins[np.argsort(hist)[-n:]][::-1]) return modes + [float('NaN') for _ in range(n-len(modes))]