def test_sine_coulomb_matrix(self): scm = SineCoulombMatrix() sin_mat = scm.featurize(self.diamond) mtarget = [[36.8581, 6.147068], [6.147068, 36.8581]] self.assertAlmostEqual( np.linalg.norm(sin_mat - np.array(mtarget)), 0.0, places=4) scm = SineCoulombMatrix(False) sin_mat = scm.featurize(self.diamond)[0] self.assertEqual(sin_mat[0][0], 0) self.assertEqual(sin_mat[1][1], 0)
def test_sine_coulomb_matrix(self): # flat scm = SineCoulombMatrix(flatten=True) df = pd.DataFrame({"s": [self.sc, self.ni3al]}) with self.assertRaises(NotFittedError): df = scm.featurize_dataframe(df, "s") df = scm.fit_featurize_dataframe(df, "s") labels = scm.feature_labels() self.assertEqual(labels[0], "sine coulomb matrix eig 0") self.assertArrayAlmostEqual( df[labels].iloc[0], [235.740418, 0.0, 0.0, 0.0], decimal=5) self.assertArrayAlmostEqual( df[labels].iloc[1], [232.578562, 1656.288171, 1403.106576, 1403.106576], decimal=5) # matrix scm = SineCoulombMatrix(flatten=False) sin_mat = scm.featurize(self.diamond) mtarget = [[36.8581, 6.147068], [6.147068, 36.8581]] self.assertAlmostEqual( np.linalg.norm(sin_mat - np.array(mtarget)), 0.0, places=4) scm = SineCoulombMatrix(diag_elems=False, flatten=False) sin_mat = scm.featurize(self.diamond)[0] self.assertEqual(sin_mat[0][0], 0) self.assertEqual(sin_mat[1][1], 0)
def __init__(self, coulomb_matrix=SineCoulombMatrix(flatten=False), token=' - '): self.coulomb_matrix = coulomb_matrix self.token = token self.bag_lens = None self.ordered_bonds = None
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with structural features from matminer. Currently applies the set of all matminer structure features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying structure featurizers...") df = df.copy() structure_features = [ DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), BagofBonds() ] featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features]) df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d) df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int) return clean_df(df)
def _featurize(self, struct): """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- struct : dict Json-serializable dictionary representation of pymatgen.core.structure https://pymatgen.org/pymatgen.core.structure.html Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ from pymatgen import Structure from matminer.featurizers.structure import SineCoulombMatrix as SCM s = Structure.from_dict(struct) # Get full N x N SCM scm = SCM(flatten=False) sine_mat = scm.featurize(s) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros((self.max_atoms,)) zeros[:len(eigs)] = eigs features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
def _featurize(self, struct: "pymatgen.Structure"): """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- struct : pymatgen.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ try: from matminer.featurizers.structure import SineCoulombMatrix as SCM except ModuleNotFoundError: raise ValueError("This class requires matminer to be installed.") # Get full N x N SCM scm = SCM(flatten=False) sine_mat = scm.featurize(struct) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros((1, self.max_atoms)) zeros[:len(eigs)] = eigs features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
def __init__(self, max_atoms: int = 100, flatten: bool = True): """ Parameters ---------- max_atoms: int (default 100) Maximum number of atoms for any crystal in the dataset. Used to pad the Coulomb matrix. flatten: bool (default True) Return flattened vector of matrix eigenvalues. """ try: from matminer.featurizers.structure import SineCoulombMatrix as SCM except ModuleNotFoundError: raise ValueError("This class requires matminer to be installed.") self.max_atoms = max_atoms self.flatten = flatten self.scm = SCM(flatten=False)
def _featurize(self, datapoint: PymatgenStructure, **kwargs) -> np.ndarray: """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- datapoint: pymatgen.core.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ if 'struct' in kwargs and datapoint is None: datapoint = kwargs.get("struct") raise DeprecationWarning( 'Struct is being phased out as a parameter, please pass "datapoint" instead.' ) if self.scm is None: try: from matminer.featurizers.structure import SineCoulombMatrix as SCM self.scm = SCM(flatten=False) except ModuleNotFoundError: raise ImportError( "This class requires matminer to be installed.") # Get full N x N SCM sine_mat = self.scm.featurize(datapoint) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros(self.max_atoms) zeros[:len(eigs[0])] = eigs[0] features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
def test_bob(self): # Test a single fit and featurization scm = SineCoulombMatrix(flatten=False) bob = BagofBonds(coulomb_matrix=scm, token=' - ') bob.fit([self.ni3al]) truth1 = [ 235.74041833262768, 1486.4464890775491, 1486.4464890775491, 1486.4464890775491, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257 ] truth1_labels = [ 'Al site #0', 'Ni site #0', 'Ni site #1', 'Ni site #2', 'Al - Ni bond #0', 'Al - Ni bond #1', 'Al - Ni bond #2', 'Al - Ni bond #3', 'Al - Ni bond #4', 'Al - Ni bond #5', 'Ni - Ni bond #0', 'Ni - Ni bond #1', 'Ni - Ni bond #2', 'Ni - Ni bond #3', 'Ni - Ni bond #4', 'Ni - Ni bond #5' ] self.assertArrayAlmostEqual(bob.featurize(self.ni3al), truth1) self.assertEqual(bob.feature_labels(), truth1_labels) # Test padding from fitting and dataframe featurization bob.coulomb_matrix = CoulombMatrix(flatten=False) bob.fit([self.ni3al, self.cscl, self.diamond_no_oxi]) df = pd.DataFrame({'structures': [self.cscl]}) df = bob.featurize_dataframe(df, 'structures') self.assertEqual(len(df.columns.values), 25) self.assertAlmostEqual(df['Cs+ site #0'][0], 7513.468312122532) self.assertAlmostEqual(df['Al site #0'][0], 0.0) self.assertAlmostEqual(df['Cs+ - Cl- bond #1'][0], 135.74726437398044, 3) self.assertAlmostEqual(df['Al - Ni bond #0'][0], 0.0) # Test error handling for bad fits or null fits bob = BagofBonds(CoulombMatrix(flatten=False)) self.assertRaises(NotFittedError, bob.featurize, self.nacl) bob.fit([self.ni3al, self.diamond]) self.assertRaises(ValueError, bob.featurize, self.nacl)\
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer): """ Featurizer presets used for the paper 'Machine learning materials properties for small datasets' by Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020). Uses most of the featurizers implemented by matminer at the time of writing with their default hyperparameters and presets. """ from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, # CohesiveEnergy, - This descriptor was not used in the paper preset # ElectronAffinity, - This descriptor was not used in the paper preset ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( # BagofBonds, - This descriptor was not used in the paper preset BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, # PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxide_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), # PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), # BagofBonds(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) def featurize_composition(self, df): """ Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df[ 'AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df[ 'AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df[ 'AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df['AtomicOrbitals|LUMO_element'] = df[ 'AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df = df.replace([np.inf, -np.inf, np.nan], 0) return modnet.featurizers.clean_df(df) def featurize_structure(self, df): """ Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df[ "RadialDistributionFunction|radial distribution function"].iloc[0][ 'distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function"].apply( lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map) return modnet.featurizers.clean_df(df) def featurize_site(self, df): """ Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ # rename some features for backwards compatibility with pretrained models aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return modnet.featurizers.clean_df(df)
def test_sine_coulomb_matrix(self): # flat scm = SineCoulombMatrix(flatten=True) df = pd.DataFrame({"s": [self.sc, self.ni3al]}) with self.assertRaises(NotFittedError): df = scm.featurize_dataframe(df, "s") df = scm.fit_featurize_dataframe(df, "s") labels = scm.feature_labels() self.assertEqual(labels[0], "sine coulomb matrix eig 0") self.assertArrayAlmostEqual(df[labels].iloc[0], [235.740418, 0.0, 0.0, 0.0], decimal=5) self.assertArrayAlmostEqual( df[labels].iloc[1], [232.578562, 1656.288171, 1403.106576, 1403.106576], decimal=5) # matrix scm = SineCoulombMatrix(flatten=False) sin_mat = scm.featurize(self.diamond) mtarget = [[36.8581, 6.147068], [6.147068, 36.8581]] self.assertAlmostEqual(np.linalg.norm(sin_mat - np.array(mtarget)), 0.0, places=4) scm = SineCoulombMatrix(diag_elems=False, flatten=False) sin_mat = scm.featurize(self.diamond)[0] self.assertEqual(sin_mat[0][0], 0) self.assertEqual(sin_mat[1][1], 0)
class SineCoulombMatrix(MaterialStructureFeaturizer): """ Calculate sine Coulomb matrix for crystals. A variant of Coulomb matrix for periodic crystals. The sine Coulomb matrix is identical to the Coulomb matrix, except that the inverse distance function is replaced by the inverse of sin**2 of the vector between sites which are periodic in the dimensions of the crystal lattice. Features are flattened into a vector of matrix eigenvalues by default for ML-readiness. To ensure that all feature vectors are equal length, the maximum number of atoms (eigenvalues) in the input dataset must be specified. This featurizer requires the optional dependencies pymatgen and matminer. It may be useful when crystal structures with 3D coordinates are available. See [1]_ for more details. References ---------- .. [1] Faber et al. Inter. J. Quantum Chem. 115, 16, 2015. Examples -------- >>> import pymatgen as mg >>> lattice = mg.Lattice.cubic(4.2) >>> structure = mg.Structure(lattice, ["Cs", "Cl"], [[0, 0, 0], [0.5, 0.5, 0.5]]) >>> featurizer = SineCoulombMatrix(max_atoms=2) >>> features = featurizer.featurize([structure]) Notes ----- This class requires matminer and Pymatgen to be installed. """ def __init__(self, max_atoms: int = 100, flatten: bool = True): """ Parameters ---------- max_atoms: int (default 100) Maximum number of atoms for any crystal in the dataset. Used to pad the Coulomb matrix. flatten: bool (default True) Return flattened vector of matrix eigenvalues. """ try: from matminer.featurizers.structure import SineCoulombMatrix as SCM except ModuleNotFoundError: raise ValueError("This class requires matminer to be installed.") self.max_atoms = max_atoms self.flatten = flatten self.scm = SCM(flatten=False) def _featurize(self, struct: PymatgenStructure) -> np.ndarray: """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- struct: pymatgen.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ # Get full N x N SCM sine_mat = self.scm.featurize(struct) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros(self.max_atoms) zeros[:len(eigs[0])] = eigs[0] features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
sizes = np.array([self.length(row) for row in X]) y_pred = self.predict(X) / sizes y_true = y / sizes return sklearn.metrics.r2_score(y_true, y_pred) def length(self, vec): return vec[vec != 0].shape[0] # SCM evaluation DIAG = True print("DIAG ELEMS", DIAG) # Featurize dataframe with sine coulomb matrix and time it start = time.monotonic() scm = SineCoulombMatrix(DIAG) # Set the number of jobs for parallelization scm.set_n_jobs(NJOBS) df = scm.featurize_dataframe(df, 'structure') # Take the eigenvalues of the SCMs to form vector descriptors df['sine coulomb matrix'] = pd.Series( [np.sort(np.linalg.eigvals(s))[::-1] for s in df['sine coulomb matrix']], df.index) finish = time.monotonic() print("TIME TO FEATURIZE SCM %f SECONDS" % (finish - start)) print() # Set up KRR model krr = KrrScm() print(krr.get_params().keys())
centro_elastic_compliance=np.load('/Users/dennistrujillo/Dropbox/mp_share_dt_ag/elasticity_compliance/centro_elasticity.npy',allow_pickle=True) centro_dielectric_tensor=np.load('/Users/dennistrujillo/Dropbox/mp_share_dt_ag/dielectric_total/centro_diel.npy',allow_pickle=True) data = {'structures': centrosymmetric_structures, 'ids' : task_ids} df = pd.DataFrame(data) #soap representation #from matminer.featurizers.structure import SOAP #soap = SOAP(periodic=True) #soap=soap.fit(data['structures']) #labels=soap.feature_labels() #df = soap.featurize_dataframe(df,'structures') from matminer.featurizers.structure import SineCoulombMatrix sine_coulomb = SineCoulombMatrix() sine_coulomb.set_n_jobs(28) sine_coulomb.fit(centrosymmetric_structures)#data['structures']) labels=sine_coulomb.feature_labels() df = sine_coulomb.featurize_dataframe(df, 'structures')#,ignore_errors=True) #agni #from matminer.featurizers.site import AGNIFingerprints #agni=AGNIFingerprints(directions=['x','y','z']) #agni.set_n_jobs(28) #labels=agni.feature_labels() #df = agni.featurize(df['structures'],0) #df = agni.featurize_dataframe(df, ['structures', 'site'])#,ignore_errors=True) #get s_vs_ep ec_list=[]
def score(self, X, y): sizes = np.array([self.length(row) for row in X]) y_pred = self.predict(X) / sizes y_true = y / sizes return sklearn.metrics.r2_score(y_true, y_pred) def length(self, vec): return vec[vec != 0].shape[0] # SCM evaluation DIAG = True print ("DIAG ELEMS", DIAG) # Featurize dataframe with sine coulomb matrix and time it start = time.monotonic() scm = SineCoulombMatrix(DIAG) # Set the number of jobs for parallelization scm.set_n_jobs(NJOBS) df = scm.featurize_dataframe(df, 'structure') # Take the eigenvalues of the SCMs to form vector descriptors df['sine coulomb matrix'] = pd.Series([np.sort(np.linalg.eigvals(s))[::-1] \ for s in df['sine coulomb matrix']], df.index) finish = time.monotonic() print ("TIME TO FEATURIZE SCM %f SECONDS" % (finish-start)) print() # Set up KRR model krr = KrrScm() print(krr.get_params().keys()) # Initialize hyperparameter grid search
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer): from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, CohesiveEnergy, ElectronAffinity, ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( BagofBonds, BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) from matminer.featurizers.dos import ( DOSFeaturizer, SiteDOS, Hybridization, DosAsymmetry, ) from matminer.featurizers.bandstructure import ( BandFeaturizer, BranchPointEnergy ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxid_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), #PartialRadialDistributionFunction(), #Introduces a large amount of features SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) dos_featurizers = ( DOSFeaturizer(), SiteDOS(), Hybridization() ) band_featurizers = ( BandFeaturizer(), BranchPointEnergy() ) def __init__(self, n_jobs=None): self._n_jobs = n_jobs def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( _orbitals ) df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( _orbitals ) df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) return clean_df(df) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ "distances" ][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d ) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function" ].apply(lambda x: x["distribution"][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7, } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system" ].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) return clean_df(df) def featurize_dos(self, df): """Applies the presetdos featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_dos(df) hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"] one_hot = pd.get_dummies(df[hotencodeColumns]) df = df.drop(hotencodeColumns, axis = 1).join(one_hot) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["DOSFeaturizer|vbm_character_1"] = df[ "DOSFeaturizer|vbm_character_1" ].map(_orbitals) df["DOSFeaturizer|cbm_character_1"] = df[ "DOSFeaturizer|cbm_character_1" ].map(_orbitals) # Splitting one feature into several floating features # e.g. number;number;number into three columns splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"] for column in splitColumns: try: newColumns = df[column].str.split(";", n = 2, expand = True) for i in range(0,3): df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float) except: continue df = df.drop(splitColumns, axis=1) df = df.drop(["dos"], axis=1) return clean_df(df) def featurize_bandstructure(self, df): """Applies the preset band structure featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_bandstructure(df) def _int_map(x): if str(x) == "False": return 0 elif str(x) == "True": return 1 df["BandFeaturizer|is_gap_direct"] = df[ "BandFeaturizer|is_gap_direct" ].map(_int_map) df = df.drop(["bandstructure"], axis=1) return clean_df(df) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
class SineCoulombMatrix(MaterialStructureFeaturizer): """ Calculate sine Coulomb matrix for crystals. A variant of Coulomb matrix for periodic crystals. The sine Coulomb matrix is identical to the Coulomb matrix, except that the inverse distance function is replaced by the inverse of sin**2 of the vector between sites which are periodic in the dimensions of the crystal lattice. Features are flattened into a vector of matrix eigenvalues by default for ML-readiness. To ensure that all feature vectors are equal length, the maximum number of atoms (eigenvalues) in the input dataset must be specified. This featurizer requires the optional dependencies pymatgen and matminer. It may be useful when crystal structures with 3D coordinates are available. See [1]_ for more details. References ---------- .. [1] Faber et al. "Crystal Structure Representations for Machine Learning Models of Formation Energies", Inter. J. Quantum Chem. 115, 16, 2015. https://arxiv.org/abs/1503.07406 Examples -------- >>> import deepchem as dc >>> import pymatgen as mg >>> lattice = mg.core.Lattice.cubic(4.2) >>> structure = mg.core.Structure(lattice, ["Cs", "Cl"], [[0, 0, 0], [0.5, 0.5, 0.5]]) >>> featurizer = dc.feat.SineCoulombMatrix(max_atoms=2) >>> features = featurizer.featurize([structure]) >>> type(features[0]) <class 'numpy.ndarray'> >>> features[0].shape # (max_atoms,) (2,) Note ---- This class requires matminer and Pymatgen to be installed. """ def __init__(self, max_atoms: int = 100, flatten: bool = True): """ Parameters ---------- max_atoms: int (default 100) Maximum number of atoms for any crystal in the dataset. Used to pad the Coulomb matrix. flatten: bool (default True) Return flattened vector of matrix eigenvalues. """ self.max_atoms = max_atoms self.flatten = flatten self.scm: Any = None def _featurize(self, datapoint: PymatgenStructure, **kwargs) -> np.ndarray: """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- datapoint: pymatgen.core.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ if 'struct' in kwargs and datapoint is None: datapoint = kwargs.get("struct") raise DeprecationWarning( 'Struct is being phased out as a parameter, please pass "datapoint" instead.' ) if self.scm is None: try: from matminer.featurizers.structure import SineCoulombMatrix as SCM self.scm = SCM(flatten=False) except ModuleNotFoundError: raise ImportError( "This class requires matminer to be installed.") # Get full N x N SCM sine_mat = self.scm.featurize(datapoint) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros(self.max_atoms) zeros[:len(eigs[0])] = eigs[0] features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
sizes = np.array([self.length(row) for row in X]) y_pred = self.predict(X) / sizes y_true = y / sizes return sklearn.metrics.r2_score(y_true, y_pred) def length(self, vec): return vec[vec != 0].shape[0] # SCM evaluation DIAG = True print("DIAG ELEMS", DIAG) # Featurize dataframe with sine coulomb matrix and time it start = time.monotonic() scm = SineCoulombMatrix(diag_elems=DIAG, flatten=True) # Set the number of jobs for parallelization scm.set_n_jobs(NJOBS) df = scm.fit_featurize_dataframe(df, 'structure') # Take the eigenvalues of the SCMs to form vector descriptors # df['sine coulomb matrix'] = pd.Series([np.sort(np.linalg.eigvals(s))[::-1] # for s in df['sine coulomb matrix']], # df.index) finish = time.monotonic() print("TIME TO FEATURIZE SCM %f SECONDS" % (finish - start)) print() # Set up KRR model krr = KrrScm() print(krr.get_params().keys())