def featurize_structure(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with structural features from matminer. Currently applies the set of all matminer structure features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying structure featurizers...") df = df.copy() structure_features = [ DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), BagofBonds() ] featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features]) df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d) df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int) return clean_df(df)
def test_bob(self): # Test a single fit and featurization scm = SineCoulombMatrix(flatten=False) bob = BagofBonds(coulomb_matrix=scm, token=' - ') bob.fit([self.ni3al]) truth1 = [ 235.74041833262768, 1486.4464890775491, 1486.4464890775491, 1486.4464890775491, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257 ] truth1_labels = [ 'Al site #0', 'Ni site #0', 'Ni site #1', 'Ni site #2', 'Al - Ni bond #0', 'Al - Ni bond #1', 'Al - Ni bond #2', 'Al - Ni bond #3', 'Al - Ni bond #4', 'Al - Ni bond #5', 'Ni - Ni bond #0', 'Ni - Ni bond #1', 'Ni - Ni bond #2', 'Ni - Ni bond #3', 'Ni - Ni bond #4', 'Ni - Ni bond #5' ] self.assertAlmostEqual(bob.featurize(self.ni3al), truth1) self.assertEqual(bob.feature_labels(), truth1_labels) # Test padding from fitting and dataframe featurization bob.coulomb_matrix = CoulombMatrix(flatten=False) bob.fit([self.ni3al, self.cscl, self.diamond_no_oxi]) df = pd.DataFrame({'structures': [self.cscl]}) df = bob.featurize_dataframe(df, 'structures') self.assertEqual(len(df.columns.values), 25) self.assertAlmostEqual(df['Cs site #0'][0], 7513.468312122532) self.assertAlmostEqual(df['Al site #0'][0], 0.0) self.assertAlmostEqual(df['Cs - Cl bond #1'][0], 135.74726437398044) self.assertAlmostEqual(df['Al - Ni bond #0'][0], 0.0) # Test error handling for bad fits or null fits bob = BagofBonds(CoulombMatrix(flatten=False)) self.assertRaises(NotFittedError, bob.featurize, self.nacl) bob.fit([self.ni3al, self.diamond]) self.assertRaises(ValueError, bob.featurize, self.nacl)\
def test_bob(self): # Test a single fit and featurization bob = BagofBonds(coulomb_matrix=SineCoulombMatrix(), token=' - ') bob.fit([self.ni3al]) truth1 = [235.74041833262768, 1486.4464890775491, 1486.4464890775491, 1486.4464890775491, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257] truth1_labels = ['Al site #0', 'Ni site #0', 'Ni site #1', 'Ni site #2', 'Al - Ni bond #0', 'Al - Ni bond #1', 'Al - Ni bond #2', 'Al - Ni bond #3', 'Al - Ni bond #4', 'Al - Ni bond #5', 'Ni - Ni bond #0', 'Ni - Ni bond #1', 'Ni - Ni bond #2', 'Ni - Ni bond #3', 'Ni - Ni bond #4', 'Ni - Ni bond #5'] self.assertAlmostEqual(bob.featurize(self.ni3al), truth1) self.assertEqual(bob.feature_labels(), truth1_labels) # Test padding from fitting and dataframe featurization bob.coulomb_matrix = CoulombMatrix() bob.fit([self.ni3al, self.cscl, self.diamond_no_oxi]) df = pd.DataFrame({'structures': [self.cscl]}) df = bob.featurize_dataframe(df, 'structures') self.assertEqual(len(df.columns.values), 25) self.assertAlmostEqual(df['Cs site #0'][0], 7513.468312122532) self.assertAlmostEqual(df['Al site #0'][0], 0.0) self.assertAlmostEqual(df['Cs - Cl bond #1'][0], 135.74726437398044) self.assertAlmostEqual(df['Al - Ni bond #0'][0], 0.0) # Test error handling for bad fits or null fits bob = BagofBonds() self.assertRaises(NotFittedError, bob.featurize, self.nacl) bob.fit([self.ni3al, self.diamond]) self.assertRaises(ValueError, bob.featurize, self.nacl)\