def test_prdf(self): # Test a few peaks in diamond # These expected numbers were derived by performing # the calculation in another code p, r = PartialRadialDistributionFunction().featurize(self.diamond) self.assertEquals(len(p), 1) self.assertEquals(p[('C', 'C')][int(round(1.4 / 0.1))], 0) self.assertAlmostEqual(p[('C', 'C')][int(round(1.5 / 0.1))], 1.324451676) self.assertAlmostEqual(r.max(), 19.9) self.assertAlmostEqual(p[('C', 'C')][int(round(19.9 / 0.1))], 0.07197902) # Test a few peaks in CsCl, make sure it gets all types correctly p, r = PartialRadialDistributionFunction().featurize(self.cscl, cutoff=10) self.assertEquals(len(p), 4) self.assertAlmostEqual(r.max(), 9.9) self.assertAlmostEquals(p[('Cs', 'Cl')][int(round(3.6 / 0.1))], 0.477823197) self.assertAlmostEquals(p[('Cl', 'Cs')][int(round(3.6 / 0.1))], 0.477823197) self.assertAlmostEquals(p[('Cs', 'Cs')][int(round(3.6 / 0.1))], 0) # Do Ni3Al, make sure it captures the antisymmetry of Ni/Al sites p, r = PartialRadialDistributionFunction().featurize(self.ni3al, cutoff=10, bin_size=0.5) self.assertEquals(len(p), 4) self.assertAlmostEquals(p[('Ni', 'Al')][int(round(2 / 0.5))], 0.125236677) self.assertAlmostEquals(p[('Al', 'Ni')][int(round(2 / 0.5))], 0.37571003) self.assertAlmostEquals(p[('Al', 'Al')][int(round(2 / 0.5))], 0)
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with structural features from matminer. Currently applies the set of all matminer structure features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying structure featurizers...") df = df.copy() structure_features = [ DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), BagofBonds() ] featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features]) df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d) df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int) return clean_df(df)
def test_prdf(self): # Test a few peaks in diamond # These expected numbers were derived by performing # the calculation in another code distances, prdf = PartialRadialDistributionFunction().compute_prdf( self.diamond) self.assertEqual(len(prdf.values()), 1) self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.4 / 0.1))], 0) self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.5 / 0.1))], 1.32445167622) self.assertAlmostEqual(max(distances), 19.9) self.assertAlmostEqual(prdf[('C', 'C')][int(round(19.9 / 0.1))], 0.07197902) # Test a few peaks in CsCl, make sure it gets all types correctly distances, prdf = PartialRadialDistributionFunction( cutoff=10).compute_prdf(self.cscl) self.assertEqual(len(prdf.values()), 4) self.assertAlmostEqual(max(distances), 9.9) self.assertAlmostEqual(prdf[('Cs', 'Cl')][int(round(3.6 / 0.1))], 0.477823197) self.assertAlmostEqual(prdf[('Cl', 'Cs')][int(round(3.6 / 0.1))], 0.477823197) self.assertAlmostEqual(prdf[('Cs', 'Cs')][int(round(3.6 / 0.1))], 0) # Do Ni3Al, make sure it captures the antisymmetry of Ni/Al sites distances, prdf = PartialRadialDistributionFunction(cutoff=10, bin_size=0.5)\ .compute_prdf(self.ni3al) self.assertEqual(len(prdf.values()), 4) self.assertAlmostEqual(prdf[('Ni', 'Al')][int(round(2 / 0.5))], 0.125236677) self.assertAlmostEqual(prdf[('Al', 'Ni')][int(round(2 / 0.5))], 0.37571003) self.assertAlmostEqual(prdf[('Al', 'Al')][int(round(2 / 0.5))], 0) # Check the fit operation featurizer = PartialRadialDistributionFunction() featurizer.fit([self.diamond, self.cscl, self.ni3al]) self.assertEqual({'Cs', 'Cl', 'C', 'Ni', 'Al'}, set(featurizer.elements_)) featurizer.exclude_elems = ['Cs', 'Al'] featurizer.fit([self.diamond, self.cscl, self.ni3al]) self.assertEqual({'Cl', 'C', 'Ni'}, set(featurizer.elements_)) featurizer.include_elems = ['H'] featurizer.fit([self.diamond, self.cscl, self.ni3al]) self.assertEqual({'H', 'Cl', 'C', 'Ni'}, set(featurizer.elements_)) # Check the feature labels featurizer.exclude_elems = () featurizer.include_elems = () featurizer.elements_ = ['Al', 'Ni'] labels = featurizer.feature_labels() n_bins = len(featurizer._make_bins()) - 1 self.assertEqual(3 * n_bins, len(labels)) self.assertIn('Al-Ni PRDF r=0.00-0.10', labels) # Check the featurize method featurizer.elements_ = ['C'] features = featurizer.featurize(self.diamond) prdf = featurizer.compute_prdf(self.diamond)[1] self.assertArrayAlmostEqual(features, prdf[('C', 'C')]) # Check the featurize_dataframe df = pd.DataFrame.from_dict({"structure": [self.diamond, self.cscl]}) featurizer.fit(df["structure"]) df = featurizer.featurize_dataframe(df, col_id="structure") self.assertEqual(df["Cs-Cl PRDF r=0.00-0.10"][0], 0.0) self.assertAlmostEqual(df["Cl-Cl PRDF r=19.70-19.80"][1], 0.049, 3) self.assertEqual(df["Cl-Cl PRDF r=19.90-20.00"][0], 0.0) # Make sure labels and features are in the same order featurizer.elements_ = ['Al', 'Ni'] features = featurizer.featurize(self.ni3al) labels = featurizer.feature_labels() prdf = featurizer.compute_prdf(self.ni3al)[1] self.assertEqual((n_bins * 3, ), features.shape) self.assertTrue(labels[0].startswith('Al-Al')) self.assertTrue(labels[n_bins].startswith('Al-Ni')) self.assertTrue(labels[2 * n_bins].startswith('Ni-Ni')) self.assertArrayAlmostEqual( features, np.hstack( [prdf[('Al', 'Al')], prdf[('Al', 'Ni')], prdf[('Ni', 'Ni')]]))
def test_prdf(self): # Test a few peaks in diamond # These expected numbers were derived by performing # the calculation in another code distances, prdf = PartialRadialDistributionFunction().compute_prdf(self.diamond) self.assertEqual(len(prdf.values()), 1) self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.4 / 0.1))], 0) self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.5 / 0.1))], 1.32445167622) self.assertAlmostEqual(max(distances), 19.9) self.assertAlmostEqual(prdf[('C', 'C')][int(round(19.9 / 0.1))], 0.07197902) # Test a few peaks in CsCl, make sure it gets all types correctly distances, prdf = PartialRadialDistributionFunction(cutoff=10).compute_prdf(self.cscl) self.assertEqual(len(prdf.values()), 4) self.assertAlmostEqual(max(distances), 9.9) self.assertAlmostEqual(prdf[('Cs', 'Cl')][int(round(3.6 / 0.1))], 0.477823197) self.assertAlmostEqual(prdf[('Cl', 'Cs')][int(round(3.6 / 0.1))], 0.477823197) self.assertAlmostEqual(prdf[('Cs', 'Cs')][int(round(3.6 / 0.1))], 0) # Do Ni3Al, make sure it captures the antisymmetry of Ni/Al sites distances, prdf = PartialRadialDistributionFunction(cutoff=10, bin_size=0.5)\ .compute_prdf(self.ni3al) self.assertEqual(len(prdf.values()), 4) self.assertAlmostEqual(prdf[('Ni', 'Al')][int(round(2 / 0.5))], 0.125236677) self.assertAlmostEqual(prdf[('Al', 'Ni')][int(round(2 / 0.5))], 0.37571003) self.assertAlmostEqual(prdf[('Al', 'Al')][int(round(2 / 0.5))], 0) # Check the fit operation featurizer = PartialRadialDistributionFunction() featurizer.fit([self.diamond, self.cscl, self.ni3al]) self.assertEqual({'Cs', 'Cl', 'C', 'Ni', 'Al'}, set(featurizer.elements_)) featurizer.exclude_elems = ['Cs', 'Al'] featurizer.fit([self.diamond, self.cscl, self.ni3al]) self.assertEqual({'Cl', 'C', 'Ni'}, set(featurizer.elements_)) featurizer.include_elems = ['H'] featurizer.fit([self.diamond, self.cscl, self.ni3al]) self.assertEqual({'H', 'Cl', 'C', 'Ni'}, set(featurizer.elements_)) # Check the feature labels featurizer.exclude_elems = () featurizer.include_elems = () featurizer.elements_ = ['Al', 'Ni'] labels = featurizer.feature_labels() n_bins = len(featurizer._make_bins()) - 1 self.assertEqual(3 * n_bins, len(labels)) self.assertIn('Al-Ni PRDF r=0.00-0.10', labels) # Check the featurize method featurizer.elements_ = ['C'] features = featurizer.featurize(self.diamond) prdf = featurizer.compute_prdf(self.diamond)[1] self.assertArrayAlmostEqual(features, prdf[('C', 'C')]) # Check the featurize_dataframe df = pd.DataFrame.from_dict({"structure": [self.diamond, self.cscl]}) featurizer.fit(df["structure"]) df = featurizer.featurize_dataframe(df, col_id="structure") self.assertEqual(df["Cs-Cl PRDF r=0.00-0.10"][0], 0.0) self.assertAlmostEqual(df["Cl-Cl PRDF r=19.70-19.80"][1], 0.049, 3) self.assertEqual(df["Cl-Cl PRDF r=19.90-20.00"][0], 0.0) # Make sure labels and features are in the same order featurizer.elements_ = ['Al', 'Ni'] features = featurizer.featurize(self.ni3al) labels = featurizer.feature_labels() prdf = featurizer.compute_prdf(self.ni3al)[1] self.assertEqual((n_bins * 3,), features.shape) self.assertTrue(labels[0].startswith('Al-Al')) self.assertTrue(labels[n_bins].startswith('Al-Ni')) self.assertTrue(labels[2 * n_bins].startswith('Ni-Ni')) self.assertArrayAlmostEqual(features, np.hstack( [prdf[('Al', 'Al')], prdf[('Al', 'Ni')], prdf[('Ni', 'Ni')]]))
CIFfiles.append(i) #List of CIF files #Creates a list of pymatgen.structure objects and a name of each structure structlist = [] namelist = [] structs = [] namecolumns = ['structure'] for i in CIFfiles: structlist.append([Structure.from_file(directoryname + i) ]) #Converts CIF to pymatgen structure object namelist.append(os.path.splitext(i)[0]) #Collects all the structure names structs.append(Structure.from_file(directoryname + i)) #Creates Pandas dataframe with data being a list of structures and the row name being the structure name dftest = pd.DataFrame(data=structlist, index=namelist, columns=namecolumns) p = PartialRadialDistributionFunction() p.fit(np.asarray(structs)) c = CoulombMatrix() c.fit(np.asarray(structs)) erdf = ElectronicRadialDistributionFunction() erdf.cutoff = 10 #longest diagonal of lattice...I picked a number semi-arbitrarily #Featurizes the structures featurizer = MultipleFeaturizer([ ElementProperty.from_preset('magpie'), OxidationStates(), AtomicOrbitals(), BandCenter(), ElectronegativityDiff(),