예제 #1
0
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with structural
    features from matminer.

    Currently applies the set of all matminer structure features.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """

    logging.info("Applying structure featurizers...")

    df = df.copy()

    structure_features = [
         DensityFeatures(),
         GlobalSymmetryFeatures(),
         RadialDistributionFunction(),
         CoulombMatrix(),
         PartialRadialDistributionFunction(),
         SineCoulombMatrix(),
         EwaldEnergy(),
         BondFractions(),
         StructuralHeterogeneity(),
         MaximumPackingEfficiency(),
         ChemicalOrdering(),
         XRDPowderPattern(),
         BagofBonds()
    ]

    featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features])

    df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True)
    df.columns = df.columns.map('|'.join).str.strip('|')

    dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50]
    for i, d in enumerate(dist):
        _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d)
        df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i])

    df = df.drop("RadialDistributionFunction|radial distribution function", axis=1)

    _crystal_system = {
        "cubic": 1, "tetragonal": 2, "orthorombic": 3,
        "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7
    }

    df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system)
    df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int)

    return clean_df(df)
예제 #2
0
    def test_bob(self):

        # Test a single fit and featurization
        scm = SineCoulombMatrix(flatten=False)
        bob = BagofBonds(coulomb_matrix=scm, token=' - ')
        bob.fit([self.ni3al])
        truth1 = [
            235.74041833262768, 1486.4464890775491, 1486.4464890775491,
            1486.4464890775491, 38.69353092306119, 38.69353092306119,
            38.69353092306119, 38.69353092306119, 38.69353092306119,
            38.69353092306119, 83.33991275736257, 83.33991275736257,
            83.33991275736257, 83.33991275736257, 83.33991275736257,
            83.33991275736257
        ]
        truth1_labels = [
            'Al site #0', 'Ni site #0', 'Ni site #1', 'Ni site #2',
            'Al - Ni bond #0', 'Al - Ni bond #1', 'Al - Ni bond #2',
            'Al - Ni bond #3', 'Al - Ni bond #4', 'Al - Ni bond #5',
            'Ni - Ni bond #0', 'Ni - Ni bond #1', 'Ni - Ni bond #2',
            'Ni - Ni bond #3', 'Ni - Ni bond #4', 'Ni - Ni bond #5'
        ]
        self.assertAlmostEqual(bob.featurize(self.ni3al), truth1)
        self.assertEqual(bob.feature_labels(), truth1_labels)

        # Test padding from fitting and dataframe featurization
        bob.coulomb_matrix = CoulombMatrix(flatten=False)
        bob.fit([self.ni3al, self.cscl, self.diamond_no_oxi])
        df = pd.DataFrame({'structures': [self.cscl]})
        df = bob.featurize_dataframe(df, 'structures')
        self.assertEqual(len(df.columns.values), 25)
        self.assertAlmostEqual(df['Cs site #0'][0], 7513.468312122532)
        self.assertAlmostEqual(df['Al site #0'][0], 0.0)
        self.assertAlmostEqual(df['Cs - Cl bond #1'][0], 135.74726437398044)
        self.assertAlmostEqual(df['Al - Ni bond #0'][0], 0.0)

        # Test error handling for bad fits or null fits
        bob = BagofBonds(CoulombMatrix(flatten=False))
        self.assertRaises(NotFittedError, bob.featurize, self.nacl)
        bob.fit([self.ni3al, self.diamond])
        self.assertRaises(ValueError, bob.featurize, self.nacl)\
예제 #3
0
    def test_bob(self):

        # Test a single fit and featurization
        bob = BagofBonds(coulomb_matrix=SineCoulombMatrix(), token=' - ')
        bob.fit([self.ni3al])
        truth1 = [235.74041833262768, 1486.4464890775491, 1486.4464890775491,
                  1486.4464890775491, 38.69353092306119, 38.69353092306119,
                  38.69353092306119, 38.69353092306119, 38.69353092306119,
                  38.69353092306119, 83.33991275736257, 83.33991275736257,
                  83.33991275736257, 83.33991275736257, 83.33991275736257,
                  83.33991275736257]
        truth1_labels = ['Al site #0', 'Ni site #0', 'Ni site #1', 'Ni site #2',
                         'Al - Ni bond #0', 'Al - Ni bond #1',
                         'Al - Ni bond #2', 'Al - Ni bond #3',
                         'Al - Ni bond #4', 'Al - Ni bond #5',
                         'Ni - Ni bond #0', 'Ni - Ni bond #1',
                         'Ni - Ni bond #2', 'Ni - Ni bond #3',
                         'Ni - Ni bond #4', 'Ni - Ni bond #5']
        self.assertAlmostEqual(bob.featurize(self.ni3al), truth1)
        self.assertEqual(bob.feature_labels(), truth1_labels)

        # Test padding from fitting and dataframe featurization
        bob.coulomb_matrix = CoulombMatrix()
        bob.fit([self.ni3al, self.cscl, self.diamond_no_oxi])
        df = pd.DataFrame({'structures': [self.cscl]})
        df = bob.featurize_dataframe(df, 'structures')
        self.assertEqual(len(df.columns.values), 25)
        self.assertAlmostEqual(df['Cs site #0'][0], 7513.468312122532)
        self.assertAlmostEqual(df['Al site #0'][0], 0.0)
        self.assertAlmostEqual(df['Cs - Cl bond #1'][0], 135.74726437398044)
        self.assertAlmostEqual(df['Al - Ni bond #0'][0], 0.0)

        # Test error handling for bad fits or null fits
        bob = BagofBonds()
        self.assertRaises(NotFittedError, bob.featurize, self.nacl)
        bob.fit([self.ni3al, self.diamond])
        self.assertRaises(ValueError, bob.featurize, self.nacl)\