Пример #1
0
def similarity(_parents, target):
    featurizer = MultipleFeaturizer([
        SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
        StructuralHeterogeneity(),
        ChemicalOrdering(),
        MaximumPackingEfficiency(),
        SiteStatsFingerprint.from_preset(
            "LocalPropertyDifference_ward-prb-2017"),
        StructureComposition(Stoichiometry()),
        StructureComposition(ElementProperty.from_preset("magpie")),
        StructureComposition(ValenceOrbital(props=["frac"])),
        StructureComposition(IonProperty(fast=True)),
    ])

    # HACK celery doesn't work with multiprocessing (used by matminer)
    try:
        from celery import current_task
        if current_task:
            featurizer.set_n_jobs(1)
    except ImportError:
        pass

    x_target = pd.DataFrame.from_records([featurizer.featurize(target)],
                                         columns=featurizer.feature_labels())
    x_parent = pd.DataFrame.from_records(
        featurizer.featurize_many(_parents, ignore_errors=True, pbar=False),
        columns=featurizer.feature_labels(),
    )
    nulls = x_parent[x_parent.isnull().any(axis=1)].index.values
    x_parent.fillna(100000, inplace=True)

    x_target = x_target.reindex(sorted(x_target.columns), axis=1)
    x_parent = x_parent.reindex(sorted(x_parent.columns), axis=1)

    with open(os.path.join(settings.rxn_files, "scaler2.pickle"), "rb") as f:
        scaler = pickle.load(f)
    with open(os.path.join(settings.rxn_files, "quantiles.pickle"), "rb") as f:
        quantiles = pickle.load(f)

    X = scaler.transform(x_parent.append(x_target))

    D = [pairwise_distances(np.array([row, X[-1]]))[0, 1] for row in X[:-1]]

    _res = []
    for d in D:
        _res.append(np.linspace(0, 1, 101)[np.abs(quantiles - d).argmin()])
    _res = np.array(_res)
    _res[nulls] = -1
    return _res
Пример #2
0
    def test_multifeatures(self):
        # Make a test dataset with two input variables
        data = self.make_test_data()
        data['x2'] = [4, 5, 6]

        # Create a second featurizer
        class MultiArgs2(SingleFeaturizerMultiArgs):
            def featurize(self, *x):
                # Making a 2D array to test whether MutliFeaturizer
                #  can handle featurizers that have both 1D vectors with
                #  singleton dimensions (e.g., shape==(4,1)) and those
                #  without (e.g., shape==(4,))
                return [super(MultiArgs2, self).featurize(*x)]

            def feature_labels(self):
                return ['y2']

        multiargs2 = MultiArgs2()

        # Create featurizer
        multi_f = MultipleFeaturizer([self.multiargs, multiargs2])
        multi_f.set_n_jobs(1)

        # Test featurize with multiple arguments
        features = multi_f.featurize(0, 2)
        self.assertArrayAlmostEqual([2, 2], features)

        # Test dataframe
        data = multi_f.featurize_dataframe(data, ['x', 'x2'])
        self.assertEquals(['y', 'y2'], multi_f.feature_labels())
        self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y',
                                                                    'y2']])
Пример #3
0
    def test_multifeatures_multiargs(self):
        multiargs2 = MultiArgs2()

        # test iterating over both entries and featurizers
        for iter_entries in [True, False]:
            # Make a test dataset with two input variables
            data = self.make_test_data()
            data['x2'] = [4, 5, 6]

            # Create featurizer
            multi_f = MultipleFeaturizer([self.multiargs, multiargs2],
                                         iterate_over_entries=iter_entries)

            # Test featurize with multiple arguments
            features = multi_f.featurize(0, 2)
            self.assertArrayAlmostEqual([2, 2], features)

            # Test dataframe
            data = multi_f.featurize_dataframe(data, ['x', 'x2'])
            self.assertEqual(['y', 'y2'], multi_f.feature_labels())
            self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]],
                                        data[['y', 'y2']])
            # Test with multiindex
            data = multi_f.featurize_dataframe(data, ['x', 'x2'],
                                               multiindex=True)
            self.assertIn(("MultiArgs2", "y2"), data.columns)
            self.assertIn(("SingleFeaturizerMultiArgs", "y"), data.columns)
            self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[[
                ("SingleFeaturizerMultiArgs", "y"), ("MultiArgs2", "y2")
            ]])
def get_structure_properties(structure: Structure, mode: str = 'all') -> dict:

    if mode == 'all':
        featurizer = MultipleFeaturizer([
            SiteStatsFingerprint.from_preset(
                'CoordinationNumber_ward-prb-2017'),
            StructuralHeterogeneity(),
            ChemicalOrdering(),
            DensityFeatures(),
            MaximumPackingEfficiency(),
            SiteStatsFingerprint.from_preset(
                'LocalPropertyDifference_ward-prb-2017'),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset('magpie')),
            StructureComposition(ValenceOrbital(props=['frac'])),
        ])
    else:
        # Calculate only those which do not need a Voronoi tesselation
        featurizer = MultipleFeaturizer([
            DensityFeatures(),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset('magpie')),
            StructureComposition(ValenceOrbital(props=['frac'])),
        ])

    X = featurizer.featurize(structure)

    matminer_dict = dict(list(zip(featurizer.feature_labels(), X)))

    matminer_dict['volume'] = structure.volume
    return matminer_dict
Пример #5
0
    def test_multifeatures(self):
        # Make a test dataset with two input variables
        data = self.make_test_data()
        data['x2'] = [4, 5, 6]

        multiargs2 = MultiArgs2()

        # Create featurizer
        multi_f = MultipleFeaturizer([self.multiargs, multiargs2])

        # Test featurize with multiple arguments
        features = multi_f.featurize(0, 2)
        self.assertArrayAlmostEqual([2, 2], features)

        # Test dataframe
        data = multi_f.featurize_dataframe(data, ['x', 'x2'])
        self.assertEquals(['y', 'y2'], multi_f.feature_labels())
        self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']])
Пример #6
0
    def test_multitype_multifeat(self):
        """Test Multifeaturizer when a featurizer returns a non-numeric type"""

        # Make the featurizer
        f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()])
        f.set_n_jobs(1)

        # Make the test data
        data = self.make_test_data()

        # Add the columns
        data = f.featurize_dataframe(data, 'x')

        # Make sure the types are as expected
        labels = f.feature_labels()
        self.assertArrayEqual(['int64', 'object', 'int64'],
                              data[labels].dtypes.astype(str).tolist())
        self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
Пример #7
0
    def test_multitype_multifeat(self):
        """Test Multifeaturizer when a featurizer returns a non-numeric type"""

        # test both iteration over entries and featurizers
        for iter_entries in [True, False]:
            # Make the featurizer
            f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()],
                                   iterate_over_entries=iter_entries)
            f.set_n_jobs(1)

            # Make the test data
            data = self.make_test_data()

            # Add the columns
            data = f.featurize_dataframe(data, 'x')

            # Make sure the types are as expected
            labels = f.feature_labels()
            self.assertArrayEqual(['int64', 'object', 'int64'],
                                  data[labels].dtypes.astype(str).tolist())
            self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
Пример #8
0
       (original_count - len(data), original_count))
 # 用逻辑和的方式筛选[-20,5]范围内的delta_e
 original_count = len(data)
 data = data[np.logical_and(data['delta_e'] >= -20, data['delta_e'] <= 5)]
 print('Removed %d/%d entries' %
       (original_count - len(data), original_count))
 print(data.head(3))
 # 设定化学计算规范:使用MagpieData数据源初始化元素属性,返回各层轨道电子数量信息,假设元素以单一氧化态存在
 feature_calculators = MultipleFeaturizer([
     cf.Stoichiometry(),
     cf.ElementProperty.from_preset("magpie"),
     cf.ValenceOrbital(props=['avg']),
     cf.IonProperty(fast=False)
 ])
 # 获得特征名
 feature_labels = feature_calculators.feature_labels()
 # 计算特征量
 data = feature_calculators.featurize_dataframe(data,
                                                col_id='composition_obj')
 print('Generated %d features' % len(feature_labels))
 print('Training set size:',
       'x'.join([str(x) for x in data[feature_labels].shape]))
 # 去除空值缺省值
 original_count = len(data)
 data = data[~data[feature_labels].isnull().any(axis=1)]
 print('Removed %d/%d entries' %
       (original_count - len(data), original_count))
 print(data.head(3))
 # 调用随机森林
 # “随机森林”算法通过训练许多不同的决策树模型来工作,
 # 其中每个模型都在数据集的不同子集上进行训练。