Пример #1
0
    def has_polymorphs(self):
        """Determine if a task's raw data contains polymorphs.

        Returns:
            (bool) If true, contains polymorphs.
        """
        checker_key = "pmg_composition"
        self._check_is_loaded()
        if self.metadata.input_type == "composition":
            stc = StrToComposition(target_col_id=checker_key, reduce=True)
            comps = stc.featurize_dataframe(self.df,
                                            "composition")[checker_key].values
        elif self.metadata.input_type == "structure":
            stc = StructureToComposition(target_col_id=checker_key,
                                         reduce=True)
            comps = stc.featurize_dataframe(self.df,
                                            "structure")[checker_key].values
        else:
            raise ValueError(
                "Cannot check for polymorphs without input type in "
                "(structure, composition)!")

        unique_comps = set(comps)
        if len(unique_comps) != len(comps):
            return True
        else:
            return False
Пример #2
0
def tran_feat_composition(
    df,
    var_formula="FORMULA",
    preset_name="magpie",
    append=True,
    ignore_errors=True,
    **kwargs,
):
    r"""Featurize a dataset using matminer

    Featurize chemical composition using matminer package.

    Args:
        df (DataFrame): Data to featurize
        var_formula (string): Column in df with chemical formula; formula
            given as string
        append (bool): Append results to original columns?
        preset_name (string): Matminer featurization preset

    Kwargs:
        ignore_errors (bool): Do not throw an error while parsing formulae; set to
            True to return NaN's for invalid formulae.

    Notes:
        - A pre-processor and wrapper for matminer.featurizers.composition

    References:
        Ward, L., Dunn, A., Faghaninia, A., Zimmermann, N. E. R., Bajaj, S., Wang, Q., Montoya, J. H., Chen, J., Bystrom, K., Dylla, M., Chard, K., Asta, M., Persson, K., Snyder, G. J., Foster, I., Jain, A., Matminer: An open source toolkit for materials data mining. Comput. Mater. Sci. 152, 60-69 (2018).

    Examples:
        >>> import grama as gr
        >>> from grama.tran import tf_feat_composition
        >>> (
        >>>     gr.df_make(FORMULA=["C6H12O6"])
        >>>     >> gr.tf_feat_composition()
        >>> )

    """
    ## Check invariants

    ## Featurize
    featurizer = ElementProperty.from_preset(preset_name=preset_name)
    df_res = StrToComposition().featurize_dataframe(
        df[[var_formula]],
        var_formula,
        ignore_errors=ignore_errors,
    )
    df_res = featurizer.featurize_dataframe(
        df_res,
        col_id="composition",
        ignore_errors=ignore_errors,
        **kwargs,
    )
    df_res.drop(columns=[var_formula, "composition"], inplace=True)

    ## Concatenate as necessary
    if append:
        df_res = concat((df, df_res), axis=1)

    return df_res
Пример #3
0
 def __init__(self, pbar=False):
     self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3)
     self.stc = StrToComposition()
     ep = ElementProperty.from_preset("magpie")
     ef = ElementFraction()
     self.featurizer = MultipleFeaturizer([ep, ef])
     self.pbar = pbar
Пример #4
0
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()
Пример #5
0
def generate(fake_df, ignore_errors=False):
    fake_df = np.array([fake_df])
    fake_df = pd.DataFrame(fake_df)
    fake_df.columns = ['full_formula']
    # print(fake_df)
    fake_df = StrToComposition().featurize_dataframe(
        fake_df, "full_formula", ignore_errors=ignore_errors)
    fake_df = fake_df.dropna()
    fake_df = feature_calculators.featurize_dataframe(
        fake_df, col_id='composition', ignore_errors=ignore_errors)
    fake_df["NComp"] = fake_df["composition"].apply(len)
    return fake_df
Пример #6
0
    def test_str_to_composition(self):
        d = {'comp_str': ["Fe2", "MnO2"]}

        df = DataFrame(data=d)
        df = StrToComposition().featurize_dataframe(df, 'comp_str')

        self.assertEqual(df["composition"].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        stc = StrToComposition(reduce=True, target_col_id='composition_red')
        df = stc.featurize_dataframe(df, 'comp_str')

        self.assertEqual(df["composition_red"].tolist(),
                         [Composition("Fe"), Composition("MnO2")])
def composition_featurizer(df_input: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """Return a Pandas DataFrame with all compositional features"""

    # generate the "composition" column
    df_comp = StrToComposition().featurize_dataframe(df_input,
                                                     col_id="Compound")
    # generate features based on elemental properites
    ep_featurizer = ElementProperty.from_preset(preset_name="magpie")
    ep_featurizer.featurize_dataframe(df_comp,
                                      col_id="composition",
                                      inplace=True)
    # generate the "composition_oxid" column based on guessed oxidation states
    CompositionToOxidComposition(
        return_original_on_error=True, **kwargs).featurize_dataframe(
            # ignore errors from non-integer stoichiometries
            df_comp,
            "composition",
            ignore_errors=True,
            inplace=True)
    # correct oxidation states
    df_comp = correct_comp_oxid(df_comp)
    # generate features based on oxidation states
    os_featurizer = OxidationStates()
    os_featurizer.featurize_dataframe(df_comp,
                                      "composition_oxid",
                                      ignore_errors=True,
                                      inplace=True)
    # remove compounds with predicted oxidation states of 0
    return df_comp[df_comp["minimum oxidation state"] != 0]
Пример #8
0
class FeatureGenerator:
    """
        A wraper class to generate multiple type of elemental features
    """
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()

    def generate(self, df: pd.DataFrame, ignore_errors: bool = False):
        """
            generate feature from a dataframe with a "formula" column that contains 
            chemical formulas of the compositions.
        """
        df = self.str2composition.featurize_dataframe(
            df, "formula", ignore_errors=ignore_errors)
        df = df.dropna()
        df = self.feature_calculators.featurize_dataframe(
            df, col_id='composition', ignore_errors=ignore_errors)
        df["NComp"] = df["composition"].apply(len)
        return df
Пример #9
0
    def test_conversion_overwrite(self):
        # Test with overwrite
        d = {'comp_str': ["Fe2", "MnO2"]}
        df = DataFrame(data=d)

        stc = StrToComposition(target_col_id='comp_str', overwrite_data=False)
        with self.assertRaises(ValueError):
            df = stc.featurize_dataframe(df, 'comp_str', inplace=True)

        with self.assertRaises(ValueError):
            df = stc.featurize_dataframe(df, 'comp_str', inplace=False)

        stc = StrToComposition(target_col_id='comp_str', overwrite_data=True)

        dfres_ipt = df.copy()
        stc.featurize_dataframe(dfres_ipt, 'comp_str', inplace=True)
        self.assertListEqual(dfres_ipt.columns.tolist(), ["comp_str"])

        dfres_ipf = stc.featurize_dataframe(df, 'comp_str', inplace=False)
        self.assertListEqual(dfres_ipf.columns.tolist(), ["comp_str"])
def test_featurizers():
    df = pd.read_csv('test.csv', index_col=[0])
    df = StrToComposition().featurize_dataframe(df, 'formula')
    print(df.head())
    #下一步,我们需要其中一个特征化来增加一系列的特征算符
    ep_feat = ElementProperty.from_preset(preset_name='magpie')
    df = ep_feat.featurize_dataframe(
        df, col_id='composition')  #将composition这一列作为特征化的输入
    print(df.head())
    print(ep_feat.citations())
    #df.to_csv('将composition特征化后.csv')

    #开始引入新的特征化算符吧
    df = CompositionToOxidComposition().featurize_dataframe(
        df, 'composition')  #引入了氧化态的相关特征
    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, col_id='composition_oxid')
    print(df.head())
    df.to_csv('after_test.csv')
Пример #11
0
def generate_data(name):
    #这个函数作用,输入是指定的文件名,输出增加了gaps,is_daoti,以及其他共计145特征的完整向量矩阵
    #name='test_plus_gaps.csv'
    df=pd.read_csv(name,index_col=[0])
    df['gaps']=-10.0   
    df_gap=pd.read_csv("gaps.csv",index_col = [0])
    print(df_gap.index)
    i=0    
    str_s=""
    for j in range(len(df_gap.index)):
        #先打印二者的id
       # print(df.index[i])
        str_s='mp-'+str(df_gap.index[j])
        if(str_s==df.index[i]):
            df.iloc[i,-1]=df_gap.iloc[j,0]
            i=i+1
            #print("确实一样") 
    print("合并完毕")

    #同样的方法我们来建立不同的分类
    df['is_daoti']=-2
    for i in range(len(df.index)):
        if(df.ix[i,-2]==0):
            df.ix[i,-1]=1
        else:
            df.ix[i,-1]=0
    print("分类feature建立完成")   
    
#首先使用describe获得对于数据的整体把握
    print(df.describe())
    df.describe().to_csv('general_look_jie.csv')
#通过观察数据发现并没有什么异常之处
    df=StrToComposition().featurize_dataframe(df,'full_formula',ignore_errors=True)
    print(df.head())   
    #print(df['composition'])
    ep_feat=ElementProperty.from_preset(preset_name='magpie')
    df=ep_feat.featurize_dataframe(df,col_id='composition',ignore_errors=True)#将composition这一列作为特征化的输入
    print(df.head())
    #print(ep_feat.citations())
    #df.to_csv("plus the composition.csv")
    #以上这部分是将formula转化为composition并转化feature

    df=CompositionToOxidComposition().featurize_dataframe(df,col_id='composition')#引入了氧化态的相关特征
    os_feat=OxidationStates()
    df=os_feat.featurize_dataframe(df,col_id='composition_oxid')
    new_name='2d_vector_plus.csv'
    df.to_csv(new_name)
Пример #12
0
class FeatureGenerator:
    """
        A wraper class to generate multiple type of elemental features
    """
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()

    def generate(self,
                 df: pd.DataFrame,
                 ignore_errors: bool = False,
                 drop_mode=True):
        """
            generate feature from a dataframe with a "formula" column that contains 
            chemical formulas of the compositions.

            df : a dataframe with a column name formula
            ignore_errors : ignore errors when generating features
            drop_mode : drop property that generated from mode aggregation function

        """
        df = self.str2composition.featurize_dataframe(
            df, "formula", ignore_errors=ignore_errors)
        df = df.dropna()
        df = self.feature_calculators.featurize_dataframe(
            df, col_id='composition', ignore_errors=ignore_errors)
        df["NComp"] = df["composition"].apply(len)
        if drop_mode:
            df = df.drop(columns=[
                c for c in df.columns if "mode" in c and c.startswith("Magpie")
            ])
        return df
Пример #13
0
class RFEstimator(BaseTesterEstimator):

    def __init__(self, pbar=False):
        self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3)
        self.stc = StrToComposition()
        ep = ElementProperty.from_preset("magpie")
        ef = ElementFraction()
        self.featurizer = MultipleFeaturizer([ep, ef])
        self.pbar = pbar

    def _generate_features(self, x):
        comps = [o[0] for o in self.stc.featurize_many(x, pbar=self.pbar)]
        features = np.asarray(self.featurizer.featurize_many(comps, pbar=self.pbar))
        return features

    def fit(self, x, y):
        features = self._generate_features(x)
        self.regressor.fit(features, y)

    def predict(self, x):
        features = self._generate_features(x)
        return self.regressor.predict(features)
Пример #14
0
import matminer
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from matminer.utils.io import store_dataframe_as_json
from matminer.utils.io import load_dataframe_from_json
from matminer.figrecipes.plot import PlotlyFig
'''
#Block 1 - Loading and filtering the experimental dataframe
'''
df = load_dataframe_from_json('data/Batteries_raw.json')

# Select the working ion among {Li, Al, Zr, Mg}
select = 'Li'

# Initial filter based on the selected element
from matminer.featurizers.conversions import StrToComposition
fdf = StrToComposition().featurize_dataframe(df, 'Ion')

select_at = fdf["composition"].apply(lambda x: x.get_atomic_fraction(select))
fdf = fdf[select_at == 1]

# Debug
print("Remaining samples: {}".format(fdf.describe))
fdf = fdf.drop(['composition'], axis=1)

## Initial conversion to matminer objects
from matminer.featurizers.conversions import StrToComposition
fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula')

from matminer.featurizers.conversions import CompositionToOxidComposition
fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition')
Пример #15
0
import numpy as np
import pandas as pd
import pickle
'''
#Block 1 - Loading dataframe
'''
# arbitrary inputs - Li must be excluded to ensure consistency
data = [['mp-1025496', 'Nb1 Se2'], ['mp-977563', 'Nb1 Ir2'],
        ['mp-864631', 'Nb1 Rh2'], ['mp-3368', 'Nb3 O8']]

fdf = pd.DataFrame(data, columns=['Id', 'Reduced Formula'])

## Initial conversion to matminer objects
from matminer.featurizers.conversions import StrToComposition

fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula')

from matminer.featurizers.conversions import CompositionToOxidComposition

fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition')

print("The initial dataset has {}".format(fdf.shape))
print(fdf.head())
'''
Block 2 - Featurization
'''
#
# -- start F1
from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name='magpie')
    def __init__(self, filepath, dataset, init_samples):
        self.filepath = filepath
        self.df = pd.read_csv(
            self.filepath,
            usecols=['material_id', 'pretty_formula', 'band_gap'])
        self.dataset = dataset
        self.init_samples = init_samples
        self.init_filename = './ALSearch_init_' + str(init_samples) + '.csv'
        if dataset is 'bandgap':
            #self.df = pd.read_csv('./bandgap_df_whole.csv')
            if os.path.exists(self.init_filename) is False:
                # small examples for debugging
                self.df = self.df.sample(n=self.init_samples,
                                         replace=True,
                                         random_state=42)
                added_columns_name = []
                for i in range(128):
                    added_columns_name.append('V' + str(i))
                data = []
                # create composition column
                df_comp = StrToComposition(
                    target_col_id='composition').featurize_dataframe(
                        self.df, 'pretty_formula')
                # create column with maximum atom number
                max_atom_num = []
                for st in df_comp[['composition']].astype(str).values:
                    # if len(st[0].as_dict()) > 8:
                    #     continue
                    atom_list = []
                    # print(st[0])
                    s = st[0]
                    for item in s.split():
                        num = re.sub(r"\D", "", item)
                        atom_list.append(int(num))
                    # print(atom_list)
                    max_atom_num.append(max(atom_list))

                # update dataframe with max_atom_num
                self.df['max_atom_num'] = max_atom_num
                # remove rows whose max atom number above 20
                self.df = self.df[self.df['max_atom_num'] < 21]
                self.df = self.df.drop(['max_atom_num'], axis=1)
                # convert formula to latent vector
                for formula in self.df['pretty_formula']:
                    print(formula)
                    onehot_matrix = formula2onehot_matrix(formula, l=20)
                    lat_vec = get_latent_space(onehot_matrix)
                    lat_list = lat_vec.tolist()
                    data.append(lat_list[0])
                    print(formula + 'has been converted into latent vector~')

                df_added = pd.DataFrame(data, columns=added_columns_name)
                self.df.reset_index(drop=True, inplace=True)
                df_added.reset_index(drop=True, inplace=True)
                self.df = pd.concat([self.df, df_added], axis=1)

                # rename columns to eliminate ' '
                column_rename = [
                    'id', 'composition', 'Eg', 'V0', 'V1', 'V2', 'V3', 'V4',
                    'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13',
                    'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
                    'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29',
                    'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37',
                    'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45',
                    'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53',
                    'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61',
                    'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69',
                    'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77',
                    'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85',
                    'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93',
                    'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101',
                    'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108',
                    'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115',
                    'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122',
                    'V123', 'V124', 'V125', 'V126', 'V127'
                ]
                self.df = self.df.set_axis(column_rename,
                                           axis=1,
                                           inplace=False)
                #self.df = self.df.drop()

                self.df.to_csv(self.init_filename, index=False, header=True)

            else:
                self.df = pd.read_csv(self.init_filename)
        print('The shape of initial dataset is ' + str(self.df.shape))
        self.label = ['Eg']

        # drop duplicate values
        self.df = self.df.drop_duplicates(
            subset=[i for i in self.df.columns if i not in self.label],
            keep='first')
        print('The shape of init dataset after dropping duplicates is ' +
              str(self.df.shape))

        self.df = self.df.dropna()

        # sort dataframe by y value
        self.sorted_df = self.df.sort_values(by=self.label)
df = load_elastic_tensor()
print(df.columns)
"""
Index(['material_id', 'formula', 'nsites', 'space_group', 'volume',
       'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt',
       'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor',
       'elastic_tensor', 'elastic_tensor_original'],
      dtype='object')
"""
unwanted_columns = ["volume", "nsites", "compliance_tensor", "elastic_tensor",
                    "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss"]
df = df.drop(unwanted_columns, axis=1)

from matminer.featurizers.conversions import StrToComposition

df = StrToComposition().featurize_dataframe(df, 'formula')

from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates

df = CompositionToOxidComposition().featurize_dataframe(df, "composition")

os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, "composition_oxid")

from matminer.featurizers.structure import DensityFeatures
onehot = build_entry()
print(onehot)
onehot_l = list(onehot.keys())
print(onehot_l)

filepath = './Utils/bandgap-magpie.csv'
df = pd.read_csv(filepath)
#df = df.sample(frac=0.001, replace=True, random_state=1)
print('The shape of current dataset is ' + str(df.shape))

added_columns_name = []
for i in range(128):
    added_columns_name.append('V' + str(i))
data = []
# create composition column
df_comp = StrToComposition(target_col_id='composition').featurize_dataframe(
    df, 'pretty_formula')
# create column with maximum atom number
max_atom_num = []
for st in df_comp[['composition']].astype(str).values:
    atom_list = []
    #print(st[0])
    s = st[0]
    for item in s.split():
        num = re.sub(r"\D", "", item)
        atom_list.append(int(num))
    #print(atom_list)
    max_atom_num.append(max(atom_list))

# update dataframe with max_atom_num
df['max_atom_num'] = max_atom_num
# remove rows whose max atom number above 20
Пример #19
0
    def test_conversion_multiindex(self):
        d = {'comp_str': ["Fe2", "MnO2"]}

        df_1lvl = DataFrame(data=d)

        df_1lvl = StrToComposition().featurize_dataframe(
            df_1lvl, 'comp_str', multiindex=True)
        self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        df_2lvl = StrToComposition().featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id='test')
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # if two level multiindex provided as target, it should be written there
        # here we test converting multiindex in place
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)

        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False)
        self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # Try inplace multiindex conversion with return errors
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True,
            return_errors=True, ignore_errors=True)

        self.assertTrue(
            all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
Пример #20
0
    # 得到数据
    query_string = 'mdf.source_name:oqmd AND (oqmd.configuration:static OR ' \
                   'oqmd.configuration:standard) AND dft.converged:True'
    if quick_demo:
        query_string += " AND mdf.scroll_id:<10000"

    data = mdf.get_data(query_string, unwind_arrays=False)
    print(data.head())
    # 重命名、预处理和筛选,delta_e应该是形成能
    data = data[['oqmd.delta_e.value', 'material.composition']]
    data = data.rename(columns={
        'oqmd.delta_e.value': 'delta_e',
        'material.composition': 'composition'
    })
    data = StrToComposition(
        target_col_id='composition_obj').featurize_dataframe(
            data, 'composition')
    data.sort_values('delta_e', ascending=True, inplace=True)
    print(data.head(3))
    for k in ['delta_e']:
        data[k] = pd.to_numeric(data[k])

    original_count = len(data)
    data = data[~data['delta_e'].isnull()]
    print('Removed %d/%d entries' %
          (original_count - len(data), original_count))

    original_count = len(data)
    data['composition'] = data['composition_obj'].apply(
        lambda x: x.reduced_formula)
    data.sort_values('delta_e', ascending=True, inplace=True)
def generate_data():
    df = load_elastic_tensor()
    df.to_csv('原始elastic数据.csv')
    print(df.columns)

    unwanted_columns = [
        'volume', 'nsites', 'compliance_tensor', 'elastic_tensor',
        'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss'
    ]
    df = df.drop(unwanted_columns, axis=1)
    print(df.head())
    df.to_csv('扔掉不需要的部分.csv')

    #首先使用describe获得对于数据的整体把握
    print(df.describe())
    df.describe().to_csv('general_look.csv')
    #通过观察数据发现并没有什么异常之处
    df = StrToComposition().featurize_dataframe(df, 'formula')
    print(df.head())
    df.to_csv('引入composition.csv')

    #下一步,我们需要其中一个特征化来增加一系列的特征算符
    ep_feat = ElementProperty.from_preset(preset_name='magpie')
    df = ep_feat.featurize_dataframe(
        df, col_id='composition')  #将composition这一列作为特征化的输入
    print(df.head())
    print(ep_feat.citations())
    df.to_csv('将composition特征化后.csv')

    #开始引入新的特征化算符吧
    df = CompositionToOxidComposition().featurize_dataframe(
        df, 'composition')  #引入了氧化态的相关特征
    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, col_id='composition_oxid')
    print(df.head())
    df.to_csv('引入氧化态之后.csv')

    #其实除了基于composition的特征之外还有很多其他的,比如基于结构的
    df_feat = DensityFeatures()
    df = df_feat.featurize_dataframe(df, 'structure')
    print(df.head())
    df.to_csv('引入结构中的密度.csv')
    print(df_feat.feature_labels())
Пример #22
0
from matminer.featurizers.conversions import StrToComposition
from tqdm import tqdm

import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

df = load_dataset("glass_ternary_landolt")

df = df.rename(columns={"formula": "composition"})
df = df[["composition", "gfa"]]

df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
    df, "composition")
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])

# print("Ground truth")
# print(df[df["composition"]=="ZrTi9"])  # should be False in final dataframe also!!
# print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also!
# print(df["gfa"].value_counts())    # proportion is about 5000 GFA 2054 no GFA
# raise ValueError

unique = df["composition"].unique()
print(len(df))
print(len(unique))

problem_compositions = []
new_df_dict = {"composition": [], "gfa": []}
    df['elastic_tensor_original'][i] = np.array(df['elastic_tensor_original'][i]['data'])

"""
['_id', 'material_id', 'formula', 'nsites', 'space_group', 'volume',
       'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt',
       'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor',
       'elastic_tensor', 'elastic_tensor_original', 'cif', 'kpoint_density',
       'poscar']
"""
unwanted_columns = ['_id', 'material_id', 'nsites', 'volume',
                    'cif', 'kpoint_density', 'poscar']
df = df.drop(unwanted_columns, axis=1)

from matminer.featurizers.conversions import StrToComposition

sc_feat = StrToComposition()
df = sc_feat.featurize_dataframe(df, col_id='formula')

from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name='magpie')
df = ep_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.conversions import CompositionToOxidComposition

co_feat = CompositionToOxidComposition()
df = co_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.composition import OxidationStates

os_feat = OxidationStates()
def AddFeatures(df):  # Add features by Matminer
    from matminer.featurizers.conversions import StrToComposition
    df = StrToComposition().featurize_dataframe(df, "formula")

    from matminer.featurizers.composition import ElementProperty

    ep_feat = ElementProperty.from_preset(preset_name="magpie")
    df = ep_feat.featurize_dataframe(
        df, col_id="composition"
    )  # input the "composition" column to the featurizer

    from matminer.featurizers.conversions import CompositionToOxidComposition
    from matminer.featurizers.composition import OxidationStates

    df = CompositionToOxidComposition().featurize_dataframe(df, "composition")

    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, "composition_oxid")

    from matminer.featurizers.composition import ElectronAffinity

    ea_feat = ElectronAffinity()
    df = ea_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import BandCenter

    bc_feat = BandCenter()
    df = bc_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import CohesiveEnergy

    ce_feat = CohesiveEnergy()
    df = ce_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import Miedema

    m_feat = Miedema()
    df = m_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True)

    from matminer.featurizers.composition import TMetalFraction

    tmf_feat = TMetalFraction()
    df = tmf_feat.featurize_dataframe(df,
                                      "composition_oxid",
                                      ignore_errors=True)

    from matminer.featurizers.composition import ValenceOrbital

    vo_feat = ValenceOrbital()
    df = vo_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import YangSolidSolution

    yss_feat = YangSolidSolution()
    df = yss_feat.featurize_dataframe(df,
                                      "composition_oxid",
                                      ignore_errors=True)

    from matminer.featurizers.structure import GlobalSymmetryFeatures

    # This is the border between compositional features and structural features. Comment out the following featurizers to use only compostional features.

    gsf_feat = GlobalSymmetryFeatures()
    df = gsf_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import StructuralComplexity
    sc_feat = StructuralComplexity()
    df = sc_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import ChemicalOrdering
    co_feat = ChemicalOrdering()
    df = co_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import MaximumPackingEfficiency
    mpe_feat = MaximumPackingEfficiency()
    df = mpe_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import MinimumRelativeDistances
    mrd_feat = MinimumRelativeDistances()
    df = mrd_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import StructuralHeterogeneity
    sh_feat = StructuralHeterogeneity()
    df = sh_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import SiteStatsFingerprint

    from matminer.featurizers.site import AverageBondLength
    from pymatgen.analysis.local_env import CrystalNN
    bl_feat = SiteStatsFingerprint(
        AverageBondLength(CrystalNN(search_cutoff=20)))
    df = bl_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import AverageBondAngle
    ba_feat = SiteStatsFingerprint(
        AverageBondAngle(CrystalNN(search_cutoff=20)))
    df = ba_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import BondOrientationalParameter
    bop_feat = SiteStatsFingerprint(BondOrientationalParameter())
    df = bop_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import CoordinationNumber
    cn_feat = SiteStatsFingerprint(CoordinationNumber())
    df = cn_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import DensityFeatures
    df_feat = DensityFeatures()
    df = df_feat.featurize_dataframe(df, "structure", ignore_errors=True)
    return (df)
# df = mpdr.get_dataframe({"elasticity": {"$exists": True}, "elasticity.warnings": []},
                        # ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']) 
criteria = {'elasticity.K_VRH': {'$ne': None}}
properties = ['pretty_formula', 'spacegroup.symbol', 'elasticity.K_VRH', 'elasticity.G_VRH','formation_energy_per_atom', 'band_gap',
              'e_above_hull', 'density', 'volume', 'nsites']
df = mpr.get_dataframe(criteria=criteria, properties=properties)
df1=pd.read_csv(r'D:\FYP_files\database\data_after_processing\huizong\huizong.csv')
df=df.reset_index()
df=pd.merge(df,df1)
df=df.set_index("material_id")
df = df[df['elasticity.K_VRH'] > 0]
df = df[df['e_above_hull'] < 0.1]  
df['vpa'] = df['volume']/df['nsites']        
df['poisson_ratio']=df[["elasticity.K_VRH","elasticity.G_VRH"]].apply(lambda x:(3*x["elasticity.K_VRH"]-2*x["elasticity.G_VRH"])/(6*x["elasticity.K_VRH"]+2*x["elasticity.G_VRH"]),axis=1)
from matminer.featurizers.conversions import StrToComposition
df = StrToComposition().featurize_dataframe(df, "pretty_formula")
from matminer.featurizers.composition import ElementProperty
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id="composition")  # input the "composition" column to the featurizer
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
df = CompositionToOxidComposition().featurize_dataframe(df, "composition")
os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, "composition_oxid")
dataset = PymatgenData()
descriptors = ['row', 'group', 'atomic_mass',
               'atomic_radius', 'boiling_point', 'melting_point', 'X']
stats = ["mean", "std_dev"]
ep = ElementProperty(data_source=dataset, features=descriptors, stats=stats)
df = ep.featurize_dataframe(df, "composition")
#Remove NaN values
Пример #26
0
for entry in data_2['elasticity']:
    values_list.append(list(entry.values()))

for prop in tensor_list:
    prop_value = list()
    for materials_val_list in values_list:
        prop_value.append(materials_val_list[tensor_list.index(prop)])
    new_cols_val.append(prop_value)

for prop_name in tensor_list:
    data_2[prop_name] = new_cols_val[tensor_list.index(prop_name)]

# prepare for featurization
from matminer.featurizers.conversions import StrToComposition
data_3 = StrToComposition().featurize_dataframe(data_2, "pretty_formula")
#data_3.columns

# In[9]:

# Saving this intermediate dataset before defining training data and targets
import numpy as np
np.savez_compressed("heusler_all.npz", data=data_3)

# In[ ]:

# Featurization
# This part is done with reference to the matiner examples
from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name="magpie")
    def __init__(self):
        self.filepath = './Utils/bandgap-magpie.csv'
        self.df = pd.read_csv(self.filepath)
        # drop duplicate values
        print('The shape of whole dataset before dropping duplicates is ' +
              str(self.df.shape))

        self.df = self.df.drop_duplicates(subset=['pretty_formula'],
                                          keep='first')
        print('The shape of whole dataset after dropping duplicates is ' +
              str(self.df.shape))

        self.df = self.df.sample(frac=0.0001, replace=True, random_state=1)
        added_columns_name = []
        for i in range(128):
            added_columns_name.append('V' + str(i))
        data = []
        # create composition column
        df_comp = StrToComposition(
            target_col_id='composition').featurize_dataframe(
                self.df, 'pretty_formula')
        # create column with maximum atom number
        max_atom_num = []
        for st in df_comp[['composition']].astype(str).values:
            atom_list = []
            s = st[0]
            for item in s.split():
                num = re.sub(r"\D", "", item)
                atom_list.append(int(num))
            max_atom_num.append(max(atom_list))

        # update dataframe with max_atom_num
        self.df['max_atom_num'] = max_atom_num
        # remove rows whose max atom number above 8
        self.df = self.df[self.df['max_atom_num'] < 9]
        # convert formula to latent vector
        for formula in self.df['pretty_formula']:
            print(formula)
            onehot_matrix = formula2onehot_matrix(formula, l=8)
            lat_vec = get_latent_space(onehot_matrix)
            lat_list = lat_vec.tolist()
            data.append(lat_list[0])
            print(formula + 'has been converted into latent vector~')

        df_added = pd.DataFrame(data, columns=added_columns_name)
        self.df.reset_index(drop=True, inplace=True)
        df_added.reset_index(drop=True, inplace=True)
        self.df = pd.concat([self.df, df_added], axis=1)

        # perform autoencode to pretty formula
        column_to_remove = ['material_id', 'max_atom_num']

        # generate column names
        self.df = self.df.drop(column_to_remove, axis=1)

        # rename columns to eliminate ' '
        column_rename = [
            'pretty_formula', 'band_gap', 'MagpieData_minimum_Number',
            'MagpieData_maximum_Number', 'MagpieData_range_Number',
            'MagpieData_mean_Number', 'MagpieData_avg_dev_Number',
            'MagpieData_mode_Number', 'MagpieData_minimum_MendeleevNumber',
            'MagpieData_maximum_MendeleevNumber',
            'MagpieData_range_MendeleevNumber',
            'MagpieData_mean_MendeleevNumber',
            'MagpieData_avg_dev_MendeleevNumber',
            'MagpieData_mode_MendeleevNumber',
            'MagpieData_minimum_AtomicWeight',
            'MagpieData_maximum_AtomicWeight', 'MagpieData_range_AtomicWeight',
            'MagpieData_mean_AtomicWeight', 'MagpieData_avg_dev_AtomicWeight',
            'MagpieData_mode_AtomicWeight', 'MagpieData_minimum_MeltingT',
            'MagpieData_maximum_MeltingT', 'MagpieData_range_MeltingT',
            'MagpieData_mean_MeltingT', 'MagpieData_avg_dev_MeltingT',
            'MagpieData_mode_MeltingT', 'MagpieData_minimum_Column',
            'MagpieData_maximum_Column', 'MagpieData_range_Column',
            'MagpieData_mean_Column', 'MagpieData_avg_dev_Column',
            'MagpieData_mode_Column', 'MagpieData_minimum_Row',
            'MagpieData_maximum_Row', 'MagpieData_range_Row',
            'MagpieData_mean_Row', 'MagpieData_avg_dev_Row',
            'MagpieData_mode_Row', 'MagpieData_minimum_CovalentRadius',
            'MagpieData_maximum_CovalentRadius',
            'MagpieData_range_CovalentRadius',
            'MagpieData_mean_CovalentRadius',
            'MagpieData_avg_dev_CovalentRadius',
            'MagpieData_mode_CovalentRadius',
            'MagpieData_minimum_Electronegativity',
            'MagpieData_maximum_Electronegativity',
            'MagpieData_range_Electronegativity',
            'MagpieData_mean_Electronegativity',
            'MagpieData_avg_dev_Electronegativity',
            'MagpieData_mode_Electronegativity',
            'MagpieData_minimum_NsValence', 'MagpieData_maximum_NsValence',
            'MagpieData_range_NsValence', 'MagpieData_mean_NsValence',
            'MagpieData_avg_dev_NsValence', 'MagpieData_mode_NsValence',
            'MagpieData_minimum_NpValence', 'MagpieData_maximum_NpValence',
            'MagpieData_range_NpValence', 'MagpieData_mean_NpValence',
            'MagpieData_avg_dev_NpValence', 'MagpieData_mode_NpValence',
            'MagpieData_minimum_NdValence', 'MagpieData_maximum_NdValence',
            'MagpieData_range_NdValence', 'MagpieData_mean_NdValence',
            'MagpieData_avg_dev_NdValence', 'MagpieData_mode_NdValence',
            'MagpieData_minimum_NfValence', 'MagpieData_maximum_NfValence',
            'MagpieData_range_NfValence', 'MagpieData_mean_NfValence',
            'MagpieData_avg_dev_NfValence', 'MagpieData_mode_NfValence',
            'MagpieData_minimum_NValence', 'MagpieData_maximum_NValence',
            'MagpieData_range_NValence', 'MagpieData_mean_NValence',
            'MagpieData_avg_dev_NValence', 'MagpieData_mode_NValence',
            'MagpieData_minimum_NsUnfilled', 'MagpieData_maximum_NsUnfilled',
            'MagpieData_range_NsUnfilled', 'MagpieData_mean_NsUnfilled',
            'MagpieData_avg_dev_NsUnfilled', 'MagpieData_mode_NsUnfilled',
            'MagpieData_minimum_NpUnfilled', 'MagpieData_maximum_NpUnfilled',
            'MagpieData_range_NpUnfilled', 'MagpieData_mean_NpUnfilled',
            'MagpieData_avg_dev_NpUnfilled', 'MagpieData_mode_NpUnfilled',
            'MagpieData_minimum_NdUnfilled', 'MagpieData_maximum_NdUnfilled',
            'MagpieData_range_NdUnfilled', 'MagpieData_mean_NdUnfilled',
            'MagpieData_avg_dev_NdUnfilled', 'MagpieData_mode_NdUnfilled',
            'MagpieData_minimum_NfUnfilled', 'MagpieData_maximum_NfUnfilled',
            'MagpieData_range_NfUnfilled', 'MagpieData_mean_NfUnfilled',
            'MagpieData_avg_dev_NfUnfilled', 'MagpieData_mode_NfUnfilled',
            'MagpieData_minimum_NUnfilled', 'MagpieData_maximum_NUnfilled',
            'MagpieData_range_NUnfilled', 'MagpieData_mean_NUnfilled',
            'MagpieData_avg_dev_NUnfilled', 'MagpieData_mode_NUnfilled',
            'MagpieData_minimum_GSvolume_pa', 'MagpieData_maximum_GSvolume_pa',
            'MagpieData_range_GSvolume_pa', 'MagpieData_mean_GSvolume_pa',
            'MagpieData_avg_dev_GSvolume_pa', 'MagpieData_mode_GSvolume_pa',
            'MagpieData_minimum_GSbandgap', 'MagpieData_maximum_GSbandgap',
            'MagpieData_range_GSbandgap', 'MagpieData_mean_GSbandgap',
            'MagpieData_avg_dev_GSbandgap', 'MagpieData_mode_GSbandgap',
            'MagpieData_minimum_GSmagmom', 'MagpieData_maximum_GSmagmom',
            'MagpieData_range_GSmagmom', 'MagpieData_mean_GSmagmom',
            'MagpieData_avg_dev_GSmagmom', 'MagpieData_mode_GSmagmom',
            'MagpieData_minimum_SpaceGroupNumber',
            'MagpieData_maximum_SpaceGroupNumber',
            'MagpieData_range_SpaceGroupNumber',
            'MagpieData_mean_SpaceGroupNumber',
            'MagpieData_avg_dev_SpaceGroupNumber',
            'MagpieData_mode_SpaceGroupNumber', 'V0', 'V1', 'V2', 'V3', 'V4',
            'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14',
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23',
            'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32',
            'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41',
            'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50',
            'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59',
            'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68',
            'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77',
            'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86',
            'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95',
            'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104',
            'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112',
            'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120',
            'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127'
        ]
        self.df = self.df.set_axis(column_rename, axis=1, inplace=False)

        self.df.to_csv(r'bandgap_df_new_114.csv', index=False, header=True)
Пример #28
0
    def _tidy_column(self, df, featurizer_type):
        """
        Various conversions to homogenize columns for featurization input.
        For example, take a column of compositions and ensure they are decorated
        with oxidation states, are not strings, etc.

        Args:
            df (pandas.DataFrame)
            featurizer_type: The key defining the featurizer input. For example,
                composition featurizers should have featurizer_type of
                "composition".

        Returns:
            df (pandas.DataFrame): DataFrame with featurizer_type column
                ready for featurization.
        """
        # todo: Make the following conversions more robust (no [0] type checking)
        type_tester = df[featurizer_type].iloc[0]

        if featurizer_type == self.composition_col:
            # Convert formulas to composition objects
            if isinstance(type_tester, str):
                self.logger.info(
                    self._log_prefix +
                    "Compositions detected as strings. Attempting "
                    "conversion to Composition objects...")
                stc = StrToComposition(overwrite_data=True,
                                       target_col_id=featurizer_type)
                df = stc.featurize_dataframe(df,
                                             featurizer_type,
                                             multiindex=self.multiindex,
                                             ignore_errors=True,
                                             inplace=False)

            elif isinstance(type_tester, dict):
                self.logger.info(self._log_prefix +
                                 "Compositions detected as dicts. Attempting "
                                 "conversion to Composition objects...")
                df[featurizer_type] = [
                    Composition.from_dict(d) for d in df[featurizer_type]
                ]

            # Convert non-oxidstate containing comps to oxidstate comps
            if self.guess_oxistates:
                self.logger.info(
                    self._log_prefix +
                    "Guessing oxidation states of compositions, as"
                    " they were not present in input.")
                cto = CompositionToOxidComposition(
                    target_col_id=featurizer_type,
                    overwrite_data=True,
                    return_original_on_error=True,
                    max_sites=-50)
                try:
                    df = cto.featurize_dataframe(df,
                                                 featurizer_type,
                                                 multiindex=self.multiindex,
                                                 inplace=False)
                except Exception as e:
                    self.logger.info(self._log_prefix +
                                     "Could not decorate oxidation states due "
                                     "to {}. Excluding featurizers based on "
                                     "composition oxistates".format(e))
                    classes_require_oxi = [
                        c.__class__.__name__
                        for c in CompositionFeaturizers().need_oxi
                    ]
                    self.exclude.extend(classes_require_oxi)

        else:
            # Convert structure/bs/dos dicts to objects (robust already)
            if isinstance(type_tester, (dict, str)):
                self.logger.info(self._log_prefix.capitalize() +
                                 "{} detected as string or dict. Attempting "
                                 "conversion to {} objects..."
                                 "".format(featurizer_type, featurizer_type))
                if isinstance(type_tester, str):
                    raise ValueError("{} column is type {}. Cannot convert."
                                     "".format(featurizer_type,
                                               type(type_tester)))
                dto = DictToObject(overwrite_data=True,
                                   target_col_id=featurizer_type)
                df = dto.featurize_dataframe(df,
                                             featurizer_type,
                                             inplace=False)

                # Decorate with oxidstates
                if featurizer_type == self.structure_col and \
                        self.guess_oxistates:
                    self.logger.info(
                        self._log_prefix +
                        "Guessing oxidation states of structures if they were "
                        "not present in input.")
                    sto = StructureToOxidStructure(
                        target_col_id=featurizer_type,
                        overwrite_data=True,
                        return_original_on_error=True,
                        max_sites=-50)
                    try:
                        df = sto.featurize_dataframe(
                            df,
                            featurizer_type,
                            multiindex=self.multiindex,
                            inplace=False)
                    except Exception as e:
                        self.logger.info(
                            self._log_prefix +
                            "Could not decorate oxidation states on structures "
                            "due to {}.".format(e))
        return df
Пример #29
0
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import StrToComposition
import numpy as np
import pandas as pd
import csv
import os
import itertools
from pymatgen import Composition
from pymatgen.core.periodic_table import Element

# Read in dataset
filepath = "pifs.csv"
glass_data = pd.read_csv(filepath)
# Make the compositions of the glasses data into pymatgen objects to match the data from OQMD
# Convert compositions to pymatgen objects.
comps = StrToComposition().featurize_dataframe(
    glass_data, "formula", ignore_errors=True)["composition"]

# Loop through all elements and list the ones that come up.
# Also keep track fo how many elements there are of each.
majority = []
for c in comps:
    print(c)
    max_comp = -1
    main_element = ""
    elements = c.items()
    for e in elements:
        if e[1] > max_comp:
            max_comp = e[1]
            main_element = e[0]
    majority.append(str(main_element))
Пример #30
0
        json.loads(
            urlopen("http://aflowlib.duke.edu/search/API/?" + MATCHBOOK +
                    ",$paging(0)").read().decode("utf-8")))['compound']

matrix = pd.DataFrame([metal] * len(substrate))

Tsplit = [float(i) for i in Trange.split('-')]
if Tsplit[2] != 0:
    Tlist = np.arange(Tsplit[0], Tsplit[1], Tsplit[2]).tolist()

sys_cond_0 = pd.concat([matrix, substrate], axis=1)
sys_cond_0['Temp'] = pd.DataFrame([Tsplit[0]] * len(substrate))
sys_cond_0.columns = ['Metal', 'Substrate', 'Temp']

metal_matminer = pd.DataFrame([metal], columns=['Metal'])
metal_matminer = StrToComposition(target_col_id='Me_comp').featurize_dataframe(
    metal_matminer, 'Metal')
data_Me = magpie.featurize_dataframe(metal_matminer,
                                     col_id="Me_comp",
                                     ignore_errors=True)
metal_features = pd.DataFrame(data_Me.values.tolist() * len(substrate),
                              columns=data_Me.columns)
feature_Me = metal_features.filter(like='mean')
feature_Me = feature_Me.drop(columns=['MagpieData mean NfUnfilled'])
feature_Me.columns = ['Me_' + j for j in feature_Me.columns]

sys_cond_0 = StrToComposition(target_col_id='Sub_comp').featurize_dataframe(
    sys_cond_0, 'Substrate')
data_Sub = magpie.featurize_dataframe(sys_cond_0,
                                      col_id="Sub_comp",
                                      ignore_errors=True)
feature_Sub = data_Sub.filter(like='mean')