def composition_featurizer(df_input: pd.DataFrame, **kwargs) -> pd.DataFrame: """Return a Pandas DataFrame with all compositional features""" # generate the "composition" column df_comp = StrToComposition().featurize_dataframe(df_input, col_id="Compound") # generate features based on elemental properites ep_featurizer = ElementProperty.from_preset(preset_name="magpie") ep_featurizer.featurize_dataframe(df_comp, col_id="composition", inplace=True) # generate the "composition_oxid" column based on guessed oxidation states CompositionToOxidComposition( return_original_on_error=True, **kwargs).featurize_dataframe( # ignore errors from non-integer stoichiometries df_comp, "composition", ignore_errors=True, inplace=True) # correct oxidation states df_comp = correct_comp_oxid(df_comp) # generate features based on oxidation states os_featurizer = OxidationStates() os_featurizer.featurize_dataframe(df_comp, "composition_oxid", ignore_errors=True, inplace=True) # remove compounds with predicted oxidation states of 0 return df_comp[df_comp["minimum oxidation state"] != 0]
def generate_data(name): #这个函数作用,输入是指定的文件名,输出增加了gaps,is_daoti,以及其他共计145特征的完整向量矩阵 #name='test_plus_gaps.csv' df=pd.read_csv(name,index_col=[0]) df['gaps']=-10.0 df_gap=pd.read_csv("gaps.csv",index_col = [0]) print(df_gap.index) i=0 str_s="" for j in range(len(df_gap.index)): #先打印二者的id # print(df.index[i]) str_s='mp-'+str(df_gap.index[j]) if(str_s==df.index[i]): df.iloc[i,-1]=df_gap.iloc[j,0] i=i+1 #print("确实一样") print("合并完毕") #同样的方法我们来建立不同的分类 df['is_daoti']=-2 for i in range(len(df.index)): if(df.ix[i,-2]==0): df.ix[i,-1]=1 else: df.ix[i,-1]=0 print("分类feature建立完成") #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look_jie.csv') #通过观察数据发现并没有什么异常之处 df=StrToComposition().featurize_dataframe(df,'full_formula',ignore_errors=True) print(df.head()) #print(df['composition']) ep_feat=ElementProperty.from_preset(preset_name='magpie') df=ep_feat.featurize_dataframe(df,col_id='composition',ignore_errors=True)#将composition这一列作为特征化的输入 print(df.head()) #print(ep_feat.citations()) #df.to_csv("plus the composition.csv") #以上这部分是将formula转化为composition并转化feature df=CompositionToOxidComposition().featurize_dataframe(df,col_id='composition')#引入了氧化态的相关特征 os_feat=OxidationStates() df=os_feat.featurize_dataframe(df,col_id='composition_oxid') new_name='2d_vector_plus.csv' df.to_csv(new_name)
def test_featurizers(): df = pd.read_csv('test.csv', index_col=[0]) df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) #df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('after_test.csv')
def generate_data(): df = load_elastic_tensor() df.to_csv('原始elastic数据.csv') print(df.columns) unwanted_columns = [ 'volume', 'nsites', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss' ] df = df.drop(unwanted_columns, axis=1) print(df.head()) df.to_csv('扔掉不需要的部分.csv') #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look.csv') #通过观察数据发现并没有什么异常之处 df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) df.to_csv('引入composition.csv') #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('引入氧化态之后.csv') #其实除了基于composition的特征之外还有很多其他的,比如基于结构的 df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, 'structure') print(df.head()) df.to_csv('引入结构中的密度.csv') print(df_feat.feature_labels())
def add_cs_features(df,rdf_flag=False): df["composition"] = str_to_composition(df["pretty_formula"]) df["composition_oxid"] = composition_to_oxidcomposition(df["composition"]) df["structure"] = dict_to_object(df["structure"]) vo = ValenceOrbital() df = vo.featurize_dataframe(df,"composition") ox = OxidationStates() df = ox.featurize_dataframe(df, "composition_oxid") # structure features den = DensityFeatures() df = den.featurize_dataframe(df, "structure") if rdf_flag: rdf = RadialDistributionFunction(cutoff=15.0,bin_size=0.2) df = rdf.featurize_dataframe(df, "structure") return df
# Featurization # This part is done with reference to the matiner examples from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") data_3 = ep_feat.featurize_dataframe(data_3, col_id="composition") from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates data_3 = CompositionToOxidComposition().featurize_dataframe( data_3, "composition") os_feat = OxidationStates() data_3 = os_feat.featurize_dataframe(data_3, "composition_oxid") from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() data_3 = df_feat.featurize_dataframe(data_3, "structure") unwanted_columns = [ "elasticity", "material_id", "nsites", "compliance_tensor", "elastic_tensor", "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss", "warnings" ] data_4 = data_3.drop(unwanted_columns, axis=1) # In[ ]:
for col in fdf.columns: if 'MagpieData' in col: new_col = col.split('MagpieData ', 1) del new_col[0] fin_col = ''.join(new_col) new_cols.append(fin_col) cols_dict = dict(zip(magpie_cols, new_cols)) fdf = fdf.rename(columns=cols_dict) # -- end F1 # -- start F3 -- from matminer.featurizers.composition import OxidationStates os_feat = OxidationStates() fdf = os_feat.featurize_dataframe(fdf, 'composition_oxid', ignore_errors=True) # -- end F3 # -- start F4 -- from matminer.featurizers.composition import AtomicOrbitals ao_feat = AtomicOrbitals() fdf = ao_feat.featurize_dataframe(fdf, col_id='composition', ignore_errors=True) # -- end F4 # -- start F5 from matminer.featurizers.composition import BandCenter bce_feat = BandCenter()
from matminer.featurizers.conversions import StrToComposition df = StrToComposition().featurize_dataframe(df, 'formula') from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, col_id='structure') y = df['K_VRH'].values excluded = ["G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id", "poisson_ratio", "structure", "composition", "composition_oxid"] X = df.drop(excluded, axis=1) print("There are {} possible descriptors:\n\n{}".format(X.shape[1], X.columns.values)) from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import numpy as np
def AddFeatures(df): # Add features by Matminer from matminer.featurizers.conversions import StrToComposition df = StrToComposition().featurize_dataframe(df, "formula") from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe( df, col_id="composition" ) # input the "composition" column to the featurizer from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") from matminer.featurizers.composition import ElectronAffinity ea_feat = ElectronAffinity() df = ea_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import BandCenter bc_feat = BandCenter() df = bc_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import CohesiveEnergy ce_feat = CohesiveEnergy() df = ce_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import Miedema m_feat = Miedema() df = m_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import TMetalFraction tmf_feat = TMetalFraction() df = tmf_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import ValenceOrbital vo_feat = ValenceOrbital() df = vo_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import YangSolidSolution yss_feat = YangSolidSolution() df = yss_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.structure import GlobalSymmetryFeatures # This is the border between compositional features and structural features. Comment out the following featurizers to use only compostional features. gsf_feat = GlobalSymmetryFeatures() df = gsf_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import StructuralComplexity sc_feat = StructuralComplexity() df = sc_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import ChemicalOrdering co_feat = ChemicalOrdering() df = co_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import MaximumPackingEfficiency mpe_feat = MaximumPackingEfficiency() df = mpe_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import MinimumRelativeDistances mrd_feat = MinimumRelativeDistances() df = mrd_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import StructuralHeterogeneity sh_feat = StructuralHeterogeneity() df = sh_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import SiteStatsFingerprint from matminer.featurizers.site import AverageBondLength from pymatgen.analysis.local_env import CrystalNN bl_feat = SiteStatsFingerprint( AverageBondLength(CrystalNN(search_cutoff=20))) df = bl_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import AverageBondAngle ba_feat = SiteStatsFingerprint( AverageBondAngle(CrystalNN(search_cutoff=20))) df = ba_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import BondOrientationalParameter bop_feat = SiteStatsFingerprint(BondOrientationalParameter()) df = bop_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import CoordinationNumber cn_feat = SiteStatsFingerprint(CoordinationNumber()) df = cn_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, "structure", ignore_errors=True) return (df)
df = sc_feat.featurize_dataframe(df, col_id='formula') from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.conversions import CompositionToOxidComposition co_feat = CompositionToOxidComposition() df = co_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.composition import OxidationStates os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, col_id='structure') """ formula, structure, elastic_anisotropy, G_Reuss, G_VRH, G_Voigt, K_Reuss, K_VRH, K_Voigt, poisson_ratio, compliance_tensor, elastic_tensor, elastic_tensor_original, composition """ y = df['K_VRH'].values excluded = ['formula', 'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt', 'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'composition', 'composition_oxid']