def generate_data(name): #这个函数作用,输入是指定的文件名,输出增加了gaps,is_daoti,以及其他共计145特征的完整向量矩阵 #name='test_plus_gaps.csv' df=pd.read_csv(name,index_col=[0]) df['gaps']=-10.0 df_gap=pd.read_csv("gaps.csv",index_col = [0]) print(df_gap.index) i=0 str_s="" for j in range(len(df_gap.index)): #先打印二者的id # print(df.index[i]) str_s='mp-'+str(df_gap.index[j]) if(str_s==df.index[i]): df.iloc[i,-1]=df_gap.iloc[j,0] i=i+1 #print("确实一样") print("合并完毕") #同样的方法我们来建立不同的分类 df['is_daoti']=-2 for i in range(len(df.index)): if(df.ix[i,-2]==0): df.ix[i,-1]=1 else: df.ix[i,-1]=0 print("分类feature建立完成") #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look_jie.csv') #通过观察数据发现并没有什么异常之处 df=StrToComposition().featurize_dataframe(df,'full_formula',ignore_errors=True) print(df.head()) #print(df['composition']) ep_feat=ElementProperty.from_preset(preset_name='magpie') df=ep_feat.featurize_dataframe(df,col_id='composition',ignore_errors=True)#将composition这一列作为特征化的输入 print(df.head()) #print(ep_feat.citations()) #df.to_csv("plus the composition.csv") #以上这部分是将formula转化为composition并转化feature df=CompositionToOxidComposition().featurize_dataframe(df,col_id='composition')#引入了氧化态的相关特征 os_feat=OxidationStates() df=os_feat.featurize_dataframe(df,col_id='composition_oxid') new_name='2d_vector_plus.csv' df.to_csv(new_name)
def test_featurizers(): df = pd.read_csv('test.csv', index_col=[0]) df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) #df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('after_test.csv')
def generate_data(): df = load_elastic_tensor() df.to_csv('原始elastic数据.csv') print(df.columns) unwanted_columns = [ 'volume', 'nsites', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss' ] df = df.drop(unwanted_columns, axis=1) print(df.head()) df.to_csv('扔掉不需要的部分.csv') #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look.csv') #通过观察数据发现并没有什么异常之处 df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) df.to_csv('引入composition.csv') #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('引入氧化态之后.csv') #其实除了基于composition的特征之外还有很多其他的,比如基于结构的 df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, 'structure') print(df.head()) df.to_csv('引入结构中的密度.csv') print(df_feat.feature_labels())
data = [['mp-1025496', 'Nb1 Se2'], ['mp-977563', 'Nb1 Ir2'], ['mp-864631', 'Nb1 Rh2'], ['mp-3368', 'Nb3 O8']] fdf = pd.DataFrame(data, columns=['Id', 'Reduced Formula']) ## Initial conversion to matminer objects from matminer.featurizers.conversions import StrToComposition fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula') from matminer.featurizers.conversions import CompositionToOxidComposition fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition') print("The initial dataset has {}".format(fdf.shape)) print(fdf.head()) ''' Block 2 - Featurization ''' # # -- start F1 from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name='magpie') fdf = ep_feat.featurize_dataframe(fdf, col_id='composition', ignore_errors=True) # Excluding the 'MagpieData' string from the columns' names magpie_cols = [col for col in fdf.columns if 'MagpieData' in col]
if quick_demo: query_string += " AND mdf.scroll_id:<10000" data = mdf.get_data(query_string, unwind_arrays=False) print(data.head()) # 重命名、预处理和筛选,delta_e应该是形成能 data = data[['oqmd.delta_e.value', 'material.composition']] data = data.rename(columns={ 'oqmd.delta_e.value': 'delta_e', 'material.composition': 'composition' }) data = StrToComposition( target_col_id='composition_obj').featurize_dataframe( data, 'composition') data.sort_values('delta_e', ascending=True, inplace=True) print(data.head(3)) for k in ['delta_e']: data[k] = pd.to_numeric(data[k]) original_count = len(data) data = data[~data['delta_e'].isnull()] print('Removed %d/%d entries' % (original_count - len(data), original_count)) original_count = len(data) data['composition'] = data['composition_obj'].apply( lambda x: x.reduced_formula) data.sort_values('delta_e', ascending=True, inplace=True) data.drop_duplicates('composition', keep='first', inplace=True) print('Removed %d/%d entries' % (original_count - len(data), original_count))