def generate_citrine_features(self): log.warning( 'WARNING: You have specified generation of features from Citrine. Based on which' ' materials you are interested in, there may be many records to parse through, thus' ' this routine may take a long time to complete!') try: compositions = self.dataframe[self.composition_feature].tolist() except KeyError as e: log.error(f'original python error: {str(e)}') raise utils.MissingColumnError( 'Error! No column named {self.composition_feature} found in your input data file. ' 'To use this feature generation routine, you must supply a material composition for each data point' ) citrine_dict_property_min = dict() citrine_dict_property_max = dict() citrine_dict_property_avg = dict() # before: ~11 seconds # made into a func so we can do requests in parallel # now like 1.8 secs! pool = multiprocessing.Pool(processes=20) #result_tuples = pool.map(self._load_composition, compositions) result_tuples = map(self._load_composition, compositions) for comp, (prop_min, prop_max, prop_avg) in zip(compositions, result_tuples): citrine_dict_property_min[comp] = prop_min citrine_dict_property_max[comp] = prop_max citrine_dict_property_avg[comp] = prop_avg dataframe = self.dataframe citrine_dict_list = [ citrine_dict_property_min, citrine_dict_property_max, citrine_dict_property_avg ] for citrine_dict in citrine_dict_list: dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_citrine = dataframe_citrine.reindex( self.dataframe[self.composition_feature].tolist()) # Need to make compositions the first column, instead of the row names dataframe_citrine.index.name = self.composition_feature dataframe_citrine.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_citrine[self.composition_feature] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_citrine) return dataframe
def generate_materialsproject_features(self): try: compositions = self.dataframe[self.composition_feature] except KeyError as e: raise utils.MissingColumnError( f'No column named {self.composition_feature} in csv file') mpdata_dict_composition = {} # before: 11 hits for a total of ~6 seconds #for composition in compositions: # composition_data_mp = self._get_data_from_materials_project(composition=composition) # mpdata_dict_composition[composition] = composition_data_mp # after: 2.5 seconds!!! pool = multiprocessing.Pool(processes=20) #comp_data_mp = pool.map(self._get_data_from_materials_project, compositions) comp_data_mp = map(self._get_data_from_materials_project, compositions) mpdata_dict_composition.update(dict(zip(compositions, comp_data_mp))) dataframe = self.dataframe dataframe_mp = pd.DataFrame.from_dict(data=mpdata_dict_composition, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_mp = dataframe_mp.reindex( self.dataframe[self.composition_feature].tolist()) # Need to make compositions the first column, instead of the row names dataframe_mp.index.name = self.composition_feature dataframe_mp.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_mp[self.composition_feature] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_mp) return dataframe
def generate_magpie_features(self): compositions = [] composition_components = [] # Replace empty composition fields with empty string instead of NaN self.dataframe = self.dataframe.fillna('') for column in self.dataframe.columns: if self.composition_feature in column: composition_components.append(self.dataframe[column].tolist()) if len(composition_components) < 1: raise utils.MissingColumnError( 'Error! No column named "Material compositions" found in your input data file. To use this feature generation routine, you must supply a material composition for each data point' ) row = 0 while row < len(composition_components[0]): composition = '' for composition_component in composition_components: composition += str(composition_component[row]) compositions.append(composition) row += 1 # Add the column of combined material compositions into the dataframe self.dataframe[self.composition_feature] = compositions # Assign each magpiedata feature set to appropriate composition name magpiedata_dict_composition_average = {} magpiedata_dict_arithmetic_average = {} magpiedata_dict_max = {} magpiedata_dict_min = {} magpiedata_dict_difference = {} magpiedata_dict_atomic_bysite = {} for composition in compositions: magpiedata_composition_average, magpiedata_arithmetic_average, magpiedata_max, magpiedata_min, magpiedata_difference = self._get_computed_magpie_features( composition=composition, data_path=MAGPIE_DATA_PATH) magpiedata_atomic_notparsed = self._get_atomic_magpie_features( composition=composition, data_path=MAGPIE_DATA_PATH) magpiedata_dict_composition_average[ composition] = magpiedata_composition_average magpiedata_dict_arithmetic_average[ composition] = magpiedata_arithmetic_average magpiedata_dict_max[composition] = magpiedata_max magpiedata_dict_min[composition] = magpiedata_min magpiedata_dict_difference[composition] = magpiedata_difference # Add site-specific elemental features count = 1 magpiedata_atomic_bysite = {} for entry in magpiedata_atomic_notparsed: for magpiefeature, featurevalue in magpiedata_atomic_notparsed[ entry].items(): magpiedata_atomic_bysite["Site" + str(count) + "_" + str(magpiefeature)] = featurevalue count += 1 magpiedata_dict_atomic_bysite[ composition] = magpiedata_atomic_bysite magpiedata_dict_list = [ magpiedata_dict_composition_average, magpiedata_dict_arithmetic_average, magpiedata_dict_max, magpiedata_dict_min, magpiedata_dict_difference, magpiedata_dict_atomic_bysite ] dataframe = self.dataframe for magpiedata_dict in magpiedata_dict_list: dataframe_magpie = pd.DataFrame.from_dict(data=magpiedata_dict, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_magpie = dataframe_magpie.reindex( self.dataframe[self.composition_feature].tolist()) # Need to make compositions the first column, instead of the row names dataframe_magpie.index.name = self.composition_feature dataframe_magpie.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_magpie[self.composition_feature] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_magpie) return dataframe