def generate_citrine_features(self, save_to_csv=True): logging.info( 'WARNING: You have specified generation of features from Citrine. Based on which materials you are' 'interested in, there may be many records to parse through, thus this routine may take a long time to complete!' ) compositions = self.dataframe['Material compositions'].tolist() citrine_dict_property_min = dict() citrine_dict_property_max = dict() citrine_dict_property_avg = dict() for composition in compositions: pifquery = self._get_pifquery(composition=composition) property_name_list, property_value_list = self._get_pifquery_property_list( pifquery=pifquery) property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg = self._parse_pifquery_property_list( property_name_list=property_name_list, property_value_list=property_value_list) citrine_dict_property_min[composition] = parsed_property_min citrine_dict_property_max[composition] = parsed_property_max citrine_dict_property_avg[composition] = parsed_property_avg dataframe = self.dataframe citrine_dict_list = [ citrine_dict_property_min, citrine_dict_property_max, citrine_dict_property_avg ] for citrine_dict in citrine_dict_list: dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_citrine = dataframe_citrine.reindex( self.dataframe['Material compositions'].tolist()) # Need to make compositions the first column, instead of the row names dataframe_citrine.index.name = 'Material compositions' dataframe_citrine.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_citrine['Material compositions'] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_citrine) if save_to_csv == bool(True): # Get y_feature in this dataframe, attach it to save path for column in dataframe.columns.values: if column in self.configdict['General Setup'][ 'target_feature']: filetag = column dataframe.to_csv(self.configdict['General Setup']['save_path'] + "/" + 'input_with_citrine_features' + '_' + str(filetag) + '.csv', index=False) return dataframe
def unnormalize_features(self, x_features, y_feature, scaler): array_unnormalized = scaler.inverse_transform( X=self.dataframe[x_features]) array_unnormalized = DataframeUtilities().concatenate_arrays( X_array=array_unnormalized, y_array=np.asarray(self.dataframe[y_feature]).reshape([-1, 1])) dataframe_unnormalized = DataframeUtilities().array_to_dataframe( array=array_unnormalized) dataframe_unnormalized = DataframeUtilities( ).assign_columns_as_features(dataframe=dataframe_unnormalized, x_features=x_features, y_feature=y_feature, remove_first_row=False) return dataframe_unnormalized, scaler
def test_assign_columns_as_features(self): df = DataframeUtilities().assign_columns_as_features( dataframe=self.df1, x_features=["Material compositions"], y_feature="O_pband_center", remove_first_row=False) self.assertIsInstance(df, pd.DataFrame) self.assertTrue(df.shape == self.df1.shape) return
def test_plot_dataframe_histogram(self): configdict = ConfigFileParser( configfile='test_unittest_dataoperations.conf').get_config_dict( path_to_file=testdir) fname = DataframeUtilities().plot_dataframe_histogram( dataframe=self.df1, configdict=configdict, y_feature="O_pband_center_regression") self.files.append(fname) return
def generate_materialsproject_features(self, save_to_csv=True): try: compositions = self.dataframe['Material compositions'] except KeyError: print( 'No column called "Material compositions" exists in the supplied dataframe.' ) sys.exit() mpdata_dict_composition = {} for composition in compositions: composition_data_mp = self._get_data_from_materials_project( composition=composition) mpdata_dict_composition[composition] = composition_data_mp dataframe = self.dataframe dataframe_mp = pd.DataFrame.from_dict(data=mpdata_dict_composition, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_mp = dataframe_mp.reindex( self.dataframe['Material compositions'].tolist()) # Need to make compositions the first column, instead of the row names dataframe_mp.index.name = 'Material compositions' dataframe_mp.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_mp['Material compositions'] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_mp) if save_to_csv == bool(True): # Get y_feature in this dataframe, attach it to save path for column in dataframe.columns.values: if column in self.configdict['General Setup'][ 'target_feature']: filetag = column dataframe.to_csv(self.configdict['General Setup']['save_path'] + "/" + 'input_with_matproj_features' + '_' + str(filetag) + '.csv', index=False) return dataframe
def normalize_and_merge_with_original_dataframe(self, x_features, y_feature, normalize_x_features, normalize_y_feature): dataframe_normalized, scaler = self.normalize_features( x_features=x_features, y_feature=y_feature, normalize_x_features=normalize_x_features, normalize_y_feature=normalize_y_feature) dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=self.dataframe, dataframe2=dataframe_normalized) return dataframe
def _split_csv_file(self): # Need dataframe and x and y features so can split CSV accordingly. data_path_list = [] dataframe = DataParser(configdict=self.configdict).import_data( datapath=self.configdict['Data Setup']['Initial']['data_path']) y_feature = self.configdict['General Setup']['target_feature'] other_features = [] for column in dataframe.columns.values: if column not in y_feature: other_features.append(column) dataframe_x = dataframe.loc[:, other_features] count = 1 for feature in y_feature: try: dataframe_y = dataframe.loc[:, feature] except KeyError: logging.info( 'Error detected: The feature names in the csv and input files do not match' ) print( 'The feature names in the csv and input files do not match. Please fix feature names and re-run MASTML' ) sys.exit() dataframe_new = DataframeUtilities.merge_dataframe_columns( dataframe1=dataframe_x, dataframe2=dataframe_y) # Write the new dataframe to new CSV, and update data_path_list data_path_split = os.path.split( self.configdict['Data Setup']['Initial']['data_path']) filename = data_path_split[1].split(".csv") data_path = data_path_split[0] + "/" + str( filename[0]) + "_" + str(count) + ".csv" dataframe_new.to_csv(data_path, index=False) data_path_list.append(data_path) count += 1 # Last, add file data paths that are not part of original CSV file to split for key in self.configdict['Data Setup'].keys(): if key != 'Initial': data_path_list.append( self.configdict['Data Setup'][key]['data_path']) return data_path_list
def generate_magpie_features(self, save_to_csv=True): compositions = [] composition_components = [] # Replace empty composition fields with empty string instead of NaN self.dataframe = self.dataframe.fillna('') for column in self.dataframe.columns: if 'Material composition' in column: composition_components.append(self.dataframe[column].tolist()) if len(composition_components) < 1: logging.info( 'ERROR: No column with "Material composition xx" was found in the supplied dataframe' ) sys.exit() row = 0 while row < len(composition_components[0]): composition = '' for composition_component in composition_components: composition += str(composition_component[row]) compositions.append(composition) row += 1 # Add the column of combined material compositions into the dataframe self.dataframe['Material compositions'] = compositions # Assign each magpiedata feature set to appropriate composition name magpiedata_dict_composition_average = {} magpiedata_dict_arithmetic_average = {} magpiedata_dict_max = {} magpiedata_dict_min = {} magpiedata_dict_difference = {} magpiedata_dict_atomic_bysite = {} for composition in compositions: magpiedata_composition_average, magpiedata_arithmetic_average, magpiedata_max, magpiedata_min, magpiedata_difference = self._get_computed_magpie_features( composition=composition) magpiedata_atomic_notparsed = self._get_atomic_magpie_features( composition=composition) magpiedata_dict_composition_average[ composition] = magpiedata_composition_average magpiedata_dict_arithmetic_average[ composition] = magpiedata_arithmetic_average magpiedata_dict_max[composition] = magpiedata_max magpiedata_dict_min[composition] = magpiedata_min magpiedata_dict_difference[composition] = magpiedata_difference # Add site-specific elemental features count = 1 magpiedata_atomic_bysite = {} for entry in magpiedata_atomic_notparsed.keys(): for magpiefeature, featurevalue in magpiedata_atomic_notparsed[ entry].items(): magpiedata_atomic_bysite["Site" + str(count) + "_" + str(magpiefeature)] = featurevalue count += 1 magpiedata_dict_atomic_bysite[ composition] = magpiedata_atomic_bysite magpiedata_dict_list = [ magpiedata_dict_composition_average, magpiedata_dict_arithmetic_average, magpiedata_dict_max, magpiedata_dict_min, magpiedata_dict_difference, magpiedata_dict_atomic_bysite ] dataframe = self.dataframe for magpiedata_dict in magpiedata_dict_list: dataframe_magpie = pd.DataFrame.from_dict(data=magpiedata_dict, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_magpie = dataframe_magpie.reindex( self.dataframe['Material compositions'].tolist()) # Need to make compositions the first column, instead of the row names dataframe_magpie.index.name = 'Material compositions' dataframe_magpie.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_magpie['Material compositions'] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_magpie) if save_to_csv == bool(True): # Get y_feature in this dataframe, attach it to save path for column in dataframe.columns.values: if column in self.configdict['General Setup'][ 'target_feature']: filetag = column fname = self.configdict['General Setup'][ 'save_path'] + "/" + 'input_with_magpie_features' + '_' + str( filetag) + '.csv' dataframe.to_csv(fname, index=False) return dataframe
def normalize_features(self, x_features, y_feature, normalize_x_features, normalize_y_feature, to_csv=True): if normalize_x_features == bool(True) and normalize_y_feature == bool( False): scaler = StandardScaler().fit(X=self.dataframe[x_features]) array_normalized = scaler.fit_transform( X=self.dataframe[x_features]) array_normalized = DataframeUtilities().concatenate_arrays( X_array=array_normalized, y_array=np.asarray(self.dataframe[y_feature]).reshape([-1, 1])) elif normalize_x_features == bool( False) and normalize_y_feature == bool(True): scaler = StandardScaler().fit( X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1])) array_normalized = scaler.fit_transform( X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1])) array_normalized = DataframeUtilities().concatenate_arrays( X_array=np.asarray(self.dataframe[x_features]), y_array=array_normalized.reshape([-1, 1])) elif normalize_x_features == bool( True) and normalize_y_feature == bool(True): scaler_x = StandardScaler().fit(X=self.dataframe[x_features]) scaler_y = StandardScaler().fit( X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1])) array_normalized_x = scaler_x.fit_transform( X=self.dataframe[x_features]) array_normalized_y = scaler_y.fit_transform( X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1])) array_normalized = DataframeUtilities().concatenate_arrays( X_array=array_normalized_x, y_array=array_normalized_y) else: "You must specify to normalize either x_features, y_feature, or both, or set perform_feature_normalization=False in the input file" sys.exit() dataframe_normalized = DataframeUtilities().array_to_dataframe( array=array_normalized) dataframe_normalized = DataframeUtilities().assign_columns_as_features( dataframe=dataframe_normalized, x_features=x_features, y_feature=y_feature, remove_first_row=False) if to_csv == True: # Need configdict to get save path #configdict = ConfigFileParser(configfile=sys.argv[1]).get_config_dict(path_to_file=os.getcwd()) # Get y_feature in this dataframe, attach it to save path for column in dataframe_normalized.columns.values: if column in self.configdict['General Setup'][ 'target_feature']: filetag = column dataframe_normalized.to_csv( self.configdict['General Setup']['save_path'] + "/" + 'input_data_normalized' + '_' + str(filetag) + '.csv', index=False) if not (normalize_x_features == bool(True) and normalize_y_feature == bool(True)): return dataframe_normalized, scaler else: return dataframe_normalized, (scaler_x, scaler_y)
def _create_data_dict(self): data_dict = dict() for data_name in self.data_setup.keys(): data_path = self.configdict['Data Setup'][data_name]['data_path'] logging.info( 'Creating data dict for data path %s and data name %s' % (data_path, data_name)) data_weights = self.data_setup[data_name]['weights'] if 'labeling_features' in self.general_setup.keys(): labeling_features = self._string_or_list_input_to_list( self.general_setup['labeling_features']) else: labeling_features = None if 'target_error_feature' in self.general_setup.keys(): target_error_feature = self.general_setup[ 'target_error_feature'] else: target_error_feature = None if 'grouping_feature' in self.general_setup.keys(): grouping_feature = self.general_setup['grouping_feature'] else: grouping_feature = None if 'Feature Generation' in self.configdict.keys(): if self.configdict['Feature Generation']['perform_feature_generation'] == bool(True) or \ self.configdict['Feature Generation']['perform_feature_generation'] == "True": generate_features = True else: generate_features = False else: generate_features = False if 'Feature Normalization' in self.configdict.keys(): if self.configdict['Feature Normalization']['normalize_x_features'] == bool(True) or \ self.configdict['Feature Normalization']['normalize_x_features'] == "True": normalize_x_features = True else: normalize_x_features = False if self.configdict['Feature Normalization']['normalize_y_feature'] == bool(True) or \ self.configdict['Feature Normalization']['normalize_y_feature'] == "True": normalize_y_feature = True else: normalize_y_feature = False else: normalize_x_features = False normalize_y_feature = False if 'Feature Selection' in self.configdict.keys(): if self.configdict['Feature Selection']['perform_feature_selection'] == bool(True) or \ self.configdict['Feature Selection']['perform_feature_selection'] == "True": select_features = True else: select_features = False else: select_features = False logging.info("Feature Generation: %s" % generate_features) logging.info("Feature Normalization (x_features): %s" % normalize_x_features) logging.info("Feature Normalization (y_feature): %s" % normalize_y_feature) logging.info("Feature Selection: %s" % select_features) # Parse input data file Xdata, ydata, x_features, y_feature, dataframe = self._parse_input_data( data_path) # Plot initial histogram of input target data DataframeUtilities().plot_dataframe_histogram( configdict=self.configdict, dataframe=dataframe, y_feature=y_feature) original_x_features = list(x_features) original_columns = list(dataframe.columns) logging.debug("original columns: %s" % original_columns) # Remove any missing rows from dataframe #dataframe = dataframe.dropna() # Save off label and grouping data dataframe_labeled = pd.DataFrame() dataframe_grouped = pd.DataFrame() if not (labeling_features is None): dataframe_labeled = FeatureIO( dataframe=dataframe).keep_custom_features( features_to_keep=labeling_features, y_feature=y_feature) if normalize_x_features == bool(True): dataframe_labeled, scaler = FeatureNormalization( dataframe=dataframe_labeled, configdict=self.configdict).normalize_features( x_features=labeling_features, y_feature=y_feature) if not (grouping_feature is None): dataframe_grouped = FeatureIO( dataframe=dataframe).keep_custom_features( features_to_keep=[grouping_feature], y_feature=y_feature) # Generate additional descriptors, as specified in input file (optional) if generate_features: dataframe = self._perform_feature_generation( dataframe=dataframe) # Actually, the x_features_NOUSE is required if starting from no features and doing feature generation. Not renaming for now. RJ 7/17 Xdata, ydata, x_features_NOUSE, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe, target_feature=y_feature) else: Xdata, ydata, x_features, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe, target_feature=y_feature) # First remove features containing strings before doing feature normalization or other operations, but don't remove grouping features if generate_features == bool(True): nonstring_x_features, dataframe_nostrings = MiscFeatureOperations( configdict=self.configdict ).remove_features_containing_strings( dataframe=dataframe, x_features=x_features_NOUSE) #Remove columns containing all entries of NaN dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all') # Re-obtain x_feature list as some features may have been dropped Xdata, ydata, x_features_NOUSE, y_feature, dataframe_nostrings = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_nostrings, target_feature=y_feature) else: nonstring_x_features, dataframe_nostrings = MiscFeatureOperations( configdict=self.configdict ).remove_features_containing_strings(dataframe=dataframe, x_features=x_features) # Remove columns containing all entries of NaN dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all') # Fill spots with NaN to be empty string dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='any') # Re-obtain x_feature list as some features may have been dropped Xdata, ydata, x_features, y_feature, dataframe_nostrings = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_nostrings, target_feature=y_feature) logging.debug("pre-changes:%s" % dataframe_nostrings.columns) # Normalize features (optional) if normalize_x_features == bool( True) or normalize_y_feature == bool(True): fn = FeatureNormalization(dataframe=dataframe_nostrings, configdict=self.configdict) dataframe_nostrings, scaler = fn.normalize_features( x_features=x_features, y_feature=y_feature, normalize_x_features=normalize_x_features, normalize_y_feature=normalize_y_feature) x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=dataframe_nostrings, target_feature=y_feature) # Perform feature selection and dimensional reduction, as specified in the input file (optional) if (select_features == bool(True)) and (y_feature in dataframe_nostrings.columns): # Remove any additional columns that are not x_features using to be fit to data features = dataframe_nostrings.columns.values.tolist() features_to_remove = [] for feature in features: if feature not in x_features and feature not in y_feature: features_to_remove.append(feature) dataframe_nostrings = FeatureIO( dataframe=dataframe_nostrings).remove_custom_features( features_to_remove=features_to_remove) dataframe_nostrings = self._perform_feature_selection( dataframe=dataframe_nostrings, x_features=x_features, y_feature=y_feature) x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=dataframe_nostrings, target_feature=y_feature) logging.debug("post-removal:%s" % dataframe_nostrings.columns) # Combine the input dataframe, which has undergone feature generation and normalization, with the grouped and labeled features of original dataframe # First, need to generate dataframe that only has the grouped and labeled features grouping_and_labeling_features = [] duplicate_features = [] if 'grouping_feature' in self.configdict['General Setup'].keys(): grouping_and_labeling_features.append(grouping_feature) if 'labeling_features' in self.configdict['General Setup'].keys(): for feature in labeling_features: grouping_and_labeling_features.append(feature) if feature in x_features: if feature not in duplicate_features: duplicate_features.append(feature) # Now merge dataframes dataframe_labeled_grouped = DataframeUtilities( ).merge_dataframe_columns(dataframe1=dataframe_labeled, dataframe2=dataframe_grouped) dataframe_merged = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe_nostrings, dataframe2=dataframe_labeled_grouped) #Add string columns back in string_x_features = list() for my_x_feature in x_features: if my_x_feature in nonstring_x_features: pass else: string_x_features.append(my_x_feature) logging.debug("string features: %s" % string_x_features) for string_x_feature in string_x_features: dataframe_merged[string_x_feature] = dataframe_orig_dropped_na[ string_x_feature] # Need to remove duplicate features after merging. logging.debug("merged:%s" % dataframe_merged.columns) dataframe_rem = FeatureIO( dataframe=dataframe_merged).remove_duplicate_columns() myXdata, myydata, myx_features, myy_feature, dataframe_final = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_rem, target_feature=y_feature) combined_x_features = list() logging.debug("total features:%s" % myx_features) for feature in myx_features: if (feature in original_x_features) or not ( feature in original_columns ): #originally designated, or created from feature generation combined_x_features.append(feature) logging.debug("combined x features:%s" % combined_x_features) data_dict[data_name] = DataHandler( data=dataframe_final, input_data=dataframe_final[combined_x_features], target_data=myydata, input_features=combined_x_features, target_feature=myy_feature, target_error_feature=target_error_feature, labeling_features=labeling_features, grouping_feature=grouping_feature) # logging.info('Parsed the input data located under %s' % data_path) # Get dataframe stats DataframeUtilities.save_all_dataframe_statistics( dataframe=dataframe_final, configdict=self.configdict) return data_dict, y_feature
def test_save_all_dataframe_statistics(self): fname = DataframeUtilities().save_all_dataframe_statistics( dataframe=self.df1, configdict=self.configdict) self.files.append(fname) return
def test_concatenate_arrays(self): arr = DataframeUtilities().concatenate_arrays(X_array=self.arr1, y_array=self.arr2) self.assertIsInstance(arr, np.ndarray) self.assertFalse(arr.shape == self.arr1.shape) return
def test_array_to_dataframe(self): df1 = DataframeUtilities().array_to_dataframe(array=self.arr1) self.assertNotIsInstance(df1, np.ndarray) self.assertIsInstance(df1, pd.DataFrame) return
def test_dataframe_to_array(self): arr = DataframeUtilities().dataframe_to_array(dataframe=self.df1) self.assertNotIsInstance(arr, pd.DataFrame) self.assertIsInstance(arr, np.ndarray) return
def test_get_dataframe_statistics(self): df = DataframeUtilities().get_dataframe_statistics(dataframe=self.df1) self.assertIsInstance(df, pd.DataFrame) return
def test_merge_dataframe_rows(self): df = DataframeUtilities().merge_dataframe_rows(dataframe1=self.df1, dataframe2=self.df2) self.assertFalse(df.shape == self.df1.shape) self.assertIsInstance(df, pd.DataFrame) return