Пример #1
0
 def csv_add_features(self, csvsrc, csvdest):
     afm_dict=dict()
     param_dict=dict()
     #E900 column
     e900_dict = dict()
     for elem in ['P','Ni','Cu','Mn']: #Si, C not used in e900
         e900_dict['wt%s' % elem] = 'wt_percent_%s' % elem
     e900_dict['fluencestr'] = 'fluence_n_cm2'
     e900_dict['tempC'] = 'temperature_C'
     e900_dict['prod_ID'] = 'product_id'
     afm_dict['DBTT.E900'] = dict(e900_dict)
     param_dict['DBTT.E900'] = dict()
     #get_dataframe
     csv_dataparser = DataParser()
     csv_dataframe = csv_dataparser.import_data("%s.csv" % os.path.join(self.save_path, csvsrc))
     #add features
     for afm in afm_dict.keys():
         (feature_name, feature_data) = cf_help.get_custom_feature_data(class_method_str = afm,
             starting_dataframe = csv_dataframe,
             param_dict = dict(param_dict[afm]),
             addl_feature_method_kwargs = dict(afm_dict[afm]))
         fio = FeatureIO(csv_dataframe)
         csv_dataframe = fio.add_custom_features([feature_name],feature_data)
     #add log10 features
     log10_dict=dict()
     log10_dict['fluence_n_cm2'] = dict()
     log10_dict['flux_n_cm2_sec'] = dict()
     for lkey in log10_dict.keys():
         orig_data = csv_dataframe[lkey]
         log10_data = np.log10(orig_data)
         fio = FeatureIO(csv_dataframe)
         csv_dataframe = fio.add_custom_features(["log(%s)" % lkey], log10_data)
     #add normalizations
     norm_dict = dict()
     norm_dict['log(fluence_n_cm2)']=dict()
     norm_dict['log(fluence_n_cm2)']['smin'] = 17
     norm_dict['log(fluence_n_cm2)']['smax'] = 25
     norm_dict['log(flux_n_cm2_sec)']=dict()
     norm_dict['log(flux_n_cm2_sec)']['smin'] = 10
     norm_dict['log(flux_n_cm2_sec)']['smax'] = 15
     norm_dict['temperature_C']=dict()
     norm_dict['temperature_C']['smin'] = 270
     norm_dict['temperature_C']['smax'] = 320
     for elem in ["P","C","Cu","Ni","Mn","Si"]:
         norm_dict["at_percent_%s" % elem] = dict()
         norm_dict["at_percent_%s" % elem]['smin'] = 0.0
         norm_dict["at_percent_%s" % elem]['smax'] = 1.717 #max Mn atomic percent
     for nkey in norm_dict.keys():
         fnorm = FeatureNormalization(csv_dataframe)
         scaled_feature = fnorm.minmax_scale_single_feature(nkey,
                             smin=norm_dict[nkey]['smin'], 
                             smax=norm_dict[nkey]['smax'])
         fio = FeatureIO(csv_dataframe)
         csv_dataframe = fio.add_custom_features(["N(%s)" % nkey],scaled_feature)
     csv_dataframe.to_csv("%s.csv" % os.path.join(self.save_path, csvdest))
     return
Пример #2
0
 def test_parse_fromfile(self):
     Xdata, ydata, x_features, y_feature, dataframe = DataParser(
         configdict=self.configdict).parse_fromfile(datapath=self.datapath,
                                                    as_array=False)
     self.assertIsInstance(Xdata, pd.DataFrame)
     self.assertIsInstance(ydata, pd.Series)
     self.assertIsInstance(dataframe, pd.DataFrame)
     Xdata, ydata, x_features, y_feature, dataframe = DataParser(
         configdict=self.configdict).parse_fromfile(datapath=self.datapath,
                                                    as_array=True)
     self.assertIsInstance(Xdata, np.ndarray)
     self.assertIsInstance(ydata, np.ndarray)
     self.assertIsInstance(dataframe, pd.DataFrame)
     return
Пример #3
0
 def _parse_input_data(self, data_path=""):
     if not (os.path.isfile(data_path)):
         raise OSError("No file found at %s" % data_path)
     Xdata, ydata, x_features, y_feature, dataframe = DataParser(
         configdict=self.configdict).parse_fromfile(datapath=data_path,
                                                    as_array=False)
     return Xdata, ydata, x_features, y_feature, dataframe
Пример #4
0
 def test_get_features(self):
     x_features, y_feature = DataParser(
         configdict=self.configdict).get_features(
             dataframe=self.df1,
             target_feature=self.target_feature,
             from_input_file=False)
     self.assertTrue(type(x_features) is list)
     self.assertTrue(y_feature == self.target_feature)
     return
Пример #5
0
    def _split_csv_file(self):
        # Need dataframe and x and y features so can split CSV accordingly.
        data_path_list = []

        dataframe = DataParser(configdict=self.configdict).import_data(
            datapath=self.configdict['Data Setup']['Initial']['data_path'])
        y_feature = self.configdict['General Setup']['target_feature']
        other_features = []
        for column in dataframe.columns.values:
            if column not in y_feature:
                other_features.append(column)

        dataframe_x = dataframe.loc[:, other_features]
        count = 1
        for feature in y_feature:
            try:
                dataframe_y = dataframe.loc[:, feature]
            except KeyError:
                logging.info(
                    'Error detected: The feature names in the csv and input files do not match'
                )
                print(
                    'The feature names in the csv and input files do not match. Please fix feature names and re-run MASTML'
                )
                sys.exit()
            dataframe_new = DataframeUtilities.merge_dataframe_columns(
                dataframe1=dataframe_x, dataframe2=dataframe_y)
            # Write the new dataframe to new CSV, and update data_path_list
            data_path_split = os.path.split(
                self.configdict['Data Setup']['Initial']['data_path'])
            filename = data_path_split[1].split(".csv")
            data_path = data_path_split[0] + "/" + str(
                filename[0]) + "_" + str(count) + ".csv"
            dataframe_new.to_csv(data_path, index=False)
            data_path_list.append(data_path)
            count += 1

        # Last, add file data paths that are not part of original CSV file to split
        for key in self.configdict['Data Setup'].keys():
            if key != 'Initial':
                data_path_list.append(
                    self.configdict['Data Setup'][key]['data_path'])

        return data_path_list
Пример #6
0
    def _perform_feature_selection(self, dataframe, x_features, y_feature):
        for k, v in self.configdict['Feature Selection'].items():
            if k == 'remove_constant_features' and v == 'True':
                logging.info(
                    'FEATURE SELECTION: Removing constant features from your feature list'
                )
                dr = DimensionalReduction(dataframe=dataframe,
                                          x_features=x_features,
                                          y_feature=y_feature)
                dataframe = dr.remove_constant_features()
                x_features, y_feature = DataParser(
                    configdict=self.configdict).get_features(
                        dataframe=dataframe, target_feature=y_feature)

            if k == 'feature_selection_algorithm':
                logging.info(
                    'FEATURE SELECTION: Selecting features using a %s algorithm'
                    % v)
                model_to_use = str(self.configdict['Feature Selection']
                                   ['model_to_use_for_learning_curve'])
                fs = FeatureSelection(configdict=self.configdict,
                                      dataframe=dataframe,
                                      x_features=x_features,
                                      y_feature=y_feature,
                                      model_type=model_to_use)
                if v == 'sequential_forward_selection':
                    if int(self.configdict['Feature Selection']
                           ['number_of_features_to_keep']) <= len(x_features):
                        dataframe = fs.sequential_forward_selection(
                            number_features_to_keep=int(
                                self.configdict['Feature Selection']
                                ['number_of_features_to_keep']))
                    else:
                        logging.info(
                            'Warning: you have specified to keep more features than the total number of features in your dataset. Defaulting to keep all features in feature selection'
                        )
                        dataframe = fs.sequential_forward_selection(
                            number_features_to_keep=int(len(x_features)))
                if v == 'recursive_feature_elimination':
                    if int(self.configdict['Feature Selection']
                           ['number_of_features_to_keep']) <= len(x_features):
                        dataframe = fs.feature_selection(
                            feature_selection_type=
                            'recursive_feature_elimination',
                            number_features_to_keep=int(
                                self.configdict['Feature Selection']
                                ['number_of_features_to_keep']),
                            use_mutual_info=self.configdict[
                                'Feature Selection']['use_mutual_information'])
                    else:
                        logging.info(
                            'Warning: you have specified to keep more features than the total number of features in your dataset. Defaulting to keep all features in feature selection'
                        )
                        dataframe = fs.feature_selection(
                            feature_selection_type=
                            'recursive_feature_elimination',
                            number_features_to_keep=int(
                                self.configdict['Feature Selection']
                                ['number_of_features_to_keep']),
                            use_mutual_info=self.configdict[
                                'Feature Selection']['use_mutual_information'])
                    if self.configdict['Feature Selection'][
                            'generate_feature_learning_curve'] == 'True':
                        learningcurve = LearningCurve(
                            configdict=self.configdict,
                            dataframe=dataframe,
                            model_type=model_to_use)
                        logging.info(
                            'Generating a feature learning curve using a %s algorithm'
                            % v)
                        learningcurve.generate_feature_learning_curve(
                            feature_selection_algorithm=
                            'recursive_feature_elimination')
                if v == 'univariate_feature_selection':
                    if int(self.configdict['Feature Selection']
                           ['number_of_features_to_keep']) <= len(x_features):
                        dataframe = fs.feature_selection(
                            feature_selection_type=
                            'univariate_feature_selection',
                            number_features_to_keep=int(
                                self.configdict['Feature Selection']
                                ['number_of_features_to_keep']),
                            use_mutual_info=self.configdict[
                                'Feature Selection']['use_mutual_information'])
                    else:
                        logging.info(
                            'Warning: you have specified to keep more features than the total number of features in your dataset. Defaulting to keep all features in feature selection'
                        )
                        dataframe = fs.feature_selection(
                            feature_selection_type=
                            'univariate_feature_selection',
                            number_features_to_keep=int(len(x_features)),
                            use_mutual_info=self.configdict[
                                'Feature Selection']['use_mutual_information'])
                    if self.configdict['Feature Selection'][
                            'generate_feature_learning_curve'] == 'True':
                        learningcurve = LearningCurve(
                            configdict=self.configdict,
                            dataframe=dataframe,
                            model_type=model_to_use)
                        logging.info(
                            'Generating a feature learning curve using a %s algorithm'
                            % v)
                        learningcurve.generate_feature_learning_curve(
                            feature_selection_algorithm=
                            'univariate_feature_selection')
        return dataframe
Пример #7
0
    def _create_data_dict(self):
        data_dict = dict()
        for data_name in self.data_setup.keys():
            data_path = self.configdict['Data Setup'][data_name]['data_path']

            logging.info(
                'Creating data dict for data path %s and data name %s' %
                (data_path, data_name))

            data_weights = self.data_setup[data_name]['weights']
            if 'labeling_features' in self.general_setup.keys():
                labeling_features = self._string_or_list_input_to_list(
                    self.general_setup['labeling_features'])
            else:
                labeling_features = None
            if 'target_error_feature' in self.general_setup.keys():
                target_error_feature = self.general_setup[
                    'target_error_feature']
            else:
                target_error_feature = None
            if 'grouping_feature' in self.general_setup.keys():
                grouping_feature = self.general_setup['grouping_feature']
            else:
                grouping_feature = None

            if 'Feature Generation' in self.configdict.keys():
                if self.configdict['Feature Generation']['perform_feature_generation'] == bool(True) or \
                                self.configdict['Feature Generation']['perform_feature_generation'] == "True":
                    generate_features = True
                else:
                    generate_features = False
            else:
                generate_features = False

            if 'Feature Normalization' in self.configdict.keys():
                if self.configdict['Feature Normalization']['normalize_x_features'] == bool(True) or \
                                self.configdict['Feature Normalization']['normalize_x_features'] == "True":
                    normalize_x_features = True
                else:
                    normalize_x_features = False
                if self.configdict['Feature Normalization']['normalize_y_feature'] == bool(True) or \
                                self.configdict['Feature Normalization']['normalize_y_feature'] == "True":
                    normalize_y_feature = True
                else:
                    normalize_y_feature = False
            else:
                normalize_x_features = False
                normalize_y_feature = False

            if 'Feature Selection' in self.configdict.keys():
                if self.configdict['Feature Selection']['perform_feature_selection'] == bool(True) or \
                                self.configdict['Feature Selection']['perform_feature_selection'] == "True":
                    select_features = True
                else:
                    select_features = False
            else:
                select_features = False

            logging.info("Feature Generation: %s" % generate_features)
            logging.info("Feature Normalization (x_features): %s" %
                         normalize_x_features)
            logging.info("Feature Normalization (y_feature): %s" %
                         normalize_y_feature)
            logging.info("Feature Selection: %s" % select_features)
            # Parse input data file
            Xdata, ydata, x_features, y_feature, dataframe = self._parse_input_data(
                data_path)

            # Plot initial histogram of input target data
            DataframeUtilities().plot_dataframe_histogram(
                configdict=self.configdict,
                dataframe=dataframe,
                y_feature=y_feature)

            original_x_features = list(x_features)
            original_columns = list(dataframe.columns)
            logging.debug("original columns: %s" % original_columns)
            # Remove any missing rows from dataframe
            #dataframe = dataframe.dropna()

            # Save off label and grouping data
            dataframe_labeled = pd.DataFrame()
            dataframe_grouped = pd.DataFrame()
            if not (labeling_features is None):
                dataframe_labeled = FeatureIO(
                    dataframe=dataframe).keep_custom_features(
                        features_to_keep=labeling_features,
                        y_feature=y_feature)
                if normalize_x_features == bool(True):
                    dataframe_labeled, scaler = FeatureNormalization(
                        dataframe=dataframe_labeled,
                        configdict=self.configdict).normalize_features(
                            x_features=labeling_features, y_feature=y_feature)
            if not (grouping_feature is None):
                dataframe_grouped = FeatureIO(
                    dataframe=dataframe).keep_custom_features(
                        features_to_keep=[grouping_feature],
                        y_feature=y_feature)

            # Generate additional descriptors, as specified in input file (optional)
            if generate_features:
                dataframe = self._perform_feature_generation(
                    dataframe=dataframe)
                # Actually, the x_features_NOUSE is required if starting from no features and doing feature generation. Not renaming for now. RJ 7/17
                Xdata, ydata, x_features_NOUSE, y_feature, dataframe = DataParser(
                    configdict=self.configdict).parse_fromdataframe(
                        dataframe=dataframe, target_feature=y_feature)

            else:
                Xdata, ydata, x_features, y_feature, dataframe = DataParser(
                    configdict=self.configdict).parse_fromdataframe(
                        dataframe=dataframe, target_feature=y_feature)

            # First remove features containing strings before doing feature normalization or other operations, but don't remove grouping features
            if generate_features == bool(True):
                nonstring_x_features, dataframe_nostrings = MiscFeatureOperations(
                    configdict=self.configdict
                ).remove_features_containing_strings(
                    dataframe=dataframe, x_features=x_features_NOUSE)
                #Remove columns containing all entries of NaN
                dataframe_nostrings = dataframe_nostrings.dropna(axis=1,
                                                                 how='all')
                # Re-obtain x_feature list as some features may have been dropped
                Xdata, ydata, x_features_NOUSE, y_feature, dataframe_nostrings = DataParser(
                    configdict=self.configdict).parse_fromdataframe(
                        dataframe=dataframe_nostrings,
                        target_feature=y_feature)
            else:
                nonstring_x_features, dataframe_nostrings = MiscFeatureOperations(
                    configdict=self.configdict
                ).remove_features_containing_strings(dataframe=dataframe,
                                                     x_features=x_features)

            # Remove columns containing all entries of NaN
            dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all')

            # Fill spots with NaN to be empty string
            dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='any')

            # Re-obtain x_feature list as some features may have been dropped
            Xdata, ydata, x_features, y_feature, dataframe_nostrings = DataParser(
                configdict=self.configdict).parse_fromdataframe(
                    dataframe=dataframe_nostrings, target_feature=y_feature)

            logging.debug("pre-changes:%s" % dataframe_nostrings.columns)

            # Normalize features (optional)
            if normalize_x_features == bool(
                    True) or normalize_y_feature == bool(True):
                fn = FeatureNormalization(dataframe=dataframe_nostrings,
                                          configdict=self.configdict)
                dataframe_nostrings, scaler = fn.normalize_features(
                    x_features=x_features,
                    y_feature=y_feature,
                    normalize_x_features=normalize_x_features,
                    normalize_y_feature=normalize_y_feature)
                x_features, y_feature = DataParser(
                    configdict=self.configdict).get_features(
                        dataframe=dataframe_nostrings,
                        target_feature=y_feature)

            # Perform feature selection and dimensional reduction, as specified in the input file (optional)
            if (select_features
                    == bool(True)) and (y_feature
                                        in dataframe_nostrings.columns):
                # Remove any additional columns that are not x_features using to be fit to data
                features = dataframe_nostrings.columns.values.tolist()
                features_to_remove = []
                for feature in features:
                    if feature not in x_features and feature not in y_feature:
                        features_to_remove.append(feature)
                dataframe_nostrings = FeatureIO(
                    dataframe=dataframe_nostrings).remove_custom_features(
                        features_to_remove=features_to_remove)
                dataframe_nostrings = self._perform_feature_selection(
                    dataframe=dataframe_nostrings,
                    x_features=x_features,
                    y_feature=y_feature)
                x_features, y_feature = DataParser(
                    configdict=self.configdict).get_features(
                        dataframe=dataframe_nostrings,
                        target_feature=y_feature)

            logging.debug("post-removal:%s" % dataframe_nostrings.columns)
            # Combine the input dataframe, which has undergone feature generation and normalization, with the grouped and labeled features of original dataframe
            # First, need to generate dataframe that only has the grouped and labeled features
            grouping_and_labeling_features = []
            duplicate_features = []
            if 'grouping_feature' in self.configdict['General Setup'].keys():
                grouping_and_labeling_features.append(grouping_feature)
            if 'labeling_features' in self.configdict['General Setup'].keys():
                for feature in labeling_features:
                    grouping_and_labeling_features.append(feature)
                    if feature in x_features:
                        if feature not in duplicate_features:
                            duplicate_features.append(feature)

            # Now merge dataframes
            dataframe_labeled_grouped = DataframeUtilities(
            ).merge_dataframe_columns(dataframe1=dataframe_labeled,
                                      dataframe2=dataframe_grouped)
            dataframe_merged = DataframeUtilities().merge_dataframe_columns(
                dataframe1=dataframe_nostrings,
                dataframe2=dataframe_labeled_grouped)

            #Add string columns back in
            string_x_features = list()
            for my_x_feature in x_features:
                if my_x_feature in nonstring_x_features:
                    pass
                else:
                    string_x_features.append(my_x_feature)
            logging.debug("string features: %s" % string_x_features)
            for string_x_feature in string_x_features:
                dataframe_merged[string_x_feature] = dataframe_orig_dropped_na[
                    string_x_feature]

            # Need to remove duplicate features after merging.
            logging.debug("merged:%s" % dataframe_merged.columns)
            dataframe_rem = FeatureIO(
                dataframe=dataframe_merged).remove_duplicate_columns()

            myXdata, myydata, myx_features, myy_feature, dataframe_final = DataParser(
                configdict=self.configdict).parse_fromdataframe(
                    dataframe=dataframe_rem, target_feature=y_feature)
            combined_x_features = list()
            logging.debug("total features:%s" % myx_features)
            for feature in myx_features:
                if (feature in original_x_features) or not (
                        feature in original_columns
                ):  #originally designated, or created from feature generation
                    combined_x_features.append(feature)
            logging.debug("combined x features:%s" % combined_x_features)
            data_dict[data_name] = DataHandler(
                data=dataframe_final,
                input_data=dataframe_final[combined_x_features],
                target_data=myydata,
                input_features=combined_x_features,
                target_feature=myy_feature,
                target_error_feature=target_error_feature,
                labeling_features=labeling_features,
                grouping_feature=grouping_feature)  #
            logging.info('Parsed the input data located under %s' % data_path)

            # Get dataframe stats
            DataframeUtilities.save_all_dataframe_statistics(
                dataframe=dataframe_final, configdict=self.configdict)

        return data_dict, y_feature
Пример #8
0
 def test_get_data(self):
     Xdata, ydata = DataParser(configdict=self.configdict).get_data(
         dataframe=self.df1,
         x_features=self.x_features,
         y_feature=self.target_feature)
     return
Пример #9
0
 def test_import_data(self):
     dataframe = DataParser(configdict=self.configdict).import_data(
         datapath=self.datapath)
     self.assertIsInstance(dataframe, pd.DataFrame)
     return