def select_features_without_label(features: pd.DataFrame, missing_threshold=0.7, correlation_threshold=0.95) -> pd.DataFrame: fs = FeatureSelector(data=features) fs.identify_missing(missing_threshold) fs.identify_single_unique() # fs.identify_collinear(correlation_threshold) return fs.remove(methods=['missing', 'single_unique'])
def select_features_without_label(features: pd.DataFrame, missing_threshold=0.90, correlation_threshold=1) -> pd.DataFrame: print(missing_threshold) fs = FeatureSelector(data=features) fs.identify_missing(missing_threshold) fs.identify_single_unique() if correlation_threshold < 1: fs.identify_collinear(correlation_threshold) return fs.remove(methods=['missing', 'single_unique', "collinear"]) else: return fs.remove(methods=['missing', 'single_unique'])
def runFeatureSelector(self, df): logging.info(("Running Feature Selection")) fs = FeatureSelector(data=df, labels=self.targets) # Identify Missing Values fs.identify_missing(missing_threshold=0.6) # Identify Collinearity fs.identify_collinear(correlation_threshold=0.98) fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv") # Identify Single Unique fs.identify_single_unique() fs.record_single_unique.to_csv( ".\\utils\\csv\\record_single_unique.csv") # Zero importance fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', n_iterations=10, early_stopping=True) fs.record_zero_importance.to_csv( ".\\utils\\csv\\record_zero_importance.csv") # Low Importance fs.identify_low_importance(cumulative_importance=0.99) fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv") #generate summary of all operations summary = pd.DataFrame.from_dict(fs.ops, orient='index') summary.to_csv(".\\utils\\csv\\summary.csv") #if drop flag is 1, go ahead and remove the suggested features if self.drop == 1: df = fs.remove(methods='all') else: pass return df
def remove_unnecessary_features(self, auto=False): if auto: self.processed_data = self.processed_data.drop( columns=self.predefined_skip_features) else: fs = FeatureSelector(data=self.processed_data.drop("label", axis=1), labels=self.processed_data["label"]) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.98) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.99) fs.identify_single_unique() # Remove the features from all methods (returns a df) labels = self.processed_data["label"] self.processed_data = fs.remove(methods='all') self.processed_data["label"] = labels
def select_best_features(data_file_path, saveto_path="Default"): mod_data_file_path = strip_header(data_file_path) if saveto_path == "Default": saveto_path = replace_ext(data_file_path, '_reduced.csv') X = pd.read_csv(mod_data_file_path) y = X['Label'] X = X.drop(columns=['Label']) feature_selector = FeatureSelector(data=X, labels=y) feature_selector.identify_single_unique() feature_selector.identify_collinear(correlation_threshold=0.98) feature_selector.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) features_1hot = feature_selector.one_hot_features features_base = feature_selector.base_features feature_selector.identify_low_importance(cumulative_importance=0.99) X_dash = feature_selector.remove(methods=[ 'single_unique', 'collinear', 'zero_importance', 'low_importance' ], keep_one_hot=False) X_dash['Label'] = y X_dash.to_csv(saveto_path, index=False) meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)] with open(saveto_path, 'r') as fh: contents = fh.read() contents = ','.join(meta_data) + '\n' + contents with open(saveto_path, 'w') as fh: fh.write(contents) os.system("rm -f " + mod_data_file_path)
#-- Identify redundant features if(USE_LEARNER_FOR_FEATURE_SELECTION): # NOT COMPLETE fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99}) #-- Get valuable features X = fs.remove(methods = 'all', keep_one_hot = True) else: #-- Features with missing values greater than threshold fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD) #-- Correlated features fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD) #-- Single unique value fs.identify_single_unique() #-- TO get keys fs.ops.keys() missing_features = list(fs.ops['missing']) corelated_features = list(fs.ops['collinear']) single_value = list(fs.ops['single_unique']) r = set(flatten([missing_features,corelated_features,single_value])) #X = df_feats.drop(r, axis=1) rnk_pval = getPvalStats(df, 'target') feat_types = pd.DataFrame(df_feats.dtypes)