def prepare_data(): """ This function is the main of this module. calls the above functions in order to read/clean/save our data in usable form. I created this function to use dataset_prepare.py as a Python module in our main program. Return values: training X,Y dataset and testing X,Y dataset """ # read our csv files features_df = pd.read_csv("UNSW_NB15_features.csv",encoding = "ISO-8859-1") training_df = pd.read_csv("training.csv").drop("id",axis=1) testing_df = pd.read_csv("testing.csv").drop("id",axis=1) fs = FeatureSelector(data = training_df) fs.identify_collinear(correlation_threshold=0.85) training_df = fs.remove(methods = ['collinear'],keep_one_hot = True) columnlist = list(training_df) testing_df = testing_df[columnlist] training_df = training_df.sample(frac=1) testing_df = testing_df.sample(frac=1) train_x,train_y,test_x,test_y, labels = clean_nominals_and_create_our_datasets(training_df,testing_df) training_df = training_df.drop(["attack_cat","label"], axis=1) print("The features we will use are: ", np.array(list(training_df))) return train_x,train_y,test_x,test_y,labels
def test(): train = pd.read_excel('../data/menori.xlsx') train_labels = train["nums"] print(train.head()) train = train.drop(columns=["nums"]) fs = FeatureSelector(data=train, labels=train_labels) fs.identify_collinear(correlation_threshold=0.98) correlated_features = fs.ops['collinear'] print(correlated_features)
def select_features_without_label(features: pd.DataFrame, missing_threshold=0.99, correlation_threshold=1.0) -> pd.DataFrame: fs = FeatureSelector(data=features) fs.identify_missing(missing_threshold) fs.identify_single_unique() if correlation_threshold < 1: fs.identify_collinear(correlation_threshold) return fs.remove(methods=['missing', 'single_unique', "collinear"]) else: return fs.remove(methods=['missing', 'single_unique'])
def transform_to_nominal(): # read our csv files training_df = pd.read_csv("training.csv").drop("id",axis=1) # Feature selector fs = FeatureSelector(data = training_df) fs.identify_collinear(correlation_threshold=0.85) training_df = fs.remove(methods = ['collinear'],keep_one_hot = True) training_df = training_df.sample(frac=1) training_df = training_df.drop(["attack_cat","label"], axis=1) columnList = list(training_df) labels,nominal_cols = retrieve_classes(training_df) return labels,nominal_cols,columnList
def clean_data(df, use_fs=True): # convert object to categorical data if 'thal' in df.columns: string_labels = ['thal'] df[string_labels] = df[string_labels].apply(categorize_label, axis=0) df = pd.get_dummies(df, drop_first=True) # drop some columns to_drop = ['fasting_blood_sugar_gt_120_mg_per_dl', 'slope_of_peak_exercise_st_segment'] df.drop(to_drop, axis=1, inplace=True) # normalize high variance columns # high_variance_cols = ['resting_blood_pressure'] # df[high_variance_cols] = np.log(df[high_variance_cols]) # convert int to float # df = df.apply(lambda c : c.astype(float), axis=1) if use_fs: fs = FeatureSelector(data=df, labels=y) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.plot_feature_importances(threshold=0.99, plot_n=14) # print(train_removed_all_once) # standard scaling # scaler = RobustScaler() # df[df.columns] = scaler.fit_transform(df[df.columns]) # print(df.info()) # print('\nFeature Selector analysis') return df
def feature_engineering(self, x_data, y_data, train=None): #特征选择 cols = x_data.columns # 消耗 consume_col = cols[0:10] # 招募 recruit_col = cols[10:22] # 加速 acceleration_col = cols[22:32] # 建筑 build_col = cols[32:48] # 科技 science_col = cols[48:97] # pvp pvp_col = cols[97:103] # 付费 pay_col = cols[103:106] # label # label_col = cols[108] if train: fs = FeatureSelector(data=x_data, labels=DataFrame(y_data)) fs.identify_all( selection_params={ 'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99 }) self.drop_columns = fs.ops with open('drop_columns.pkl', 'wb') as file: pickle.dump(self.drop_columns, file) self.feature_df = fs.remove(methods='all', keep_one_hot=False) else: drop_list = [] for key in self.drop_columns.keys(): for value in self.drop_columns[key]: drop_list.append(value) self.feature_df.drop(drop_list, axis=1, inplace=True) print(self.drop_columns)
def select_top_features(train_data): fs = FeatureSelector(train_data[0], train_data[1]) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=6, early_stopping=True) fs.identify_low_importance(cumulative_importance=0.99) return fs.ops['zero_importance'], fs.ops['low_importance']
def heatmap_ftr_slcor(df): # heatlap feature selector funciton le = {} le_df = df.drop(columns='ANNEE') le_df['ADR_CP'] = le_df['ADR_CP'].astype(object) for col in le_df.columns: ### cai's code if le_df.dtypes[col] == 'object': le_df[col] = le_df[col].str.upper() le[col] = LabelEncoder() result = le[col].fit_transform(le_df[le_df[col].notnull()][col]) le_df.loc[le_df[le_df[col].notnull()].index, col] = result fs = FeatureSelector(data=le_df, labels=df['REMUNERATION']) cor_out = le_df.corr() #cor_out.drop(columns=['idCSV','ID_ANO','id','PAYS','SUJET','idCSVDescript'],inplace=True) ## dropping unwanted columns cor_out.drop(columns=[ 'idCSV', 'ID_ANO', 'id', 'PAYS', 'SUJET', 'idCSVDescript', 'SITE_LON', 'SITE_LAT', 'ADR_LAT', 'ADR_LON', 'ENT_LAT', 'ENT_LON' ], inplace=True) # print(cor_out.columns) new_df = pd.DataFrame(columns=['group', 'variable', 'value']) # new dataframe new_df.columns k = 0 li = list(cor_out.columns) # print(li) length = len(li) #cor_out.reset_index(inplace=True, drop=True) i_ind = 0 k = 0 while i_ind < length: ## to group all the variables according as shown in the "indu.csv", so as to be fead to heatmap #print(li[i_ind]) for i in li: new_df.loc[k, 'group'] = li[i_ind] new_df.loc[k, 'variable'] = i new_df.loc[k, 'value'] = cor_out.loc[i, li[ i_ind]] ##### since all the values are very very less, there aren't showing significant difference in heatmap k = k + 1 ##### so just multiplied by 10 .... THIS HAS TO BE CHECKED i_ind = i_ind + 1 # print(new_df.head(3)) new_df.to_csv(os.path.join(BASE_DIR, 'DjangoWeb V1\Interface\static\indu.csv'), index=False) return None
def Bestfeature_from_cummulative_importance(inFile, outFile): df = pd.read_csv(inFile, sep='\t') print(df.shape) train_labels = df['class_label'] train = df.drop(columns=['class_label']) fs = FeatureSelector(data=train, labels=train_labels) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) zero_importance_features = fs.ops['zero_importance'] #fs.plot_feature_importances(threshold = 0.99, plot_n = 12) importance_index = np.min( np.where(fs.feature_importances['cumulative_importance'] > 0.99)) fs.identify_low_importance(cumulative_importance=0.99) print(importance_index) train_removed_all = fs.remove(methods=['zero_importance'], keep_one_hot=False) train_removed_all = pd.concat([train_removed_all, train_labels], axis=1) train_removed_all.to_csv(outFile, sep='\t', index=None)
def featuresSel(train, train_labels, name): """Plots the curve for the importantant features Arguments: train {pandas.Dataframe} -- Dataset train_labels {numpy.ndarray} -- Labels for the dataset name {string} -- Name for file """ print('>>> Feature Selection...') fs = FeatureSelector(data=train, labels=train_labels) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=True) plt.figure(figsize=(15, 15)) fs.plot_feature_importances(threshold=0.99, plot_n=50, name=name) plt.savefig('../../data/figures/rank_{}.png'.format(name)) plt.close()
y_dev = np.array([x - 1 for x in y_dev]) # In[11]: #Train Distribution d = {'y_train': y_train} df_y_train = pd.DataFrame(d) print(df_y_train["y_train"].value_counts()) df_y_train["y_train"].value_counts().plot.bar(figsize=(10, 8), rot=45) # ## Features # In[12]: fs = FeatureSelector(data=x_train, labels=y_train) # ### Missing Values # The first method for finding features to remove is straightforward: find features with a fraction of missing values above a specified threshold. # In[13]: fs.identify_missing(missing_threshold=0.1) # ### Collinear Features # Collinear features are features that are highly correlated with one another. In machine learning, these lead to decreased generalization performance on the test set due to high variance and less model interpretability. # In[14]: fs.identify_collinear(correlation_threshold=0.70) fs.plot_collinear()
# -*- coding: utf-8 -*- """ Created on Wed Jan 2 20:34:58 2019 @author: Animesh """ features = [] file = open('Training Dataset.arff').read() list = file.split('\n') data = np.array(list) data1 = [i.split(',') for i in data] data1 = data1[0:-1] for i in data1: results.append(i[30]) data1 = np.array(data1) features = data1[:, :-1] x = features[:, [ 0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 22, 23, 24, 25, 27, 29 ]] y = [] from feature_selector import FeatureSelector # Features are in train and labels are in train_labels fs = FeatureSelector(data=train, labels=train_labels)
from evaluation import tpr_weight_funtion_lc path0 = '../results/' test = pd.read_csv(path0 + 'test.csv') train = pd.read_csv(path0 + 'train.csv') print('tag_value_counts', train['Tag'].value_counts()) train_labels = train['Tag'] train = train.drop(['UID', 'Tag'], axis=1) X_loc_test = test.drop('UID', axis=1) from feature_selector import FeatureSelector # Features are in train and labels are in train_labels fs = FeatureSelector(data=train, labels=train_labels) #缺失值统计 fs.identify_missing(0.5) df_miss_value = fs.missing_stats.sort_values('missing_fraction', ascending=False) print('df_miss_value', df_miss_value.head(15)) missing_features = fs.ops['missing'] print('missing_features to remove', missing_features[:20]) #单值特征统计 fs.identify_single_unique() print('fs.plot_unique()', fs.plot_unique()) fs.identify_collinear(0.95) print('plot_collinear()', fs.plot_collinear())
x = train_data.drop(['isNew', 'target'], axis=1) #观察数据中residentAddr的编码方式,重新构造了特征 df.loc[df['isNew'] == 0, 'residentAddr'] = df[df['isNew'] == 0]['residentAddr'].apply( lambda x: x if x == -999 else x - 300000) #特征选择,特征选择的参数解释: """ missing_threshold表示数据特征缺失值比例阈值,当缺失值比例超过0.6时则删除该特征 correlation_threshold表示特征之间的相关性 task指的是进行的任何,eval_metric表示使用的评价指标 cumulative_importance指的是按特征重要性排序后的特征累加,看多少个特征重要性累加可以达到0.95 """ fs = FeatureSelector(data=x, labels=y) fs.identify_all( selection_params={ 'missing_threshold': 0.6, 'correlation_threshold': 0.9, 'task': 'regression', 'eval_metric': 'mse', 'cumulative_importance': 0.95 }) choose = fs.remove(methods=['missing', 'single_unique', 'zero_importance'], keep_one_hot=True) #根据选择得到的特征集来得到训练数据和测试数据集 x = x[choose.columns.values] X_predict = df_predict[choose.columns.values]
from sklearn.feature_selection import SelectKBest, chi2 import pandas as pd import numpy as np # columns = ['A_TS%', 'A_eFG%', 'A_3PAr', 'A_FTr', 'A_ORB%', 'A_DRB%', # 'A_TRB%', 'A_AST%', 'A_STL%', 'A_BLK%', 'A_TOV%', 'A_ORtg', 'A_DRtg', # 'H_TS%', 'H_eFG%', 'H_3PAr', 'H_FTr', 'H_ORB%', 'H_DRB%', # 'H_TRB%', 'H_AST%', 'H_STL%', 'H_BLK%', 'H_TOV%', 'H_ORtg', 'H_DRtg' # ] columns = ['TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'ORtg', 'DRtg'] features = pd.DataFrame(adv_diff_features(None)) labels = pd.DataFrame(adv_diff_labels()) features.columns = columns labels.columns = ['POINT_DIFF'] print(len(features), len(labels)) fs = FeatureSelector(data=features, labels=labels) fs.identify_missing(missing_threshold=0.9) fs.identify_collinear(correlation_threshold=0.5) fs.plot_collinear() fs2 = FeatureSelector(data=features, labels=labels[:,]) fs2.identify_zero_importance(eval_metric='l2', task='regression') # fs2.identify_low_importance() print(fs.record_collinear.head())
#################################### #-- FEATURE SELECTION #################################### #-- Separate features from labels y = df['target'] train_labels = y df_feats = df.drop(columns = ['target']) #-- Create an instance fs = FeatureSelector(data = df_feats, labels = train_labels) #-- Identify redundant features if(USE_LEARNER_FOR_FEATURE_SELECTION): # NOT COMPLETE fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99}) #-- Get valuable features X = fs.remove(methods = 'all', keep_one_hot = True) else: #-- Features with missing values greater than threshold fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD) #-- Correlated features fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
x.loc[:, 'county'].replace(county_dic.keys(), county_dic.values(), inplace=True) x.loc[:, 'state_perm'].replace(state_perm_dic.keys(), state_perm_dic.values(), inplace=True) x.loc[:, 'major_type10'].replace(major_type10_dic.keys(), major_type10_dic.values(), inplace=True) x.loc[:, 'major_basic'].replace(major_basic_dic.keys(), major_basic_dic.values(), inplace=True) x.loc[:, 'DegreeCompletionTermDescr'].replace(DegreeCompletionTermDescr_dic.keys(), DegreeCompletionTermDescr_dic.values(), inplace=True) x.loc[:, 'DegreeAcadPlan'].replace(DegreeAcadPlan_dic.keys(), DegreeAcadPlan_dic.values(), inplace=True) x.loc[:, 'DegreeDeptName'].replace(DegreeDeptName_dic.keys(), DegreeDeptName_dic.values(), inplace=True) x.loc[:, 'DegreeSchoolCollegeName'].replace(DegreeSchoolCollegeName_dic.keys(), DegreeSchoolCollegeName_dic.values(), inplace=True) x.loc[:, 'DegreeAcadProgramDescr'].replace(DegreeAcadProgramDescr_dic.keys(), DegreeAcadProgramDescr_dic.values(), inplace=True) x.loc[:, 'DegreeSubPlan'].replace(DegreeSubPlan_dic.keys(), DegreeSubPlan_dic.values(), inplace=True) # fill Nan # x = x.fillna(0) # print(x) # Features are in train and labels are in labels fs = FeatureSelector(data=x, labels=label) # 缺失特征分析 fs.identify_missing(missing_threshold=0.6) # 可以看到数据缺失最厉害的几项数据 print(fs.missing_stats[:10]) missing_features = fs.ops['missing'] print(missing_features[:5]) fs.plot_missing() plt.show() # 共线特征分析 # 对于每对相关特征,它会标出其中一个特征来删除 fs.identify_collinear(correlation_threshold=0.7) # fs.plot_collinear(plot_all=True) # plt.show() # list of collinear features to remove
max_depth=2, verbose=1, n_jobs=1) col_m = [] for col in feature: if 'CHANGE_TYPE' not in col: col_m.append(col) feature_matrix = feature[col_m] # %% #使用feature_selector 篩選特徵 import os os.chdir('c:\\Users\\SA\\python\\練習py') from feature_selector import FeatureSelector fs = FeatureSelector(data=feature_matrix, labels=y) #%% #處理缺失值 fs.identify_missing(missing_threshold=0.6) fs.record_missing.head() fs.plot_missing() # %% #處理共線性(colliear) fs.identify_collinear(correlation_threshold=0.8) fs.record_collinear.head() fs.plot_collinear() # %% #使用lightGBM演算法 fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10,
def featureselect(datas, target): import os os.chdir('c:\\Users\\SA\\python\\練習py') from feature_selector import FeatureSelector fs = FeatureSelector(data=datas, labels=target) fs.identify_missing(missing_threshold=0.6) fs.identify_collinear(correlation_threshold=0.9) fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=10, early_stopping=False) fs.identify_low_importance(cumulative_importance=0.9) train_removed = fs.remove(methods='all') return train_removed
metrics=lambda a,b:AUCPRC(a,b,withACC=T,withAUC=T,withSS=T) # In[24]: aucprc=lambda y_true, y_pred:tuple(list(AUCPRC(y_true,y_pred)[0])+[T]) # ###### Feature Importance # In[25]: fsDataScale = FeatureSelector(data = XTrain.dropCol("Amount"), labels=YTrain) # On redefinit la fonction qui définit les importances en incluant SMOTE # In[26]: def identify_feat_imp(self,n_splits=10): data=self.data dataClass=self.labels skf=StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=T) feature_names=list(data.columns) feature_importance_values = np.zeros(len(feature_names)) scores = np.zeros(n_splits) for i,(train_index, test_index) in tqdm_notebook(enumerate(skf.split(data, dataClass))):
train_data.append(df) # Define name of 12 features set file_name = ["AtomPairs2D","AtomPairs2DCount","EState", "Extended", "Fingerprinterd", "GraphOnly", "KlekotaRoth", "KlekotaRothCount", "MACCS", "Pubchem", "Substructure", "SubstructureCount"] file_name = sorted(file_name) # Sorting name ################# #Load one train data for get labels train_label = pd.read_csv("Data/DILI_data/DILI_train_MF/DILI_train_AtomPairs2D.csv") # Start feature selecting and add labels for each training dataset for train, name in zip(train_data, file_name): feature_columns = [] labels = train_label["class."] X_train = train.drop(labels = "Name", axis = 1) fs = FeatureSelector(data = X_train, labels = labels) fs.identify_all(selection_params = {'missing_threshold': 0.8, 'correlation_threshold': 0.98, 'task': 'classification', 'eval_metric': 'auc', 'cumulative_importance': 0.99,'num_threads':-1}) train_removed_all = fs.remove(methods = 'all', keep_one_hot=False) print('Original Number of Features', train.shape[1]) print('Final Number of Features: ', train_removed_all.shape[1]) train_removed_all.head() feature_columns.extend(train_removed_all.columns) feature_columns = pd.DataFrame(feature_columns,index=None) feature_columns.to_csv('Features_'+ name+'.csv',index = False, header = name) train_removed_all['class.']=labels train_removed_all.to_csv('Data/Feature_Data/Feature_Data/Feature_Train_'+ name + '.csv', index=False, header=True)
fs.identify_collinear(correlation_threshold=0.98) fs.record_collinear.to_csv("csv//record_collinear.csv") #Identify Single Unique fs.identify_single_unique() fs.record_single_unique.to_csv("csv//record_single_unique.csv") #Zero importance fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', n_iterations=10, early_stopping=True) fs.record_zero_importance.to_csv("csv//record_zero_importance.csv") #Low Importance fs.identify_low_importance(cumulative_importance=0.99) fs.feature_importances.to_csv("csv//feature_importance.csv") #Identified features for removal summary = pd.DataFrame.from_dict(fs.ops, orient='index') summary.to_csv("csv//summary.csv") if __name__ == '__main__': __AAPL__ = "D:\\Dropbox\\9. Data\\Mercury Data\\CSV\\CIQ_AAPL.csv" data = DataLoader(__AAPL__, window=10, threshold=0.03, drop=1) fs = FeatureSelector(data=data.df, labels=data.targets) main()
# for f in dis_data or NAN_data_test or NAN_data_train: # encoder=OneHotEncoder(sparse=False) # one_hot_data=train_data[f].values.reshape(-1,1) # encoder.fit(one_hot_data) # train_data[f]=encoder.transform(one_hot_data) print("data preprocessing done!") print("starting feature selection...") #feature selection #correlation calculation(skip) #feature_selector from feature_selector import FeatureSelector fs = FeatureSelector(data=train_data, labels=train_label) #find features with 0 variance fs.identify_single_unique() #recursive feature elimination fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=5, early_stopping=True) print("finish zero importance analysis") fs.identify_low_importance(cumulative_importance=0.99) print("finish low importance analysis") train_data = fs.remove(methods='all') print("finish removing train_data")
from feature_selector import FeatureSelector # In[8]: train_labels = train_data['label'] train_features = train_data.drop(columns=[ 'user', 'product_nbr', 'last_year_capture_user_flag', 'label', 'pro_brand_-1', 'pro_brand_Apple', 'pro_brand_三星', 'pro_brand_其他', 'pro_brand_华为', 'pro_brand_小米', 'pro_brand_未知厂商', 'pro_brand_欧珀', 'pro_brand_维沃' ]) # In[14]: fs = FeatureSelector(data=train_features, labels=train_labels) fs.identify_collinear(correlation_threshold=0.9, one_hot=False) # 绘制选择的特征的相关性heatmap fs.plot_collinear() # 列出要删除的共线特征 collinear_features = fs.ops['collinear'] # 查看共线特征的dataframe fs.record_collinear # In[20]: train_data = train_data.drop(columns=collinear_features) # In[21]: train_data.shape
# data_label[emb_list].to_csv('emb_testb.csv',index=False) print('feature done') train_label = data_label[:num] test_label = data_label[num:] features = [x for x in train_label.columns if x not in ['ship','type','time','x','y','diff_time','date','day_nig','direction','speed','hour', 'speed_many','dire_diff','direction_str','speed_str','dis','x_speed','y_speed'] ] target = 'type' # print(len(features), ','.join(features)) from feature_selector import FeatureSelector fs = FeatureSelector(data = train_label[features], labels = train_label[target]) fs.identify_zero_importance(task = 'classification', eval_metric = 'multiclass', n_iterations = 10, early_stopping = True) fs.identify_low_importance(cumulative_importance = 0.97) low_importance_features = fs.ops['low_importance'] print('====low_importance_features=====') print(low_importance_features) for i in low_importance_features: features.remove(i) print('feature number',len(features)) gc.collect()
Y, test_size=0.2, random_state=0) # Feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns) X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns) # Feature selection (remove highly correlated features) from feature_selector import FeatureSelector n = len(X_train.T) fs = FeatureSelector(data=X_train) fs.identify_collinear( correlation_threshold=0.7) # select features from training set corr = fs.ops['collinear'] X_train = fs.remove(methods=['collinear' ]) # remove selected features from training set to_remove = pd.unique( fs.record_collinear['drop_feature']) # features to remove X_test = X_test.drop( columns=to_remove) # remove selected features from test set # Create the artificial neural network import keras from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout
train = pd.read_csv( 'C:/Users/Administrator/Scikit_learn/feature-selector-master/credit_example.csv' ) train_labels = train['TARGET'] #pd.head,默认查看前五行数据。因为train已经是pd因此直接.head即可 print(train.head()) #pandas.drop的用法,删除Target列 train = train.drop(['TARGET'], axis=1) #对于pandas,行标为index,列表为columns #如常用df = pd.DataFrame(np.random.randn(5,3),index = list('abcde'),columns = ['one','two','three']) #Create the Instance fs = FeatureSelector(data=train, labels=train_labels) # 1 Missing Values fs.identify_missing(missing_threshold=0.6) #The features identified for removal can be accessed through the ops dictionary of the FeatureSelector object. missing_features = fs.ops['missing'] print(missing_features[:20]) fs.plot_missing() #在每一个画图的后面加上plt.show即可 plt.show() print(fs.missing_stats.head(20)) # 2 Single Unique Value
def runFeatureSelector(self, df): logging.info(("Running Feature Selection")) fs = FeatureSelector(data=df, labels=self.targets) # Identify Missing Values fs.identify_missing(missing_threshold=0.6) # Identify Collinearity fs.identify_collinear(correlation_threshold=0.98) fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv") # Identify Single Unique fs.identify_single_unique() fs.record_single_unique.to_csv( ".\\utils\\csv\\record_single_unique.csv") # Zero importance fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', n_iterations=10, early_stopping=True) fs.record_zero_importance.to_csv( ".\\utils\\csv\\record_zero_importance.csv") # Low Importance fs.identify_low_importance(cumulative_importance=0.99) fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv") #generate summary of all operations summary = pd.DataFrame.from_dict(fs.ops, orient='index') summary.to_csv(".\\utils\\csv\\summary.csv") #if drop flag is 1, go ahead and remove the suggested features if self.drop == 1: df = fs.remove(methods='all') else: pass return df
dfm__ = dfm_.reset_index().set_index(['date', 'symbol']) dfm__['win'] = (dfm__['trt1m'] > dfm__['sprtrn']).astype(np.int64) dfm__['rtoversp'] = dfm__['trt1m'] - dfm__['sprtrn'] dfm__ = dfm__.dropna() dfm__.isna().sum() df_mrq['win'] = dfm__.win df_mrq['trt1m'] = dfm__.trt1m df_mrq['sprtrn'] = dfm__.sprtrn df_mrq['rtoversp'] = dfm__.rtoversp df_mrq = df_mrq.dropna() train = df_mrq.drop(columns=['dimension', 'win', 'rtoversp']) train_labels = df_mrq['win'] fs = FeatureSelector(data=train, labels=train_labels) fs.identify_collinear(correlation_threshold=0.975) #fs.plot_collinear(plot_all=True) #fs.identify_zero_importance(task = 'regression', eval_metric = 'auc', n_iterations = 10, early_stopping = True) #fs.identify_low_importance(cumulative_importance = 0.99) all_to_remove = fs.check_removal() print(all_to_remove) df_mrq_pruned = df_mrq.drop(columns=all_to_remove) # df_mrq_pruned.to_csv('data/SHARADAR_SF1_montly_combined_universe_MRQ.labelled.csv')
f.write("特征:{} 数据类型:{}\n".format(index + 1, data_type)) print("#-----------------------------------------#") print("\n") print("#-----------------------------------------#") print("查看数值型数据的分布情况") columne_value_describe = data.describe() columne_value_describe.to_csv("columne_value_describe.csv", index=True, header=True) print("#-----------------------------------------#") print("\n") print("#-----------------------------------------#") print("利用FeatureSelector进行数据预处理") fs = FeatureSelector(data=data, labels=y_labels) print("# identify_missing") fs.identify_missing(missing_threshold=0.65) missing_features = fs.ops["missing"] missing_stats = fs.missing_stats fs.plot_missing() plt.savefig("missing_features.jpg", dpi=300) plt.show() print(fs.missing_stats.head()) print("\n") print("# identify_single_unique") fs.identify_single_unique() single_uniques = fs.ops["single_unique"] with open("single_unique_feature_count.txt", "w") as f: