fs.plot_missing() fs.missing_stats.head(10) fs.identify_single_unique() single_unique = fs.ops['single_unique'] single_unique fs.plot_unique() fs.identify_collinear(correlation_threshold=0.975) correlated_features = fs.ops['collinear'] correlated_features[:5] fs.plot_collinear() fs.record_collinear.head() fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', n_iterations = 10, early_stopping = True) one_hot_features = fs.one_hot_features base_features = fs.base_features print('There are %d original features' % len(base_features)) print('There are %d one-hot features' % len(one_hot_features)) fs.plot_feature_importances(threshold = 0.99, plot_n = 12) fs.feature_importances.head(10)
fs.identify_single_unique() single_unique = fs.ops['single_unique'] print(single_unique) fs.plot_unique() #画图都不好用 plt.show() print(fs.unique_stats.sample(5)) # 3 Collinear (highly correlated) Feature fs.identify_collinear(correlation_threshold=0.975) correlated_features = fs.ops['collinear'] correlated_features[:5] fs.plot_collinear() plt.show() fs.plot_collinear(plot_all=True) plt.show() fs.identify_collinear(correlation_threshold=0.98) fs.plot_collinear() plt.show() print(fs.record_collinear.head()) # 4. Zero Importance Features:one-hot coding 主要用于0相关性特征的识别 fs.identify_zero_importance(task='classification', eval_metric='auc',
test = data_new2[important_features] test["Is_Male"] = y_pred_kmeans #Building ANN now data_ANN = data.copy() data_ANN["Is_Male"] = y_pred_kmeans data_ANN.drop(columns="customer_id", inplace=True) X = data_ANN.iloc[:, :42] y = data_ANN.iloc[:, 42] #Step 1 - Feature selection from feature_selector import FeatureSelector fts = FeatureSelector(X, y) fts.identify_missing(missing_threshold=0.9) fts.identify_collinear(correlation_threshold=0.7) fts.plot_collinear() collinear_features = fts.ops['collinear'] fts.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=30, early_stopping=True) zero_importance_features = fts.ops['zero_importance'] fts.plot_feature_importances(threshold=0.99, plot_n=12) Most_important_Features = list(fts.feature_importances["feature"].head(28)) Data_ANN_2 = data_ANN[Most_important_Features] X = Data_ANN_2.iloc[:, :] y = data_ANN.iloc[:, 42]
fs = FeatureSelector(data=train, labels=train_labels) #缺失值统计 fs.identify_missing(0.5) df_miss_value = fs.missing_stats.sort_values('missing_fraction', ascending=False) print('df_miss_value', df_miss_value.head(15)) missing_features = fs.ops['missing'] print('missing_features to remove', missing_features[:20]) #单值特征统计 fs.identify_single_unique() print('fs.plot_unique()', fs.plot_unique()) fs.identify_collinear(0.95) print('plot_collinear()', fs.plot_collinear()) # list of collinear features to remove collinear_features = fs.ops['collinear'] print('collinear_features', collinear_features) # dataframe of collinear features df_collinear_features = fs.record_collinear.sort_values('corr_value', ascending=False) print('df_collinear_features', df_collinear_features.head(50)) #零重要度特征统计 # Pass in the appropriate parameters fs.identify_zero_importance(task='classification', eval_metric=tpr_weight_funtion_lc, n_iterations=10,
from feature_selector import FeatureSelector import matplotlib.pyplot as plt import pandas as pd if __name__ == '__main__': model = pd.read_csv('../Data/FlightClassificationCleaned.csv') target = model['ARR_DELAY'] model.head() model = model.drop(columns=['ARR_DELAY', 'ARR_DELAY_BIN']) fs = FeatureSelector(data=model, labels=target) fs.identify_collinear(correlation_threshold=0.9) correlated_features = fs.ops['collinear'] print(correlated_features[:5]) fs.record_collinear.head() print(fs.plot_collinear()) # fs.record_collinear.head() # graph.savefig('VarCorrelation.png')