#The features identified for removal can be accessed through the ops dictionary of the FeatureSelector object. missing_features = fs.ops['missing'] print(missing_features[:20]) fs.plot_missing() #在每一个画图的后面加上plt.show即可 plt.show() print(fs.missing_stats.head(20)) # 2 Single Unique Value fs.identify_single_unique() single_unique = fs.ops['single_unique'] print(single_unique) fs.plot_unique() #画图都不好用 plt.show() print(fs.unique_stats.sample(5)) # 3 Collinear (highly correlated) Feature fs.identify_collinear(correlation_threshold=0.975) correlated_features = fs.ops['collinear'] correlated_features[:5] fs.plot_collinear() plt.show() fs.plot_collinear(plot_all=True) plt.show()
if TEST_FEATURE_SELECTION: fs.identify_missing(missing_threshold=0.6) missing_features = fs.ops['missing'] missing_features[:10] fs.plot_missing() fs.missing_stats.head(10) fs.identify_single_unique() single_unique = fs.ops['single_unique'] single_unique fs.plot_unique() fs.identify_collinear(correlation_threshold=0.975) correlated_features = fs.ops['collinear'] correlated_features[:5] fs.plot_collinear() fs.record_collinear.head() fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', n_iterations = 10, early_stopping = True) one_hot_features = fs.one_hot_features base_features = fs.base_features print('There are %d original features' % len(base_features)) print('There are %d one-hot features' % len(one_hot_features))
from feature_selector import FeatureSelector # Features are in train and labels are in train_labels fs = FeatureSelector(data=train, labels=train_labels) #缺失值统计 fs.identify_missing(0.5) df_miss_value = fs.missing_stats.sort_values('missing_fraction', ascending=False) print('df_miss_value', df_miss_value.head(15)) missing_features = fs.ops['missing'] print('missing_features to remove', missing_features[:20]) #单值特征统计 fs.identify_single_unique() print('fs.plot_unique()', fs.plot_unique()) fs.identify_collinear(0.95) print('plot_collinear()', fs.plot_collinear()) # list of collinear features to remove collinear_features = fs.ops['collinear'] print('collinear_features', collinear_features) # dataframe of collinear features df_collinear_features = fs.record_collinear.sort_values('corr_value', ascending=False) print('df_collinear_features', df_collinear_features.head(50)) #零重要度特征统计 # Pass in the appropriate parameters