Python FeatureSelector.check_removal示例

low_importance_features = fs.ops['low_importance']
print(low_importance_features[:5])

fs.plot_feature_importances(threshold=0.99, plot_n=12)
plt.show()

# 6   Removing Features

# Removing Features:    This method returns the resulting data which we can then use for machine learning.
#                       The original data will still be accessible in the data attribute of the Feature Selector.

train_no_missing = fs.remove(methods=['missing'])  #以鉴别17种
train_no_missing_zero = fs.remove(methods=['missing',
                                           'zero_importance'])  #已经鉴别66+17=83种

all_to_remove = fs.check_removal()  #检查所有要删除的features
print(all_to_remove[0:])

train_removed = fs.remove(methods='all')  #删除所有的不好的features

# 7   Handling One-Hot Features

train_removed_all = fs.remove(methods='all', keep_one_hot=False)

print('Original Number of Features', train.shape[1])
print('Final Number of Features: ', train_removed_all.shape[1])

# 8 Alternative Option for Using all Methods ：一个命令全部做完
fs = FeatureSelector(data=train, labels=train_labels)

fs.identify_all(

示例#2

显示文件

文件： sandbox_data_cleaning.py 项目： sculd/us_finance_daily_miner

dfm__ = dfm_.reset_index().set_index(['date', 'symbol'])
dfm__['win'] = (dfm__['trt1m'] > dfm__['sprtrn']).astype(np.int64)
dfm__['rtoversp'] = dfm__['trt1m'] - dfm__['sprtrn']
dfm__ = dfm__.dropna()
dfm__.isna().sum()

df_mrq['win'] = dfm__.win
df_mrq['trt1m'] = dfm__.trt1m
df_mrq['sprtrn'] = dfm__.sprtrn
df_mrq['rtoversp'] = dfm__.rtoversp
df_mrq = df_mrq.dropna()

train = df_mrq.drop(columns=['dimension', 'win', 'rtoversp'])
train_labels = df_mrq['win']

fs = FeatureSelector(data=train, labels=train_labels)
fs.identify_collinear(correlation_threshold=0.975)

#fs.plot_collinear(plot_all=True)

#fs.identify_zero_importance(task = 'regression', eval_metric = 'auc', n_iterations = 10, early_stopping = True)

#fs.identify_low_importance(cumulative_importance = 0.99)

all_to_remove = fs.check_removal()
print(all_to_remove)

df_mrq_pruned = df_mrq.drop(columns=all_to_remove)

# df_mrq_pruned.to_csv('data/SHARADAR_SF1_montly_combined_universe_MRQ.labelled.csv')

示例#3

显示文件

print("\n")

print("# identify_low_importance")
fs.identify_low_importance(cumulative_importance=0.99)
low_importance_features = fs.ops["low_importance"]
with open("low_importance.txt", "w") as f:
    for index, low_importance_feature in enumerate(low_importance_features):
        f.write("特征个数：{}  特征名称：{}\n".format(index + 1, low_importance_feature))
print("#-----------------------------------------#")
print("\n")

print("#-----------------------------------------#")
print("移除上述方法判断出来的不需要特征")
print("输出需要被移除的特征")
feature_remove = fs.check_removal()
for i in feature_remove:
    print("移除特征:{}".format(i))
data_remove_feature = fs.remove(methods="all")
print("原始特征个数:{}".format(data.shape[1]))
print("当前特征个数:{}".format(data_remove_feature.shape[1]))
print("#-----------------------------------------#")
print("\n")

print("#---------------------------------#")
print("剩下特征缺失值使用0来进行填充")
data = data_remove_feature.replace(np.NaN, 0)
if data.isnull().any().any():
    print("数据集中存在数据缺失")
    print(data.shape[0] - data.count())
else: