low_importance_features = fs.ops['low_importance'] print(low_importance_features[:5]) fs.plot_feature_importances(threshold=0.99, plot_n=12) plt.show() # 6 Removing Features # Removing Features: This method returns the resulting data which we can then use for machine learning. # The original data will still be accessible in the data attribute of the Feature Selector. train_no_missing = fs.remove(methods=['missing']) #以鉴别17种 train_no_missing_zero = fs.remove(methods=['missing', 'zero_importance']) #已经鉴别66+17=83种 all_to_remove = fs.check_removal() #检查所有要删除的features print(all_to_remove[0:]) train_removed = fs.remove(methods='all') #删除所有的不好的features # 7 Handling One-Hot Features train_removed_all = fs.remove(methods='all', keep_one_hot=False) print('Original Number of Features', train.shape[1]) print('Final Number of Features: ', train_removed_all.shape[1]) # 8 Alternative Option for Using all Methods :一个命令全部做完 fs = FeatureSelector(data=train, labels=train_labels) fs.identify_all(
dfm__ = dfm_.reset_index().set_index(['date', 'symbol']) dfm__['win'] = (dfm__['trt1m'] > dfm__['sprtrn']).astype(np.int64) dfm__['rtoversp'] = dfm__['trt1m'] - dfm__['sprtrn'] dfm__ = dfm__.dropna() dfm__.isna().sum() df_mrq['win'] = dfm__.win df_mrq['trt1m'] = dfm__.trt1m df_mrq['sprtrn'] = dfm__.sprtrn df_mrq['rtoversp'] = dfm__.rtoversp df_mrq = df_mrq.dropna() train = df_mrq.drop(columns=['dimension', 'win', 'rtoversp']) train_labels = df_mrq['win'] fs = FeatureSelector(data=train, labels=train_labels) fs.identify_collinear(correlation_threshold=0.975) #fs.plot_collinear(plot_all=True) #fs.identify_zero_importance(task = 'regression', eval_metric = 'auc', n_iterations = 10, early_stopping = True) #fs.identify_low_importance(cumulative_importance = 0.99) all_to_remove = fs.check_removal() print(all_to_remove) df_mrq_pruned = df_mrq.drop(columns=all_to_remove) # df_mrq_pruned.to_csv('data/SHARADAR_SF1_montly_combined_universe_MRQ.labelled.csv')
print("\n") print("# identify_low_importance") fs.identify_low_importance(cumulative_importance=0.99) low_importance_features = fs.ops["low_importance"] with open("low_importance.txt", "w") as f: for index, low_importance_feature in enumerate(low_importance_features): f.write("特征个数:{} 特征名称:{}\n".format(index + 1, low_importance_feature)) print("#-----------------------------------------#") print("\n") print("#-----------------------------------------#") print("移除上述方法判断出来的不需要特征") print("输出需要被移除的特征") feature_remove = fs.check_removal() for i in feature_remove: print("移除特征:{}".format(i)) data_remove_feature = fs.remove(methods="all") print("原始特征个数:{}".format(data.shape[1])) print("当前特征个数:{}".format(data_remove_feature.shape[1])) print("#-----------------------------------------#") print("\n") print("#---------------------------------#") print("剩下特征缺失值使用0来进行填充") data = data_remove_feature.replace(np.NaN, 0) if data.isnull().any().any(): print("数据集中存在数据缺失") print(data.shape[0] - data.count()) else: