def feature_engineering(self, x_data, y_data, train=None):
     #特征选择
     cols = x_data.columns
     # 消耗
     consume_col = cols[0:10]
     # 招募
     recruit_col = cols[10:22]
     # 加速
     acceleration_col = cols[22:32]
     # 建筑
     build_col = cols[32:48]
     # 科技
     science_col = cols[48:97]
     # pvp
     pvp_col = cols[97:103]
     # 付费
     pay_col = cols[103:106]
     # label
     # label_col = cols[108]
     if train:
         fs = FeatureSelector(data=x_data, labels=DataFrame(y_data))
         fs.identify_all(
             selection_params={
                 'missing_threshold': 0.6,
                 'correlation_threshold': 0.98,
                 'task': 'classification',
                 'eval_metric': 'auc',
                 'cumulative_importance': 0.99
             })
         self.drop_columns = fs.ops
         with open('drop_columns.pkl', 'wb') as file:
             pickle.dump(self.drop_columns, file)
         self.feature_df = fs.remove(methods='all', keep_one_hot=False)
     else:
         drop_list = []
         for key in self.drop_columns.keys():
             for value in self.drop_columns[key]:
                 drop_list.append(value)
         self.feature_df.drop(drop_list, axis=1, inplace=True)
     print(self.drop_columns)
예제 #2
0


#-- Separate features from labels
y = df['target']
train_labels = y
df_feats = df.drop(columns = ['target'])

#-- Create an instance
fs = FeatureSelector(data = df_feats, labels = train_labels)

#-- Identify redundant features
if(USE_LEARNER_FOR_FEATURE_SELECTION):
    # NOT COMPLETE
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    #-- Get valuable features   
    X = fs.remove(methods = 'all', keep_one_hot = True)

else:
    #-- Features with missing values greater than threshold 
    fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD)
    #-- Correlated features
    fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
    #-- Single unique value
    fs.identify_single_unique()
    
    #-- TO get keys fs.ops.keys()
    missing_features = list(fs.ops['missing'])
    corelated_features = list(fs.ops['collinear'])
예제 #3
0
                            early_stopping=True)
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']
print('zero_importance_features', zero_importance_features)

#低重要度特征统计
fs.identify_low_importance(cumulative_importance=0.99)
df_low_importance = fs.feature_importances
print(df_low_importance.sort_values('importance', ascending=False).head(20))

#一次行运行所有函数
print('go')
fs.identify_all(
    selection_params={
        'missing_threshold': 0.7,
        'correlation_threshold': 0.99,
        'task': 'classification',
        'eval_metric': tpr_weight_funtion_lc,
        'cumulative_importance': 0.999
    })

#移除特征
# Remove the features from all methods (returns a df)
left_feature, removed_feature = fs.remove(methods=[
    'missing', 'single_unique', 'collinear', 'zero_importance',
    'low_importance'
],
                                          keep_one_hot=True)
print('left_feature\n ', left_feature.columns, left_feature.shape)
print(
    'emoved_feature\n',
    len(removed_feature),
       'residentAddr'] = df[df['isNew'] == 0]['residentAddr'].apply(
           lambda x: x if x == -999 else x - 300000)

#特征选择,特征选择的参数解释:
"""
missing_threshold表示数据特征缺失值比例阈值,当缺失值比例超过0.6时则删除该特征
correlation_threshold表示特征之间的相关性
task指的是进行的任何,eval_metric表示使用的评价指标
cumulative_importance指的是按特征重要性排序后的特征累加,看多少个特征重要性累加可以达到0.95
"""

fs = FeatureSelector(data=x, labels=y)
fs.identify_all(
    selection_params={
        'missing_threshold': 0.6,
        'correlation_threshold': 0.9,
        'task': 'regression',
        'eval_metric': 'mse',
        'cumulative_importance': 0.95
    })

choose = fs.remove(methods=['missing', 'single_unique', 'zero_importance'],
                   keep_one_hot=True)

#根据选择得到的特征集来得到训练数据和测试数据集
x = x[choose.columns.values]
X_predict = df_predict[choose.columns.values]
choose.columns

#因为存在样本不均衡问题,因而在选择测试数据集时,将50%为1的样本选做测试集
label_1 = train_data_1['target']
label_0 = train_data_0['target']