Python FeatureSelector.identify_all 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: feature_selector

클래스/타입: FeatureSelector

메소드/함수: identify_all

hotexamples.com에서의 예제들: 4

Python FeatureSelector.identify_all - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 feature_selector.FeatureSelector.identify_all에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

FeatureSelector(30)

remove(21)

identify_collinear(17)

identify_zero_importance(14)

identify_missing(10)

identify_low_importance(9)

identify_single_unique(6)

plot_feature_importances(6)

plot_collinear(5)

identify_all(4)

check_removal(3)

plot_unique(3)

plot_missing(2)

identify_feat_imp(1)

예제 #1

파일 보기

파일: tap4fun_analys.py 프로젝트: Wvvi/tap4fun_user_pay_analys

 def feature_engineering(self, x_data, y_data, train=None):
     #特征选择
     cols = x_data.columns
     # 消耗
     consume_col = cols[0:10]
     # 招募
     recruit_col = cols[10:22]
     # 加速
     acceleration_col = cols[22:32]
     # 建筑
     build_col = cols[32:48]
     # 科技
     science_col = cols[48:97]
     # pvp
     pvp_col = cols[97:103]
     # 付费
     pay_col = cols[103:106]
     # label
     # label_col = cols[108]
     if train:
         fs = FeatureSelector(data=x_data, labels=DataFrame(y_data))
         fs.identify_all(
             selection_params={
                 'missing_threshold': 0.6,
                 'correlation_threshold': 0.98,
                 'task': 'classification',
                 'eval_metric': 'auc',
                 'cumulative_importance': 0.99
             })
         self.drop_columns = fs.ops
         with open('drop_columns.pkl', 'wb') as file:
             pickle.dump(self.drop_columns, file)
         self.feature_df = fs.remove(methods='all', keep_one_hot=False)
     else:
         drop_list = []
         for key in self.drop_columns.keys():
             for value in self.drop_columns[key]:
                 drop_list.append(value)
         self.feature_df.drop(drop_list, axis=1, inplace=True)
     print(self.drop_columns)

예제 #2

파일 보기



#-- Separate features from labels
y = df['target']
train_labels = y
df_feats = df.drop(columns = ['target'])

#-- Create an instance
fs = FeatureSelector(data = df_feats, labels = train_labels)

#-- Identify redundant features
if(USE_LEARNER_FOR_FEATURE_SELECTION):
    # NOT COMPLETE
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    #-- Get valuable features   
    X = fs.remove(methods = 'all', keep_one_hot = True)

else:
    #-- Features with missing values greater than threshold 
    fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD)
    #-- Correlated features
    fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
    #-- Single unique value
    fs.identify_single_unique()
    
    #-- TO get keys fs.ops.keys()
    missing_features = list(fs.ops['missing'])
    corelated_features = list(fs.ops['collinear'])

예제 #3

파일 보기

                            early_stopping=True)
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']
print('zero_importance_features', zero_importance_features)

#低重要度特征统计
fs.identify_low_importance(cumulative_importance=0.99)
df_low_importance = fs.feature_importances
print(df_low_importance.sort_values('importance', ascending=False).head(20))

#一次行运行所有函数
print('go')
fs.identify_all(
    selection_params={
        'missing_threshold': 0.7,
        'correlation_threshold': 0.99,
        'task': 'classification',
        'eval_metric': tpr_weight_funtion_lc,
        'cumulative_importance': 0.999
    })

#移除特征
# Remove the features from all methods (returns a df)
left_feature, removed_feature = fs.remove(methods=[
    'missing', 'single_unique', 'collinear', 'zero_importance',
    'low_importance'
],
                                          keep_one_hot=True)
print('left_feature\n ', left_feature.columns, left_feature.shape)
print(
    'emoved_feature\n',
    len(removed_feature),

예제 #4

파일 보기

파일: xgb_1.py 프로젝트: whitepaper2/2019-XiaMenBank-Data-Modeling-Competition

       'residentAddr'] = df[df['isNew'] == 0]['residentAddr'].apply(
           lambda x: x if x == -999 else x - 300000)

#特征选择，特征选择的参数解释：
"""
missing_threshold表示数据特征缺失值比例阈值，当缺失值比例超过0.6时则删除该特征
correlation_threshold表示特征之间的相关性
task指的是进行的任何，eval_metric表示使用的评价指标
cumulative_importance指的是按特征重要性排序后的特征累加，看多少个特征重要性累加可以达到0.95
"""

fs = FeatureSelector(data=x, labels=y)
fs.identify_all(
    selection_params={
        'missing_threshold': 0.6,
        'correlation_threshold': 0.9,
        'task': 'regression',
        'eval_metric': 'mse',
        'cumulative_importance': 0.95
    })

choose = fs.remove(methods=['missing', 'single_unique', 'zero_importance'],
                   keep_one_hot=True)

#根据选择得到的特征集来得到训练数据和测试数据集
x = x[choose.columns.values]
X_predict = df_predict[choose.columns.values]
choose.columns

#因为存在样本不均衡问题，因而在选择测试数据集时，将50%为1的样本选做测试集
label_1 = train_data_1['target']
label_0 = train_data_0['target']