def fit(self, x, y):
        if self.method == 'poly':
            FS = Filter_Selection('pearson', TopN=self.TopN)
            FS.fit(x, y)
            new_x = FS.transform(x)
            Dchange = Data_Preprocess.Data_Change('poly')
            Dchange.fit(new_x)
            DChange_new_x = Dchange.transform(new_x)
            standard = Data_Preprocess.Data_Change('avgstd')
            standard.fit(DChange_new_x)

            #赋值
            self.Filter_Selection = FS
            self.DChange = Dchange
            self.Standard = standard
    def get_vip(self,isplot=True):
        #计算关键因子重要性
        col_name = 'variable importance'
        if self.method in ['knn','dt','svm','bp']:
            res = None
            
        else:
            if self.method in ['logistic'] :
                mean_coef = pd.DataFrame(abs(self.cls_model.coef_)).T.mean(axis=1)
                var_importance = pd.DataFrame(mean_coef.values,index = self.factor_name , columns = [col_name])
#                var_importance = pd.DataFrame(abs(self.cls_model.coef_),index = [col_name] ,columns = self.factor_name)
                
            elif self.method in ['rf','adaBoost','gbm','xgb']:
                coef = self.cls_model.best_estimator_.feature_importances_.reshape(-1,1)
                var_importance = pd.DataFrame(abs(coef),columns = [col_name] ,index = self.factor_name)
            
            res = var_importance.sort_values(col_name)
            #对因子重要性进行归一化。
            Dchange = Data_Preprocess.Data_Change('minmax')
            Dchange.fit(res)
            res = Dchange.transform(res)
            #画条形图
            if isplot:
                plt = Data_plot.plot_bar_analysis(res)
                plt.title('variable importance')
                plt.show()
        
        return res
    def get_vip(self,isplot = True):
        #计算关键因子,
        if self.method in ['svr','knn','dt','bp']:
            #上述算法没有办法衡量重要因子
            return None
        else:
            col_name = 'variable importance'
            if self.method in ['linear'] :
                var_importance = pd.DataFrame(abs(self.reg_model.coef_),columns = [col_name] , index= self.factor_name)
            elif self.method in ['ridge','lasso','ElasticNet','pls']:
                coef = self.reg_model.best_estimator_.coef_.reshape(-1,1)
                var_importance = pd.DataFrame(abs(coef),columns = [col_name] ,index = self.factor_name)
            elif self.method in ['rf','adaBoost','gbm','xgb']:
    #            var_importance = None
                coef = self.reg_model.best_estimator_.feature_importances_.reshape(-1,1)
                var_importance = pd.DataFrame(abs(coef),columns = [col_name] ,index = self.factor_name)
            res = var_importance.sort_values(col_name,ascending = False)
            #对因子重要性进行归一化。
            Dchange = Data_Preprocess.Data_Change('minmax')
            Dchange.fit(res)
            res = Dchange.transform(res)
            #画条形图

            if isplot:

                plt = Data_plot.plot_bar_analysis(res,Top=15)
                plt.title('variable importance')
                plt.show()
            return res
示例#4
0
 def data_change(self, method='minmax'):
     Dchange = Data_Preprocess.Data_Change(method=method)
     Dchange.fit(self.x)
     self.x = Dchange.transform(self.x)
     self.data.loc[:, self.x.columns] = self.x
     #新增工序list
     self.Pipeline_list.append(('data_change', Dchange))
     self.data_change_model = Dchange
示例#5
0
def keyfeature_check(data, columnslist=None, label_col=None):
    '''
    关键因子检验:对比各类分布
    '''

    if columnslist is None:
        columnslist = data.columns
    #对数据进行标准化
    keyfeature = data.loc[:, columnslist]
    keyfeature, scaler = Data_Preprocess.data2avgstd(keyfeature)
    keyfeature = pd.DataFrame(keyfeature,
                              columns=keyfeature.columns,
                              index=keyfeature.index)
    if label_col is not None:
        keyfeature = pd.concat([keyfeature, data[[label_col]]], axis=1)
        keyfeature.boxplot(column=columnslist, by='Y')
    else:
        keyfeature.boxplot(column=columnslist)
 def predict(self,x):
     '''
     预测:
     '''
     res = []
     for model_name in self.listModelName:
         sub_model_res = []
         for sub_model in self.train_model[model_name]:
             sub_model_res.append(pd.DataFrame(sub_model.predict(x)))
         #子模型结果融合
         sub_model_res = pd.concat(sub_model_res,axis = 1).mean(axis = 1)
         res.append(sub_model_res)
     #不同模型结果融合
     if self.stack_method == 'avg':
         res = pd.concat(res,axis = 1).mean(axis = 1)
     elif self.stack_method == 'weight':
         res = pd.concat(res,axis = 1).values
         #对mse进行归一化
         mse = pd.DataFrame(self.mse_list)
         Dchange = Data_Preprocess.Data_Change('minmax')
         mse = Dchange.fit_transform(mse)
         weight = np.array(mse).reshape(len(res),1)
         res = np.dot(res,weight)
     return res.values
示例#7
0
 def check_outlier(self, method='3sigma', muti=3):
     cek_out = Data_Preprocess.Check_Outlier(method=method, muti=muti)
     self.x = cek_out.fit(self.x)
     self.data.loc[:, self.x.columns] = self.x
     #新增工序list
     self.Pipeline_list.append(('check_outlier', cek_out))
示例#8
0
 def fillnan(self, method=1):
     fillna = Data_Preprocess.Fillna(method=method)
     self.x = fillna.fit(self.x)
     self.data.loc[:, self.x.columns] = self.x
     #新增工序list
     self.Pipeline_list.append(('fillnan', fillna))
示例#9
0
 def get_data_type(self):
     return Data_Preprocess.get_variable_type(self.data)