def get_psi_source_data(self, model_path, feature_file_path, model_type):
        with open(vali_message_path, 'a') as fo:
            fo.write("=" * 50 + '\n')

        feature_extractor = FeatureExtractor()
        ''' 可以指定feature的路径, feature的label类型是"逾期"还是"拒绝+逾期" '''
        feature_type = "all"
        online_feature_label = feature_extractor.select_feature(
            feature_label_path=feature_file_path,
            feature_type=feature_type,
            begin=self.begin,
            end=self.psi_date,
            is_psi=True)
        online_feature_label2 = feature_extractor.select_feature(
            feature_label_path=feature_file_path,
            feature_type=feature_type,
            begin=self.psi_date,
            end=self.end,
            is_psi=True)
        self.reporter.output_psi_data(online_feature_label,
                                      model_path,
                                      feature_type="online1",
                                      model_type=model_type)
        self.reporter.output_psi_data(online_feature_label2,
                                      model_path,
                                      feature_type="online2",
                                      model_type=model_type)
    def due_overdue_model(
            self,
            feature_path='data/train_feature_file.txt',
            model_path='model/model_o2o_stat_lgb_%s_model_20160721_v1.pkl',
            model_type="majority"):
        feature_extractor = FeatureExtractor()
        # # 数据集初始化
        feature_type = 'overdue'
        # model_path2 = model_path % (feature_type)
        feature_label = feature_extractor.select_feature(feature_path,
                                                         feature_type,
                                                         begin=self.begin,
                                                         end=self.end)
        best_param = self.lgbclass_by_feature(feature_label=feature_label,
                                              feature_type=feature_type,
                                              model_path=model_path,
                                              model_type=model_type)
        # max_n_est, max_learning_rate, max_max_depth = 150, 0.15, 4
        if self._check_model_cached(model_path):
            # feature_path = 'feature/data/feature_merged_all.txt'
            # 需要再读取一次是因为feature label在训练过程中被删除了
            feature_label = feature_extractor.select_feature(feature_path,
                                                             feature_type,
                                                             begin=self.begin,
                                                             end=self.end)

            # 大训练
            self.dump_model_file(feature_label,
                                 best_param=best_param,
                                 model_path=model_path,
                                 model_type=model_type)
        pass
 def pass_reject_model(
         self,
         feature_path='data/train_feature_file.txt',
         model_path='/model/model_o2o_stat_lgb_%s_model_20160721_v1.pkl'):
     feature_extractor = FeatureExtractor()
     # # 数据集初始化
     # # gen_test_train_validation_set.gen_set('./data/user_chae-at_label_11_18_ll.xlsx', head_name='', id_name='',
     # # v_type=1, suffix='.txt')
     feature_type = 'all'
     model_path = root + model_path % (feature_type)
     # check_performance_by_model(feature_path, feature_type, model_path, begin='2016-01-01', end='2016-05-31')
     # feature_label = feature_extractor.select_feature(feature_path, feature_type)
     feature_label = feature_extractor.select_feature(feature_path,
                                                      feature_type,
                                                      begin=self.begin,
                                                      end=self.end)
     best_param = self.lgbclass_by_feature(feature_label=feature_label,
                                           feature_type=feature_type,
                                           model_path=model_path)
     if self._check_model_cached(model_path):
         # feature_path = 'feature/data/feature_merged_all.txt'
         feature_label = feature_extractor.select_feature(feature_path,
                                                          feature_type,
                                                          begin=self.begin,
                                                          end=self.end)
         # 大训练
         self.dump_model_file(feature_label,
                              best_param=best_param,
                              model_path=model_path)
 def due_overdue_reject_model(
         self,
         feature_path='data/train_feature_file.txt',
         model_path='model/model_o2o_stat_xgb_%s_model_20160721_v1.pkl',
         model_type="majority"):
     """
     训练时需要第一个KS的数据集.
     :param feature_path:
     :param model_path:
     :param model_type:
     :return:
     """
     feature_type = 'overdue'
     model_path2 = model_path % (feature_type)
     self.due_overdue_model(feature_path, model_path2, model_type)
     # 以下是 当使用逾期数据训练挑选拒绝客户时使用
     max_n_est, max_learning_rate, max_max_depth, max_score = self.xgbclass_add_reject(
         xgb_estimators=XGB_estimators,
         xgb_max_feature=XGB_max_feature,
         feature_file_path=feature_path,
         xgb_learning_rate=XGB_learning_rate,
         xgb_tree_depth=XGB_tree_depth,
         feature_type='reject',
         model_path=model_path2,
         model_type=model_type)
     # ''' 可以指定feature的路径, feature的label类型是"逾期"还是"拒绝+逾期" '''
     # feature_path = 'feature/data/feature_merged_recent.txt'
     feature_path = root + '/../feature/data/additional_feature_reject_overdue_{0}_{1}.txt'.format(
         today, model_type)
     feature_type = 'all'
     model_path = root + '/model_dir/model_o2o_stat_xgb_model_overdue_reject_{0}_{1}.pkl'.format(
         today, model_type)
     # check_performance_by_model(feature_path, feature_type, model_path, begin='2016-01-01', end='2016-05-31', type='vali')
     # check_performance_by_model(feature_path, feature_type, model_path, begin='2016-01-01', end='2016-05-31', type='test')
     # max_n_est, max_learning_rate, max_max_depth = 150, 0.15, 4
     feature_extractor = FeatureExtractor()
     feature_label = feature_extractor.select_feature(feature_path,
                                                      feature_type,
                                                      begin=self.begin,
                                                      end=self.end)
     if self._check_model_cached(model_path):
         self.dump_model_file(feature_label,
                              xgb_tree_depth=max_max_depth,
                              xgb_estimators=max_n_est,
                              xgb_learning_rate=max_learning_rate,
                              model_path=model_path)
     pass
    def check_performance_by_model(self,
                                   feature_file_path,
                                   feature_type,
                                   model_path,
                                   type='all',
                                   model_type="minority"):
        with open(vali_message_path, 'a') as fo:
            fo.write("=" * 50 + '\n')
        try:
            current_clf = joblib.load(model_path)
        except Exception as e:
            print(e)
        feature_extractor = FeatureExtractor()
        ''' 可以指定feature的路径, feature的label类型是"逾期"还是"拒绝+逾期" '''
        ''' 选择2016-04-30前的数据做训练  '''
        feature_label = feature_extractor.select_feature(
            feature_label_path=feature_file_path,
            feature_type=feature_type,
            begin=self.begin,
            end=self.end)
        try:
            label = np.array(feature_label["label"])
        except:
            data_length = len(feature_label)
            label = [0 for i in range(data_length)]
        uid_df = pd.DataFrame()
        uid_df["uid"] = feature_label['uid']
        new_feature_df = feature_label.iloc[:, 1:]
        try:
            del new_feature_df['label']
        except:
            pass
        feature_names = new_feature_df.columns
        feature = np.array(new_feature_df)
        del new_feature_df
        # 数据集归一化
        # scaler = StandardScaler()
        scaler = joblib.load(model_path + "_scaler.pkl")
        # scaler.fit(x_train)
        big_vali = scaler.transform(feature)
        dbig_vali = lgb.Dataset(big_vali)
        # check 最后的结果
        uid_df['label'] = label
        uid_list = list(uid_df['uid'])
        new_uid_list = [str(uid) for uid in uid_list]
        print(len(new_uid_list))
        o2o_user_dao = O2OUserDao()
        # fpre = FeaturePreprocessor()
        vali_uid_df = o2o_user_dao.get_user_phone_label_by_uid(
            uid_list, self.begin, self.end)
        print(len(vali_uid_df))
        new_df = pd.merge(uid_df, vali_uid_df, on=['uid'], how='left')
        print(len(new_df))
        # log变换
        # if len(amount_feature) | len(ratio_feature):
        #     data = get_feature_and_label_one_input_label_file.log_transform(data, reversed_new_feature_pos_to_old_map, \
        #                                                                     amount_feature, ratio_feature)
        # n = len(label)
        # apply_id = []
        # for i in range(n):
        #     apply_id.append(i)

        # if is_export_model_file:
        #     joblib.dump(scaler, model_dir + 'creditlgbscaler.1.2.pkl')
        max_ks = 0
        max_n_est = 0
        max_learning_rate = 0
        max_max_depth = 0
        max_ks_test = 0
        best_clf = current_clf
        best_report = ''
        ks_record_path = "../feature/data/ks_lgbt_{0}.txt".format(feature_type)
        result = best_clf.predict(big_vali)
        # 分类报告部分
        # y_validation = label
        ks_vali, new_ks_threshold = ks_val(result, label)
        new_df["y_true"] = label
        new_df["y_predict"] = result

        # graph_ks_curve(result, label, 'validation/lgbt_ks_curve.jpg')

        print('在验证集上的ks为: %f' % ks_vali)

        self.reporter.model_performance_report(
            new_df,
            result,
            y_test=label,
            max_score=new_ks_threshold,
            model_type=model_type,
            key_message_path=vali_message_path,
            feature_type="all")
        self.reporter.record_feature_importance(feature_names,
                                                best_clf,
                                                feature_type=feature_type,
                                                model_type=model_type)
        # ks—value
        # ks_vali_dynamic = ks_thres[0]
        # # 分割阈值
        # threshold_vali = ks_thres[1]

        ### 按照分割阈值对结果进行分类
        predict_label = list()
        for tmp in result:
            if tmp >= new_ks_threshold:
                predict_label.append(1)
            else:
                predict_label.append(0)

        result_report = classification_report(
            label, predict_label, target_names=['bad_user', 'good_user'])
        best_ks_params_path = root + '/../validation/lgb_best_ks_params_{0}.txt'.format(
            feature_type)
        with open(best_ks_params_path, 'a') as fo:
            best_ks_string = "MAX KS in validation set: " + str(
                ks_vali) + 'best ks THRESHOLD: ' + str(
                    new_ks_threshold) + "\t" + "best n_estimators : " + str(
                        max_n_est) + "\t" + "best learning_rate: " + str(
                            max_learning_rate
                        ) + "\t" + "best max_depth: " + str(
                            max_max_depth) + "\n"
            fo.write(best_ks_string)
            fo.write(result_report + "\n")
            print(best_ks_string)
        with open(vali_message_path, 'a') as fo:
            fo.write(best_ks_string)
            fo.write(result_report + "\n")