def get_psi_source_data(self, model_path, feature_file_path, model_type): with open(vali_message_path, 'a') as fo: fo.write("=" * 50 + '\n') feature_extractor = FeatureExtractor() ''' 可以指定feature的路径, feature的label类型是"逾期"还是"拒绝+逾期" ''' feature_type = "all" online_feature_label = feature_extractor.select_feature( feature_label_path=feature_file_path, feature_type=feature_type, begin=self.begin, end=self.psi_date, is_psi=True) online_feature_label2 = feature_extractor.select_feature( feature_label_path=feature_file_path, feature_type=feature_type, begin=self.psi_date, end=self.end, is_psi=True) self.reporter.output_psi_data(online_feature_label, model_path, feature_type="online1", model_type=model_type) self.reporter.output_psi_data(online_feature_label2, model_path, feature_type="online2", model_type=model_type)
def due_overdue_model( self, feature_path='data/train_feature_file.txt', model_path='model/model_o2o_stat_lgb_%s_model_20160721_v1.pkl', model_type="majority"): feature_extractor = FeatureExtractor() # # 数据集初始化 feature_type = 'overdue' # model_path2 = model_path % (feature_type) feature_label = feature_extractor.select_feature(feature_path, feature_type, begin=self.begin, end=self.end) best_param = self.lgbclass_by_feature(feature_label=feature_label, feature_type=feature_type, model_path=model_path, model_type=model_type) # max_n_est, max_learning_rate, max_max_depth = 150, 0.15, 4 if self._check_model_cached(model_path): # feature_path = 'feature/data/feature_merged_all.txt' # 需要再读取一次是因为feature label在训练过程中被删除了 feature_label = feature_extractor.select_feature(feature_path, feature_type, begin=self.begin, end=self.end) # 大训练 self.dump_model_file(feature_label, best_param=best_param, model_path=model_path, model_type=model_type) pass
def pass_reject_model( self, feature_path='data/train_feature_file.txt', model_path='/model/model_o2o_stat_lgb_%s_model_20160721_v1.pkl'): feature_extractor = FeatureExtractor() # # 数据集初始化 # # gen_test_train_validation_set.gen_set('./data/user_chae-at_label_11_18_ll.xlsx', head_name='', id_name='', # # v_type=1, suffix='.txt') feature_type = 'all' model_path = root + model_path % (feature_type) # check_performance_by_model(feature_path, feature_type, model_path, begin='2016-01-01', end='2016-05-31') # feature_label = feature_extractor.select_feature(feature_path, feature_type) feature_label = feature_extractor.select_feature(feature_path, feature_type, begin=self.begin, end=self.end) best_param = self.lgbclass_by_feature(feature_label=feature_label, feature_type=feature_type, model_path=model_path) if self._check_model_cached(model_path): # feature_path = 'feature/data/feature_merged_all.txt' feature_label = feature_extractor.select_feature(feature_path, feature_type, begin=self.begin, end=self.end) # 大训练 self.dump_model_file(feature_label, best_param=best_param, model_path=model_path)
def due_overdue_reject_model( self, feature_path='data/train_feature_file.txt', model_path='model/model_o2o_stat_xgb_%s_model_20160721_v1.pkl', model_type="majority"): """ 训练时需要第一个KS的数据集. :param feature_path: :param model_path: :param model_type: :return: """ feature_type = 'overdue' model_path2 = model_path % (feature_type) self.due_overdue_model(feature_path, model_path2, model_type) # 以下是 当使用逾期数据训练挑选拒绝客户时使用 max_n_est, max_learning_rate, max_max_depth, max_score = self.xgbclass_add_reject( xgb_estimators=XGB_estimators, xgb_max_feature=XGB_max_feature, feature_file_path=feature_path, xgb_learning_rate=XGB_learning_rate, xgb_tree_depth=XGB_tree_depth, feature_type='reject', model_path=model_path2, model_type=model_type) # ''' 可以指定feature的路径, feature的label类型是"逾期"还是"拒绝+逾期" ''' # feature_path = 'feature/data/feature_merged_recent.txt' feature_path = root + '/../feature/data/additional_feature_reject_overdue_{0}_{1}.txt'.format( today, model_type) feature_type = 'all' model_path = root + '/model_dir/model_o2o_stat_xgb_model_overdue_reject_{0}_{1}.pkl'.format( today, model_type) # check_performance_by_model(feature_path, feature_type, model_path, begin='2016-01-01', end='2016-05-31', type='vali') # check_performance_by_model(feature_path, feature_type, model_path, begin='2016-01-01', end='2016-05-31', type='test') # max_n_est, max_learning_rate, max_max_depth = 150, 0.15, 4 feature_extractor = FeatureExtractor() feature_label = feature_extractor.select_feature(feature_path, feature_type, begin=self.begin, end=self.end) if self._check_model_cached(model_path): self.dump_model_file(feature_label, xgb_tree_depth=max_max_depth, xgb_estimators=max_n_est, xgb_learning_rate=max_learning_rate, model_path=model_path) pass
def check_performance_by_model(self, feature_file_path, feature_type, model_path, type='all', model_type="minority"): with open(vali_message_path, 'a') as fo: fo.write("=" * 50 + '\n') try: current_clf = joblib.load(model_path) except Exception as e: print(e) feature_extractor = FeatureExtractor() ''' 可以指定feature的路径, feature的label类型是"逾期"还是"拒绝+逾期" ''' ''' 选择2016-04-30前的数据做训练 ''' feature_label = feature_extractor.select_feature( feature_label_path=feature_file_path, feature_type=feature_type, begin=self.begin, end=self.end) try: label = np.array(feature_label["label"]) except: data_length = len(feature_label) label = [0 for i in range(data_length)] uid_df = pd.DataFrame() uid_df["uid"] = feature_label['uid'] new_feature_df = feature_label.iloc[:, 1:] try: del new_feature_df['label'] except: pass feature_names = new_feature_df.columns feature = np.array(new_feature_df) del new_feature_df # 数据集归一化 # scaler = StandardScaler() scaler = joblib.load(model_path + "_scaler.pkl") # scaler.fit(x_train) big_vali = scaler.transform(feature) dbig_vali = lgb.Dataset(big_vali) # check 最后的结果 uid_df['label'] = label uid_list = list(uid_df['uid']) new_uid_list = [str(uid) for uid in uid_list] print(len(new_uid_list)) o2o_user_dao = O2OUserDao() # fpre = FeaturePreprocessor() vali_uid_df = o2o_user_dao.get_user_phone_label_by_uid( uid_list, self.begin, self.end) print(len(vali_uid_df)) new_df = pd.merge(uid_df, vali_uid_df, on=['uid'], how='left') print(len(new_df)) # log变换 # if len(amount_feature) | len(ratio_feature): # data = get_feature_and_label_one_input_label_file.log_transform(data, reversed_new_feature_pos_to_old_map, \ # amount_feature, ratio_feature) # n = len(label) # apply_id = [] # for i in range(n): # apply_id.append(i) # if is_export_model_file: # joblib.dump(scaler, model_dir + 'creditlgbscaler.1.2.pkl') max_ks = 0 max_n_est = 0 max_learning_rate = 0 max_max_depth = 0 max_ks_test = 0 best_clf = current_clf best_report = '' ks_record_path = "../feature/data/ks_lgbt_{0}.txt".format(feature_type) result = best_clf.predict(big_vali) # 分类报告部分 # y_validation = label ks_vali, new_ks_threshold = ks_val(result, label) new_df["y_true"] = label new_df["y_predict"] = result # graph_ks_curve(result, label, 'validation/lgbt_ks_curve.jpg') print('在验证集上的ks为: %f' % ks_vali) self.reporter.model_performance_report( new_df, result, y_test=label, max_score=new_ks_threshold, model_type=model_type, key_message_path=vali_message_path, feature_type="all") self.reporter.record_feature_importance(feature_names, best_clf, feature_type=feature_type, model_type=model_type) # ks—value # ks_vali_dynamic = ks_thres[0] # # 分割阈值 # threshold_vali = ks_thres[1] ### 按照分割阈值对结果进行分类 predict_label = list() for tmp in result: if tmp >= new_ks_threshold: predict_label.append(1) else: predict_label.append(0) result_report = classification_report( label, predict_label, target_names=['bad_user', 'good_user']) best_ks_params_path = root + '/../validation/lgb_best_ks_params_{0}.txt'.format( feature_type) with open(best_ks_params_path, 'a') as fo: best_ks_string = "MAX KS in validation set: " + str( ks_vali) + 'best ks THRESHOLD: ' + str( new_ks_threshold) + "\t" + "best n_estimators : " + str( max_n_est) + "\t" + "best learning_rate: " + str( max_learning_rate ) + "\t" + "best max_depth: " + str( max_max_depth) + "\n" fo.write(best_ks_string) fo.write(result_report + "\n") print(best_ks_string) with open(vali_message_path, 'a') as fo: fo.write(best_ks_string) fo.write(result_report + "\n")