def test(self, clf, dftest=pd.DataFrame()): info = "\nstart test model ... " print(info) self.report_info = self.report_info + info + '\n' # 若传入新的dftest,则需要再次做数据预处理 if len(dftest) > 0: print('preprocessing test data...\n') # 禁止数据预处理期间打印输出 stdout = sys.stdout sys.stdout = open(os.devnull, 'w') X_train, y_train, X_test, y_test = self.preprocess_data( self.dftrain, dftest) # 恢复打印输出 sys.stdout = stdout # 预处理后的训练和测试集 self.X_train, self.y_train = X_train, y_train self.X_test, self.y_test = X_test, y_test y_test_hat = clf.predict_proba(self.X_test)[:, -1] dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test)) ks_test = max(dfks_test['ks_value']) auc_test = metrics.roc_auc_score(np.ravel(self.y_test), y_test_hat) info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n' prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test)) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info + '\n'
def test(self, bst, dftest=pd.DataFrame()): info = "\nstart test xgboost model ... \n" print(info) self.report_info = self.report_info + info + '\n' # 若传入新的dftest,则需要再次做数据预处理 if len(dftest) > 0: print('preprocessing test data...') # 禁止数据预处理期间打印输出 stdout = sys.stdout sys.stdout = open(os.devnull, 'w') X_train, y_train, X_test, y_test = self.preprocess_data( self.dftrain, dftest) # 恢复打印输出 sys.stdout = stdout # 预处理后的测试集 self.X_test, self.y_test = X_test, y_test self.dtest = xgb.DMatrix(self.X_test, self.y_test['label']) y_test_hat = bst.predict(self.dtest) dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test)) ks_test = max(dfks_test['ks_value']) auc_test = auc(np.ravel(self.y_test), y_test_hat) info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n' prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test)) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info + '\n'
def test_xgb(test_tbl, xgb_model, train_list): df_test_x, df_test_y, f_list_test, df_median = data_preprocess(test_tbl) df_test = pd.DataFrame() for e in train_list: df_test[e] = df_test_x[e] df_test_x = df_test # df_test_x.fillna(-1, inplace=True) print 'Read test done' test_y = np.array(df_test_y) xgb = xgb_model test_x = np.array(df_test_x) y_proba = xgb.predict_proba(test_x) y_score = [item[0] for item in y_proba] y_good = [1 - item for item in test_y] tmp_df = pd.DataFrame() tmp_df['f'] = y_score tmp_df['good'] = y_good tmp_df['bad'] = test_y ks_dict = run_ks(test_y, y_proba[:, 1]) auc = roc_auc_score(test_y, y_proba[:, 1]) print "%f\t%f" % (auc, ks_dict['ks']) print_ks(ks_dict, test_tbl + '_score_ks_detail')
def test_xgb(data, fea_list_path, model_path, filter_name=['name', 'idcard', 'phone', 'loan_dt', 'label']): fea_list = [] df = pd.read_csv(data, sep='\t') df_label = df['label'] df_out = df for key in filter_name: if key in df.keys(): del df[key] if fea_list_path != "": fea_list = [ x.strip() for x in open(fea_list_path) if len(x.strip()) > 0 ] df = df[fea_list] df_label = df_label[~df.isnull().all(axis=1)] df_out = df_out[~df.isnull().all(axis=1)] df = df[~df.isnull().all(axis=1)] test_x = df test_Y = df_label f_list = fea_list dTest = xgb.DMatrix(test_x, label=test_Y) clf_xgb = xgb.Booster() clf_xgb.load_model(model_path) print "Read test done" dTest = xgb.DMatrix(test_x) y_proba = clf_xgb.predict(dTest) print y_proba ks_ = run_ks(test_Y, y_proba) auc_ = roc_auc_score(test_Y, y_proba) print "%f\t%f" % (auc_, ks_['ks']) df_out['prob'] = y_proba print_ks(ks_, data + r'_score_ks_detail') df_out.to_csv(data + r'_prob_', index=False, sep='\t')
def xgb_model(sample_tbl, model_path, xgb_params={'nthread': 4, 'n_estimators': 80, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 0.1, 'subsample': 0.4, 'learning_rate': 0.06, 'colsample_bytree': 0.5, 'scale_pos_weight': 1, 'seed': 100}): # os.path.join(model_res_path, 'score'), os.path.join(model_res_path, 'fip') plst = xgb_params.items() f_score = os.path.join(model_path, "score") f_write = os.path.join(model_path, "fip") df_x, df_y, df_name = sample_tbl x = np.array(df_x) y = np.array(df_y) # 深度是n,节点数2**(n+1)-1,叶子节点数2**n kf = StratifiedKFold(n_splits=5, shuffle=True) auc_list = [] auc_list_train = [] ks_list = [] ks_list_train = [] times = 0 for train_index, dev_index in kf.split(x, y): # print "KFold: %d\nauc\tks" % times times += 1 x_train, x_dev = x[train_index], x[dev_index] y_train, y_dev = y[train_index], y[dev_index] dtrain = xgb.DMatrix(x_train, label=y_train) dvalid = xgb.DMatrix(x_dev, label=y_dev) # evallist = [(dtrain, 'train'), (dvalid, 'eval')] #clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators'], evals=evallist) clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators']) clf_xgb.save_model(os.path.join(model_path, 'clf_'+str(times))) clf_xgb.dump_model(os.path.join(model_path, 'dump_raw_'+str(times))) f_weights_dict = clf_xgb.get_score(importance_type='weight') f_gains_dict = clf_xgb.get_score(importance_type='gain') f_covers_dict = clf_xgb.get_score(importance_type='cover') fea_analysis = [] for f_key in f_weights_dict: fea_analysis.append( {'feature': f_list[int(f_key[1:])], 'weight': f_weights_dict[f_key], 'gain': f_gains_dict[f_key], 'cover': f_covers_dict[f_key]}) fea_analysis_df = pd.DataFrame(fea_analysis, columns=['feature', 'weight', 'gain', 'cover']) fea_analysis_df.sort_values(['gain'], ascending=False, inplace=True) fea_analysis_df.to_csv(f_write+"_"+str(times), index=None, sep='\t') y_proba = clf_xgb.predict(dvalid) y_proba_train = clf_xgb.predict(dtrain) ks_dict = run_ks(y_dev, y_proba) ks_dict_train = run_ks(y_train, y_proba_train) auc = roc_auc_score(y_dev, y_proba) auc_train = roc_auc_score(y_train, y_proba_train) # print "%f\t%f" % (auc, ks) ks_list.append(ks_dict['ks']) auc_list.append(auc) ks_list_train.append(ks_dict_train['ks']) auc_list_train.append(auc_train) print_ks(ks_dict, f_score) fea_analysis_df.to_csv(f_write, index=None, sep='\t') clf_xgb.save_model(os.path.join(model_path, "clf")) clf_xgb.dump_model(os.path.join(model_path, "dump_raw")) dd = dict() dd['train_ks'] = ks_list_train dd['valida_ks'] = ks_list dd['train_auc'] = auc_list_train dd['valida_auc'] = auc_list train_ks_df = pd.DataFrame(dd) train_ks_df.to_csv(os.path.join(model_path, "ks_auc"),index=False, sep='\t') #params = clf_xgb.get_params() #print params ks_mean = np.mean(ks_list) ks_var = np.std(ks_list) auc_mean = np.mean(auc_list) auc_var = np.std(auc_list) ks_mean_train = np.mean(ks_list_train) ks_var_train = np.std(ks_list_train) auc_mean_train = np.mean(auc_list_train) auc_var_train = np.std(auc_list_train) print('train: ') print "ks mean: %f, ks var: %f" % (ks_mean_train, ks_var_train) print "auc mean: %f, auc var: %f" % (auc_mean_train, auc_var_train) print('validation:') print "ks mean: %f, ks var: %f" % (ks_mean, ks_var) print "auc mean: %f, auc var: %f" % (auc_mean, auc_var)
def train(self, clf, cv=5, model_idx=5): if cv: #skf = StratifiedKFold(n_splits = cv,shuffle=True) k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0 models = {} #for train_index,validate_index in skf.split(self.X_train,np.ravel(self.y_train)): for train_index, validate_index in stratified_kfold( self.X_train, np.ravel(self.y_train), nfolds=cv): k = k + 1 nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n{}: k = {}'.format(nowtime, k) print(info) self.report_info = self.report_info + info + '\n' X_train_k, y_train_k = self.X_train.iloc[ train_index, :], self.y_train.iloc[train_index, :] X_validate_k, y_validate_k = self.X_train.iloc[ validate_index, :], self.y_train.iloc[validate_index, :] clf.fit(X_train_k, np.ravel(y_train_k)) predict_train_k = clf.predict_proba(X_train_k)[:, -1] predict_validate_k = clf.predict_proba(X_validate_k)[:, -1] dfks_train = ks.ks_analysis(predict_train_k, y_train_k.values) dfks_validate = ks.ks_analysis(predict_validate_k, y_validate_k.values) ks_train, ks_validate = max(dfks_train['ks_value']), max( dfks_validate['ks_value']) auc_validate = metrics.roc_auc_score(np.ravel(y_validate_k), predict_validate_k) auc_train = metrics.roc_auc_score(np.ravel(y_train_k), predict_train_k) ks_mean_train = ks_mean_train + ks_train auc_mean_train = auc_mean_train + auc_train ks_mean_validate = ks_mean_validate + ks_validate auc_mean_validate = auc_mean_validate + auc_validate info = '\ntrain: ks = {} \t auc = {} '.format( ks_train, auc_train) prettyks = ks.print_ks(predict_train_k, y_train_k.values) info = info + '\n' + str(prettyks) + '\n' info = info + '\nvalidate: ks = {} \t auc = {}'.format( ks_validate, auc_validate) + '\n' prettyks = ks.print_ks(predict_validate_k, y_validate_k.values) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info models[k] = clf ks_mean_train = ks_mean_train / float(k) auc_mean_train = auc_mean_train / float(k) ks_mean_validate = ks_mean_validate / float(k) auc_mean_validate = auc_mean_validate / float(k) nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_train, auc_mean_train) + '\n' info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_validate, auc_mean_validate) + '\n' print(info) self.report_info = self.report_info + info clf = models[model_idx] # 处理 cv = 0 或 cv = None时无需交叉验证逻辑 else: nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime print(info) self.report_info = self.report_info + info clf.fit(self.X_train, np.ravel(self.y_train)) predict_train = clf.predict_proba(self.X_train)[:, -1] dfks_train = ks.ks_analysis(predict_train, self.y_train.values) ks_train = max(dfks_train['ks_value']) auc_train = metrics.roc_auc_score(np.ravel(self.y_train), predict_train) info = '\ntrain: ks = {} \t auc = {} '.format(ks_train, auc_train) + '\n' prettyks = ks.print_ks(predict_train, self.y_train.values) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info return (clf)
def train(self, cv=5, model_idx=5, params_dict=params_dict, n_jobs=4, verbose_eval=20): info = "start train xgboost model ..." print(info) self.report_info = self.report_info + info + '\n' params_dict_copy = params_dict.copy() params_dict_copy.update({'nthread': n_jobs}) if cv: k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0 models = {} for train_index, validate_index in stratified_kfold( self.X_train, np.ravel(self.y_train), nfolds=cv): k = k + 1 nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print( '\n================================================================================ %s\n' % nowtime) info = 'k = {}'.format(k) print(info) self.report_info = self.report_info + info + '\n' X_train_k, y_train_k = self.X_train.iloc[ train_index, :], self.y_train.iloc[train_index, :] X_validate_k, y_validate_k = self.X_train.iloc[ validate_index, :], self.y_train.iloc[validate_index, :] dtrain_k = xgb.DMatrix(X_train_k, y_train_k['label']) dvalid_k = xgb.DMatrix(X_validate_k, y_validate_k['label']) bst, _ = train_xgb(params_dict_copy, dtrain_k, dvalid_k, None, verbose_eval) predict_train_k = bst.predict(dtrain_k) predict_validate_k = bst.predict(dvalid_k) dfks_train = ks.ks_analysis(predict_train_k, dtrain_k.get_label()) dfks_validate = ks.ks_analysis(predict_validate_k, dvalid_k.get_label()) ks_train, ks_validate = max(dfks_train['ks_value']), max( dfks_validate['ks_value']) auc_train = auc(dtrain_k.get_label(), predict_train_k) auc_validate = auc(dvalid_k.get_label(), predict_validate_k) ks_mean_train = ks_mean_train + ks_train auc_mean_train = auc_mean_train + auc_train ks_mean_validate = ks_mean_validate + ks_validate auc_mean_validate = auc_mean_validate + auc_validate info = '\ntrain: ks = {} \t auc = {} '.format( ks_train, auc_train) prettyks = ks.print_ks(predict_train_k, dtrain_k.get_label()) info = info + '\n' + str(prettyks) + '\n' info = info + '\nvalidate: ks = {} \t auc = {}'.format( ks_validate, auc_validate) + '\n' prettyks = ks.print_ks(predict_validate_k, dvalid_k.get_label()) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info models[k] = bst ks_mean_train = ks_mean_train / float(k) auc_mean_train = auc_mean_train / float(k) ks_mean_validate = ks_mean_validate / float(k) auc_mean_validate = auc_mean_validate / float(k) nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_train, auc_mean_train) + '\n' info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_validate, auc_mean_validate) + '\n' print(info) self.report_info = self.report_info + info bst = models[model_idx] # 处理 cv = 0 或 cv = None时无需交叉验证逻辑 else: nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime print(info) self.report_info = self.report_info + info bst, _ = train_xgb(params_dict_copy, self.dtrain, None, None, verbose_eval) predict_train = bst.predict(self.dtrain) dfks_train = ks.ks_analysis(predict_train, self.y_train.values) ks_train = max(dfks_train['ks_value']) auc_train = auc(self.dtrain.get_label(), predict_train) info = '\ntrain: ks = {} \t auc = {} '.format(ks_train, auc_train) + '\n' prettyks = ks.print_ks(predict_train, self.y_train.values) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info # 计算特征重要性 feature_scores = bst.get_score() dfimportance = pd.DataFrame({ 'feature': feature_scores.keys(), 'importance': feature_scores.values() }) try: dfimportance = dfimportance.sort_values('importance', ascending=False) except AttributeError as err: dfimportance = dfimportance.sort('importance', ascending=False) dfimportance.index = range(len(dfimportance)) self.dfimportance = dfimportance return (bst)