Exemplo n.º 1
0
    def test(self, clf, dftest=pd.DataFrame()):

        info = "\nstart test model ... "
        print(info)
        self.report_info = self.report_info + info + '\n'

        # 若传入新的dftest,则需要再次做数据预处理
        if len(dftest) > 0:

            print('preprocessing test data...\n')

            # 禁止数据预处理期间打印输出
            stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')

            X_train, y_train, X_test, y_test = self.preprocess_data(
                self.dftrain, dftest)

            # 恢复打印输出
            sys.stdout = stdout

            # 预处理后的训练和测试集
            self.X_train, self.y_train = X_train, y_train
            self.X_test, self.y_test = X_test, y_test

        y_test_hat = clf.predict_proba(self.X_test)[:, -1]
        dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test))
        ks_test = max(dfks_test['ks_value'])
        auc_test = metrics.roc_auc_score(np.ravel(self.y_test), y_test_hat)

        info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n'
        prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test))
        info = info + str(prettyks) + '\n'
        print(info)
        self.report_info = self.report_info + info + '\n'
Exemplo n.º 2
0
    def test(self, bst, dftest=pd.DataFrame()):

        info = "\nstart test xgboost model ... \n"
        print(info)
        self.report_info = self.report_info + info + '\n'

        # 若传入新的dftest,则需要再次做数据预处理
        if len(dftest) > 0:

            print('preprocessing test data...')

            # 禁止数据预处理期间打印输出
            stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')

            X_train, y_train, X_test, y_test = self.preprocess_data(
                self.dftrain, dftest)

            # 恢复打印输出
            sys.stdout = stdout

            # 预处理后的测试集
            self.X_test, self.y_test = X_test, y_test
            self.dtest = xgb.DMatrix(self.X_test, self.y_test['label'])

        y_test_hat = bst.predict(self.dtest)
        dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test))
        ks_test = max(dfks_test['ks_value'])
        auc_test = auc(np.ravel(self.y_test), y_test_hat)

        info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n'
        prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test))
        info = info + str(prettyks) + '\n'
        print(info)
        self.report_info = self.report_info + info + '\n'
Exemplo n.º 3
0
def test_xgb(test_tbl, xgb_model, train_list):
    df_test_x, df_test_y, f_list_test, df_median = data_preprocess(test_tbl)
    df_test = pd.DataFrame()
    for e in train_list:
        df_test[e] = df_test_x[e]
    df_test_x = df_test
    # df_test_x.fillna(-1, inplace=True)
    print 'Read test done'
    test_y = np.array(df_test_y)
    xgb = xgb_model
    test_x = np.array(df_test_x)
    y_proba = xgb.predict_proba(test_x)
    y_score = [item[0] for item in y_proba]
    y_good = [1 - item for item in test_y]
    tmp_df = pd.DataFrame()
    tmp_df['f'] = y_score
    tmp_df['good'] = y_good
    tmp_df['bad'] = test_y
    ks_dict = run_ks(test_y, y_proba[:, 1])
    auc = roc_auc_score(test_y, y_proba[:, 1])
    print "%f\t%f" % (auc, ks_dict['ks'])
    print_ks(ks_dict, test_tbl + '_score_ks_detail')
Exemplo n.º 4
0
def test_xgb(data,
             fea_list_path,
             model_path,
             filter_name=['name', 'idcard', 'phone', 'loan_dt', 'label']):
    fea_list = []
    df = pd.read_csv(data, sep='\t')
    df_label = df['label']
    df_out = df
    for key in filter_name:
        if key in df.keys():
            del df[key]
    if fea_list_path != "":
        fea_list = [
            x.strip() for x in open(fea_list_path) if len(x.strip()) > 0
        ]
    df = df[fea_list]
    df_label = df_label[~df.isnull().all(axis=1)]
    df_out = df_out[~df.isnull().all(axis=1)]
    df = df[~df.isnull().all(axis=1)]
    test_x = df
    test_Y = df_label
    f_list = fea_list

    dTest = xgb.DMatrix(test_x, label=test_Y)
    clf_xgb = xgb.Booster()
    clf_xgb.load_model(model_path)
    print "Read test done"
    dTest = xgb.DMatrix(test_x)
    y_proba = clf_xgb.predict(dTest)
    print y_proba
    ks_ = run_ks(test_Y, y_proba)
    auc_ = roc_auc_score(test_Y, y_proba)
    print "%f\t%f" % (auc_, ks_['ks'])
    df_out['prob'] = y_proba
    print_ks(ks_, data + r'_score_ks_detail')
    df_out.to_csv(data + r'_prob_', index=False, sep='\t')
Exemplo n.º 5
0
def xgb_model(sample_tbl, model_path,
              xgb_params={'nthread': 4, 'n_estimators': 80, 'max_depth': 3,
                          'min_child_weight': 2, 'gamma': 0.1, 'subsample': 0.4, 'learning_rate': 0.06,
                          'colsample_bytree': 0.5, 'scale_pos_weight': 1, 'seed': 100}):
    # os.path.join(model_res_path,  'score'), os.path.join(model_res_path, 'fip')
    plst = xgb_params.items()
    f_score = os.path.join(model_path, "score")
    f_write = os.path.join(model_path, "fip")

    df_x, df_y, df_name = sample_tbl
    x = np.array(df_x)
    y = np.array(df_y)
    # 深度是n,节点数2**(n+1)-1,叶子节点数2**n
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    auc_list = []
    auc_list_train = []
    ks_list = []
    ks_list_train = []
    times = 0
    for train_index, dev_index in kf.split(x, y):
        # print "KFold: %d\nauc\tks" % times
        times += 1
        x_train, x_dev = x[train_index], x[dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_dev, label=y_dev)

        # evallist = [(dtrain, 'train'), (dvalid, 'eval')]
        #clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators'], evals=evallist)
        clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators'])


        clf_xgb.save_model(os.path.join(model_path, 'clf_'+str(times)))
        clf_xgb.dump_model(os.path.join(model_path, 'dump_raw_'+str(times)))

        f_weights_dict = clf_xgb.get_score(importance_type='weight')
        f_gains_dict = clf_xgb.get_score(importance_type='gain')
        f_covers_dict = clf_xgb.get_score(importance_type='cover')
        fea_analysis = []
        for f_key in f_weights_dict:
            fea_analysis.append(
                {'feature': f_list[int(f_key[1:])], 'weight': f_weights_dict[f_key], 'gain': f_gains_dict[f_key],
                 'cover': f_covers_dict[f_key]})
        fea_analysis_df = pd.DataFrame(fea_analysis, columns=['feature', 'weight', 'gain', 'cover'])
        fea_analysis_df.sort_values(['gain'], ascending=False, inplace=True)
        fea_analysis_df.to_csv(f_write+"_"+str(times), index=None, sep='\t')

        y_proba = clf_xgb.predict(dvalid)
        y_proba_train = clf_xgb.predict(dtrain)

        ks_dict = run_ks(y_dev, y_proba)
        ks_dict_train = run_ks(y_train, y_proba_train)

        auc = roc_auc_score(y_dev, y_proba)
        auc_train = roc_auc_score(y_train, y_proba_train)
        # print "%f\t%f" % (auc, ks)

        ks_list.append(ks_dict['ks'])
        auc_list.append(auc)
        ks_list_train.append(ks_dict_train['ks'])
        auc_list_train.append(auc_train)
        print_ks(ks_dict, f_score)

    fea_analysis_df.to_csv(f_write, index=None, sep='\t')
    clf_xgb.save_model(os.path.join(model_path, "clf"))
    clf_xgb.dump_model(os.path.join(model_path, "dump_raw"))
    dd = dict()
    dd['train_ks'] = ks_list_train
    dd['valida_ks'] = ks_list
    dd['train_auc'] = auc_list_train
    dd['valida_auc'] = auc_list
    train_ks_df = pd.DataFrame(dd)
    train_ks_df.to_csv(os.path.join(model_path, "ks_auc"),index=False, sep='\t')
    
    #params = clf_xgb.get_params()
    #print params
    ks_mean = np.mean(ks_list)
    ks_var = np.std(ks_list)
    auc_mean = np.mean(auc_list)
    auc_var = np.std(auc_list)
    ks_mean_train = np.mean(ks_list_train)
    ks_var_train = np.std(ks_list_train)
    auc_mean_train = np.mean(auc_list_train)
    auc_var_train = np.std(auc_list_train)

    print('train: ')
    print "ks mean: %f, ks var: %f" % (ks_mean_train, ks_var_train)
    print "auc mean: %f, auc var: %f" % (auc_mean_train, auc_var_train)
    print('validation:')
    print "ks mean: %f, ks var: %f" % (ks_mean, ks_var)
    print "auc mean: %f, auc var: %f" % (auc_mean, auc_var)
Exemplo n.º 6
0
    def train(self, clf, cv=5, model_idx=5):

        if cv:
            #skf = StratifiedKFold(n_splits = cv,shuffle=True)

            k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0

            models = {}

            #for train_index,validate_index in skf.split(self.X_train,np.ravel(self.y_train)):
            for train_index, validate_index in stratified_kfold(
                    self.X_train, np.ravel(self.y_train), nfolds=cv):

                k = k + 1
                nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                info = '\n{}: k = {}'.format(nowtime, k)
                print(info)
                self.report_info = self.report_info + info + '\n'

                X_train_k, y_train_k = self.X_train.iloc[
                    train_index, :], self.y_train.iloc[train_index, :]
                X_validate_k, y_validate_k = self.X_train.iloc[
                    validate_index, :], self.y_train.iloc[validate_index, :]

                clf.fit(X_train_k, np.ravel(y_train_k))
                predict_train_k = clf.predict_proba(X_train_k)[:, -1]
                predict_validate_k = clf.predict_proba(X_validate_k)[:, -1]

                dfks_train = ks.ks_analysis(predict_train_k, y_train_k.values)
                dfks_validate = ks.ks_analysis(predict_validate_k,
                                               y_validate_k.values)

                ks_train, ks_validate = max(dfks_train['ks_value']), max(
                    dfks_validate['ks_value'])

                auc_validate = metrics.roc_auc_score(np.ravel(y_validate_k),
                                                     predict_validate_k)
                auc_train = metrics.roc_auc_score(np.ravel(y_train_k),
                                                  predict_train_k)

                ks_mean_train = ks_mean_train + ks_train
                auc_mean_train = auc_mean_train + auc_train
                ks_mean_validate = ks_mean_validate + ks_validate
                auc_mean_validate = auc_mean_validate + auc_validate

                info = '\ntrain: ks = {} \t auc = {} '.format(
                    ks_train, auc_train)
                prettyks = ks.print_ks(predict_train_k, y_train_k.values)
                info = info + '\n' + str(prettyks) + '\n'
                info = info + '\nvalidate: ks = {} \t auc = {}'.format(
                    ks_validate, auc_validate) + '\n'
                prettyks = ks.print_ks(predict_validate_k, y_validate_k.values)
                info = info + str(prettyks) + '\n'
                print(info)
                self.report_info = self.report_info + info

                models[k] = clf

            ks_mean_train = ks_mean_train / float(k)
            auc_mean_train = auc_mean_train / float(k)
            ks_mean_validate = ks_mean_validate / float(k)
            auc_mean_validate = auc_mean_validate / float(k)

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_train, auc_mean_train) + '\n'
            info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_validate, auc_mean_validate) + '\n'
            print(info)
            self.report_info = self.report_info + info

            clf = models[model_idx]

        # 处理 cv = 0 或 cv = None时无需交叉验证逻辑
        else:

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            print(info)
            self.report_info = self.report_info + info

            clf.fit(self.X_train, np.ravel(self.y_train))
            predict_train = clf.predict_proba(self.X_train)[:, -1]
            dfks_train = ks.ks_analysis(predict_train, self.y_train.values)
            ks_train = max(dfks_train['ks_value'])
            auc_train = metrics.roc_auc_score(np.ravel(self.y_train),
                                              predict_train)

            info = '\ntrain: ks = {} \t auc = {} '.format(ks_train,
                                                          auc_train) + '\n'
            prettyks = ks.print_ks(predict_train, self.y_train.values)
            info = info + str(prettyks) + '\n'
            print(info)
            self.report_info = self.report_info + info

        return (clf)
Exemplo n.º 7
0
    def train(self,
              cv=5,
              model_idx=5,
              params_dict=params_dict,
              n_jobs=4,
              verbose_eval=20):

        info = "start train xgboost model ..."
        print(info)
        self.report_info = self.report_info + info + '\n'

        params_dict_copy = params_dict.copy()
        params_dict_copy.update({'nthread': n_jobs})

        if cv:

            k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0

            models = {}

            for train_index, validate_index in stratified_kfold(
                    self.X_train, np.ravel(self.y_train), nfolds=cv):

                k = k + 1
                nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print(
                    '\n================================================================================ %s\n'
                    % nowtime)
                info = 'k = {}'.format(k)
                print(info)
                self.report_info = self.report_info + info + '\n'

                X_train_k, y_train_k = self.X_train.iloc[
                    train_index, :], self.y_train.iloc[train_index, :]
                X_validate_k, y_validate_k = self.X_train.iloc[
                    validate_index, :], self.y_train.iloc[validate_index, :]

                dtrain_k = xgb.DMatrix(X_train_k, y_train_k['label'])
                dvalid_k = xgb.DMatrix(X_validate_k, y_validate_k['label'])

                bst, _ = train_xgb(params_dict_copy, dtrain_k, dvalid_k, None,
                                   verbose_eval)
                predict_train_k = bst.predict(dtrain_k)
                predict_validate_k = bst.predict(dvalid_k)

                dfks_train = ks.ks_analysis(predict_train_k,
                                            dtrain_k.get_label())
                dfks_validate = ks.ks_analysis(predict_validate_k,
                                               dvalid_k.get_label())

                ks_train, ks_validate = max(dfks_train['ks_value']), max(
                    dfks_validate['ks_value'])

                auc_train = auc(dtrain_k.get_label(), predict_train_k)
                auc_validate = auc(dvalid_k.get_label(), predict_validate_k)

                ks_mean_train = ks_mean_train + ks_train
                auc_mean_train = auc_mean_train + auc_train
                ks_mean_validate = ks_mean_validate + ks_validate
                auc_mean_validate = auc_mean_validate + auc_validate

                info = '\ntrain: ks = {} \t auc = {} '.format(
                    ks_train, auc_train)
                prettyks = ks.print_ks(predict_train_k, dtrain_k.get_label())
                info = info + '\n' + str(prettyks) + '\n'
                info = info + '\nvalidate: ks = {} \t auc = {}'.format(
                    ks_validate, auc_validate) + '\n'
                prettyks = ks.print_ks(predict_validate_k,
                                       dvalid_k.get_label())
                info = info + str(prettyks) + '\n'
                print(info)
                self.report_info = self.report_info + info

                models[k] = bst

            ks_mean_train = ks_mean_train / float(k)
            auc_mean_train = auc_mean_train / float(k)
            ks_mean_validate = ks_mean_validate / float(k)
            auc_mean_validate = auc_mean_validate / float(k)

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_train, auc_mean_train) + '\n'
            info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_validate, auc_mean_validate) + '\n'
            print(info)
            self.report_info = self.report_info + info

            bst = models[model_idx]

        # 处理 cv = 0 或 cv = None时无需交叉验证逻辑
        else:

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            print(info)
            self.report_info = self.report_info + info

            bst, _ = train_xgb(params_dict_copy, self.dtrain, None, None,
                               verbose_eval)
            predict_train = bst.predict(self.dtrain)
            dfks_train = ks.ks_analysis(predict_train, self.y_train.values)
            ks_train = max(dfks_train['ks_value'])
            auc_train = auc(self.dtrain.get_label(), predict_train)

            info = '\ntrain: ks = {} \t auc = {} '.format(ks_train,
                                                          auc_train) + '\n'
            prettyks = ks.print_ks(predict_train, self.y_train.values)
            info = info + str(prettyks) + '\n'
            print(info)
            self.report_info = self.report_info + info

        # 计算特征重要性
        feature_scores = bst.get_score()
        dfimportance = pd.DataFrame({
            'feature': feature_scores.keys(),
            'importance': feature_scores.values()
        })
        try:
            dfimportance = dfimportance.sort_values('importance',
                                                    ascending=False)
        except AttributeError as err:
            dfimportance = dfimportance.sort('importance', ascending=False)

        dfimportance.index = range(len(dfimportance))

        self.dfimportance = dfimportance

        return (bst)