Пример #1
0
def summarize_perf_metrics(score_dict, all_Y, y_col):
    """
    把模型在不同样本和Y的表现(KS, AUC)汇总到一个表格

    Args:
    score_dict (dict): the key is the label of the sample set. e.g. 'train',
        'test', 'oot' etc. should be the save as data_cat_dict.keys()
        The value is the scores. index is apply_id
    all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build'
        name can be changed. index is apply_id
    y_col (str): the column name of Y

    """
    metrics_result_build = {}
    for label, score in score_dict.items():
        logging.log(18, label + ' data set starts performance calculation')
        merged_data = score['score'].to_frame('score')\
                           .merge(all_Y, left_index=True, right_index=True)
        if merged_data[y_col].count() > 10:
            merged_data = merged_data.loc[merged_data[y_col].notnull()]
            metrics_result_build[label] = {}
            metrics_result_build[label]['AUC'] = mt.Performance(
            ).calculate_auc(merged_data[y_col], 1. / merged_data.score)
            metrics_result_build[label]['KS'] = mt.Performance(
            ).calculate_ks_by_score(merged_data[y_col], merged_data.score)

    perf_result = pd.DataFrame(metrics_result_build)
    perf_result = perf_result.reset_index()
    perf_result = perf_result.rename(columns={'index': 'metrics'})
    return perf_result
Пример #2
0
def get_decile(score,
               varscore,
               all_Y,
               y_col,
               label,
               RESULT_PATH,
               manual_cut_bounds=[]):
    """
    根据提供的score和Y相应的Y定义

    Args:
    score (pd.DataFrame): index is apply_id
    all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build'
        name can be changed. index is apply_id
    y_col (str): column name of Y
    label (str): sample set label

    """
    merged_data = score.to_frame('score')\
                       .merge(all_Y, left_index=True, right_index=True)
    merged_data = merged_data.loc[merged_data[y_col].notnull()]
    if len(merged_data) > 10:
        ks_decile = mt.Performance().calculate_ks_by_decile(merged_data.score, \
                                    np.array(merged_data[y_col]), 'decile', 10,
                                    manual_cut_bounds=manual_cut_bounds)
        ks_decile.loc[:, 'sample_set'] = label
    else:
        ks_decile = pd.DataFrame()

    varscore_new = varscore.merge(score.to_frame('score'),
                                  left_index=True,
                                  right_index=True)

    if not ks_decile.empty:
        if len(manual_cut_bounds) == 0:
            manual_cut_bounds = mt.BinWoe().obtain_boundaries(
                ks_decile['分箱'])['cut_boundaries']

        varscore_new.loc[:,
                         'score_10bin'] = pd.cut(varscore_new.score,
                                                 manual_cut_bounds).astype(str)
        bin_varscore_avg = varscore_new.groupby('score_10bin').mean()\
                                       .reset_index()\
                                       .rename(columns={'score_10bin': '分箱'})
        ks_decile = ks_decile.merge(bin_varscore_avg, on='分箱')
        return ks_decile, manual_cut_bounds
    else:
        return pd.DataFrame(), manual_cut_bounds
Пример #3
0
def score_stability(score_dict, all_Y, y_col, train_set_name, RESULT_PATH,
                    model_label):
    """
    计算所有sample_set的score stability
    Args:
    score_dict (dict): the key is the label of the sample set. e.g. 'train',
        'test', 'oot' etc. should be the save as data_cat_dict.keys()
        The value is the scores. index is apply_id
    all_Y (pd.DataFrame): should have ['Y_build', 'Y_fid14'], 'Y_build'
        name can be changed. index is apply_id
    y_col (str): column name of Y
    """
    all_decile_df = pd.read_excel(os.path.join(RESULT_PATH, 'all_decile.xlsx'))
    the_bool = ((all_decile_df.model_label == model_label)
                & (all_decile_df.sample_set == train_set_name)
                & (all_decile_df.Y_definition == y_col))
    train_decile = all_decile_df.loc[the_bool].copy()
    point_bounds = mt.BinWoe().obtain_boundaries(
        train_decile['分箱'])['cut_boundaries']

    d_list = []
    compare_set_names = [
        i for i in list(score_dict.keys()) if i != train_set_name
    ]
    for label in compare_set_names:
        logging.log(18, label + ' data set starts score PSI calculation')
        tmp = mt.Performance().score_psi(score_dict[train_set_name]['score'],
                                         score_dict[label]['score'],
                                         point_bounds)
        columns_order = tmp.columns
        tmp.loc[:, 'compare_set'] = '%s_vs_%s' % (train_set_name, label)
        d_list.append(tmp)

    columns_order = ['model_label', 'compare_set'] + list(tmp.columns.values)
    all_score_psi = pd.concat(d_list)
    all_score_psi.loc[:, 'model_label'] = model_label
    return all_score_psi[columns_order]
Пример #4
0
# 1)把主键设置成index ;2)在代码前期声名y; 3)train-test模式和train-test-OOT模式

import sys
sys.path.append('/Users/yantingting/Documents/Code/CodePackage/')
from utils3.summary_statistics import *
from utils3.data_bins import *
from utils3.common_fun import *
from utils3.stepregression import *
from utils3.scorecard import *
import utils3.metrics as mt
import pandas as pd
pd.set_option('display.max_columns', None)
import os

pf = mt.Performance()

data_path = '/Users/yantingting/Documents/Code/'
result_path = '/Users/yantingting/Documents/Code/result'
if not os.path.exists(result_path):
    os.makedirs(result_path)

df = pd.read_excel(os.path.join(data_path, 'sample_data.xlsx'))
df.set_index(['company_name_md5'], inplace=True)
df.head()
df['flag'].sum() / df['flag'].count()

# step 1:生成数据字典,方便后期管理建模可用的变量
my_dict = DataSummary().create_dict(df,
                                    is_available='Y',
                                    data_sorce='TYC',
Пример #5
0
def generate_xgb_deployment_documents(model_label,
                                      eda_table,
                                      model_spec,
                                      xgb_importance_score,
                                      var_dict,
                                      model_decile,
                                      model_result,
                                      RESULT_PATH,
                                      test_case_data,
                                      test_nums,
                                      woe_iv_df_coarse,
                                      production_name_map={}):
    """
    生成XGBoost模型部署文档

    Args
    model_label (str): 模型名称
    eda_table (dataFrame): EDA结果
    model_spec (dataFrame): XGBoost输出数据和分箱明细.pkl 文件
    xgb_importance_score (dataFrame): xgboost模型变量重要性排序
    var_dict (dataFrame): 数据字典
    model_decile(dataFrame): 建模decile
    model_result (dict): 模型结果
    RESULT_PATH (str): 结果路径
    test_case_data(dataFrame): 生成testcase的测试数据, 如果用于部署, 请确保apply_id为index
    test_nums(int):生成测试用例的数量
    woe_iv_df_coarse(dataFrame): 建模粗分箱结果
    production_name_map (dict): 当有指标英文线上的名字和线下建模时不一样时传入。key=建模时英文名
        value=线上英文名。 default={} 默认建模时和线上没有命名不一致的情况
    """
    if not os.path.exists(os.path.join(RESULT_PATH, 'deployment')):
        os.makedirs(os.path.join(RESULT_PATH, 'deployment'))

    DEPLOY_PATH = os.path.join(RESULT_PATH, 'deployment')

    eda_table = eda_table
    model_spec = model_spec
    rebin_spec = model_spec['rebin_spec']
    bin_to_label = model_spec['bin_to_label']
    dummy_var_name_map = model_spec['dummy_var_name_map']

    impt_writer = pd.ExcelWriter(
        os.path.join(DEPLOY_PATH, '%s_variable_importance.xlsx' % model_label))
    xgb_importance_score.to_excel(impt_writer, 'local', index=False)
    impt_writer.save()

    xgb_importance_score = xgb_importance_score
    xgb_importance_score = xgb_importance_score.rename(
        columns={
            'feature': 'XGB衍生入模名称',
            'fscore': '指标用于Split数据的数量',
            'imp_pct': '指标用于Split数据的数量占比'
        })

    xgb_importance_score[['XGB变量转换类型', '中间层指标名称']] = xgb_importance_score['XGB衍生入模名称']\
                        .apply(lambda x: pd.Series(mt.BinWoe().xgboost_obtain_raw_variable(x, var_dict)))

    xgb_importance_score['建模时指标名称'] = xgb_importance_score['中间层指标名称'].copy()
    xgb_importance_score['建模时XGB衍生入模名称'] = xgb_importance_score[
        'XGB衍生入模名称'].copy()
    for original, new_name in production_name_map.items():
        a = xgb_importance_score.loc[xgb_importance_score['建模时指标名称']==original, '建模时XGB衍生入模名称']\
                                .apply(lambda x: x.replace(original, new_name))
        xgb_importance_score.loc[xgb_importance_score['建模时指标名称'] == original,
                                 'XGB衍生入模名称'] = a
        if original in rebin_spec:
            rebin_spec[new_name] = rebin_spec.pop(original)
        if original in bin_to_label:
            bin_to_label[new_name] = bin_to_label.pop(original)
        # 这一步很重要,要将字典里边的变量名也改掉,不然rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict, num_variables+bin_variables)会出错
        if original in list(var_dict['指标英文']):
            var_dict['指标英文'].replace({original: new_name}, inplace=True)

    xgb_importance_score['中间层指标名称'] = xgb_importance_score['中间层指标名称'].replace(
        production_name_map)

    xgb_importance_score = var_dict[['数据源', '指标英文', '指标中文', '数据类型']]\
                                .rename(columns={'指标英文':'中间层指标名称'})\
                                .merge(xgb_importance_score, on='中间层指标名称', how='right')

    xgb_importance_score.insert(
        5, '输出打分指标名称', xgb_importance_score['XGB衍生入模名称'].apply(
            lambda x: 'mlr_' + str(x) + '_xgb_' + model_label))

    xgb_importance_score = xgb_importance_score.append(
        {'输出打分指标名称': 'mlr_creditscore_xgb_' + model_label}, ignore_index=True)
    xgb_importance_score = xgb_importance_score.append(
        {'输出打分指标名称': 'mlr_prob_xgb_' + model_label}, ignore_index=True)

    model_decile = model_decile

    selected_variables = xgb_importance_score['建模时指标名称'].unique()
    model_eda = eda_table.loc[eda_table['指标英文'].isin(
        selected_variables)].copy()
    model_eda['指标英文'] = model_eda['指标英文'].replace(production_name_map)

    woe_iv_df_coarse_copy = woe_iv_df_coarse.copy()
    if len(production_name_map) > 0:
        for j, k in production_name_map.items():
            woe_iv_df_coarse_copy = woe_iv_df_coarse_copy.replace(j, k)

    xgb_importance_score.rename(columns={'数据源': '指标分类'}, inplace=True)
    xgb_importance_score_bin_result = pd.merge(xgb_importance_score.drop_duplicates('中间层指标名称')[['指标分类','中间层指标名称','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\
     ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称']]\
         ,woe_iv_df_coarse_copy,left_on='中间层指标名称',right_on='指标英文',how='left')\
    [['指标分类','中间层指标名称','指标中文','XGB衍生入模名称','输出打分指标名称','指标用于Split数据的数量'\
     ,'指标用于Split数据的数量占比','XGB变量转换类型','建模时指标名称','建模时XGB衍生入模名称'\
    ,'数据类型','指标类型','分箱','分箱对应原始分类','N','分布占比','WOE','逾期率']]

    xgb_importance_score_bin_result[
        '分箱'] = xgb_importance_score_bin_result.apply(
            lambda x: x['分箱对应原始分类'] if x['分箱对应原始分类'] else x['分箱'], axis=1)
    xgb_importance_score_bin_result.drop(['分箱对应原始分类'], axis=1, inplace=True)

    writer = pd.ExcelWriter(
        os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label))
    xgb_importance_score_bin_result.to_excel(writer,
                                             '2_模型变量重要性排序及分箱统计',
                                             index=False)
    model_decile.to_excel(writer, '3_模型decile', index=False)
    model_eda.to_excel(writer, '4_模型EDA', index=False)
    writer.save()
    logging.info("""第六步部署:生成XGBoost部署文档。
    1. 模型部署Excel文档存储于:%s
    2. 需添加『0_文档修订记录』、『1_信息总览』页面。详见其他正式部署文档文件。并存储于『/Seafile/模型共享/模型部署文档/』相应文件夹中
    """ % os.path.join(DEPLOY_PATH, '%s部署文档.xlsx' % model_label))

    model_result = model_result
    derive_name_map = dict(
        zip(xgb_importance_score['建模时XGB衍生入模名称'],
            xgb_importance_score['XGB衍生入模名称']))
    xgbmodel = model_result['model_final']

    var_list = []
    for i in xgbmodel.__dict__['feature_names']:
        try:
            var_list.append(derive_name_map[i])
        except:
            var_list.append(i)
    xgbmodel.__dict__['feature_names'] = var_list

    if len(production_name_map) > 0:
        var_list_copy = []
        for i in var_list:
            for j, k in production_name_map.items():
                r = i.replace(j, k)
            var_list_copy.append(r)
        xgbmodel.__dict__['feature_names'] = var_list_copy

    num_variables = list(xgb_importance_score.loc[
        xgb_importance_score['XGB变量转换类型'] == 'num_vars_origin',
        '中间层指标名称'].unique())
    bin_variables = list(xgb_importance_score.loc[
        xgb_importance_score['XGB变量转换类型'] == 'bin_vars', '中间层指标名称'].unique())
    rebin_spec_json = mu.process_rebin_spec(rebin_spec, var_dict,
                                            num_variables + bin_variables)
    bin_to_label = {
        k: v
        for k, v in bin_to_label.items() if k in bin_variables
    }

    var_transform_method = {}
    var_transform_method['num_vars_origin'] = num_variables
    var_transform_method['bin_vars'] = bin_variables
    var_transform_method['dummy_vars'] = {}
    dummy_vars_df = xgb_importance_score.loc[xgb_importance_score['XGB变量转换类型']
                                             == 'dummy_vars'].copy()
    dummy_vars_df.loc[:, "dummy原始名称"] = dummy_vars_df['建模时XGB衍生入模名称'].apply(
        lambda x: dummy_var_name_map[x])
    dummy_vars_df.loc[:,
                      'dummy原始对应分类'] = dummy_vars_df.loc[:, "dummy原始名称"].apply(
                          lambda x: x.split('DUMMY')[-1])
    for original_variable in dummy_vars_df['中间层指标名称'].unique():
        cat_list = list(
            dummy_vars_df.loc[dummy_vars_df['中间层指标名称'] == original_variable,
                              'dummy原始对应分类'].unique())
        var_transform_method['dummy_vars'][original_variable] = cat_list

    production_dummy_var_name_map = {}
    for new_dummy_name, old_dummy_name in dummy_var_name_map.items():
        if new_dummy_name != old_dummy_name:
            if new_dummy_name.split('DUMMY')[0] in production_name_map:
                var_name = new_dummy_name.split('DUMMY')[0]
                prod_var_name = production_name_map[var_name]
                production_dummy_var_name_map[old_dummy_name.replace(
                    var_name, prod_var_name)] = new_dummy_name.replace(
                        var_name, prod_var_name)
            else:
                production_dummy_var_name_map[old_dummy_name] = new_dummy_name

    var_transform_method['dummy_var_name_map'] = production_dummy_var_name_map

    save_data_to_json(rebin_spec_json, DEPLOY_PATH,
                      '%s_selected_rebin_spec.json' % model_label)
    save_data_to_json(bin_to_label, DEPLOY_PATH,
                      '%s_bin_to_label.json' % model_label)
    save_data_to_python2_pickle(xgbmodel, DEPLOY_PATH,
                                '%s_xgbmodel.pkl' % model_label)
    save_data_to_json(var_transform_method, DEPLOY_PATH,
                      '%s_var_transform_method.json' % model_label)
    ''' 生成模型的testcase '''
    if test_nums < len(test_case_data):
        test_case_data_final = test_case_data[:test_nums]
    else:
        test_case_data_final = test_case_data.copy()
    if len(production_name_map) > 0:
        columns = []
        for i in test_case_data_final.columns:
            for j, k in production_name_map.items():
                r = i.replace(j, k)
            columns.append(r)
        test_case_data_final.columns = columns

    offline_model_score = pd.concat([model_result['p_train'].apply(mt.Performance()\
                                  .p_to_score).to_frame('offline_model_score'),\
                                    model_result['p_test'].apply(mt.Performance()\
                                  .p_to_score).to_frame('offline_model_score')])

    model_used_origin_features = [
        i for i in list(set(xgb_importance_score['中间层指标名称']))
        if str(i) != 'nan' and i != None
    ]
    test_case_data_final_with_xgbScore = pd.merge(
        test_case_data_final[model_used_origin_features],
        offline_model_score,
        left_index=True,
        right_index=True)

    def get_xgb_testcase(test_data, modelName, productName):
        strs = ''' curl -d 'varparams={"modelName":"%s","productName":"%s","applyId": 123, ''' % (
            modelName, productName)
        for i, j in test_data.items():
            strs = strs + '"%s"' % i + ':"%s"' % j + ', '
        final = strs[:-2] + '''}' -X POST http://localhost:8080/modelInvoke'''
        return final

    test_case_data_final_with_xgbScore[
        'test_case'] = test_case_data_final_with_xgbScore[
            test_case_data_final_with_xgbScore.columns.difference(
                ['offline_model_score'])].to_dict('records')
    test_case_data_final_with_xgbScore[
        'curl_script'] = test_case_data_final_with_xgbScore['test_case'].apply(
            get_xgb_testcase, args=(model_label, model_label))
    test_case_data_final_with_xgbScore[[
        'offline_model_score', 'curl_script'
    ]].to_csv(os.path.join(DEPLOY_PATH,
                           '%s_xgbmodel_testcase.csv' % model_label),
              index=None)
    '''为秒算模型部署平台生成测试用例'''
    generate_xgb_testcases(model_label, test_case_data_final, var_dict,
                           DEPLOY_PATH, model_result)

    logging.info("""第六步部署:生成XGBoost部署文档。
    线上部署配置文件存储于%s路径下
    1. %save_data_dict_to_pickle
    2. %s
    3. %s
    4. %s
    """ % (
        DEPLOY_PATH,
        '%s_selected_rebin_spec.json' % model_label,
        '%sbin_to_label.json' % model_label,
        '%s_var_transform_method.json' % model_label,
        '%s_xgbmodel.pkl' % model_label,
    ))
Пример #6
0
def generate_xgb_testcases(model_label,
                           x_with_apply_id,
                           var_dict,
                           RESULT_PATH,
                           model_result=None,
                           is_backscoring=False):
    '''
    :param model_label(str): 模型命名, 发布模型时定义的模型名称
    :param x_with_apply_id(DataFrame): X变量, 必须有apply_id, apply_id务必设置为index
    :param var_dict(DataFrame): 变量字典
    :param model_result(dict): 建模时的模型结果, 建模时的[模型结果.pkl]. 必须有p_train. 务必保证model_result['p_train']的索引为apply_id,model_result['p_test']同理
    :param is_backscoring(boolean): True 用于回溯打分, 不需要建模样本的打分结果; False 用于建模样本生成建模样本的测试用例, 需同时传入model_result
    :return:
    '''
    def auto_xgb_testcase(test_data, modelName):
        '''
        :param test_data(dict):
        :param modelName(str):
        :return :json dict
        '''
        strs = ''' {"modelName":"%s","productName":"%s","applyId": 123, ''' % (
            modelName, modelName)
        for i, j in test_data.items():
            strs = strs + '"%s"' % i + ':"%s"' % j + ', '
        final = strs[:-2] + '''} '''
        return final

    x_columns = list(
        set((var_dict['指标英文']).unique()).intersection(
            set(x_with_apply_id.columns.values)))
    x_columns
    X = x_with_apply_id[x_columns]
    X = mu.process_missing(X, var_dict, verbose=True)
    X = mu.convert_right_data_type(X, var_dict)[0]

    if is_backscoring:
        X['origin_features'] = X.to_dict('records')
        X['var_param'] = X.apply(
            lambda row: auto_xgb_testcase(row['origin_features'], model_label),
            axis=1)
        test_cases = X[['var_param']]
        test_cases['offline_model_score'] = -8887
    else:
        model_sample_score = pd.concat([
            model_result['p_train'].apply(
                mt.Performance().p_to_score).to_frame('offline_model_score'),
            (model_result['p_test'].apply(
                mt.Performance().p_to_score).to_frame('offline_model_score'))
        ])
        model_sample_x_with_score = model_sample_score.merge(x_with_apply_id,
                                                             left_index=True,
                                                             right_index=True,
                                                             how='inner')
        model_sample_x_with_score[
            'origin_features'] = model_sample_x_with_score[
                model_sample_x_with_score.columns.difference(
                    ['offline_model_score'])].to_dict('records')

        model_sample_x_with_score[
            'var_param'] = model_sample_x_with_score.apply(
                lambda row: auto_xgb_testcase(row['origin_features'],
                                              model_label),
                axis=1)
        test_cases = model_sample_x_with_score[[
            'offline_model_score', 'var_param'
        ]]

    result_df = test_cases[['offline_model_score', 'var_param']]
    result_df.reset_index(inplace=True)
    result_df.columns = ['applyId', 'offline_model_score', 'var_param']
    writer = pd.ExcelWriter(
        os.path.join(RESULT_PATH, '%s_testcases.xlsx' % model_label))
    result_df.to_excel(writer, index=False)
    writer.save()
    return
Пример #7
0
def get_score_card(model_label, DATA_PATH, RESULT_PATH, woe_file_name,
                   rebin_spec_name, var_dict):
    """
    为了延展性输入model_label将相应的评分卡计算并存好

    Args:
    model_label (str): 模型版本名
    RESULT_PATH (str): 存储路径
    woe_file_name (str): 存储的woe_iv_df的文件名,用于转换计算评分卡
    rebin_spec_name (str): 变量粗分箱(或者评分卡建模用的分箱)边界spec文件名
    var_dict (pd.DataFrame): 标准变量字典

    Returns:
    存储评分卡至: RESULT_PATH, '%s_score_card.xlsx' % model_label
    """
    rebin_spec = load_data_from_pickle(DATA_PATH, rebin_spec_name)
    model_result = load_data_from_pickle(DATA_PATH, '%s模型结果.pkl' % model_label)
    coefficients = model_result['model_final'].params.to_frame('beta')\
                              .reset_index().rename(columns={'index':'var_code'})

    woe_iv_df_coarse = pd.read_excel(os.path.join(RESULT_PATH, woe_file_name))
    woe_iv_df_coarse.loc[:, '指标英文'] = woe_iv_df_coarse.loc[:,
                                                           '指标英文'].astype(str)
    cleaned_woe = woe_iv_df_coarse.loc[
        woe_iv_df_coarse[u'指标英文'].isin(coefficients.var_code),
        [u'指标英文', u'分箱', '分箱对应原始分类', 'N', u'分布占比', 'WOE', u'逾期率', u'Bad分布占比']]

    # woe表是根据实际数据计算的,可能实际数据中缺少某些字段的某一箱
    should_haves_list = []
    for var_name in cleaned_woe['指标英文'].unique():
        var_type = woe_iv_df_coarse.loc[woe_iv_df_coarse['指标英文'] == var_name,
                                        '数据类型'].iloc[0]
        if var_type == 'varchar':
            if var_name in rebin_spec:
                complete_bins_df = pd.Series(list(
                    rebin_spec[var_name].keys())).to_frame('分箱')
                complete_bins_df.loc[:, '指标英文'] = var_name
            else:
                complete_bins_df = cleaned_woe.loc[
                    cleaned_woe['指标英文'] == var_name, ['指标英文', '分箱']].copy()
        else:
            a = pd.Series(0)
            # 这样拿到的是最全的分箱值,不依赖数据里是否有这一分箱。且顺序是排好的
            complete_bins = [
                str(i) for i in pd.cut(a, rebin_spec[var_name]
                                       ['cut_boundaries']).cat.categories
            ]
            if var_type == 'integer':
                other_categories = [
                    int(i) for i in rebin_spec[var_name]['other_categories']
                ]

            if var_type == 'float':
                other_categories = [
                    float(i) for i in rebin_spec[var_name]['other_categories']
                ]

            complete_bins_list = sorted(other_categories) + list(complete_bins)
            complete_bins_df = pd.Series(complete_bins_list).to_frame('分箱')
            complete_bins_df.loc[:, '指标英文'] = var_name
        should_haves_list.append(complete_bins_df)

    should_haves_df = pd.concat(should_haves_list)
    should_haves_df = should_haves_df.astype(str)
    cleaned_woe[[u'指标英文', u'分箱']] = cleaned_woe[[u'指标英文', u'分箱']].astype(str)
    cleaned_woe = should_haves_df.merge(cleaned_woe,
                                        on=[u'指标英文', u'分箱'],
                                        how='left')
    cleaned_woe['N'] = cleaned_woe['N'].fillna(0)
    cleaned_woe['分布占比'] = cleaned_woe['分布占比'].fillna(0)
    cleaned_woe['WOE'] = cleaned_woe['WOE'].fillna(0)

    score_card = mt.Performance().calculate_score(coefficients, cleaned_woe,
                                                  'var_score', var_dict)
    score_card.to_excel(os.path.join(RESULT_PATH,
                                     '%s_score_card.xlsx' % model_label),
                        index=False)
Пример #8
0
    Data_path, '%s模型结果.pkl' % (model_label + 'XGBHyperopt'))
plot_obj.plot_for_the_model(Result_path, model_result_hyperopt, with_test=True)

# #### 查看XGBHyperopt(超参)对应的每一颗决策树的决策点
model_result_hyperopt['model_final'].dump_model(
    Result_path + '/model_result_hyperopt_trees.txt')

# #### 查看XGBHyperopt(超参)每一颗决策树对应的information和population
train_model_result_hyperopt_leafs = ss.get_xgboost_tree_leaf_dist_and_badRate(
    model_result_hyperopt, X_train[selected], y_train, y_col_name)
train_model_result_hyperopt_leafs.head(10)
"""
10.4 XGBHyperopt模型decile
"""
train_XGBHyperopt_score = model_result_hyperopt['p_train'].apply(
    mt.Performance().p_to_score)
test_XGBHyperopt_score = model_result_hyperopt['p_test'].apply(
    mt.Performance().p_to_score)

# 训练集
ks_decile_XGBHyperopt_train = mt.Performance().calculate_ks_by_decile(
    train_XGBHyperopt_score, np.array(y_train), 'decile', q=10)
ks_decile_XGBHyperopt_train

# 测试集
point_bounds_XGBHyperopt = mt.BinWoe().obtain_boundaries(
    ks_decile_XGBHyperopt_train[u'分箱'])['cut_boundaries']
ks_decile_XGBHyperopt_test = mt.Performance().calculate_ks_by_decile(
    test_XGBHyperopt_score,
    np.array(y_test),
    'decile',
Пример #9
0
def xgb_online_score_validity(data,input_vars,var_dict,var_transform_method,model_booster,model_decile,onLineScoreName):
    '''
    该函数为xgb模型上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数

    Args:

    data(DataFrame):其中表头必须含有apply_id,var_code,create_at字段,
    大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id
    input_vars(list):原始变量
    var_dict(DataFrame):数据字典
    var_transform_method(json):部署时刻的变量转换文件
    model_booster(booster):模型部署时刻的booster文件
    model_decile(DataFrame):model建模时候的decile
    onLineScoreName:xgb模型线上打分字段名

    Returns:
    data_input_vars_missing(DataFrame):未填充缺失值的所有apply
    data_scored_applys_wrong(DataFrame):打分错误明细
    model_decile_summary(DataFrame):模型分PSI
    '''
    data = data.sort_values(['apply_id','var_code','created_at'],ascending=False)\
    .drop_duplicates(['apply_id','var_code'])

    # 将数据进行列行转换
    data_pivot_all = data.pivot(index='apply_id',columns='var_code',values='var_val')

    # 变量出现空值的apply统计
    data_input_vars_missing = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)>0]
    data_scored_applys = data_pivot_all[data_pivot_all[input_vars].isnull().sum(axis=1)==0]

    # 进模型前的原始数据
    data_scored_applys_origin = data_scored_applys[input_vars]
    # 将数据转化为字典中的格式
    data_scored_applys_origin = mu.convert_right_data_type(data_scored_applys_origin, var_dict)[0]

    try:
        # 将需要做dummy的变量进行dummy转化
        data_dummys = pd.get_dummies(data_scored_applys_origin[list(var_transform_method['dummy_vars'].keys())]\
                                 ,columns=list(var_transform_method['dummy_vars'].keys()), drop_first=False\
                                 , prefix_sep='DUMMY')
        # 进入模型的数据
        data_in_model = pd.merge(data_scored_applys_origin,data_dummys,left_index=True,right_index=True)
    except:
        data_in_model = data_scored_applys_origin
    data_in_model.rename(columns=var_transform_method['dummy_var_name_map'],inplace=True)

    # 转换成xgb格式
    data_in_model_DMatrix = xgb.DMatrix(data_in_model[model_booster.__dict__['feature_names']])

    data_scored_applys['offLine_prob'] = model_booster.predict(data_in_model_DMatrix)
    data_scored_applys['offLineScore'] = data_scored_applys['offLine_prob']\
    .apply(lambda x : mt.Performance().p_to_score(x))

    # 打分错误的applyid
    data_scored_applys_wrong = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\
    .apply(lambda x : float(x[onLineScoreName]) != float(x['offLineScore']),axis=1)]

    data_scored_applys_right = data_scored_applys[data_scored_applys[[onLineScoreName,'offLineScore']]\
    .apply(lambda x : float(x[onLineScoreName]) == float(x['offLineScore']),axis=1)]

    # PSI统计
    point_bounds_XGBRandom = mt.BinWoe().obtain_boundaries(model_decile['分箱'])['cut_boundaries']
    model_decile_curr = pd.cut(data_scored_applys_right[onLineScoreName].astype(float)\
       ,point_bounds_XGBRandom).value_counts().sort_index().to_frame('curr_num').reset_index()

    model_decile_final = pd.concat([model_decile,model_decile_curr],axis=1)
    model_decile_final['建模分布'] = model_decile_final['样本数']/model_decile_final['样本数'].sum()
    model_decile_final['curr_dist'] = model_decile_final['curr_num']/model_decile_final['curr_num'].sum()
    model_decile_final['psi'] = model_decile_final.apply(lambda x : (x['建模分布'] - x['curr_dist'])\
                           *log(x['建模分布']/x['curr_dist'])\
                           if x['curr_dist']!=0 and x['建模分布']!=0 else 0\
                           ,axis=1)
    model_decile_summary = model_decile_final[['分箱','样本数','建模分布','curr_num','curr_dist','psi']]
    return data_input_vars_missing,data_scored_applys_wrong,model_decile_summary
Пример #10
0
def verify_var_and_score_validity(data_origin,score_card_deploy,rebin_spec_bin_adjusted_deploy,model_decile_deploy):
    '''
    该函数为上线以后统计是否存在缺失值未填充、打分错误以及PSI相关函数

    Args:

    data_origin(DataFrame):其中表头必须含有apply_id,var_code,create_at字段,
    大家严格按照部署监控时刻的要求撰写PSI监控时刻的SQL取数则不会出错,如果唯一标识不是apply_id,务必将唯一标识字段名修改为apply_id
    score_card_deploy(DataFrame):切记这是部署时刻的评分卡
    rebin_spec_bin_adjusted_deploy(DataFrame):部署时刻的变量分箱.json文件
    model_decile_deploy(DataFrame):部署时刻的model decile就在部署的评分卡中

    Returns:
    wrong_data(DataFrame):未填充缺失值的所有apply
    wrong_score_vars(dict):变量以及其对应的打分错误明细
    wrong_model_score(DataFrame):模型分打分错误明细
    var_dist_data_with_score_card(DataFrame):变量PSI
    score_dist_with_decile(DataFrame):模型分PSI
    '''

    # 将部署的评分卡切换成和建模时对应的字段名
    score_card_deploy.rename(columns={'打分':'变量打分','中间层指标名称':'指标英文'},inplace=True)
    # 选出模型中用到的字段名和其对应的
    model_selected = [i for i in set(list(score_card_deploy['指标英文'])) if str(i)!='nan']
    mlr_selected_vars =  [i for i in set(list(score_card_deploy['输出打分指标名称'])) if str(i)!='nan']

    # 生成数据字典
    var_dict = score_card_deploy.drop_duplicates('指标英文')
    var_dict = var_dict[var_dict['指标英文'].isin(model_selected)]

    # 生成打分时候建模的评分卡
    score_card_modeling = score_card_deploy[score_card_deploy['指标英文'].isin(model_selected)]
    score_card_modeling.ix[len(score_card_modeling)+1,'指标英文'] = 'intercept'
    const_var = [i for i in set(list(score_card_deploy['输出打分指标名称'])) if 'const' in i][0]
    const_value = float(score_card_deploy.ix[score_card_deploy['输出打分指标名称']==const_var,'变量打分'])
    score_card_modeling.ix[score_card_modeling['指标英文']=='intercept','变量打分'] = const_value
    # 生成建模时刻的rebin_spec
    rebin_spec_bin_adjusted_modeling = rebin_spec_bin_adjusted_deploy['numerical']
    rebin_spec_bin_adjusted_modeling.update(rebin_spec_bin_adjusted_deploy['categorical'])

    # 生成变量名称与打分名称对应关系
    modelVar_scoreVar_dict = {}
    for j,i in score_card_deploy[['指标英文','输出打分指标名称']].drop_duplicates(['指标英文']).iterrows():
        if str(i['指标英文'])!='nan':
            modelVar_scoreVar_dict[i['指标英文']] = i['输出打分指标名称']

    # 转换原始数据格式
    try:
        data_origin = data_origin.sort_values(['apply_id','var_code','created_at'],ascending=False)\
        .drop_duplicates(['apply_id','var_code','created_at'])
    except:
        try:
            data_origin = data_origin.sort_values(['apply_id','var_code'],ascending=False)\
            .drop_duplicates(['apply_id','var_code'])
        except:
            pass

    data_origin_pivot = data_origin.pivot(index='apply_id',columns='var_code',values='var_val')

    # 找出变量缺失的用户
    wrong_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)>0]
    wrong_data.reset_index(inplace=True)
    # 找出不缺失打分的用户
    right_data = data_origin_pivot[data_origin_pivot.isnull().sum(axis=1)==0]

    right_data_converted = convert_right_data_type(right_data[model_selected],var_dict)[0]

    # 将数据转换为bin
    bin_obj = mt.BinWoe()
    data_origin_pivot_cat = bin_obj.convert_to_category(right_data_converted[model_selected], var_dict\
                                               , rebin_spec_bin_adjusted_modeling)
    data_origin_pivot_score_all = mt.Performance().calculate_score_by_scrd(data_origin_pivot_cat[model_selected]\
                                                               ,score_card_modeling)
    data_origin_pivot_score_vars = data_origin_pivot_score_all[0]
    data_origin_pivot_score_model = data_origin_pivot_score_all[1]


    data_origin_pivot_cat.columns = [i+'_cat' for i in data_origin_pivot_cat.columns]
    data_origin_pivot_score_vars.columns = [i+'_varScoreBymodel' for i in data_origin_pivot_score_vars.columns]

    data_origin_pivot_all = pd.merge(pd.merge(data_origin_pivot,data_origin_pivot_cat,left_index=True,right_index=True)\
             ,data_origin_pivot_score_vars,left_index=True,right_index=True)

    # 将打分错误的变量对应的申请找出来
    wrong_score_vars = {}
    for i in model_selected:
        wrong_dt =data_origin_pivot_all[data_origin_pivot_all\
                                        .apply(lambda x : float(x[modelVar_scoreVar_dict[i]])!=float(x[i+'_varScoreBymodel']),axis=1)]\
                                        [[i,i+'_cat',i+'_varScoreBymodel',modelVar_scoreVar_dict[i]]]

        wrong_score_vars[i] = wrong_dt.reset_index()


    # 整合线上和线下打分
    mlr_score_vars = [i for i in mlr_selected_vars if 'creditscore' in i][0]
    model_on_off_score = pd.merge(data_origin_pivot_score_model.to_frame('back_score'),data_origin_pivot_all[mlr_score_vars]\
         .to_frame('mlr_score'),left_index=True,right_index=True)

    wrong_model_score = model_on_off_score[model_on_off_score\
                                           .apply(lambda x : float(x['back_score']) != float(x['mlr_score']),axis=1)]
    wrong_model_score.reset_index(inplace=True)

    # 求变量的PSI
    var_dist = []
    for i in model_selected:
        tmp = (data_origin_pivot_cat[i+'_cat'].value_counts())\
        .to_frame('allNum').reset_index()
        tmp['dist'] = tmp['allNum']/len(data_origin_pivot_cat)

        tmp['指标英文'] = i
        tmp.columns = ['分箱','allNum','dist','指标英文']
        var_dist.append(tmp)
    var_dist_data = pd.concat(var_dist)[['指标英文','分箱','allNum','dist']]
    var_dist_data['分箱'] = var_dist_data['分箱'].astype(str)
    var_dist_data_with_score_card = pd.merge(score_card_modeling,var_dist_data,on = ['指标英文','分箱'])
    var_dist_data_with_score_card['PSI'] = var_dist_data_with_score_card.apply(lambda x : (x['dist'] - x['分布占比'])\
                           *log(x['dist']/x['分布占比'])\
                           if x['dist']!=0 and x['分布占比']!=0 else 0\
                           ,axis=1)
    var_dist_data_with_score_card = var_dist_data_with_score_card\
    [['指标分类','指标英文','变量中文','数据类型','分箱','N','分布占比','allNum','dist','PSI']]
    # 求打分的psi
    # 拿到打分的cut_bounds
    point_bounds = mt.BinWoe().obtain_boundaries(model_decile_deploy[u'分箱'])['cut_boundaries']

    mlr_score_dist = (pd.cut(data_origin_pivot_all[mlr_score_vars].astype(float),point_bounds)\
    .value_counts().to_frame('allNum')).reset_index()

    mlr_score_dist['dist'] = mlr_score_dist['allNum']/len(data_origin_pivot_all)
    mlr_score_dist['index'] = mlr_score_dist['index'].astype(str)
    model_decile_deploy['样本分布'] = model_decile_deploy['样本数']/sum(model_decile_deploy['样本数'])
    score_dist_with_decile = pd.merge(model_decile_deploy,mlr_score_dist,left_on='分箱',right_on='index')\
    [['分箱','样本数','样本分布','allNum','dist']]

    score_dist_with_decile['PSI'] = score_dist_with_decile.apply(lambda x : (x['dist'] - x['样本分布'])\
                           *log(x['dist']/x['样本分布'])\
                           if x['dist']!=0 and x['样本分布']!=0 else 0\
                           ,axis=1)

    return wrong_data,wrong_score_vars,wrong_model_score,var_dist_data_with_score_card,score_dist_with_decile