示例#1
0
def init():
    """
    初始化binning record
    :return: dict
    格式:
        {variable_name1
                -var_table
                    -var_params(province,goods,bads...)
                -is_selected
                -iv
         variable_name2...}
    """
    name = request.form.get("modelName")
    branch = request.form.get("branch")

    # 根据key从内存中获取已上传文件的相关参数
    df_map = global_value.get_value(name + "_" + branch)

    result = tool_model_service.load_model(model_name=name, model_branch=branch)
    # selected_list在数据库中是json格式,在python中是一个dict,格式为:select_variable:index(变量的位置)
    selected_list_json = json.loads(result[0].selected_list)
    selected_list = selected_list_json.keys()

    min_val = 0
    df = df_map['df_train']
    init_result = get_init(df, target=result[0].model_target, valid=selected_list)

    # 根据init_result获得变量的区间
    out = get_boundary(init_result, min_val)
    # 根据iv排序
    out_sorted_iv = sort_iv(out)
    return rest.responseto(data=out_sorted_iv)
示例#2
0
def divide_manually():
    boundary = request.form.get("boundary")
    variable_name = request.form.get("variable_name")
    branch = request.form.get("branch")
    model_name = request.form.get("model_name")
    type = request.form.get("type")

    df_map = global_value.get_value(model_name + "_" + branch)
    df_train = df_map['df_train']

    boundary_list = []
    if type == "true":
        for s in boundary.split(","):
            temp = []
            temp.extend(map(cmm.transfer, s.split("|")))
            boundary_list.append(temp)
        columns = ['bin_num', variable_name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe',
                   'type']

    else:
        for s in boundary.split(","):
            boundary_list.append(float(s))
        columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc',
                   'bad_rate', 'woe',
                   'type']

    target = tool_model_service.load_model(model_name=model_name, model_branch=branch)[0]["model_target"]
    result = bf.adjust_bin(df_train, type == "true", variable_name, boundary_list
                           , target=target, expected_column={variable_name})

    iv = result['IV'].sum()
    df = pd.DataFrame(result,
                      columns=columns)
    data = generate_response(variable_name, df, iv)
    return rest.responseto(data=data)
示例#3
0
def variable_verify():
    """
    变量相关性校验
    """
    model_name = request.form.get("modelName")
    branch = request.form.get("branch")
    corr_cap = request.form.get("corrCap")
    if corr_cap is not None:
        corr_cap = float(corr_cap)
    else:
        corr_cap = 1
    variables = request.form.get("variables")

    variable_list = variables.split(",")
    variable_list = [x + "_woe" for x in variable_list]
    df_map = global_value.get_value(model_name + "_" + branch)

    df_train_woe = df_map["df_train_woe"]

    df = pd.DataFrame(df_train_woe, columns=variable_list)

    if common.is_valid_correlation(df, corr_cap) is False:
        result = common.get_correlation(df)
        return rest.responseto(result.to_dict())
    else:
        return rest.responseto(None)
def parse():
    # 对train文件进行转换,分析
    model_name = request.form.get("modelName")
    branch = request.form.get("branch")
    # 用户指定的文件相对路径
    file_path = request.form.get("filePath")
    target = request.form.get("target")
    root_path = app.config["ROOT_PATH"]

    path = root_path + "/" + file_path
    # 以模型名称和分支名作为唯一的key
    key = model_name + "_" + branch
    # df_train = None
    # 流程继续下去的前提就是路径是真实存在的
    if os.path.exists(path):
        # 检查是否已经加载过了
        if global_value.has_key(key) is False:
            # 重新加载资源
            df_all = pd.read_excel(path)
            df_train = df_all[df_all['dev_ind'] == 1]
            df_test = df_all[df_all['dev_ind'] == 0]
            df_map = {
                model_name + "_" + branch: {
                    "df_all": df_all,
                    "df_train": df_train,
                    "df_test": df_test
                }
            }
            global_value.set_value(**df_map)
        else:
            df_map = global_value.get_value(key)
            df_train = df_map['df_train']
        df = ba.get_df_summary(df_train)
        # 得到df_train,将dataframe转换为用于展示前端展示的数据
        data_map = cmm.df_for_html(df)

        result = tool_model_service.load_model(model_name=model_name,
                                               model_branch=branch)
        branches = []
        v = result[0]
        for n in result:
            branches.append(n.model_branch)

        # data_map["current_model"] = model_name
        data_map["branches"] = branches
        data_map["selected_list"] = v.selected_list
        data_map["target"] = v.model_target

        return rest.responseto(data=data_map)
    else:
        return rest.responseto(message="file not exist", success=False)
示例#5
0
def load_applyed():
    """读取apply后的文件"""
    # 在跨域的情况下,前端会发送OPTIONS请求进行试探,然后再发送POST请求
    if request.method == 'POST':
        # 获取training文件上传的路径
        model_name = request.form.get("model_name")
        branch = request.form.get("branch")
        files = request.files.getlist("file[]")
        for file in files:
            df_map = global_value.get_value(model_name + "_" + branch)
            df = pd.read_excel(file, encoding="utf-8")
            df_map["df_train_woe"] = df[df['dev_ind'] == 1]
            df_map["df_test_woe"] = df[df['dev_ind'] == 0]
    return rest.responseto(data="success")
示例#6
0
def apply():
    """将train数据得到的woe与test数据进行匹配"""
    req = request.form.get('data')
    var_dict = json.loads(req)
    model_name = var_dict["modelName"]
    branch = var_dict["branch"]
    df_map = global_value.get_value(model_name + "_" + branch)

    data = var_dict["data"]

    # df = df_test.append(df_train)
    df = df_map['df_all'].copy()
    var_list = data.keys()

    for var_name in var_list:
        df[var_name + '_woe'] = df[var_name].apply(lambda var_value: apply_get_woe_value(var_name, var_value, data))

    global withIntercept
    withIntercept = True

    # if withIntercept:
    #    df['intercept_woe'] = 1.0
    df_map["df_train_woe"] = df[df['dev_ind'] == 1]
    df_map["df_test_woe"] = df[df['dev_ind'] == 0]

    global apply_result, safely_apply

    apply_result = df
    safely_apply = True
    output = BytesIO()
    writer = pd.ExcelWriter(output, engine='xlsxwriter')
    df.to_excel(writer, startrow=0, merge_cells=False, sheet_name="Sheet_1")
    workbook = writer.book
    worksheet = writer.sheets["Sheet_1"]
    format = workbook.add_format()
    format.set_bg_color('#eeeeee')
    worksheet.set_column(0, 9, 28)
    writer.close()
    output.seek(0)

    file = send_file(output, as_attachment=True, attachment_filename='df_iv.xlsx')
    response = rest.make_response(file)

    return rest.responsePandas(response)
示例#7
0
def variable_select_manual():
    """
    手动选择变量
    """
    all_list = request.form.get("all_list")
    selected_list = request.form.get("selected_list")
    target = request.form.get("target")
    with_intercept = request.form.get("with_intercept") == 'true'
    model_name = request.form.get("modelName")
    branch = request.form.get("branch")
    ks_group_num = request.form.get("ks_group_num")

    df_map = global_value.get_value(model_name + "_" + branch)
    df_train_woe = df_map["df_train_woe"]
    df_test_woe = df_map["df_test_woe"]

    ks_group_num = ks_group_num if ks_group_num != '' else 20

    data = lmf.get_logit_backward_manually(df_train_woe, df_test_woe,
                                           all_list.split(","),
                                           selected_list.split(","), target,
                                           ks_group_num, with_intercept)
    return rest.responseto(data=data)
示例#8
0
def variable_select():
    """
    apply完成后,第一次进入时的变量选择
    """
    model_name = request.form.get("modelName")
    branch = request.form.get("branch")
    var_list = request.form.get("var_list")

    df_map = global_value.get_value(model_name + "_" + branch)

    # 调用接口时发现var_list为空,那么主动从数据库中读取
    if var_list is None or var_list == '':
        result = tool_model_service.get_selected_variable(model_name,
                                                          branch)[0]
        var_list = result["selected_variable"].decode('utf-8')
    else:
        # 清除旧数据,插入新的数据
        if (tool_model_service.del_selected_variable(model_name, branch)):
            tool_model_service.save_selected_variable(model_name, branch,
                                                      var_list)
        else:
            return rest.responseto(messege="fail to save selected variable",
                                   success=False)
    target = request.form.get("target")
    withIntercept = request.form.get("with_intercept") == 'true'
    ks_group_num = request.form.get("ks_group_num")
    ks_group_num = ks_group_num if ks_group_num != '' else 20

    df_train_woe = df_map["df_train_woe"]
    df_test_woe = df_map["df_test_woe"]

    data = lmf.get_logit_backward(df_train_woe,
                                  df_test_woe, target, ks_group_num,
                                  var_list.split(","), withIntercept)
    if data is None:
        return rest.responseto(success=False)
    return rest.responseto(data=data)
示例#9
0
def ppp():
    return global_value.get_value("")
示例#10
0
def divide():
    """
    分裂操作
    先将从data中得到的范围,从excel中筛选相应的数据
    筛选完成后,调用init方法对数据进行初始化,得到一定数据的范围区间
    将该范围区间与原来的区间合并.
    调用adjust方法获得的结果即为分裂后的结果
    :return:
        {variable_name1
                -var_table
                    -var_params{province,goods,bads...}
                -iv
        }
    """
    model_name = request.form.get("modelName")
    branch = request.form.get('branch')
    df_map = global_value.get_value(model_name + "_" + branch)
    df_train = df_map['df_train']
    min_val = 0
    data = request.form.get('data')
    # 解析json
    data_map = json.loads(data, object_pairs_hook=OrderedDict)
    name = data_map["name"]
    target = request.form.get("target")
    # 将excel转化为dataframe,只读取target和name两列
    df = pd.DataFrame(df_train, columns={target, name})

    bound_list = None
    if data_map["selected"]["type"] == 'Numerical':
        # 根据min和max的边界去筛选数据
        min = data_map["selected"]["min_boundary"]
        max = data_map["selected"]["max_boundary"]
        df = df[(df[name].astype(float) >= float(min)) & (df[name].astype(float) < float(max))]
        out = get_init(df, target=target, invalid=[], fineMinLeafRate=0)
        bound_list = get_divide_min_bound(out)

        list = data_map["table"]
        # 删除要被分裂的项
        del list[data_map["selectedIndex"]]

        for v in list:
            bound_list.append(float(v["min_boundary"]))
        # bound_list.append(np.nan)

        result = bf.adjust_bin(df_train, data_map["selected"]["type"] == 'Categorical', name, bound_list
                               , target=target, expected_column={name})
        columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc',
                   'bad_rate', 'woe',
                   'type']
        iv = result['IV'].sum()
        df = pd.DataFrame(result,
                          columns=columns)
        data = generate_response(name, df, iv)
        # data = get_merged(name, df, min_val)

        return rest.responseto(data=data)
    else:
        val = data_map["selected"][name].split("|")
        df[name] = df[name].apply(lambda x: simple_util.float_nan_to_str_nan(x))

        df = df[df[name].isin(val)]

        list = data_map["table"]
        # 删除要被分裂的项
        del list[data_map["selectedIndex"]]

        out = get_init(df, target=target, invalid=[], fineMinLeafRate=0)
        bound_list = get_divide_caterotical_bound(out, name)
        # 被分裂的项的下标
        index = data_map["selectedIndex"]
        # 将分裂的结果加入原有的列表中
        for v in list:
            bound_list.append(map(cmm.transfer, v[name].split("|")))
        result = bf.adjust_bin(df_train, data_map["selected"]["type"] == 'Categorical', name, bound_list
                               , target=target, expected_column={name})
        iv = result['IV'].sum()
        columns = ['bin_num', name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe',
                   'type']
        df = pd.DataFrame(result,
                          columns=columns)

        data = generate_response(name, df, iv)
        # data = get_merged(name, df, min_val)
        return rest.responseto(data=data)
示例#11
0
def merge():
    """归并操作"""
    model_name = request.form.get("modelName")
    branch = request.form.get("branch")
    # 要执行合并的variable
    var_name = request.form.get('varName')
    # 变量的类型
    type = request.form.get('type').encode('utf-8')
    # 选定的范围
    boundary = request.form.get('boundary').encode('utf-8')  # 每个bin_num的max的大小,都以逗号隔开
    # 总的范围
    all_boundary = request.form.get('allBoundary').encode('utf-8')  # 每个bin_num的max的大小,都以逗号隔开
    # 获得target
    # target = request.form.get('allBoundary').encode('utf-8');
    target = request.form.get('target')
    if target is None:
        target = 'bad_4w'
    excepted_column = {var_name}

    min_val = 0

    df_map = global_value.get_value(model_name + "_" + branch)

    result = None
    type_bool = False
    df = None
    if type == 'Numerical':
        # 将字符转换为list
        boundary_list = map(eval, boundary.split("&"))
        all_boundary_list = []
        # 将字符转换为list,nan替换为np.nan
        for a in all_boundary.split("&"):
            if a != 'nan':
                a = float(a)
            else:
                a = np.nan
            all_boundary_list.append(a)
        boundary_list = list(set(all_boundary_list).difference(set(boundary_list)))
        # boundary_list.append(np.nan)
        selected_list = boundary_list

        columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc',
                   'bad_rate', 'woe',
                   'type']
    else:
        type_bool = True
        temp = []
        for s in boundary.split("&"):
            temp.extend(map(cmm.transfer, s.split("|")))

        selected_list = [temp]
        if all_boundary != '':
            for s in all_boundary.split("&"):
                selected_list.append(map(cmm.transfer, s.split("|")))

        columns = ['bin_num', var_name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe',
                   'type']

    result = bf.adjust_bin(df_map["df_train"], type_bool, var_name, selected_list, target=target,
                           expected_column=excepted_column)  # 获得合并的结果
    iv = result['IV'].sum()

    df = pd.DataFrame(result,
                      columns=columns)

    data = generate_response(var_name, df, iv)
    # data = get_merged(var_name, df, min_val)
    return rest.responseto(data=data)