def run_analyse(conf, input_conn, output_conn, tables, fk_filter=None):
    assert isinstance(conf, Config)
    assert isinstance(tables, list)
    if len(tables) > 0:
        assert isinstance(tables[0], tuple) and len(tables[0]) == 2
    else:
        return

    if fk_filter:
        assert isinstance(fk_filter, list)
    else:
        fk_filter = []

    input_helper, output_helper = dynamic_import(conf)
    ana_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

    # 清除临时计算表
    tmp_helper.clean_all_tmp_tables(input_conn, conf.tmp_db)

    alias_generator = AliasNameGen(output_conn, conf)  # 别名生成器
    # 生成临时表
    node_table_relations, res_fk = generate_tmp_tables(conf, input_conn, output_conn, tables,
                                                                  alias_generator, fk_filter)
    output_helper.save_same_cluster_detail(output_conn, res_fk, ana_time, conf.output_schema)

    # 确认关联关系
    res_relations = do_calc_node_table_relations(conf, input_conn, output_conn, node_table_relations,
                                                     alias_generator, ana_time)
예제 #2
0
def analyse_table_pk_by_sql(conf,
                            input_conn,
                            output_conn,
                            sys_code,
                            ori_table_code,
                            etl_date,
                            date_offset,
                            alg,
                            start_date_str=time.strftime(
                                "%Y-%m-%d %H:%M:%S", time.localtime())):
    """
    通过SQL语句直接从数据库获取候选键
    :param conf:
    :param input_conn:
    :param output_conn:
    :param sys_code:
    :param ori_table_code:
    :param etl_date:
    :param date_offset:
    :param alg:
    :param start_date_str:
    :return:
    """
    assert isinstance(conf, Config)
    input_helper, output_helper = dynamic_import(conf)
    etl_dates = date_trans(etl_date, date_offset)

    table_columns = output_helper.get_table_all_columns(
        output_conn, conf.output_schema, sys_code, ori_table_code)
    feature_dict = output_helper.get_table_feature(output_conn, sys_code,
                                                   sys_code, ori_table_code,
                                                   conf.output_schema)
    candidates, right_count = output_helper.get_candidates(
        output_conn, ori_table_code, conf.output_schema)
    if right_count != len(table_columns):
        logging.warning("{}表未找到正确的候选键!".format(ori_table_code))
        logging.debug("错误的候选键:{}".format(str(candidates)))
        # 更新进度表,进度为未能找到正确的候选键
        res_code = output_helper.\
            update_unfound_candidate_sche(output_conn, conf.output_schema, ori_table_code, start_date_str)
        if res_code == -1:
            logging.error("{}表没有找到候选键,更新进度表失败".format(ori_table_code))
        return

    checked_single_pk, checked_joint_pk = check_candidate(
        conf, input_conn, output_conn, sys_code, ori_table_code, candidates,
        feature_dict, alg, etl_dates)

    # 5、校验通过,存入对应的表
    save_pk_result(conf, output_conn, output_helper, sys_code, ori_table_code,
                   checked_single_pk, checked_joint_pk, start_date_str)
def run_cluster(conf, output_conn):
    assert isinstance(conf, Config)

    input_helper, output_helper = dynamic_import(conf)
    ana_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    # 清除聚合结果
    output_helper.clean_same_cluster_result(output_conn, conf.output_schema)

    # 查询
    all_same = output_helper.get_same_pair_in_detail(output_conn, conf.output_schema)
    filtered_relations = {frozenset([t[0], t[1]]) for t in all_same}

    # 进库
    return merge_same_equals_and_save(conf, output_conn, filtered_relations, ana_time)
예제 #4
0
def analyse_table_fds_by_pandas(conf, sys_code, table_name, alg, etl_dates,
                                start_date_str, fd_sample_size):
    logging.info("{}表使用pandas分析部分函数依赖关系".format(table_name))
    import time
    st_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    assert isinstance(conf, Config)
    input_conn, output_conn = get_input_output_conn(conf)
    input_helper, output_helper = dynamic_import(conf)

    # 1. 数据采样
    if alg == '':
        alg = output_helper.get_tab_alg_single(output_conn, conf.output_schema,
                                               sys_code, table_name)
    if alg == "F5":
        data, size, col_num = input_helper.get_cols_sample(
            input_conn, table_name, fd_sample_size, etl_dates[-1])
    elif alg == "I":
        data, size, col_num = input_helper.get_cols_sample(
            input_conn, table_name, fd_sample_size, etl_dates)
    elif alg == "IU":
        trans_table_name = get_trans_table_name(output_conn,
                                                conf.output_schema, table_name)
        data, size, col_num = input_helper.get_cols_sample(
            input_conn, trans_table_name, fd_sample_size, etl_dates[-1])
    else:
        logging.warning("{}表使用了未知算法{}".format(table_name, alg))
        close_odbc_connection(input_conn)
        close_db2_connection(output_conn)
        return '004'

    if size < conf.min_records:
        logging.warning("{}表数据过少!".format(table_name))
        fds = []
        output_helper.save_table_fd(output_conn, sys_code, table_name, fds,
                                    conf.output_schema, start_date_str, '2')
        close_odbc_connection(input_conn)
        close_db2_connection(output_conn)
        return "001"
    df = pd.DataFrame(data)
    fds = analyse_table_mini_fds(df)
    ed_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    logging.info('{}表开始函数依赖分析:{}'.format(table_name, st_time))
    logging.info("{}表函数依赖计算正常完成:{}".format(table_name, ed_time))
    output_helper.save_table_fd(output_conn, sys_code, table_name, fds,
                                conf.output_schema, start_date_str, '5')
    close_odbc_connection(input_conn)
    close_db2_connection(output_conn)
    return "000"
 def _init_mapping(self, conn, config):
     input_helper, output_helper = dynamic_import(config)
     stat_dict = output_helper.get_config_info(conn, config.output_schema)
     all_table = list(stat_dict.keys())
     mapping_tup, max_id = output_helper.get_alias_mapping(conn, config.output_schema)
     self.origin_to_alias = {origin: alias for alias, sys_code, origin in mapping_tup}
     self.alias_to_origin = {alias: (sys_code, origin) for alias, sys_code, origin in mapping_tup}
     not_in = set(all_table) - set([(sys_code, origin) for alias, sys_code, origin in mapping_tup])
     inserts = []
     for sys_code, origin_name in not_in:
         max_id += 1
         alias = 't' + str(max_id).rjust(self.prefix_len, '0')
         inserts.append((int(max_id), str(sys_code), str(origin_name), str(alias)))
         self.origin_to_alias[origin_name] = alias
         self.alias_to_origin[alias] = (sys_code, origin_name)
     output_helper.save_alias_mapping(conn, inserts, config.output_schema)
예제 #6
0
def run_dim_cluster_main(syscode):
    conf = Config()
    logging.info("{}系统分析开始".format(syscode))

    input_helper, output_helper = dynamic_import(conf)
    input_conn, output_conn = get_input_output_conn(conf)

    tables_schedule = output_helper.get_all_fk_tables(output_conn, conf.output_schema)
    filter_fks = output_helper.get_all_fk_id_in_detail(output_conn, conf.output_schema)

    tables = [tup for tup in tables_schedule if tup[0] == syscode]
    logging.info("分析表数量:{}".format(len(tables)))
    run_analyse(conf, input_conn, output_conn, tables, filter_fks)
    logging.info("{}系统分析结束".format(syscode))
    close_odbc_connection(input_conn)
    close_db2_connection(output_conn)
def merge_same_equals_and_save(conf, output_conn, res_reciprocal_fd, ana_time):
    input_helper, output_helper = dynamic_import(conf)
    cols_distinct = output_helper.get_columns_distinct(output_conn, conf.output_schema)
    res_cluster = list(res_reciprocal_fd.copy())
    temp_save = []
    while True:
        check_set = res_cluster[0]
        flg = True
        for i in range(1, len(res_cluster)):
            other_set = res_cluster[i]
            if len(check_set.intersection(other_set)) > 0:
                new_set = check_set | other_set
                res_cluster.remove(check_set)
                res_cluster.remove(other_set)
                res_cluster.append(new_set)
                flg = False
                break
        if flg:
            temp_save.append(check_set)
            res_cluster.remove(check_set)
        if len(res_cluster) == 0:
            break
    res_cluster = temp_save

    order_res_cluster = []
    for elem_list in res_cluster:
        order_cluster = []
        for elem in elem_list:
            if elem in cols_distinct:
                order_cluster.append((elem, 1, int(cols_distinct[elem])))
            else:
                order_cluster.append((elem, 1, 0))
        order_res_cluster.append(sorted(order_cluster, key=lambda x: x[2], reverse=True))
    new_res_cluster = []

    for cluster_list in order_res_cluster:
        cluster = []
        for i, elem in enumerate(cluster_list):
            cluster.append((elem[0], elem[1], (i + 1)))
        new_res_cluster.append(cluster)
    logging.debug("分析结果:" + str(new_res_cluster))
    return output_helper.save_same_cluster(output_conn, new_res_cluster, ana_time, conf.output_schema)
예제 #8
0
def analyse_joint_fk(conf,
                     main_table_code,
                     sub_sys_code_list,
                     start_date_str=time.strftime("%Y-%m-%d %H:%M:%S",
                                                  time.localtime())):
    """
    处理联合外键分析逻辑
    支持:单系统内联合外键分析(main_table_code:SO1中所有表做循环,针对该循环体做并发, sub_sys_code:S01)
         单系统间联合外键分析(main_table_code:SO1中所有表做循环,针对该循环体做并发, sub_sys_code:S02)
         单系统和其他所有系统联合外键分析,包括自己(main_table_code:SO1中所有表做循环,针对该循环体做并发, sub_sys_code:All)
         所有系统联合外键分析,包括自己(main_table_code:所有表做循环,针对该循环体做并发, sub_sys_code:All)
    :param conf: 配置对象
    :param main_table_code: 主系统编号
    :param sub_sys_code_list: 从系统编号列表
    :param start_date_str: 单表外键分析开始时间
    :return:
    """
    assert isinstance(conf, Config)
    assert isinstance(sub_sys_code_list, list)

    inceptor_conn, output_conn = get_input_output_conn(conf)
    input_helper, output_helper = dynamic_import(conf)

    # 1、根据主系统编号查找已分析的联合主键
    tables_pk = output_helper.get_tables_joint_pk(output_conn,
                                                  conf.output_schema,
                                                  main_table_code)

    # 2、遍历结果集,查找作为联合主键的每一个字段的字段特征,并根据字段特征在从系统编号中找到符合特征的字段
    for sys_code, table_name in tables_pk:
        try:
            # 获取联合主键列表joint_pk
            for _, joint_pk in tables_pk[(sys_code, table_name)].items():
                # 联合主键长度大于3的,或者小于等于1的不分析,记录日志
                if len(joint_pk) > 3 or len(joint_pk) <= 1:
                    joint_pk_str = " , ".join(pk for pk in joint_pk)
                    logging.warning(
                        "{}系统{}表的{}字段做联合主键,字段数目大于3或小于等于1,不能用于联合外键分析".format(
                            sys_code, table_name, joint_pk_str))
                    continue
                init_capacity = 0
                # 用于存放待检查的外键字段字典
                all_check_fk_cols = {}
                double_or_time_flg = False
                # 遍历联合主键中的每一个字段
                for col in joint_pk:
                    table_schema = sys_code
                    # 查询联合主键中的每一个字段的字段特征
                    pk_feature = output_helper.get_col_info_feature(
                        output_conn, sys_code, table_schema, table_name, col,
                        conf.output_schema)
                    # TODO 如果联合主键中有字段的数据类型是Double、TIMESTAMP、DATE、TIME,则该联合主键不能进行联合外键分析
                    if pk_feature["COL_TYPE"].rstrip() == 'DOUBLE' or pk_feature["COL_TYPE"].rstrip() == 'TIMESTAMP'\
                            or pk_feature["COL_TYPE"].rstrip() == 'DATE' or pk_feature["COL_TYPE"].rstrip() == 'TIME':
                        double_or_time_flg = True
                    # bloom过滤器初始化容量
                    init_capacity = int(pk_feature["COL_RECORDS"])
                    # TODO 在sub_sys_code中找符合主键特征的所有字段,排除掉为空的联合外键字段
                    check_fk_cols = output_helper. \
                        get_check_fk_col(output_conn, pk_feature, conf.output_schema, sub_sys_code_list,
                                         distinct_limit=True, nullable=False)
                    # 用于存放单个待检查外键字段,key为(fk_sys_code, fk_table_schema, fk_table_name),value为候选外键字段名,主键字段名
                    check_fk_cols_dict = {}
                    # 遍历符合主键特征的字段,获取SYS_CODE, TABLE_SCHEMA, TABLE_CODE, COL_CODE
                    for check_dict in check_fk_cols:
                        fk_sys_code = check_dict['SYS_CODE']
                        fk_table_schema = check_dict['TABLE_SCHEMA']
                        fk_table_name = check_dict['TABLE_CODE']
                        fk_col_name = check_dict['COL_CODE']
                        if (fk_sys_code, fk_table_schema,
                                fk_table_name) not in check_fk_cols_dict:
                            check_fk_cols_dict[(fk_sys_code, fk_table_schema,
                                                fk_table_name)] = []
                        # key:(fk_sys_code, fk_table_schema, fk_table_name),value[(fk_col_name, col)]
                        check_fk_cols_dict[(fk_sys_code, fk_table_schema,
                                            fk_table_name)].append(
                                                (fk_col_name, col))
                    all_check_fk_cols[col] = check_fk_cols_dict
                check_fk_values_list = list(all_check_fk_cols.values())
                # 3、在符合特征的字段中取交集,交集字段所在的表即为联合主键可能出现并作为联合外键的表,即checks_tables
                checks_tables = set(
                    check_fk_values_list[0].keys()).intersection(
                        set(check_fk_values_list[1].keys()))
                # 如果联合主键数目多于2,进行下面的处理
                if len(check_fk_values_list) > 2:
                    for i in range(2, len(check_fk_values_list)):
                        checks_tables = set(
                            check_fk_values_list[i].keys()).intersection(
                                checks_tables)
                # 如果联合主键中有字段的数据类型是Double和TIMESTAMP,则该联合主键不能进行联合外键分析
                if double_or_time_flg:
                    continue
                # 多个符合联合主键特征的字段出现在不同的表中,没有交集,也无法进行联合外键分析
                if not checks_tables:
                    continue
                logging.info("主键:{}表{}字段, 待检查外键所在表个数:{}".format(
                    table_name, joint_pk, len(checks_tables)))

                # 4、生成bloom过滤器,从ods中根据联合主键所在表名,字段,取数时间,取数偏移量,函数依赖分析算法拉取具体的字段值放入bloom过滤器
                capacity = init_capacity + conf.bloom_init_capacity
                # 获取联合主键所在表的卸数算法
                table_alg = output_helper.get_tab_alg_single(
                    output_conn, conf.output_schema, sys_code, table_name)
                # 获取联合主键所在表的数据日期和日期偏移量
                etl_dates = None
                etl_date, date_offset = output_helper. \
                    get_tab_date_offset_single(output_conn, conf.output_schema, sys_code, table_name)
                if etl_date and date_offset:
                    etl_dates = date_trans(etl_date, date_offset)
                else:
                    logging.error(
                        "{}表存在联合主键,但未获取到卸数日期和日期偏移量,无法继续进行联合外键分析".format(
                            table_name))
                    exit(-1)

                cursor = None
                if table_alg == "F5":
                    cursor = input_helper.get_mul_col_cursor(
                        inceptor_conn, table_name, joint_pk, etl_dates[-1])
                elif table_alg == "I":
                    cursor = input_helper.get_mul_col_cursor(
                        inceptor_conn, table_name, joint_pk, etl_dates)
                elif table_alg == "IU":
                    trans_table_code = output_helper.get_trans_table_name(
                        output_conn, conf.output_schema, table_name)
                    cursor = input_helper.get_mul_col_cursor(
                        inceptor_conn, trans_table_code, joint_pk,
                        etl_dates[-1])
                else:
                    logging.error("{}表使用了不支持卸数方式{},在联合外键分析时无法获得联合主键的值".format(
                        table_name, table_alg))
                    close_db2_connection(output_conn)
                    close_odbc_connection(inceptor_conn)
                    exit(-1)
                # 将联合主键的值放入bloom过滤器
                bloom = generate_mul_col_bloom(conf, capacity, cursor)

                # 5、遍历外键组合
                joint_fks = []
                for fk_sys_code, fk_table_schema, fk_table_name in checks_tables:
                    # 可能会出现在查找all->all或者S01->S01的时候,查找符合联合主键特征的联合外键的时候正好找到了联合主键所在的表,要把这种情况排除掉
                    if fk_sys_code == sys_code and fk_table_name == table_name:
                        continue
                    lists = []
                    # 遍历待检查的字段,将[(fk1,pk1),(fk2,pk2),...]放入list
                    for col, check_dict in all_check_fk_cols.items():
                        lists.append(check_dict[(fk_sys_code, fk_table_schema,
                                                 fk_table_name)])
                    # 对符合特征的字段做外键的排列组合
                    check_lists = comb_lists(lists)
                    # check_tuple:((fk1,pk1),(fk2,pk2))
                    for check_tuple in check_lists:
                        # check_cols:[fk1,fk2]
                        pk_to_fk_dict = {p: f for f, p in check_tuple}
                        check_cols = [pk_to_fk_dict[p] for p in joint_pk]
                        # 防止出现[fk1,fk1]这样的情况
                        if len(set(check_cols)) != len(check_cols):
                            continue

                        # 获取候选联合外键所在表的卸数算法
                        fk_table_alg = output_helper. \
                            get_tab_alg_single(output_conn, conf.output_schema, fk_sys_code, fk_table_name)
                        # 获取联合外键所在表的数据日期和日期偏移量
                        fk_etl_dates = None
                        fk_tb_etl_date, fk_tb_date_offset = output_helper. \
                            get_tab_date_offset_single(output_conn, conf.output_schema, fk_sys_code, fk_table_name)
                        if fk_tb_etl_date and fk_tb_date_offset:
                            fk_etl_dates = date_trans(fk_tb_etl_date,
                                                      fk_tb_date_offset)
                        else:
                            logging.error(
                                "{}表存在候选联合外键,但未获取到卸数日期和日期偏移量,无法继续进行联合外键分析".
                                format(fk_table_name))
                            close_db2_connection(output_conn)
                            close_odbc_connection(inceptor_conn)
                            exit(-1)

                        # 从ods中根据联合外键所在表名,字段,取数时间,取数偏移量,函数依赖分析算法拉取具体的字段值
                        fk_cursor = None
                        if fk_table_alg == "F5":
                            fk_cursor = input_helper. \
                                get_mul_col_not_null_cursor(inceptor_conn, fk_table_name, check_cols, fk_etl_dates[-1])
                        elif fk_table_alg == "I":
                            fk_cursor = input_helper. \
                                get_mul_col_not_null_cursor(inceptor_conn, fk_table_name, check_cols, fk_etl_dates)
                        elif fk_table_alg == "IU":
                            fk_trans_table_code = output_helper. \
                                get_trans_table_name(output_conn, conf.output_schema, fk_table_name)
                            fk_cursor = input_helper. \
                                get_mul_col_not_null_cursor(inceptor_conn, fk_trans_table_code, check_cols,
                                                            fk_etl_dates[-1])
                        else:
                            logging.error(
                                "在进行联合外键分析时,对候选联合外键所在表{}进行取数时,发现该表使用了不支持卸数方式{}"
                                .format(fk_table_name, fk_table_alg))
                            close_db2_connection(output_conn)
                            close_odbc_connection(inceptor_conn)
                            exit(-1)
                        # 和bloom过滤器中的值进行对比,得到联合外键的值在布隆过滤器中的占比
                        p = get_contains_percent_from_cursor(bloom, fk_cursor)
                        # 外键比例阈值,当外键数据比例达到该值认为是外键关系
                        thr = conf.fk_check_threshold
                        # 主键数据量少于该阈值认为主键所在表只有少量数据
                        if len(bloom) < conf.fk_little_data:
                            # 少量数据外键比例阈值
                            thr = conf.fk_little_data_threshold
                        # 联合外键的值在布隆过滤器中的占比大于等于阈值
                        if p >= thr:
                            tmp_joint_fk = []
                            for elem in check_tuple:
                                # tuple含义:即(sys_code)系统的(table_name)表的(elem[1])字段,在(fk_sys_code)系统的(fk_table_name)表的(elem[0])做外键
                                tmp_joint_fk.append(
                                    ((sys_code, table_name, elem[1]),
                                     (fk_sys_code, fk_table_name, elem[0])))
                            joint_fks.append(tmp_joint_fk)

                # 6、分析结果保存到数据库
                if joint_fks:
                    res_code = output_helper.save_joint_fk_info(
                        output_conn, joint_fks, conf.output_schema,
                        main_table_code, start_date_str)
                    if res_code == -1:
                        sub_sys_str = " , ".join(
                            sub_sys for sub_sys in sub_sys_code_list)
                        logging.error(
                            "以{}为主表,{}为从系统进行联合外键分析,找到了联合外键,结果保存数据库失败".format(
                                main_table_code, sub_sys_str))

                    close_db2_connection(output_conn)
                    close_odbc_connection(inceptor_conn)
                    return
                else:
                    res_code = output_helper.update_unfound_joint_fk_sche(
                        output_conn, conf.output_schema, main_table_code,
                        start_date_str)

                    if res_code == -1:
                        sub_sys_str = " , ".join(
                            sub_sys for sub_sys in sub_sys_code_list)
                        logging.error(
                            "以{}为主表,{}为从系统进行联合外键分析,未能找到联合外键,更新进度表失败".format(
                                main_table_code, sub_sys_str))
                    close_db2_connection(output_conn)
                    close_odbc_connection(inceptor_conn)
                    return
            logging.warning("多个符合{}表联合主键特征的字段出现在不同的表中,没有交集,找不到联合外键".format(
                main_table_code))
            no_intersection_res_code = output_helper. \
                update_unfound_joint_fk_sche(output_conn, conf.output_schema, main_table_code, start_date_str)
            if no_intersection_res_code == -1:
                sub_sys_str = " , ".join(sub_sys
                                         for sub_sys in sub_sys_code_list)
                logging.error("以{}为主表,{}为从系统进行联合外键分析,未能找到联合外键,更新进度表失败".format(
                    main_table_code, sub_sys_str))
        except Exception as ex:
            logging.warning(str(ex))

    # 关闭数据库连接
    close_db2_connection(output_conn)
    close_odbc_connection(inceptor_conn)
def do_calc_node_table_relations(conf, input_conn, output_conn, node_table_relations, alias_gen, ana_time):
    all_same_set = set()
    not_same_set = set()
    all_relation = []
    input_helper, output_helper = dynamic_import(conf)
    for node_1, node_2, wheres, id_info, rel_fk in node_table_relations:
        selects = set()
        left_selects = []

        wheres_left = [t[0] for t in wheres]
        wheres_right = [t[1] for t in wheres]

        left_pk_alias_col_name = get_alias_col_name(node_1.tab_tup, alias_gen)
        selects.add((node_1.tab_tup, left_pk_alias_col_name))
        left_selects.append((node_1.tab_tup, left_pk_alias_col_name))

        for col in node_1.dim_cols:
            if col == node_1.tab_tup[2]:
                continue
            col_tup = (node_1.tab_tup[0], node_1.tab_tup[1], col)
            alias_col_name = get_alias_col_name(col_tup, alias_gen)
            selects.add((col_tup, alias_col_name))
            left_selects.append((col_tup, alias_col_name))

        right_selects = []
        alias_col_name = get_alias_col_name(node_2.tab_tup, alias_gen)
        right_selects.append((node_2.tab_tup, alias_col_name))
        # selects.add((leaf.tab_tup, alias_col_name))
        for col in node_2.dim_cols:
            if col == node_2.tab_tup[2]:
                continue
            col_tup = (node_2.tab_tup[0], node_2.tab_tup[1], col)
            if col not in wheres_right:
                selects.add((col_tup, alias_col_name))
            alias_col_name = get_alias_col_name(col_tup, alias_gen)
            right_selects.append((col_tup, alias_col_name))

        # 生成临时表名
        new_tmp_table_name = alias_gen.generate_tmp_table_name()
        # 参数
        left_table = node_1.tab_tup[1]
        right_table = node_2.tab_tup[1]

        # 创建临时表
        tmp_helper.create_table_from_inner_joins(input_conn, new_tmp_table_name, left_table, left_selects, right_table,
                                                 right_selects, wheres)

        # 使用长度进行过滤
        selects_alias = [a for _, a in selects]
        lengs_dict = tmp_helper.union_get_max_min_length(input_conn, new_tmp_table_name, selects_alias)
        check_same_set = set()
        for c, c1 in comb_lists([[a for _, a in left_selects if a not in wheres_left],
                                 [a for _, a in right_selects if a not in wheres_right]]):
            if c not in selects_alias or c1 not in selects_alias:
                continue
            leng = lengs_dict[c]
            leng1 = lengs_dict[c1]
            if leng[0] is None or leng[1] is None or leng1[0] is None or leng1[1] is None:
                continue
            if leng1[0] <= leng[0] and leng1[1] >= leng[1]:
                check_same_set.add((c, c1))

        logging.debug("检测同名关系数量:{}".format(str(len(check_same_set))))
        check_same_set = check_same_set - not_same_set - all_same_set
        logging.debug("过滤后检测同名关系数量:{}".format(str(len(check_same_set))))

        check_same_tuples = list(check_same_set)
        same_res = tmp_helper.union_check_same(input_conn, check_same_tuples, new_tmp_table_name)
        res_dict = {}

        for comb, is_same in zip(check_same_tuples, same_res):
            if is_same:
                res_dict[comb] = 'same'
            else:
                not_same_set.add(comb)

        # 对于去空后相等的组合判断不去空是否仍然相等
        logging.debug("检测相等关系")
        check_equals_tuples = list(res_dict.keys())
        equals_res = tmp_helper.union_check_equals(input_conn, check_equals_tuples, new_tmp_table_name)
        for comb, is_equals in zip(check_equals_tuples, equals_res):
            if not is_equals:
                continue
            res_dict[comb] = 'equals'

        count = tmp_helper.get_count(input_conn, new_tmp_table_name)
        if count > 3:
            check_fd_tuples = list(set(comb_lists([[a for _, a in left_selects if a not in wheres_left],
                                                   [a for _, a in right_selects if a not in wheres_right]])) - set(
                res_dict.keys()))
            df = tmp_helper.get_cols_sample(input_conn, new_tmp_table_name, conf.fd_sample_size)
            fds = multi_check_fd_by_pandas(df, check_fd_tuples)
            for c, c1, typ in fds:
                res_dict[(c, c1)] = typ
        else:
            logging.debug("由于join后数据量小于3条。不分析函数依赖")

        decode_rel = []
        for fk in rel_fk:
            decode_rel.append(fk)
        for comb, rel_type in res_dict.items():
            logging.debug("组合:{},关系类型:{}".format(comb, rel_type))
            all_same_set.add(comb)
            c1_tup = alias_gen.get_origin_tuple(comb[0])
            c2_tup = alias_gen.get_origin_tuple(comb[1])
            decode_rel.append((c1_tup, c2_tup, id_info, rel_type))
        # 进库
        output_helper.save_same_cluster_detail(output_conn, decode_rel, ana_time, conf.output_schema)
        all_relation += decode_rel
        # 清理
        if id_info[0] == '0':
            tmp_helper.clean_tmp_tables(input_conn, [right_table, new_tmp_table_name])
        elif id_info[0] == '1':
            tmp_helper.clean_tmp_tables(input_conn, [left_table, right_table, new_tmp_table_name])

    return all_relation
def generate_tmp_tables(conf, input_conn, output_conn, tables, alias_gen, filter_fks):
    res_fk = []
    filter_fks_set = set(filter_fks)
    input_helper, output_helper = dynamic_import(conf)
    fks = output_helper.get_all_fks(output_conn, conf.output_schema)
    fds = output_helper.get_all_fds(output_conn, conf.output_schema)
    cols_distinct = output_helper.get_columns_distinct(output_conn, conf.output_schema)
    cols_flg = output_helper.get_all_flag_columns(output_conn, conf.output_schema)
    analysis_conf_dict = output_helper.get_config_info(output_conn, conf.output_schema)
    joint_fks = output_helper.get_all_joint_fks(output_conn, conf.output_schema)

    def rm_default_cols(system_code, table_code, columns):
        new_cols = []
        for c in columns:
            if (system_code, table_code, c) in cols_flg:
                continue
            if (system_code, table_code, c) in cols_distinct:
                if int(cols_distinct[(system_code, table_code, c)]) > 1:
                    new_cols.append(c)
            else:
                new_cols.append(c)
        return new_cols

    def process_tmp_table(one_node):
        selects = set()
        # 生成临时表名
        new_tmp_table_name = alias_gen.generate_tmp_table_name()
        alias_fk_col_name = get_alias_col_name(one_node.tab_tup, alias_gen)
        selects.add((one_node.tab_tup[2], alias_fk_col_name))

        alias_cols = []
        for col in one_node.dim_cols:
            if col == one_node.tab_tup[2]:
                continue
            col_tup = (one_node.tab_tup[0], one_node.tab_tup[1], col)
            alias_col_name = get_alias_col_name(col_tup, alias_gen)
            alias_cols.append(alias_col_name)
            selects.add((col, alias_col_name))

        if (one_node.tab_tup[0], one_node.tab_tup[1]) in analysis_conf_dict:
            table_config = analysis_conf_dict[(one_node.tab_tup[0], one_node.tab_tup[1])]
            alg = table_config['ANA_ALG']
            etl_date = table_config['ETL_DATE']
            date_offset = table_config['DATE_OFFSET']
        else:
            logging.ERROR("{}系统中{}表未在配置表找到!".format(one_node.tab_tup[0], one_node.tab_tup[1]))
            raise OSError("{}系统中{}表未在配置表找到!".format(one_node.tab_tup[0], one_node.tab_tup[1]))

        etl_dates = date_trans(etl_date, date_offset)
        if alg == "F5":
            tmp_helper.create_tmp_table(input_conn, new_tmp_table_name, one_node.tab_tup[1], selects, etl_dates[-1:])
        elif alg == "I":
            tmp_helper.create_tmp_table(input_conn, new_tmp_table_name, one_node.tab_tup[1], selects, etl_dates)
        elif alg == "IU":
            # TODO 增加IU方式创建临时表的函数
            pass
        else:
            logging.ERROR("{}表使用了未知算法{}".format(one_node.tab_tup[1], alg))
            raise OSError("{}表使用了未知算法{}".format(one_node.tab_tup[1], alg))
        one_node.tab_tup = ('tmp', new_tmp_table_name, alias_fk_col_name)
        one_node.dim_cols = alias_cols

    # 注:由于主键一定能推出表内所有阶段,所以主键节点应该复用已有节点
    all_pk_nodes = {}  # 存储已经存在的主键节点
    node_table_relations = []
    # 处理单一主键
    for fk in fks:
        if fk['ID'] in filter_fks_set:
            continue
        if (fk['FK_SYS_CODE'], fk['FK_TABLE_CODE']) not in tables or \
                (fk['SYS_CODE'], fk['TABLE_CODE']) not in tables:
            continue

        rel_fk = [((fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE']),
                   (fk['SYS_CODE'], fk['TABLE_CODE'], fk['COL_CODE']), ('0', fk['ID']), 'fk')]

        if (fk['FK_SYS_CODE'], fk['FK_TABLE_CODE']) not in fds or \
                (fk['SYS_CODE'], fk['TABLE_CODE']) not in fds:
            res_fk += rel_fk
            continue

        where = [(get_alias_col_name((fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE']), alias_gen),
                  get_alias_col_name((fk['SYS_CODE'], fk['TABLE_CODE'], fk['COL_CODE']), alias_gen))]
        # 取闭包获取所有可推字段
        fk_link_cols = {fk['COL_CODE']}
        fk_tab_fds = []
        for left_fz_set, right_list in fds[(fk['SYS_CODE'], fk['TABLE_CODE'])].items():
            if '' in left_fz_set:
                continue
            for right in right_list:
                fk_tab_fds.append((left_fz_set, right))
        fk_cols_set = closures_cycle(fk_tab_fds, fk_link_cols)
        fk_not_default_cols = list(fk_link_cols) + rm_default_cols(fk['SYS_CODE'], fk['TABLE_CODE'],
                                                                   list(fk_cols_set - fk_link_cols))
        if len(fk_not_default_cols) <= 1:
            res_fk += rel_fk
            continue
        fk_node = DimNode((fk['SYS_CODE'], fk['TABLE_CODE'], fk['COL_CODE']),
                          fk_not_default_cols)
        process_tmp_table(fk_node)

        if (fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE']) in all_pk_nodes:
            pk_node = all_pk_nodes[(fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE'])]
        else:
            pk_cols = [tup[2] for tup in cols_distinct if
                       tup[0] == fk['FK_SYS_CODE'] and tup[1] == fk['FK_TABLE_CODE']]
            pk_node = DimNode((fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE']),
                              rm_default_cols(fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], pk_cols))
            process_tmp_table(pk_node)
            all_pk_nodes[(fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE'])] = pk_node
        node_table_relations.append((pk_node, fk_node, where, ('0', fk['ID']), rel_fk))

    # 处理联合主键
    for _, one_joint_fk in joint_fks.items():
        first = one_joint_fk[0]
        if first['GROUP_CODE'] in filter_fks_set:
            continue
        if (first['FK_SYS_CODE'], first['FK_TABLE_CODE']) not in tables or \
                (first['SYS_CODE'], first['TABLE_CODE']) not in tables:
            continue

        where = []
        rel_fk = []
        fk_link_cols = set()
        for fk in one_joint_fk:
            where.append((get_alias_col_name((fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE']), alias_gen),
                          get_alias_col_name((fk['SYS_CODE'], fk['TABLE_CODE'], fk['COL_CODE']), alias_gen)))
            fk_link_cols.add(fk['COL_CODE'])
            rel_fk.append(((fk['FK_SYS_CODE'], fk['FK_TABLE_CODE'], fk['FK_COL_CODE']),
                           (fk['SYS_CODE'], fk['TABLE_CODE'], fk['COL_CODE']), ('1', fk['GROUP_CODE']), 'fk'))

        if (first['FK_SYS_CODE'], first['FK_TABLE_CODE']) not in fds or \
                (first['SYS_CODE'], first['TABLE_CODE']) not in fds:
            res_fk += rel_fk
            continue
        # 取闭包获取所有可推字段
        fk_tab_fds = []
        for left_fz_set, right_list in fds[(first['SYS_CODE'], first['TABLE_CODE'])].items():
            if '' in left_fz_set:
                continue
            for right in right_list:
                fk_tab_fds.append((left_fz_set, right))
        fk_cols_set = closures_cycle(fk_tab_fds, fk_link_cols)
        fk_not_default_cols = list(fk_link_cols) + rm_default_cols(first['SYS_CODE'], first['TABLE_CODE'],
                                                                   list(fk_cols_set - fk_link_cols))
        if len(fk_not_default_cols) <= len(fk_link_cols):
            res_fk += rel_fk
            continue
        fk_node = DimNode((first['SYS_CODE'], first['TABLE_CODE'], first['COL_CODE']), fk_not_default_cols)
        process_tmp_table(fk_node)
        pk_cols = [tup[2] for tup in cols_distinct if
                   tup[0] == first['FK_SYS_CODE'] and tup[1] == first['FK_TABLE_CODE']]
        pk_node = DimNode((first['FK_SYS_CODE'], first['FK_TABLE_CODE'], first['FK_COL_CODE']),
                          rm_default_cols(first['FK_SYS_CODE'], first['FK_TABLE_CODE'], pk_cols))
        process_tmp_table(pk_node)
        node_table_relations.append((pk_node, fk_node, where, ('1', first['GROUP_CODE']), rel_fk))

    return node_table_relations, res_fk
예제 #11
0
def analyse_table_pk(conf,
                     input_conn,
                     output_conn,
                     sys_code,
                     ori_table_code,
                     etl_date,
                     date_offset,
                     alg,
                     start_date_str=time.strftime("%Y-%m-%d %H:%M:%S",
                                                  time.localtime())):
    """
    主键分析逻辑代码
    :param conf: 配置对象
    :param sys_code: 系统编号
    :param ori_table_code: 原始表编号
    :param etl_date: 函数依赖分析取数时间,用于得到候选联合主键后进行校验
    :param date_offset: 函数依赖分析取数时间偏移量,用于得到候选联合主键后进行校验
    :param alg: 函数依赖分析算法,用于得到联合主键后进行校验
    :param start_date_str: 主键分析开始时间,用于更新分析进度表
    :return:
    """
    assert isinstance(conf, Config)
    input_helper, output_helper = dynamic_import(conf)
    etl_dates = date_trans(etl_date, date_offset)
    # 1、根据系统名和原始表名在函数依赖关系表中查找该表全部的函数依赖关系
    fds = output_helper.get_fd_by_sys_table(output_conn, conf.output_schema,
                                            sys_code, ori_table_code)
    # 2、对得到的函数依赖关系进行候选键分析,得到候选键
    # 获取待分析表的全部字段
    table_columns = output_helper.get_table_all_columns(
        output_conn, conf.output_schema, sys_code, ori_table_code)
    checked_single_pk = []
    checked_joint_pk = []
    if fds:
        # 单表函数依赖关系小于一万条,则分析全部关系
        if len(fds) <= 10000:
            candidates = find_candidate_code(fds, table_columns)
        else:
            # 单表函数依赖关系超过一万条,只对左列长度小于等于3的关系进行分析,防止闭包算不动
            level_filter_fds = [fd for fd in fds if 0 < len(fd[0]) <= 3]
            candidates = find_candidate_code(level_filter_fds, table_columns)
        # 判断candidates是否是空,如果是空,则表明没有找到候选键
        if len(candidates) == 0:
            res_code = output_helper. \
                update_unfound_candidate_sche(output_conn, conf.output_schema, ori_table_code, start_date_str)
            if res_code == -1:
                logging.error("{}表没有找到候选键,更新进度表失败".format(ori_table_code))
            logging.warning("{}系统{}表没有找到候选键,无法继续分析主键".format(
                sys_code, ori_table_code))
            return

        # 3、获取待分析表的全部字段特征
        feature_dict = output_helper.get_table_feature(output_conn, sys_code,
                                                       sys_code,
                                                       ori_table_code,
                                                       conf.output_schema)
        if feature_dict:
            # 5、如果候选键长度等于1,做单一主键校验,如果候选键长度大于1,做联合主键校验
            checked_single_pk, checked_joint_pk = check_candidate(
                conf, input_conn, output_conn, sys_code, ori_table_code,
                candidates, feature_dict, alg, etl_dates)
        else:
            logging.error("{}系统{}表未获取到字段特征".format(sys_code, ori_table_code))
    else:
        logging.error("{}系统{}表未获取到函数依赖关系".format(sys_code, ori_table_code))
        if len(table_columns) > 3:
            logging.error("{}系统{}表未获取到函数依赖关系,且字段数为{},请核查".format(
                sys_code, ori_table_code, len(table_columns)))
            return
        checked_joint_pk = [table_columns]
    # 5、校验通过,存入对应的表
    save_pk_result(conf, output_conn, output_helper, sys_code, ori_table_code,
                   checked_single_pk, checked_joint_pk, start_date_str)
예제 #12
0
def check_candidate(conf, input_conn, output_conn, sys_code, ori_table_code,
                    candidates, feature_dict, alg, etl_dates):
    """
    对候选键进行检查确认主键
    :param conf:
    :param input_conn:
    :param output_conn:
    :param sys_code:
    :param ori_table_code:
    :param candidates:
    :param feature_dict:
    :param alg:
    :param etl_dates:
    :return:
    """
    input_helper, output_helper = dynamic_import(conf)
    checked_single_pk, checked_joint_pk = [], []
    for candidate in candidates:
        if len(candidate) == 1:
            # 不含中文,字段类型不为double和TIMESTAMP,表记录数和根据主键去重后的数目相等,校验通过
            if feature_dict[candidate[0]]['HAS_CHINESE'] == '0' and \
                    feature_dict[candidate[0]]['COL_TYPE'].rstrip() != "DOUBLE" and \
                    feature_dict[candidate[0]]['COL_TYPE'].rstrip() != "TIMESTAMP" and \
                    feature_dict[candidate[0]]['COL_TYPE'].rstrip() != "TIME" and \
                    feature_dict[candidate[0]]['COL_TYPE'].rstrip() != "DATE":
                records = int(feature_dict[candidate[0]]['COL_RECORDS'])
                distinct = int(feature_dict[candidate[0]]['COL_DISTINCT'])
                if records == distinct:
                    checked_single_pk.append(candidate)
                elif records >= conf.pk_threshold and (
                        records - distinct) < 5:  # 允许阈值以上条数的数据存在5条以内的脏数据
                    checked_single_pk.append(candidate)
        else:
            continue_flag = False
            for col in candidate:
                if feature_dict[col]['HAS_CHINESE'] == '1' or feature_dict[
                        col]['COL_NULLABLE'] == '1':
                    continue_flag = True
                    break
            if continue_flag:
                continue

            if alg == "F5":
                distinct = input_helper. \
                    get_mul_distinct_count(input_conn, ori_table_code, candidate, etl_dates[-1])
                records = input_helper.get_count(input_conn, ori_table_code,
                                                 etl_dates[-1])
            elif alg == "I":
                distinct = input_helper.get_mul_distinct_count(
                    input_conn, ori_table_code, candidate, etl_dates)
                records = input_helper.get_count(input_conn, ori_table_code,
                                                 etl_dates)
            elif alg == "IU":
                trans_table_code = output_helper. \
                    get_trans_table_name(output_conn, conf.output_schema, ori_table_code)
                distinct = input_helper. \
                    get_mul_distinct_count(input_conn, trans_table_code, candidate, etl_dates[-1])
                records = input_helper.get_count(input_conn, trans_table_code,
                                                 etl_dates[-1])
            else:
                logging.error("{}系统{}表使用了不支持的卸数方式{}".format(
                    sys_code, ori_table_code, alg))
                return

            if records == distinct:
                append_flag = True
            elif records >= conf.pk_threshold and (
                    records - distinct) < 5:  # 允许阈值以上条数的数据存在5条以内的脏数据
                append_flag = True
            else:
                append_flag = False

            if append_flag:
                checked_joint_pk.append(candidate)
    return checked_single_pk, checked_joint_pk
import logging
from configuration import Config
from utils.log_util import init_log
from utils.common_util import dynamic_import
from helper.same_cluster_helper import run_cluster
from dao import close_db2_connection, close_odbc_connection, get_input_output_conn

if __name__ == '__main__':
    init_log(log_path='../logs/same_cluster', level=logging.DEBUG)
    conf = Config()
    input_helper, output_helper = dynamic_import(conf)
    input_conn, output_conn = get_input_output_conn(conf)

    run_cluster(conf, output_conn)
    close_odbc_connection(input_conn)
    close_db2_connection(output_conn)
예제 #14
0
def analyse_table_fds_by_spark(conf, sys_code, table_name, alg, etl_dates,
                               start_date_str, fd_sample_size):
    logging.info("{}表使用spark分析{}函数依赖关系".format(table_name, fd_sample_size))
    import time
    st_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    assert isinstance(conf, Config)
    input_conn, output_conn = get_input_output_conn(conf)
    input_helper, output_helper = dynamic_import(conf)
    # 从hive上拉取数据,卸数为csv文件,csv文件的存放路径
    tmp_csv_file = os.path.abspath(
        os.path.join(conf.fd_tmp_path, "{}.tmp".format(table_name)))
    # 分析结果路径
    tmp_res_path = os.path.abspath(os.path.join(conf.fd_tmp_path,
                                                table_name)).replace(
                                                    "\\", "/")
    # 拼接HDFS路径
    hdfs_tmp_csv_file = "/tmp/fd/%s.tmp" % table_name
    hdfs_tmp_res_path = "/tmp/fd/%s" % table_name
    logging.info("开始函数依赖分析:{}!".format(table_name))
    if not os.path.exists(tmp_res_path):
        # 1. 数据采样
        try:
            if alg == '':
                alg = output_helper.get_tab_alg_single(output_conn,
                                                       conf.output_schema,
                                                       sys_code, table_name)
            if alg == "F5":
                data, size, col_num = input_helper.get_cols_sample(
                    input_conn, table_name, fd_sample_size, etl_dates[-1])
            elif alg == "I":
                data, size, col_num = input_helper.get_cols_sample(
                    input_conn, table_name, fd_sample_size, etl_dates)
            elif alg == "IU":
                trans_table_name = get_trans_table_name(
                    output_conn, conf.output_schema, table_name)
                data, size, col_num = input_helper.get_cols_sample(
                    input_conn, trans_table_name, fd_sample_size,
                    etl_dates[-1])
            else:
                logging.warning("{}表使用了未知算法{}".format(table_name, alg))
                close_odbc_connection(input_conn)
                close_db2_connection(output_conn)
                return '004'
        except Exception as e:
            logging.error("{}表进行函数依赖分析数据采集时发生异常{}".format(table_name, e))

        if size < conf.min_records:
            logging.warning("{}表数据过少!".format(table_name))
            fds = []
            output_helper.save_table_fd(output_conn, sys_code, table_name, fds,
                                        conf.output_schema, start_date_str,
                                        '2')
            close_odbc_connection(input_conn)
            close_db2_connection(output_conn)
            return "001"
        df = pd.DataFrame(data)

        # df.to_csv(tmp_csv_file, encoding='utf-8', sep='$', index=False)
        df.to_parquet(tmp_csv_file, compression='UNCOMPRESSED')
        del df

        if conf.spark_mode == 'yarn':
            cmd_hdfs = "hdfs dfs -put -f %s %s" % (tmp_csv_file,
                                                   hdfs_tmp_csv_file)
            execute_command(cmd_hdfs)
            cmd_rm = "hdfs dfs -rm -r -f %s" % hdfs_tmp_res_path
            execute_command(cmd_rm)
            # cmd = "spark-submit  --master yarn --deploy-mode client " + \
            #       "--driver-memory 4G --num-executors 12 --executor-cores 2 --executor-memory 3G " + \
            #       "--conf spark.default.parallelism=50 --conf spark.storage.memoryFraction=0.4 " + \
            #       "--conf spark.sql.shuffle.partitions=50 --conf spark.shuffle.memoryFraction=0.5 " + \
            #       "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_hdfs_jar_path) + \
            #       "--inputFilePath {} ".format(hdfs_tmp_csv_file) + \
            #       "--outputFilePath {} ".format(hdfs_tmp_res_path) + \
            #       "--inputFileHasHeader true " + \
            #       "--inputFileSeparator $"
            # cmd = "spark-submit  --master yarn --deploy-mode client " + \
            #       "--driver-memory 16G --num-executors 6 --executor-cores 2 --executor-memory 10G " + \
            #       "--conf spark.default.parallelism=50 --conf spark.storage.memoryFraction=0.4 " + \
            #       "--conf spark.sql.shuffle.partitions=50 --conf spark.shuffle.memoryFraction=0.5 " + \
            #       "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_hdfs_jar_path) + \
            #       "--inputFilePath {} ".format(hdfs_tmp_csv_file) + \
            #       "--outputFilePath {} ".format(hdfs_tmp_res_path) + \
            #       "--inputFileHasHeader true " + \
            #       "--inputFileSeparator $"
            cmd = "spark-submit  --master yarn --deploy-mode cluster " + \
                  "--driver-memory 20G --executor-cores 8 --executor-memory 20G --num-executors 3 " + \
                  "--conf spark.driver.maxResultSize=20G --conf spark.storage.memoryFraction=0.4 " + \
                  "--conf spark.shuffle.memoryFraction=0.5 --conf spark.shuffle.spill.compress=true " + \
                  "--conf spark.kryoserializer.buffer.max=128m --name FD_{} ".format(table_name) + \
                  "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_hdfs_jar_path) + \
                  "--inputFilePath {} ".format(hdfs_tmp_csv_file) + \
                  "--outputFilePath {} ".format(hdfs_tmp_res_path) + \
                  "--inputFileHasHeader true " + \
                  "--inputFileSeparator $ " + \
                  "--useParquet true"
        else:
            cmd = "spark-submit  --master local[*] " + \
                  "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_jar_path) + \
                  "--inputFilePath file://{} ".format(tmp_csv_file) + \
                  "--outputFilePath file://{} ".format(os.path.abspath(tmp_res_path)) + \
                  "--inputFileHasHeader true " + \
                  "--inputFileSeparator $" + \
                  "--useParquet true"

        timeout = 60 * 60
        res_int = execute_command(cmd)
        # res_int = execute_command(cmd, timeout=timeout)
        logging.debug("spark执行返回代码:{}".format(res_int))
    else:
        res_int = 0

    if res_int == 0 and conf.spark_mode == 'yarn':
        # logging.info("{}表spark程序完成".format(table_name))
        if os.path.exists(tmp_res_path + "/part-00000"):
            os.remove(tmp_res_path + "/part-00000")
            os.rmdir(tmp_res_path)
        cmd_hdfs = "hdfs dfs -get %s %s" % (hdfs_tmp_res_path, tmp_res_path)
        hdfs_to_lcoal_res = execute_command(cmd_hdfs)
        if hdfs_to_lcoal_res != 0:
            logging.error("{}表函数依赖关系分析完毕,将结果从hdfs拉取至本地时失败".format(table_name))
            return
    if res_int == 0:
        # 问题修复:可能没有符合条件的函数依赖
        try:
            fds = parse_result(tmp_res_path + "/part-00000")
            output_helper.save_table_fd(output_conn, sys_code, table_name, fds,
                                        conf.output_schema, start_date_str,
                                        '1')
        except Exception as e:
            logging.error("{}表函数依赖未正常保存:{}".format(table_name, e))
            close_odbc_connection(input_conn)
            close_db2_connection(output_conn)
            return "005"
        ed_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        logging.info('{}表开始函数依赖分析:{}'.format(table_name, st_time))
        logging.info("{}表函数依赖计算正常完成:{}".format(table_name, ed_time))
        try:
            # 删除临时文件
            if os.path.exists(tmp_res_path + "/part-00000"):
                os.remove(tmp_res_path + "/part-00000")
            if os.path.exists(tmp_res_path):
                os.rmdir(tmp_res_path)
        except Exception as e:
            logging.error("{}表临时文件删除失败:{}".format(table_name, e))
            close_odbc_connection(input_conn)
            close_db2_connection(output_conn)
            return "006"
        close_odbc_connection(input_conn)
        close_db2_connection(output_conn)
        return "000"
    elif res_int == -1:
        fds = []
        output_helper.save_table_fd(output_conn, sys_code, table_name, fds,
                                    conf.output_schema, start_date_str, '3')
        logging.warning("{}表函数依赖计算超时".format(table_name))
        close_odbc_connection(input_conn)
        close_db2_connection(output_conn)
        return "002"
    else:
        fds = []
        output_helper.save_table_fd(output_conn, sys_code, table_name, fds,
                                    conf.output_schema, start_date_str, '4')
        logging.error("{}表函数依赖计算发生异常".format(table_name))
        close_odbc_connection(input_conn)
        close_db2_connection(output_conn)
        return "003"
예제 #15
0
def analyse_table_feature(conf, sys_code, table_code, alg, etl_dates,
                          start_date_str=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())):
    """
    按表分析字段特征
    :param conf: 配置信息对象
    :param sys_code: 系统编码
    :param table_code: 表编码
    :param alg: 数据来源表卸数方法
    :param etl_dates: 数据来源表卸数时间
    :param start_date_str: 单表字段分析开始时间
    :return:
    """
    assert isinstance(conf, Config)
    assert isinstance(etl_dates, list)

    input_conn, output_conn = get_input_output_conn(conf)
    input_helper, output_helper = dynamic_import(conf)

    # 用保存字段特征
    features = {}
    # 用于保存代码类字段的码值信息
    code_value_dict = {}
    size, data, col_num, distinct_col_count, count, distinct, max_len, min_len = \
        None, None, None, None, None, None, None, None

    # 1. 数据采样,并计算表记录数
    try:
        if alg == "F5":
            data, size, col_num = input_helper.\
                get_cols_sample(input_conn, table_code, conf.feature_sample_size, etl_dates[-1])
            count = input_helper.get_count(input_conn, table_code, etl_dates[-1])
        elif alg == "I":
            data, size, col_num = input_helper.get_cols_sample(input_conn, table_code, conf.feature_sample_size, etl_dates)
            count = input_helper.get_count(input_conn, table_code, etl_dates)
        elif alg == "IU":
            trans_table_code = get_trans_table_name(output_conn, conf.output_schema, table_code)
            data, size, col_num = input_helper.\
                get_cols_sample(input_conn, trans_table_code, conf.feature_sample_size, etl_dates[-1])
            count = input_helper.get_count(input_conn, trans_table_code, etl_dates[-1])
        else:
            logging.error("{}表使用了不支持卸数方式{}".format(table_code, alg))
            close_odbc_connection(input_conn)
            close_db2_connection(output_conn)
            exit(-1)
    except Exception as e:
        logging.error("{}表字段特征分析采样阶段出现异常{}".format(table_code, e))

    # 如果采样量小于字段特征分析阈值,记录日志
    if size < conf.min_records:
        logging.warning("{}表实际数据采样量{}小于字段特征分析的阈值{}".format(table_code, size, conf.min_records))
        # 因采样量小于字段特征分析阈值,将进度表更新为2
        res_code = output_helper.update_unana_feature_sche(output_conn, conf.output_schema, table_code, start_date_str)
        if res_code != 0:
            logging.error("{}表实际数据采样量小于字段特征分析的阈值,更新进度表失败".format(table_code))
        close_odbc_connection(input_conn)
        close_db2_connection(output_conn)
        return

    logging.info("开始分析{}表字段特征".format(table_code))

    # 遍历表中的每一个字段
    for col_name, col_data in data.items():
        # 字段值检查
        if not isinstance(col_data[0], str):
            logging.warning("{}表{}字段不是字符串类型,无法进行特征分析".format(table_code, col_name))
            continue

        feature = Feature()

        # 2) 字段值去重记录数分析,字段值最大长度和最小长度分析,计算字段值是否是默认值
        if alg == "F5":
            distinct = input_helper.get_distinct_count(input_conn, table_code, col_name, etl_dates[-1])
            min_len, max_len = input_helper.get_min_max_length(input_conn, table_code, col_name, etl_dates[-1])
            distinct_col_count = input_helper.get_distinct_col_count(input_conn, table_code, col_name, etl_dates[-1])
        elif alg == "I":
            distinct = input_helper.get_distinct_count(input_conn, table_code, col_name, etl_dates)
            min_len, max_len = input_helper.get_min_max_length(input_conn, table_code, col_name, etl_dates)
            distinct_col_count = input_helper.get_distinct_col_count(input_conn, table_code, col_name, etl_dates)
        elif alg == "IU":
            trans_table_code = get_trans_table_name(output_conn, conf.output_schema, table_code)
            distinct = input_helper.\
                get_distinct_count(input_conn, trans_table_code, col_name, etl_dates[-1])
            min_len, max_len = input_helper.get_min_max_length(input_conn, trans_table_code, col_name, etl_dates[-1])
            distinct_col_count = input_helper.\
                get_distinct_col_count(input_conn, trans_table_code, col_name, etl_dates[-1])
        else:
            logging.error("{}表使用了不支持卸数方式{}".format(table_code, alg))
            close_odbc_connection(input_conn)
            close_db2_connection(output_conn)
            exit(-1)

        if int(distinct_col_count) == 1:
            feature.default_value = True
        feature.records = count
        feature.distinct = distinct
        feature.max_len = max_len
        feature.min_len = min_len

        # 5)从字段值的角度进行字段特征分析
        feature, code_value_set = \
            infer_feature(conf, col_name, col_data, input_conn, table_code, alg, output_conn, etl_dates, feature=feature)
        # 判断字段是否是代码类,如果是代码类将码值保存到code_value_dict中
        if code_value_set:
            code_value_dict[col_name] = code_value_set
        features[col_name] = feature
    # 3. 保存数据
    stat = output_helper.\
        save_table_features(output_conn, sys_code, sys_code, table_code, features, conf.output_schema, start_date_str,
                            col_num, code_value_dict)
    if stat != 0:
        logging.error("{}表分析结果保存数据库失败".format(table_code))

    logging.info("{}表字段特征分析结束".format(table_code))

    # 关闭数据库连接
    close_odbc_connection(input_conn)
    close_db2_connection(output_conn)