예제 #1
0
def fd_main(sys_code, tab_code, etl_date, date_offset, alg, sample_size, start_date_str):
    etl_dates = date_trans(etl_date, date_offset)
    conf = Config()
    output_conn = None
    if conf.output_db == "db2":
        output_conn = get_db2_connect(conf.output_db_url)
    else:
        logging.error("输出配置数据库未适配 :{}".format(conf.output_db))
        exit(-1)
    # 检查输出,已分析的表跳过分析步骤
    # 函数依赖分析
    fd_sche = get_analysis_schedule_single(output_conn, conf.output_schema, sys_code, tab_code)['FD_SCHE']
    ibm_db.close(output_conn)
    if fd_sche == "1":
        logging.warning("该表已完成函数依赖分析:{}".format(tab_code))
        exit(-1)
    else:
        analyse_table_fds(conf, sys_code, tab_code, alg, etl_dates, start_date_str, sample_size, status=fd_sche)
예제 #2
0
from utils.common_util import *
import time
import multiprocessing
from main.fk_main import analyse_table_fk
import ibm_db
from dao.output.db2_helper import get_fk_sys
init_log('../logs/fk', level=logging.DEBUG)

if __name__ == "__main__":
    conf = Config()
    start_date_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    inceptor_conn = get_odbc_connect(conf.dsn)
    # 检查输出,已分析的表跳过分析步骤
    output_conn = None
    if conf.output_db == "db2":
        output_conn = get_db2_connect(conf.output_db_url)
        import dao.output.db2_helper as output_helper
    else:
        logging.error("输出配置数据库未适配 :{}".format(conf.output_db))
        exit(-1)
    # 获取所有外键分析系统
    fk_sys_all = get_fk_sys(output_conn, conf.output_schema)
    # 获取配置表信息
    analysis_conf_dict = output_helper.get_config_info(output_conn,
                                                       conf.output_schema)
    # 读取全部表的分析进度情况
    analysis_schedule_dict = output_helper.get_analysis_schedule(
        output_conn, conf.output_schema)
    # 用于存放待单一外键分析的字典
    table_need_analysis_dict = {}
    for (sys_code, ori_table_code) in analysis_conf_dict:
예제 #3
0
def fd_merge_main(conf, sys_code, table_code, start_date_str):
    schema = conf.output_schema
    conn = None
    if conf.output_db == "db2":
        conn = get_db2_connect(conf.output_db_url)
        import dao.output.db2_helper as output_helper
    else:
        logging.error("输出配置数据库未适配 :{}".format(conf.output_db))
        exit(-1)
    try:
        # key为函数依赖关系右部,value为能推出右部的函数依赖关系左部tuple
        fd_dict_1, fd_dict_2 = output_helper.get_fd_tmp(
            conn, schema, sys_code, table_code)
    except Exception as e:
        logging.error("临时函数依赖关系读取失败 :{}:{}".format(table_code, e))
        return '001'
    # 函数依赖关系右部取交集
    right_cols = list(set(fd_dict_1.keys()) & set(fd_dict_2.keys()))
    merge_res = {}
    try:
        # 遍历函数依赖关系右部交集
        for right_col in tqdm(right_cols):
            fd_intersect = set(fd_dict_1[right_col]) & set(
                fd_dict_2[right_col])
            left_col_list = list(fd_intersect)
            fd_diff_1 = set(fd_dict_1[right_col]) - fd_intersect
            fd_diff_2 = set(fd_dict_2[right_col]) - fd_intersect
            for fd_1 in list(fd_diff_1):
                for fd_2 in list(fd_diff_2):
                    fd_1 = set(fd_1)
                    fd_2 = set(fd_2)
                    if fd_1 & fd_2:
                        fd_new = fd_1 | fd_2
                        fd_new = list(fd_new)
                        fd_new.sort()
                        left_col_list.append(tuple(fd_new))
            left_col_list = list(set(left_col_list))
            print('{}:{}'.format(right_col, len(left_col_list)))
            left_col_list.sort(key=lambda i: len(i))

            # 依赖关系化成最简
            # left_col_list_res = left_col_list.copy()
            # for fd in left_col_list:
            #     for fd_sub in left_col_list_res:
            #         if fd == fd_sub:
            #             continue
            #         else:
            #             if len(fd) < len(fd_sub):
            #                 break
            #             if set(fd_sub).issubset(set(fd)):
            #                 if fd in left_col_list_res:
            #                      left_col_list_res.remove(fd)
            #                 break
            # merge_res[right_col] = left_col_list_res
            # merge_flag = True
            fd_sub_num = 0
            left_cols = left_col_list.copy()
            left_cols_tmp = left_cols.copy()
            max_len = max([len(i) for i in left_cols])
            while True:
                fd_sub = left_cols[fd_sub_num]
                if len(fd_sub) == max_len or fd_sub_num == (len(left_cols) -
                                                            1):
                    break
                fd_sub = set(fd_sub)
                # left_cols_tmp = left_cols.copy()
                for fd in left_cols[fd_sub_num + 1:]:
                    if len(fd) == len(fd_sub):
                        continue
                    if fd_sub.issubset(set(fd)):
                        left_cols_tmp.remove(fd)
                left_cols = left_cols_tmp.copy()
                fd_sub_num += 1
            merge_res[right_col] = left_cols_tmp

    # print(merge_res)
    except Exception as e:
        logging.error("函数依赖关系合并失败 :{}:{}".format(table_code, e))
        return '002'
    code = output_helper.fd_merge_save(conn, schema, sys_code, table_code,
                                       merge_res, start_date_str)
    if code == 0:
        logging.info("函数依赖关系合并完成 :{}".format(table_code))
    ibm_db.close(conn)
예제 #4
0
def dim_division(conf):
    """
    维度划分逻辑处理
    1、在数据库中读取数据函数依赖关系和外键关系,目前暂时只取FD_LEVEL = 1的函数依赖关系和单一外键关系
    2、将两个结果集转换为pandas的DataFrame
    3、对数据进行初步处理,主要是删除重复项
    4、去重之后的FD关系和FK关系进行开始进行维度划分
        4-1、处理FK关系,对FK关系进行扩展,可以进行FK关系互推的将其合并为一个节点
        4-2、删除fk和fd关系中,指向自身的关系,如
               LEFT          RIGHT           RL
         0     A/B           A/B             FK
         1     A/B           A/B             FK
        都会被删掉
    5、初步找出维度主节点,原则是:外键关系只出不进的认为是维度主节点
    6、整理fd和fk关系
    7、遍历初步判定的维度主节点,找出每个维度节点的属性列表和子集列表,属性列表指的是FD关系,子集列表指的是FK关系
    8、检查初步判定的维度节点之间的关系,检查初步判定的维度节点是否可作为维度主节点,原则是如果一个维度节点不做任何节点的属性,或者做两个以上节点的属性,则该节点认定为维度主节点
    9、对无法作为维度主节点的节点,找到其对应的维度主节点
    10、对同表中可以互推的维度主节点进行合并
    11、整理所有节点所属的维度
    12、保存维度划分结果
    :param conf:
    :return:
    """
    assert isinstance(conf, Config)

    output_conn = None
    output_helper = None
    if conf.output_db == "db2":
        import dao.output.db2_helper as output_helper
        output_conn = get_db2_connect(conf.output_db_url)
    else:
        logging.error("输出配置数据库未适配:{}".format(conf.output_db))
        exit(-1)

    logging.info('开始删除旧的维度划分结果')
    del_result_code = output_helper.del_old_dim_dive_result(
        output_conn, conf.output_schema)
    if del_result_code == 0:
        logging.info('删除旧的维度划分结果完成')
    elif del_result_code == -1:
        logging.error('删除旧的维度划分结果失败')
        ibm_db.close(output_conn)
        exit(-1)
    else:
        logging.error('删除旧的维度划分结果返回未知的状态码{}'.format(del_result_code))
        ibm_db.close(output_conn)
        exit(-1)

    # 1、在数据库中读取数据函数依赖关系和外键关系,目前暂时只取FD_LEVEL = 1的函数依赖关系和单一外键关系
    logging.info('开始读取数据')
    # FD_LEVEL = 1的函数依赖关系
    fd_dict_from_db = output_helper.get_function_dependency(
        output_conn, conf.output_schema)
    # 单一外键关系
    fk_dict_from_db = output_helper.get_single_fk_relation(
        output_conn, conf.output_schema)

    # FIXME 从ANAschema中拿到函数依赖关系和外键关系,用来校验程序,后续可以删掉
    # fd_dict_from_db = output_helper.get_fd_for_dim_dive(output_conn, "ANA")
    # fk_dict_from_db = output_helper.get_fk_for_dim_dive(output_conn, "ANA")

    # 2、将两个结果集转换为pandas的DataFrame
    fd_df = pd.DataFrame(fd_dict_from_db)
    fk_df = pd.DataFrame(fk_dict_from_db)

    # 3、对数据进行初步处理,主要是删除重复项
    fd_df = fd_df.drop_duplicates()
    fk_df = fk_df.drop_duplicates()
    # 全部的字段关系
    all_relation_df = fd_df.append(fk_df)
    logging.info('数据读取完成')

    # 4、去重之后的FD关系和FK关系进行开始进行维度划分
    # 4-1、处理FK关系,对FK关系进行扩展,可以进行FK关系互推的将其合并为一个节点
    logging.info('外键关系互推节点合并开始')
    unexpand_fk_dict = {}
    # 遍历外键,得到fk_dict,key为外键关系左部,value为列表,列表中是外键关系右部
    for index, row in fk_df.iterrows():
        if row['LEFT'] in unexpand_fk_dict.keys():
            unexpand_fk_dict[row['LEFT']].append(row['RIGHT'])
        else:
            unexpand_fk_dict[row['LEFT']] = [row['RIGHT']]

    # 对fk_dict进行处理,找出每个左部节点的全部的外键关系,即每个节点通过外键关系可以推出的全部节点,这样如果有互推关系,A字段FK关系推B,B字段FK关系推A,则能扩展出A能推出A
    expand_fk_dict = expand_fk_relation(unexpand_fk_dict)

    # 基于扩展后的FK关系,得到FK互推的字段
    dfeo_fd_dict = get_derive_from_each_other_fk(expand_fk_dict)

    # 修改关系,合并可互推节点,可以互推的节点视为一个节点
    if os.path.exists('../tmp/after_merge_fk_df.csv') and os.path.exists(
            '../tmp/after_merge_all_rela_df.csv'):
        logging.info('已存在修改完成的关系,直接读取')
        fk_df = pd.read_csv('../tmp/after_merge_fk_df.csv')
        all_relation_df = pd.read_csv('../tmp/after_merge_all_rela_df.csv')
    else:
        for key in dfeo_fd_dict.keys():
            fk_df.loc[fk_df.LEFT == key, 'LEFT'] = dfeo_fd_dict[key]
            fk_df.loc[fk_df.RIGHT == key, 'RIGHT'] = dfeo_fd_dict[key]
            all_relation_df.loc[all_relation_df.LEFT == key,
                                'LEFT'] = dfeo_fd_dict[key]
            all_relation_df.loc[all_relation_df.RIGHT == key,
                                'RIGHT'] = dfeo_fd_dict[key]
        fk_df.to_csv('../tmp/after_merge_fk_df.csv', index_label='index_label')
        all_relation_df.to_csv('../tmp/after_merge_all_rela_df.csv',
                               index_label='index_label')
    logging.info('外键关系互推节点合并完成')

    # 4-2、删除fk和所有关系中,指向自身的关系
    logging.info('删除fk和所有关系中指向自身的fk关系开始')
    fk_drop_index = []
    for index, row in fk_df.iterrows():
        if row['LEFT'] == row['RIGHT']:
            fk_drop_index.append(index)
    fk_df = fk_df.drop(fk_drop_index, axis=0)
    all_rela_drop_index = []
    for index, row in all_relation_df.iterrows():
        if row['LEFT'] == row['RIGHT']:
            all_rela_drop_index.append(index)
    all_relation_df = all_relation_df.drop(all_rela_drop_index, axis=0)
    logging.info('已删除fk和所有关系中指向自身的fk关系')

    # 5、初步找出维度主节点,原则是:外键关系只出不进的认为是维度主节点
    logging.info('初步找出维度主节点开始')
    if os.path.exists('../tmp/candidate_dim_node_list.pickle'):
        with open('../tmp/candidate_dim_node_list.pickle', 'rb') as p:
            candidate_dim_node_list = pickle.load(p)
    else:
        candidate_dim_node_list = find_candidate_dim_node(fk_df)
        with open('../tmp/candidate_dim_node_list.pickle', 'wb') as p:
            pickle.dump(candidate_dim_node_list, p)
    logging.info('已初步找出维度主节点')

    # 6、整理fd和fk关系,为寻找维度主节点的属性集合和子集集合做准备
    # key为fd关系左部节点,value为fd关系左部节点能够推出的右部节点set集合
    after_arrange_fd_dict = {}
    # key为fk关系左部节点,value为fk关系左部节点能够推出的右部节点set集合
    after_arrange_fk_dict = {}
    for index, row in all_relation_df.iterrows():
        if row['RL'] == 'FD':
            if row['LEFT'] in after_arrange_fd_dict.keys():
                # 求并集,即A字段FD推B字段,A字段FD推C字段...
                after_arrange_fd_dict[row['LEFT']] = after_arrange_fd_dict[
                    row['LEFT']] | {row['RIGHT']}
            else:
                after_arrange_fd_dict[row['LEFT']] = {row['RIGHT']}
        elif row['RL'] == 'FK':
            if row['LEFT'] in after_arrange_fk_dict.keys():
                # 求并集,即A字段FK推B字段,A字段FK推C字段...
                after_arrange_fk_dict[row['LEFT']] = after_arrange_fk_dict[
                    row['LEFT']] | {row['RIGHT']}
            else:
                after_arrange_fk_dict[row['LEFT']] = {row['RIGHT']}
        else:
            logging.error("系统无法识别{}关系,无法进行维度划分".format(row['RL']))
            ibm_db.close(output_conn)
            exit(-1)
    logging.info('函数依赖关系和外键关系整理完毕')

    # 所有候选维度主节点的全部属性关系,[(dim_node, attr1),(dim_node, attr2),...]
    all_attr_rela_list = []
    # 所有候选维度主节点的全部子集关系,[(dim_node, subset1),(dim_node, subset2),...]
    all_subset_rela_list = []
    # 所有候选维度主节点的属性关系和子集关系,{'dim1':[[attr,...],[subset,...]], 'dim2':[[],[]],...}
    all_dim_node_rela = {}

    if os.path.exists('../tmp/all_attr_rela.pickle') and os.path.exists('../tmp/all_subset_rela.pickle') \
            and os.path.exists('../tmp/all_dim_node_rela.pickle'):
        with open('../tmp/all_attr_rela.pickle', 'rb') as p:
            all_attr_rela_list = pickle.load(p)
        with open('../tmp/all_subset_rela.pickle', 'rb') as p:
            all_subset_rela_list = pickle.load(p)
        with open('../tmp/all_dim_node_rela.pickle', 'rb') as p:
            all_dim_node_rela = pickle.load(p)

    # 7、遍历初步判定的维度主节点,找出每个维度节点的属性列表和子集列表,属性列表指的是FD关系,子集列表指的是FK关系
    logging.info('开始寻找每个维度节点的属性列表和子集列表')
    for i in range(len(candidate_dim_node_list)):
        candidate_dim_node = candidate_dim_node_list[i]
        # 如果这个维度主节点分析过了,就跳过不分析
        if candidate_dim_node in all_dim_node_rela.keys():
            continue
        attr_list, subset_list = find_attr_and_subset(candidate_dim_node,
                                                      after_arrange_fd_dict,
                                                      after_arrange_fk_dict)
        # 准备把结果写成pickle文件
        all_dim_node_rela[candidate_dim_node] = [attr_list, subset_list]
        for attr in attr_list:
            all_attr_rela_list.append((candidate_dim_node, attr))
        for subset in subset_list:
            all_subset_rela_list.append((candidate_dim_node, subset))
    all_attr_rela_list = list(set(all_attr_rela_list))
    all_subset_rela_list = list(set(all_subset_rela_list))
    with open('../tmp/all_attr_rela.pickle', 'wb') as p:
        pickle.dump(all_attr_rela_list, p)
    with open('../tmp/all_subset_rela.pickle', 'wb') as p:
        pickle.dump(all_subset_rela_list, p)
    with open('../tmp/all_dim_node_rela.pickle', 'wb') as p:
        pickle.dump(all_dim_node_rela, p)
    logging.info('已找出每个维度节点的属性列表和子集列表')

    # 8、检查初步判定的维度节点之间的关系,检查初步判定的维度节点是否可作为维度主节点,原则是如果一个维度节点不做任何节点的属性,或者做两个以上节点的属性,则该节点认定为维度主节点
    dim_main_node_check_res, group_dict, group_num_dict = dim_main_node_check(
        candidate_dim_node_list, all_attr_rela_list, all_dim_node_rela)

    # 9、对无法作为维度主节点的节点,找到其对应的维度主节点,结果为:key为初步判定的维度节点名,value为True表示该节点为维度主节点,为其他值表示该key不是维度主节点,value是其维度主节点
    candidate_node_find_main_node_res = candidate_node_find_main_node(
        dim_main_node_check_res)

    # 10、对同表中可以互推的维度主节点进行合并
    logging.info('开始合并相同维度')
    ori_fd_dict = {}
    for index, row in fd_df.iterrows():
        if row['LEFT'] not in ori_fd_dict.keys():
            ori_fd_dict[row['LEFT']] = []
            ori_fd_dict[row['LEFT']].append(row['RIGHT'])
        else:
            ori_fd_dict[row['LEFT']].append(row['RIGHT'])
    # 同表中可以互推的维度节点进行合并
    same_tab_dim_node_merge_res = same_tab_dim_node_merge(
        candidate_node_find_main_node_res, group_dict, group_num_dict)
    logging.info('已合并相同维度')

    # 11、整理所有节点所属的维度
    logging.info('开始整理所有节点所属的维度')
    res_dict = {
        'node': [],
        'dim': [],
        'orig_dim': [],
        'type': [],
        'del_flag': []
    }
    # 遍历所有的属性节点,[(dim_node, attr),(dim_node, attr),...]
    for attr in all_attr_rela_list:
        res_dict['node'].append(attr[1])
        # 是维度主节点
        if same_tab_dim_node_merge_res[attr[0]] is True:
            res_dict['dim'].append(attr[0])
            res_dict['orig_dim'].append(attr[0])
        # 不是维度主节点,但是进行了合并,有可能有这种情况,就是同表的维度主节点进行了合并,导致了same_tab_dim_node_merge_res[attr[0]] 不是 True
        else:
            if same_tab_dim_node_merge_res[attr[0]][0] == '#':
                res_dict['dim'].append(
                    same_tab_dim_node_merge_res[attr[0]][1:])
                res_dict['orig_dim'].append(attr[0])
            else:
                res_dict['dim'].append(same_tab_dim_node_merge_res[attr[0]])
                res_dict['orig_dim'].append(
                    same_tab_dim_node_merge_res[attr[0]])
        res_dict['type'].append('attr')
        res_dict['del_flag'].append('1')
    # 遍历所有的子集节点,[(dim_node, subset),(dim_node, subset),...]
    for subset in all_subset_rela_list:
        # 如果子集节点不是某个节点的属性,则只考虑子集关系
        if subset[1] not in res_dict['node']:
            res_dict['node'].append(subset[1])
            if same_tab_dim_node_merge_res[subset[0]] is True:
                res_dict['dim'].append(subset[0])
                res_dict['orig_dim'].append(subset[0])
            else:
                if same_tab_dim_node_merge_res[subset[0]][0] == '#':
                    res_dict['dim'].append(
                        same_tab_dim_node_merge_res[subset[0]][1:])
                    res_dict['orig_dim'].append(subset[0])
                else:
                    res_dict['dim'].append(
                        same_tab_dim_node_merge_res[subset[0]])
                    res_dict['orig_dim'].append(
                        same_tab_dim_node_merge_res[subset[0]])
            res_dict['type'].append('subset')
            res_dict['del_flag'].append('1')
        # 如果子集节点是某个节点的属性,则删除子集关系,保留属性关系
        elif subset[1] in res_dict['node']:
            inds = [
                ind for ind in range(len(res_dict['node']))
                if res_dict['node'][ind] == subset[1]
                and res_dict['dim'][ind] == subset[0]
            ]
            del_flag = False
            for ind in inds:
                if res_dict['type'][ind] == 'attr':
                    res_dict['del_flag'][ind] = '0'
                    del_flag = True

            if not del_flag:
                res_dict['node'].append(subset[1])
                if same_tab_dim_node_merge_res[subset[0]] is True:
                    res_dict['dim'].append(subset[0])
                    res_dict['orig_dim'].append(subset[0])
                else:
                    if same_tab_dim_node_merge_res[subset[0]][0] == '#':
                        res_dict['dim'].append(
                            same_tab_dim_node_merge_res[subset[0]][1:])
                        res_dict['orig_dim'].append(subset[0])
                    else:
                        res_dict['dim'].append(
                            same_tab_dim_node_merge_res[subset[0]])
                        res_dict['orig_dim'].append(
                            same_tab_dim_node_merge_res[subset[0]])
                res_dict['type'].append('subset')
                res_dict['del_flag'].append('1')

    after_arrange_result_dict = node_arrange(res_dict)
    logging.info('节点归属维度修改完成')

    # 12、保存维度划分结果
    logging.info('保存维度划分结果')
    dim_division_result_df = pd.DataFrame(after_arrange_result_dict)
    dim_division_result_df = dim_division_result_df.drop_duplicates()

    result_code = output_helper.save_dim_division_result(
        output_conn, conf.output_schema, dim_division_result_df)

    # 如果维度划分结果保存正常,则删除临时文件
    if result_code == 0:
        del_temp_file()
        logging.info('保存维度划分结果完成')
    elif result_code == -1:
        logging.error("维度划分结果保存数据库失败")
    else:
        logging.error("维度划分结果保存数据库返回不支持的状态码")

    # 关闭数据库连接
    ibm_db.close(output_conn)