示例#1
0
def file_tree_test(postfix='.html'):
    # --- 服务器数据 220 ---
    path = '/data/hadoop/yisun/data/tianchi2/train/chongzu/html/'
    filename = None
    # filename = ['20546245.html']
    label_file = '/data/hadoop/yisun/data/tianchi2/train_label/chongzu.train'
    outpath = '../data/extract_result/train_chongzu/'

    # # # --- 天池服务器
    # path = '/home/118_16/data/chongzu_train_html/'
    # filename = None
    # # filename = ['20546245.html']
    # label_file = '../data/train_data/train_labels/chongzu.train'
    # outpath = '../data/extract_result/train_chongzu/'

    # --- 本地外部数据 thinkpad ---
    path = 'E:\\天池大赛\\公告数据\\天池大赛\\announcement_extract\\复赛数据\\复赛新增类型训练数据-20180712\\资产重组\\html\\'
    filename = ['20546245.html']
    label_file = '../data/train_data/train_labels/chongzu.train'
    outpath = '../data/extract_result/train_chongzu/'

    # --- 本地外部数据 ---
    path = 'D:\\TianChi_competition\\公告信息抽取\\materials\\复赛\\复赛新增类型训练数据-20180712\\资产重组\\html\\'
    filename = ['9895659.html']
    label_file = '../data/train_data/train_labels/chongzu.train'
    outpath = '../data/extract_result/train_chongzu/'

    # --- 本地数据 ---
    # path = '../data/temp2/'
    # filename = None
    # filename = ['19223567.html']
    # outpath = '../data/temp2/result/'
    # label_file = '../data/extract_result/train_chongzu/'

    if filename == None:
        files_name = os.listdir(path)
    else:
        files_name = filename
    file_list = [i for i in files_name if i.endswith(postfix)]
    html_dict = tian_chi.read_html(filepath=path, filename=file_list)
    content = {}
    for index in html_dict:
        # -------------------------- 核心过程 -----------------------------
        # --- 获取预内容 ---
        res = tian_chi.extract_pre_content(html_dict[index])
        tag = res[3]
        _major_promption = res[2]
        # --- 获取天池 text+table ---
        text_list = tian_chi.get_content(tag)
        # --- text to text_list
        _content = content_format.tags_format(tags_list=text_list)[0]
        major_promption = content_format.tags_format(_major_promption)[0]
        # --- 转换为文档树 ---
        t = content_format.FileTree(content_list=_content,
                                    zhongdashixiangtishi=major_promption)
        t.get_tree_list()
        t.get_file_tree()
        # result = t.get_tree_content(strcture=['第二章', '实际控制人'], method='content')
        content[index] = t.titles
    return t
示例#2
0
def t2(drop_table=False):
    path = '../data/temp2/'
    filename = '23599.html'
    outpath = '../data/temp2/'
    html_dict = tc.read_html(filepath=path, filename=filename)
    contents = {}
    data = {}
    for n, index in enumerate(html_dict):
        print('processing {0} -- total: {1} this: {2}'.format(
            index, len(html_dict), n))
        # 是否去掉表格tag
        h = html_dict[index]
        if drop_table:
            for i in h.find_all('table'):
                i.decompose()
        mulu, shiyi_dict, major_promption, tag = tc.extract_pre_content(h)
        _content = tc.get_content(tag)
        _content, part_table, table_failed = cf.tags_format(_content)

        content = {
            'mulu': mulu,
            'shiyi': shiyi_dict,
            'major_promption': major_promption,
            'content': _content
        }
        contents[index] = content
        res = cf.file_tree(content['mulu'], content['shiyi'],
                           content['major_promption'], content['content'])
        # data = res.get_file_tree()
        data[index] = res.get_tree_list()
    return data
def html2file_tree(html, drop_table=False):
    mulu, shiyi_dict, major_promption, tag = tian_chi.extract_pre_content(html)
    _content = tian_chi.get_content(tag)
    _content, part_table, table_failed = content_format.tags_format(_content)

    content = {
        'mulu': mulu,
        'shiyi': shiyi_dict,
        'major_promption': major_promption,
        'content': _content
    }
    res = content_format.FileTree(content['mulu'], content['shiyi'],
                                  content['major_promption'],
                                  content['content'])
    data = res.get_tree_list()
    return data
示例#4
0
def main(postfix='.html', batches=20):
    """
        信息抽取抽取主程序
    :return:
    """
    # --- 服务器数据 220 ---
    path = '/data/hadoop/yisun/data/tianchhi2/train/chongzu/html/'
    filename = None
    # filename = ['20546245.html']
    label_file = '/data/hadoop/yisun/data/tianchi2/train_label/chongzu.train'
    outpath = '/data/hadoop/yisun/data/tianchi2/result/train/chongzu/'

    # # --- 天池服务器
    # path = '/home/118_16/data/chongzu_train_html/'
    # filename = None
    # # filename = ['20546245.html']
    # label_file = '../data/train_data/train_labels/chongzu.train'
    # outpath = '../data/temp2/result/'

    # # --- 本地外部数据 ---
    path = 'D:\\TianChi_competition\\公告信息抽取\\materials\\复赛\\复赛新增类型训练数据-20180712\\资产重组\\html\\'
    filename = ['15990796.html']
    label_file = '../data/train_data/train_labels/chongzu.train'
    outpath = '../data/temp2/result/chongzu/'

    # --- 本地数据 ---
    # path = '../data/temp2/'
    # filename = None
    # filename = ['19223567.html']
    # outpath = '../data/temp2/result/'
    # label_file = '../data/train_data/train_labels/chongzu.train'

    if filename == None:
        files_name = os.listdir(path)
    else:
        files_name = filename
    file_list = [i for i in files_name if i.endswith(postfix)]
    tag_dict = {}
    for file in file_list:
        index = file.replace(postfix, '')
        tag_dict = tian_chi.read_html(filepath=path, filename=filename)
    data = {}
    for index in tag_dict:
        data[index] = tian_chi.extract_pre_content(tag_dict[index])
    return data
示例#5
0
def get_pre_content(path,
                    filename,
                    drop_table=False,
                    keys=['mulu', 'shiyi', 'major_promption', 'content'],
                    text_trans=False,
                    df_json=True):
    """
        获取文件的预信息
    :param path:
    :param filename:
    :param drop_table:
    :param keys:
    :param text_trans:
    :return:
    """
    html_dict = tian_chi.read_html(filepath=path, filename=filename)
    contents = {}
    for n, index in enumerate(html_dict):
        print('processing {0} -- total: {1} this: {2}'.format(
            index, len(html_dict), n))
        # 是否去掉表格tag
        h = html_dict[index]
        if drop_table:
            for i in h.find_all('table'):
                i.decompose()
        mulu, shiyi_dict, major_promption, tag = tian_chi.extract_pre_content(
            h)
        _content = tian_chi.get_content(tag)
        # 是否转换 tags ---> text list
        if text_trans:
            _content, part_table, table_failed = content_format.tags_format(
                _content, df_json=True)
        content = {
            'mulu': mulu,
            'shiyi': shiyi_dict,
            'major_promption': major_promption,
            'content': _content
        }
        res = {}
        for i in keys:
            res[i] = content[i]
        contents[index] = res
    return contents
示例#6
0
def get_file_tree(path,
                  filename,
                  drop_table=False,
                  method='list',
                  df_json=False):
    # --- pre_main ---
    html_dict = tian_chi.read_html(filepath=path, filename=filename)
    contents = {}
    data = {}
    for n, index in enumerate(html_dict):
        print('processing {0} -- total: {1} this: {2}'.format(
            index, len(html_dict), n))
        # 是否去掉表格tag
        h = html_dict[index]
        if drop_table:
            for i in h.find_all('table'):
                i.decompose()
        mulu, shiyi_dict, major_promption, tag = tian_chi.extract_pre_content(
            h)
        _content = tian_chi.get_content(tag)
        _content, part_table, table_failed = content_format.tags_format(
            _content, df_json=df_json)
        content = {
            'mulu': mulu,
            'shiyi': shiyi_dict,
            'major_promption': major_promption,
            'content': _content
        }
        contents[index] = content
        res = content_format.FileTree(content['mulu'], content['shiyi'],
                                      content['major_promption'],
                                      content['content'])
        # data = res.get_file_tree()
        if method == 'tree':
            res.get_file_tree()
        if method == 'list':
            res.get_tree_list()
        if method == 'both':
            res.get_file_tree()
            res.get_tree_list()
        data[index] = res
    return data