def file_tree_test(postfix='.html'): # --- 服务器数据 220 --- path = '/data/hadoop/yisun/data/tianchi2/train/chongzu/html/' filename = None # filename = ['20546245.html'] label_file = '/data/hadoop/yisun/data/tianchi2/train_label/chongzu.train' outpath = '../data/extract_result/train_chongzu/' # # # --- 天池服务器 # path = '/home/118_16/data/chongzu_train_html/' # filename = None # # filename = ['20546245.html'] # label_file = '../data/train_data/train_labels/chongzu.train' # outpath = '../data/extract_result/train_chongzu/' # --- 本地外部数据 thinkpad --- path = 'E:\\天池大赛\\公告数据\\天池大赛\\announcement_extract\\复赛数据\\复赛新增类型训练数据-20180712\\资产重组\\html\\' filename = ['20546245.html'] label_file = '../data/train_data/train_labels/chongzu.train' outpath = '../data/extract_result/train_chongzu/' # --- 本地外部数据 --- path = 'D:\\TianChi_competition\\公告信息抽取\\materials\\复赛\\复赛新增类型训练数据-20180712\\资产重组\\html\\' filename = ['9895659.html'] label_file = '../data/train_data/train_labels/chongzu.train' outpath = '../data/extract_result/train_chongzu/' # --- 本地数据 --- # path = '../data/temp2/' # filename = None # filename = ['19223567.html'] # outpath = '../data/temp2/result/' # label_file = '../data/extract_result/train_chongzu/' if filename == None: files_name = os.listdir(path) else: files_name = filename file_list = [i for i in files_name if i.endswith(postfix)] html_dict = tian_chi.read_html(filepath=path, filename=file_list) content = {} for index in html_dict: # -------------------------- 核心过程 ----------------------------- # --- 获取预内容 --- res = tian_chi.extract_pre_content(html_dict[index]) tag = res[3] _major_promption = res[2] # --- 获取天池 text+table --- text_list = tian_chi.get_content(tag) # --- text to text_list _content = content_format.tags_format(tags_list=text_list)[0] major_promption = content_format.tags_format(_major_promption)[0] # --- 转换为文档树 --- t = content_format.FileTree(content_list=_content, zhongdashixiangtishi=major_promption) t.get_tree_list() t.get_file_tree() # result = t.get_tree_content(strcture=['第二章', '实际控制人'], method='content') content[index] = t.titles return t
def t2(drop_table=False): path = '../data/temp2/' filename = '23599.html' outpath = '../data/temp2/' html_dict = tc.read_html(filepath=path, filename=filename) contents = {} data = {} for n, index in enumerate(html_dict): print('processing {0} -- total: {1} this: {2}'.format( index, len(html_dict), n)) # 是否去掉表格tag h = html_dict[index] if drop_table: for i in h.find_all('table'): i.decompose() mulu, shiyi_dict, major_promption, tag = tc.extract_pre_content(h) _content = tc.get_content(tag) _content, part_table, table_failed = cf.tags_format(_content) content = { 'mulu': mulu, 'shiyi': shiyi_dict, 'major_promption': major_promption, 'content': _content } contents[index] = content res = cf.file_tree(content['mulu'], content['shiyi'], content['major_promption'], content['content']) # data = res.get_file_tree() data[index] = res.get_tree_list() return data
def html2file_tree(html, drop_table=False): mulu, shiyi_dict, major_promption, tag = tian_chi.extract_pre_content(html) _content = tian_chi.get_content(tag) _content, part_table, table_failed = content_format.tags_format(_content) content = { 'mulu': mulu, 'shiyi': shiyi_dict, 'major_promption': major_promption, 'content': _content } res = content_format.FileTree(content['mulu'], content['shiyi'], content['major_promption'], content['content']) data = res.get_tree_list() return data
def get_pre_content(path, filename, drop_table=False, keys=['mulu', 'shiyi', 'major_promption', 'content'], text_trans=False, df_json=True): """ 获取文件的预信息 :param path: :param filename: :param drop_table: :param keys: :param text_trans: :return: """ html_dict = tian_chi.read_html(filepath=path, filename=filename) contents = {} for n, index in enumerate(html_dict): print('processing {0} -- total: {1} this: {2}'.format( index, len(html_dict), n)) # 是否去掉表格tag h = html_dict[index] if drop_table: for i in h.find_all('table'): i.decompose() mulu, shiyi_dict, major_promption, tag = tian_chi.extract_pre_content( h) _content = tian_chi.get_content(tag) # 是否转换 tags ---> text list if text_trans: _content, part_table, table_failed = content_format.tags_format( _content, df_json=True) content = { 'mulu': mulu, 'shiyi': shiyi_dict, 'major_promption': major_promption, 'content': _content } res = {} for i in keys: res[i] = content[i] contents[index] = res return contents
def get_file_tree(path, filename, drop_table=False, method='list', df_json=False): # --- pre_main --- html_dict = tian_chi.read_html(filepath=path, filename=filename) contents = {} data = {} for n, index in enumerate(html_dict): print('processing {0} -- total: {1} this: {2}'.format( index, len(html_dict), n)) # 是否去掉表格tag h = html_dict[index] if drop_table: for i in h.find_all('table'): i.decompose() mulu, shiyi_dict, major_promption, tag = tian_chi.extract_pre_content( h) _content = tian_chi.get_content(tag) _content, part_table, table_failed = content_format.tags_format( _content, df_json=df_json) content = { 'mulu': mulu, 'shiyi': shiyi_dict, 'major_promption': major_promption, 'content': _content } contents[index] = content res = content_format.FileTree(content['mulu'], content['shiyi'], content['major_promption'], content['content']) # data = res.get_file_tree() if method == 'tree': res.get_file_tree() if method == 'list': res.get_tree_list() if method == 'both': res.get_file_tree() res.get_tree_list() data[index] = res return data