def catch_trick88(): from htmlconvert2text import convert2txt def read_train_res(): with open('/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train') as rf: train_res = rf.read() return train_res train_re = read_train_res() for i in os.listdir('/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/')[ 0:2000:20]: # sss = convert2txt('/home/html/' + i) sss, ent_str = convert2txt("/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/" + i) row_train_re = re.search(r'{}[^\n。]+\n'.format(i.split(".")[0]), train_re).group()[:-1] print("###########################################################{}".format(i)) for index, res_enti in enumerate(row_train_re.split('\t')): if len(res_enti) > 1: print("@@@this is the {}th key_value{}".format(index, res_enti)) res_find = re.findall(r'{}'.format(res_enti), ent_str) if len(res_find) > 0: print(res_find) else: print("@@@")
def findall_reg(): for i in os.listdir('/home/html/')[100:2770:50]: for l in trick_precedences: sss = convert2txt('/home/html/'+i) reg_out=re.findall(r'{}[^。|]*[。;|]'.format(l), sss, flags=re.X) reg_out_final=[] list_false_true = [] for j in reg_out: list_false_true = [True if k in j else False for k in list_keywords] if True not in list_false_true: reg_out_final.append(j) print(i) for i in reg_out_final: print(i) print('\n\n')
def catch_trick888(): from htmlconvert2text import convert2txt # def read_train_res(): # with open('/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train') as rf: # train_res = rf.read() # return train_res # # train_re = read_train_res() for i in os.listdir('/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/')[ 0:2688:18]: # sss = convert2txt('/home/html/' + i) sss, ent_str = convert2txt("/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/" + i) if len(ent_str) > 10: with open('checkregexentity.txt' , 'a') as af: af.write(ent_str + "\n") print("OOOOOOOOOOO") else: print(i)
def tokenize_enti(self,path11): texx, entity_string = convert2txt(path11) sentences = re.split(r'。', texx) # sentences.sort(key=len, reverse=True) entities = list(set(re.split(r'[\s~、,;/]', entity_string))) entities.sort(key=len) entities_arrows_list =list(set([ x if '~' in x else '' for x in re.split(r'\s', entity_string)])) entities_arrows_list.sort(key=len, reverse=True) entities_arrows_list = entities_arrows_list[:-1] # 找出结果数据行并且把最后的回车符号去掉 patt_index = re.findall(r'\d{4,10}', path11)[0] res_rows = re.findall(r'(?<=\n){}[^\n]+(?=\n)'.format(patt_index), self.train_res) # 以下是整理train——res # 遍历结果,发现有简称全称的,把匹配的另一半加进去。 """主要目的是修正train——res文件,里面有简称或者全称,并不统一,为了让简称全称都出现, 使用正则提取对应的简称或全称,如果有顿号,把那些字串也分开提取,作为标注的标的,当然是先 把字符长度小的匹配出来,分词之后也是先把长度长的连起来。没问题的""" res_paired = {} # 临时定义一个res的列表,存储修改后的train res for x in range(len(res_rows)): res_row = res_rows[x] for y in range(6): res_paired[str(x)+str(y)]= [re.split(r'\t', res_row)[y]] for arrow_str in entities_arrows_list: for index, result_row in enumerate(res_rows): for indi, res_value in enumerate(re.split(r'\t', result_row)): if indi in [0, 1, 4, 5]: continue res_value_list = res_value.split('、') for res_value_split in res_value_list: if res_value_split in entities and res_value_split in arrow_str: # 找出配对的简称或者全称,添加,如果是股权/估值法/金额直接添加并且continue niki, fullna = re.split(r'~', arrow_str) fullna_first = fullna.split(',')[0] niki_split_list = re.split(r'[/、]', niki) # 对应的全称满足三个条件,长度/逗号 以及含有简称的几个字 if res_value_split in niki_split_list \ and len(fullna_first) < 18 \ and re.search(re.sub(r'(?<=[^屄\s])', '\s?', res_value_split), fullna_first): res_paired[str(index)+str(indi)].append(fullna_first) """ 由全称查简称时候要避免 公司/本公司/上市公司/发起人/申请人/, 含有这几个字的要剔除 """ if res_value_split == fullna_first: # 对应的简称满足几个条件: 包含在全程里面,不长于4个字,不等于 for niki_split in niki_split_list: if re.search(re.sub(r'(?<=[^屄\s])', '\s?', fullna_first), niki_split)\ and not re.search(r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|对方|发行|对象|股东|对手|单位)',re.sub(r'\s', '', niki_split)): res_paired[str(index)+str(indi)].append(niki_split) # 遍历公告的每一句,把每一句送进模型。 words_n_words = '' for i in sentences: words = self.segmentor.segment(i) words = ' '.join(words) words = words+' '+'。'+' ' # 加上句号以及句号后面的空格 # 分词要使用更好的策略,更长一些,避免太短的句子,重复循环浪费流程 # # 下面是把所有目标主体合并在一起, 把55%股权这样的先分出来, # for ent in entities: # # 把words中所有是实体的中间去掉空格。使用双层sub # # 正则还是要多注释啊 # """ re.sub(r'(?<=\w)(?=\w)'','\s?',ent) 是把实体里面的每个字符中间插入“\s?” # 表示匹配任何以此序列出现但中间可能有空格的情况,分词之后join成空格分割的。然后找出words # 中出现这个序列的地方,将其换成没空格的""" # if len(ent) > 1: # if not re.search(r'([\d.]+%的?(?:股权|股份|权益))', ent): # 如果没有股权关键字,直接加上空格匹配pattern # patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent) # elif len(ent) > 7: # 如果有股权关键字,且长度比较长,就把前面主体提出来,单独分词 # patt_ent = re.sub(r'(?<=\w)(?=\w)',r'\s?', re.sub(r'的?[\d.]+%的?(股权|股份|权益)','', ent)) # else: # patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent) # # 下面一句把words中所有符合主体列表的项目,可能被分词分开的,重新合并起来,单独成行,在test时使用 # words = re.sub(r'{}'.format(patt_ent), '\s' + ent + '\s', words) # 然后把空格都换成回车,words竖起来了。 # words = re.sub(r'\s', '\n', words) # words = re.sub(r'\n+', '\n', words) """把words中所有是结果键值的,后缀上tab键和结果索引号。否则后缀tab键和字母o 目的是好的,就是让模型更容易找到目标,模型不需要判断开始和结束, 但是这样的正则太难了, 我无法将所有合适的实体 全部抽出来,而导致标注的缺失,那么还是把任务给模型了""" # for x in range(len(res_rows)): # for y in range(6): # index = str(x)+str(y) # tags_list = res_paired[index] for index, tags_list in res_paired.items(): # 表中的小表,可能有一个或多个成员,遍历一下,包括顿号分割的那些都可以标出来了,不影响合并好的实体字符串。 for sub_res in sorted(tags_list, key=len, reverse=True): if not index.endswith('0') and len(sub_res) > 1: patt_sub_res = re.sub(r'(?<=[^屄\s])', '\s?', sub_res) if re.search(r'{}'.format(patt_sub_res), words): spliter = re.findall(patt_sub_res, words)[0] words_split_list = re.split(spliter, words) spliter_tagged = re.sub(r'\s', '屄{}'.format(index[1]), spliter) words = spliter_tagged.join(words_split_list) # print(words) # words=re.sub(patt_sub_res, sub_res) # words= re.sub(r'{}(?=\n)'.format(sub_res), '\n{}\t{}\n'.format(sub_res, index), words) # train——result标注完了,现在标注o,就是把非数字结尾的行加上tab和o words = re.sub(r'\s', '\to\n', words) words = re.sub(r'(?<=屄\d)', '\n', words) words = re.sub(r'屄', '\t' , words) words_n_words += words # print(words) with open('/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/'+res_paired['00'][0]+'.txt', 'w') as af: af.write(words_n_words) print(path11.split("/")[-1])
recognizer.load(ner_model_path) pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') source_path = "/home/mm/Downloads/round1_train_20180518/dingzeng/html/" out_path = "/home/mm/aliyunChallenge/" listdir = os.listdir(source_path) postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 for i in listdir[0:1]: html_text = convert2txt(source_path + i) words = segmentor.segment(html_text) # 分词 postags = postagger.postag(words) # 词性标注 netags = recognizer.recognize(words, postags) # 命名实体识别 indices = [i for i, x in enumerate(list(netags)) if x.endswith("Ni")] temp_entity = "" new_list = [] for i, x in enumerate(words): if (i in indices) and ((i + 1) in indices) and (i - 1 not in indices): temp_entity = x elif (i - 1 in indices) and (i + 1 in indices) and (i in indices): temp_entity += x elif (i - 1 in indices) and (i in indices) and (i + 1 not in indices): temp_entity += x new_list.append(temp_entity)
def fill_table(path): # if index not in random_index: # continue # list_true_res = re.findall(r'{}[^\n]+(?=\n)'.format(path.split(".")[0]), true_res_str) # text, entity_string = convert2txt('/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/762567.html') text, entity_string = convert2txt( '/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/' + path) official_res_row = re.findall( r'{path}[^\n]+\n'.format(path=path.split(".")[0]), true_res_str) answer_dic = {"公告ID": path.split(".")[0]} _, asset_string, eval_string, money_string = entity_string.split("|||") entities_arrows_list = list( set([x if 'H&#~' in x else '' for x in re.split(r'\s', entity_string)])) short_name_list = [re.split(r"H&#~", x)[0] for x in entities_arrows_list] reg_short_listr = "" for tiy in short_name_list: for tiny in re.split(r'[,、/]', tiy): if not re.search( r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|审计|对方|发行|对象|股东|对手|单位|事务所|计划|分公司|日$|董事|独立|书$|承诺|机构|评估|交所|股|认购|局$|律|本次|国家|中央|中国|重组|重大|期$|^元$|^万元$|^亿元$|《|》|股份|股分|利润|报告)', tiny): reg_short_listr += tiny reg_short_listr += "|" reg_short_listr = re.sub(r'^\||\|$', "", reg_short_listr) reg_short_listr = re.sub(r'\|\|', "|", reg_short_listr) answer_dic["估值方法"] = collections.Counter( eval_string.split(" ")).most_common(1)[0][0] answer_dic["交易金额"] = collections.Counter( money_string.split(" ")).most_common(1)[0][0] answer_dic["交易标的"] = collections.Counter( asset_string.split(" ")).most_common(1)[0][0] answer_dic["标的公司"] = '' answer_dic["交易对方"] = '' for row in entities_arrows_list: if len(row) < 2: continue short, long = row.split('H&#~') list_splits_short = re.split(r'/|、', short) for short_split in list_splits_short: if re.match(r'交易对[手方]|发行对象|认购人', short_split): answer_dic["交易对方"] = long if re.match(r'标的公司|目标公司', short_split): answer_dic["标的公司"] = long if "、" in long: """ltp 识别实体, 针对顿号分开的pos,分别确认不含有动词副词, 然后在entity——string里面找相应的各自的股权信息/资产信息""" asset_list = [] for long_split in long.split("、"): # print("{} long_split is {}".format(path, long_split)) # asset_related_target = re.findall(r'{}[\d.%]+的?股[权分]|全部[股债分权]'.format(long_split), entity_string) # if len(asset_related_target) > 0: # asset_list.append(re.findall(r'[\d.%]+的?股[权分]|全部[股债分权]', asset_related_target[0])[0]) # else: if re.findall(r'{ls}[\d.%]+的?[股债分权份]{{2}}|{ls}全部的?[股债分权份资产负利和与]{{2,6}}'.format(ls=long_split), text) \ and re.findall(r'[\d.%]+的?[股债分权份资产负利和与]{2, 5}|全部的?[股债分权份资产负利和与]{2,6}', \ re.findall(r'{ls}[\d.%]+的?股股债分权份资产负利和与]{{2,6}}|全部[股债分权份资产负利和与]{{2,6}}'.format(ls = long_split), text)[0]): # asset_related_target = re.findall(r'{}[\d.%]+的?股[权分]|全部[股债分权]'.format(long_split), text) asset_list.append(re.findall(r'[\d.%]+的?[股债分权份]{2}|全部的?[股债分权份资产负利和与]{2,6}',\ re.findall(r'{ls}[\d.%]+的?[股债分权份]{{2,4}}|全部的?[股债分权份资产负利和与]{{2,5}}'.format(ls =long_split), text)[0])[0]) answer_dic["交易标的"] = '|'.join(asset_list) answer_dic["标的公司"] = "|".join(long.split("、")) if re.match(r'本次交易|交易标的|标的资产|交易资产|目标资产|标的股权', short_split): if re.findall(r'[\d.%]+的?[股债分权份]{2}|全部的?[股债分权份]{2}', long): list_ass = re.findall(r'[\d.%]+的?[股债分权份]{2}|全部的?[股债分权份]{2}', long) # if len(list_ass) > 1: answer_dic["交易标的"] = '|'.join(list_ass) list_tar = re.findall( r'({ls})(?=的?[\d.%]+的?[股债分权份]{{2,3}}|全部的?[股债分权份资产负利和与]{{2,6}})' .format(ls=reg_short_listr), long) answer_dic["标的公司"] = '|'.join(list_tar) if answer_dic["标的公司"] == "": guess_target = collections.Counter( re.findall(r'{ls}'.format(ls=reg_short_listr), entity_string)).most_common(8) for tar in guess_target: if len(tar[0]) > 2: answer_dic["标的公司"] = guess_target pass # print(re.findall(r'(?<=[和及、,~的])[^\d和及、,~的股份分权]+(?=[\d.%的]+股[权分份])', entity_string)) if answer_dic["标的公司"] == "": for post_fix in re.findall( r'(?<=[和及、,~的])[^\d和及、,~的股份分权资产负利与]+(?=的?[\d.%]+的?[股债分权份]{{2,3}}|全部的?[股债分权份资产负利和与]{{2,6}})', entity_string): if len(post_fix) in [3, 4, 5, 6]: answer_dic["标的公司"] = post_fix if answer_dic["标的公司"] != "" and answer_dic["交易标的"] != "": print("answer dict is ok ") else: print("f**k it {}".format(path)) # # 说明交易 # # # if "|" in answer_dic['标的公司'] and "|" not in answer_dic['交易标的']: # # answer_dic['交易标的'] == "" # # for target_split in re.split(r'|', answer_dic['标的公司']): # # answer_dic['交易标的'] += re.findall() # # if re.match(r'交易标的|标的资产|标的股权|目标资产', short_split): # # if len(answer_dic['交易标的'])>1 and answer_dic['交易标的'] in long: # # print("股权最频的信息就在交易标的名词解释中{}".format(path)) # if re.search(r'的?[\d.%全部]+股[权分份]]', long): # answer_dic["标的公司"] ="|".join(re.split(r'的?[\d.%全部的]+股[权分份]、?', long)) # answer_dic["交易标的"] ="|".join(re.findall(r'[\d.%全部的]+股[权分份]]', long)) # # elif len(answer_dic['交易标的'])>1: # # answer_dic['交易标的'] = re.findall(r'([\d.]+%的?(?:股权|股份|权益))', long)[0] submit_string = '' if '|' in str(answer_dic): target_list = answer_dic["标的公司"].split("|") asset_list = answer_dic["交易标的"].split("|") rows_to_gen = max(len(target_list), len(asset_list)) for row in range(rows_to_gen): index_target = row if len( target_list) < row + 1 else len(target_list) - 1 index_asset = row if len( asset_list) < row + 1 else len(asset_list) - 1 submit_string += answer_dic["公告ID"]+ "\t" + asset_list[index_asset] \ + "\t" + target_list[index_target] + "\t" + answer_dic["交易对方"] \ + "\t" + answer_dic["交易金额"] + "\t" + answer_dic["估值方法"] + "\n" else: submit_string = answer_dic["公告ID"]+ "\t" + answer_dic["交易标的"] \ + "\t" + answer_dic["标的公司"] + "\t" + answer_dic["交易对方"] \ + "\t" + answer_dic["交易金额"] + "\t" + answer_dic["估值方法"] + "\n" return submit_string
recognizer.load(ner_model_path) pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') source_path = "/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/" out_path = "/home/mm/FDDC_datasets_text_dir/chongzu/" listdir = os.listdir(source_path) postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 for i in listdir[0:1]: html_text, entity_string = convert2txt(source_path + i) words = segmentor.segment(html_text) # 分词 postags = postagger.postag(words) # 词性标注 netags = recognizer.recognize(words, postags) # 命名实体识别 indices = [i for i, x in enumerate(list(netags)) if x.endswith("Ni")] temp_entity = "" new_list = [] """以下是对字符串序列中含有实体名称的部分,重新结合在一起,去掉分词造成的间隔,然后在实体前后加缀一个特殊符号{NER#}""" for i, x in enumerate(words): if (i in indices) and ((i + 1) in indices) and (i - 1 not in indices): temp_entity = x elif (i - 1 in indices) and (i + 1 in indices) and (i in indices): temp_entity += x elif (i - 1 in indices) and (i in indices) and (i + 1 not in indices): temp_entity += x