from ddparser import DDParser from main import build_conllx, cut_sent import pandas as pd import stanza from stanza.utils.conll import CoNLL import spacy sample = open('sample_corpus.txt', 'r', encoding='utf-8').read() sample_sents = cut_sent(sample) '''Construct the sample corpus''' # begin ddparser ddp = DDParser(use_pos=True) data = ddp.parse(sample_sents) build_conllx(data, 'sample_ddparser.conllx') print('DDParser has finished the parsing.') # begin SpaCy nlp = spacy.load('zh_core_web_sm') file_spacy = open('sample_spacy.conllx', 'w', encoding ='utf-8') # file_spacy_gold = open('gold_spacy.conllx', 'r', encoding='utf-8') for sent in sample_sents: file_spacy.write('\n\n') # file_spacy_gold.write('\n\n') for idx, token in enumerate(nlp(sent)): print(token.text) print(token.pos_) print(token.dep_) line = f'{idx+1}\t{token.text}\t{token.pos_}\t{token.dep_}\t{token.head}\t{token.head.i}'
class SVOParser: def __init__(self): self.parser = DDParser(use_pos=True) print('loaded model') '''文章分句处理, 切分长句,冒号,分号,感叹号等做切分标识''' def split_sents(self, content): return [sentence for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence] '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, rel_id, relation): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(rel_id)): if rel_id[arc_index] == index+1: #arcs的索引从1开始 if rel_id[arc_index] in child_dict: child_dict[relation[arc_index]].append(arc_index) else: child_dict[relation[arc_index]] = [] child_dict[relation[arc_index]].append(arc_index) child_dict_list.append(child_dict) heads = ['Root' if id == 0 else words[id - 1] for id in rel_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rel_id[i]-1, postags[rel_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): res = self.parser.parse(sentence, )[0] words = res["word"] postags = res["postag"] rel_id = res["head"] relation = res["deprel"] child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, rel_id, relation) return words, postags, child_dict_list, format_parse_list """将所有的ATT进行合并""" def merge_ATT(self, words, postags, format_parse_list): words_ = words retain_nodes = set() ATTs = [] ATT = [] format_parse_list_ = [] for parse in format_parse_list: dep = parse[0] if dep in ['ATT', 'ADV']: ATT += [parse[2], parse[5]] else: if ATT: body = ''.join([words[i] for i in sorted(set(ATT))]) ATTs.append(body) retain_nodes.add(sorted(set(ATT))[-1]) words_[sorted(set(ATT))[-1]] = body else: retain_nodes.add(parse[2]) ATT = [] for indx, parse in enumerate(format_parse_list): if indx in retain_nodes: parse_ = [parse[0], words_[indx], indx, postags[indx], words_[parse[5]], parse[5], postags[parse[5]]] format_parse_list_.append(parse_) return words_, postags, format_parse_list_, retain_nodes """基于该结果,提取三元组""" def extract(self, words, postags, child_dict_list, arcs, retain_nodes): svos = [] for index in range(len(postags)): if index not in retain_nodes: continue tmp = 1 # 如果语义角色标记为空,则使用依存句法进行抽取 if postags[index]: # 抽取以谓词为中心的事实三元组 child_dict = child_dict_list[index] # 主谓宾 if 'SBV' in child_dict and 'VOB' in child_dict: # e1s = self.expand_e(words, postags, child_dict_list, child_dict['SBV'][0]) # e2s = self.expand_e(words, postags, child_dict_list, child_dict['VOB'][0]) r = words[index] e1 = words[child_dict['SBV'][0]] e2 = words[child_dict['VOB'][0]] if e1.replace(' ', '') and e2.replace(' ', ''): svos.append([e1, r, e2]) # 含有介宾关系的主谓动补关系 if 'SBV' in child_dict and 'CMP' in child_dict: e1 = words[child_dict['SBV'][0]] cmp_index = child_dict['CMP'][0] r = words[index] + words[cmp_index] if 'POB' in child_dict_list[cmp_index]: e2 = words[child_dict_list[cmp_index]['POB'][0]] if e1.replace(' ', '') and e2.replace(' ', ''): svos.append([e1, r, e2]) return svos '''三元组抽取主函数''' def ruler2(self, words, postags, child_dict_list, arcs): svos = [] for index in range(len(postags)): tmp = 1 # 先借助语义角色标注的结果,进行三元组抽取 if tmp == 1: # 如果语义角色标记为空,则使用依存句法进行抽取 # if postags[index] == 'v': if postags[index]: # 抽取以谓词为中心的事实三元组 child_dict = child_dict_list[index] # 主谓宾 if 'SBV' in child_dict and 'VOB' in child_dict: r = words[index] e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if e1.replace(' ', '') and e2.replace(' ', ''): svos.append([e1, r, e2]) # 定语后置,动宾关系 relation = arcs[index][0] head = arcs[index][2] if relation == 'ATT': if 'VOB' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, head - 1) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_string = r + e2 if temp_string == e1[:len(temp_string)]: e1 = e1[len(temp_string):] if temp_string not in e1: if e1.replace(' ', '') and e2.replace(' ', ''): svos.append([e1, r, e2]) # 含有介宾关系的主谓动补关系 if 'SBV' in child_dict and 'CMP' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) cmp_index = child_dict['CMP'][0] r = words[index] + words[cmp_index] if 'POB' in child_dict_list[cmp_index]: e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) if e1.replace(' ', '') and e2.replace(' ', ''): svos.append([e1, r, e2]) return svos '''对找出的主语或者宾语进行扩展''' def complete_e(self, words, postags, child_dict_list, word_index): child_dict = child_dict_list[word_index] prefix = '' if 'ATT' in child_dict: for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) postfix = '' if postags[word_index] == 'v': if 'VOB' in child_dict: postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if 'SBV' in child_dict: prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix '''程序主控函数''' def triples_main(self, content): sentences = self.split_sents(content) svos = [] for sentence in sentences: print(sentence) words, postags, child_dict_list, arcs = self.parser_main(sentence) svo = self.ruler2(words, postags, child_dict_list, arcs) svos += svo return svos
return sub_tokens[0][0] tokens, _ = zip(*sub_tokens) return "".join(tokens) def inorder_traversal(self, node): """中序遍历""" lf_list = [] rf_list = [] for ln in node.lefts: if self.nodes[ln].deprel not in ['COO']: lf_list += self.inorder_traversal(self.nodes[ln]) for rn in node.rights: if self.nodes[rn].deprel not in ['COO']: rf_list += self.inorder_traversal(self.nodes[rn]) return lf_list + [(node.word, node.deprel)] + rf_list if __name__ == "__main__": ddp = DDParser(encoding_model='transformer') text = ["百度是一家高科技公司"] ddp_res = ddp.parse(text) print(ddp_res) # 细粒度 fine_info = FineGrainedInfo(ddp_res[0]) print("细粒度:", fine_info.parse()) # 粗粒度 coarse_info = CoarseGrainedInfo(ddp_res[0]) print("粗粒度:", coarse_info.parse())
TRAIN_PATH = 'train.csv' DEV_PATH = 'dev.csv' TEST_PATH = 'test.csv' use_cuda = True if __name__ == "__main__": file_paths = [TRAIN_PATH, DEV_PATH, TEST_PATH] tokenizer = BasicTokenizer() ddp = DDParser(use_cuda=use_cuda, encoding_model='transformer', buckets=True, batch_size=1000) for file_path in file_paths: df = pd.read_csv(file_path, sep='\t') df['ddp_res_a'] = [ str(ddp_res) for ddp_res in ddp.parse([ tokenizer._clean_text(query) for query in df['text_a'].tolist() ]) ] df['ddp_res_b'] = [ str(ddp_res) for ddp_res in ddp.parse([ tokenizer._clean_text(query) for query in df['text_b'].tolist() ]) ] output_path = file_path.split('.')[0] + '_ddp.csv' df.to_csv(output_path, sep='\t', index=False) print(f"{file_path} done!")
# count the frequency of a specific deprel term def count_rel(text, head, info): count = 0 for i, sentence in enumerate(text): if i % 1000 == 0: print(str(i) + ' sentences have been processed.') if sentence[head].count(info) > 0: count += sentence[head].count(info) return format(count / len(text), '.2f') # parse the texts of different corpora sents_1950 = get_corp('corpus_50_65.txt') data_1950 = ddp.parse(sents_1950) print('Finish reading the treebank_1950.') sents_1966 = get_corp('corpus_66_76.txt') data_1966 = ddp.parse(sents_1966) print('Finish reading the treebank_1966.') sents_1978 = get_corp('corpus_78_99.txt') data_1978 = ddp.parse(sents_1978) print('Finish reading the treebank_1978.') sents_2000 = get_corp('corpus_00_10.txt') data_2000 = ddp.parse(sents_2000) print('Finish reading the treebank_2000.') sents_web = []