def triples_extraction(path): processed_sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf-8'): if len(line) == 1: processed_sentences.append(sentence) sentence = [] else: word = line.split("\t") sentence.append(word) deps = [] for sentence in processed_sentences: s = u"" for line in sentence: s += u"\t".join(line) + u'\n' deps.append(s) triples = [] for sent_dep in deps: try: graph = DependencyGraph(tree_str=sent_dep) except: pass try: res = extract(graph.triples()) triples.append(res) except: pass return (triples)
def draw(self): # 绘制词法树 par_result = '' for node in self.tree: par_result += "\t" + node[0] + "\t" + 'null' + "\t" + str( node[1]) + "\n" conlltree = DependencyGraph(par_result) tree = conlltree.tree() # 构建树结构 tree.draw() # 显示输出的树
def build_dep_graph(self): if self._check(): return None par_result = '' for i in range(len(self.words)): if self.arcs[i].head == 0: pass # self.arcs[i].relation = "ROOT" par_result += "\t" + self.words[i] + "(" + self.arcs[i].relation + ")" + "\t" + self.postags[ i] + "\t" + str(self.arcs[i].head) + "\t" + self.arcs[i].relation + "\n" # print(par_result) conlltree = DependencyGraph(par_result) # 转换为依存句法图 tree = conlltree.tree() # 构建树结构 tree.draw() # 显示输出的树
def read_syntaxnet_output(sentences): # joint all sentences into a single string with # separating new lines all_sentences = "\n".join(sentences) # redirect std_error to /dev/null FNULL = open(os.devnull, 'w') process = subprocess.Popen( 'MODEL_DIRECTORY=/Users/dbatista/Downloads/Portuguese; ' 'cd /Users/dbatista/models/syntaxnet; ' 'echo \'%s\' | syntaxnet/models/parsey_universal/parse.sh ' '$MODEL_DIRECTORY 2' % all_sentences, shell=True, universal_newlines=False, stdout=subprocess.PIPE, stderr=FNULL) output = process.communicate() processed_sentences = [] sentence = [] for line in output[0].split("\n"): if len(line) == 0: processed_sentences.append(sentence) sentence = [] else: word = line.split("\t") sentence.append(word) # subprocess captures an empty new line del processed_sentences[-1] deps = [] for sentence in processed_sentences: s = '' for line in sentence: s += "\t".join(line) + '\n' deps.append(s) for sent_dep in deps: graph = DependencyGraph(tree_str=sent_dep.decode("utf8")) print "triples" for triple in graph.triples(): print triple print tree = graph.tree() tree.pretty_print()
def combined_sentence_lemma(sent1_conllu, sent2_conllu, sent1_id, sent2_id): sent1_dict = DependencyGraph(sent1_conllu, top_relation_label='root').nodes sent2_dict = DependencyGraph(sent2_conllu, top_relation_label='root').nodes if sent1_id==sent2_id: return extract_node_info(sent1_dict, info="lemma") elif sent1_id < sent2_id: return extract_node_info(sent1_dict, info="lemma") + \ extract_node_info(sent2_dict, info="lemma") else: return extract_node_info(sent2_dict, info="lemma") + \ extract_node_info(sent1_dict, info="lemma")
def gold_standard_tree(self, sent: DependencyGraph): """ return the arcs of the gold standard tree for the given sentence """ root = sent.root["address"] edges = sent.nx_graph().edges # build the gold standard tree, edge weights are # of no significance and therefore are all set to 1 return [(edge[1], edge[0]) for edge in edges] + [(0, root)]
def extract_stanza_info(row, eid_num, param="ctag"): ''' Given a row in pd dataframe, and eid number (1 or 2) extract the stanza info from conllu parse param: 'ctag', 'lemma', 'word' ''' sent_dict = DependencyGraph(row[f'eid{eid_num}_sent_conllu'], top_relation_label='root').nodes tokenid = int(getattr(row, f'eid{eid_num}_token_id')) return sent_dict[tokenid+1][param]
def get_structs(ud_path): files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu'] structures = {} for file in files: with open(ud_path + file, 'r') as f: iden = 0 a = "" words = [] for line in f: if line != "\n": a += line words.append(line.split("\t")[1]) else: iden += 1 a = a structure = DependencyGraph(a, top_relation_label='root') sent = " ".join(words) sent = sent sent_id = file + " " + str(iden) structures[sent_id] = structure a = "" words = [] return structures
html_string = re.sub(r'\)', r'&rcrb;', s) html_string = re.sub(r'\(', r'&lcrb;', html_string) return html_string files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu'] structures = [] for file in files: with open('structures.tsv', 'a') as fout: with open(file, 'r') as f: id = 0 a = "" words = [] for line in f: if line != "\n": a += line words.append(line.split("\t")[1]) else: id += 1 a = html_ify(a) structure = DependencyGraph(a, top_relation_label='root') sent = " ".join(words) sent = html_ify(sent) sent_id = file + " sent_" + str(id) structures.append(structure) a = "" words = [] fout.write( sent_id + "\t" + " ".join(str(structures[-1].tree()).splitlines()) + "\t" + sent + "\n")