def triples_extraction(path): processed_sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf-8'): if len(line) == 1: processed_sentences.append(sentence) sentence = [] else: word = line.split("\t") sentence.append(word) deps = [] for sentence in processed_sentences: s = u"" for line in sentence: s += u"\t".join(line) + u'\n' deps.append(s) triples = [] for sent_dep in deps: try: graph = DependencyGraph(tree_str=sent_dep) except: pass try: res = extract(graph.triples()) triples.append(res) except: pass return (triples)
def read_syntaxnet_output(sentences): # joint all sentences into a single string with # separating new lines all_sentences = "\n".join(sentences) # redirect std_error to /dev/null FNULL = open(os.devnull, 'w') process = subprocess.Popen( 'MODEL_DIRECTORY=/Users/dbatista/Downloads/Portuguese; ' 'cd /Users/dbatista/models/syntaxnet; ' 'echo \'%s\' | syntaxnet/models/parsey_universal/parse.sh ' '$MODEL_DIRECTORY 2' % all_sentences, shell=True, universal_newlines=False, stdout=subprocess.PIPE, stderr=FNULL) output = process.communicate() processed_sentences = [] sentence = [] for line in output[0].split("\n"): if len(line) == 0: processed_sentences.append(sentence) sentence = [] else: word = line.split("\t") sentence.append(word) # subprocess captures an empty new line del processed_sentences[-1] deps = [] for sentence in processed_sentences: s = '' for line in sentence: s += "\t".join(line) + '\n' deps.append(s) for sent_dep in deps: graph = DependencyGraph(tree_str=sent_dep.decode("utf8")) print "triples" for triple in graph.triples(): print triple print tree = graph.tree() tree.pretty_print()