def conll_file_demo(wf, c, lst, i): print('Mass conll_read demo...') graphs = [DependencyGraph(entry) for entry in lst if entry] ars = [] sent_ids = [] events = [] reltypes = [] test_path = "./test/test{:0>2d}.txt".format(i) with open(test_path, 'r+') as f1: lines = f1.readlines() for line in lines: cols = line.split("\t") ars.append((cols[1], cols[4])) sent_ids.append(cols[0]) events.append(cols[3]) reltypes.append(cols[6]) print(len(graphs), len(ars)) for l, graph, ar, sent_id_tmp, event, reltype in zip( lst, graphs, ars, sent_ids, events, reltypes): reltype = reltype.strip() depgraph = DependencyGraph(l.strip()) depgraph.tree = graph.tree() dep_nx = nxGraphWroot(depgraph) dep_nx = dep_nx.to_undirected() #print(sent_id_tmp, ar) shortest_path = nx.shortest_path(dep_nx, source=int(ar[0]), target=int(ar[1])) shortest_path_word = [] for i in shortest_path: shortest_path_word.append(c[sent_id_tmp][i]) write_line = sent_id_tmp + '\t' + "reltype=" + reltype + '\t' + str( shortest_path_word) + '\n' wf.write(write_line)
def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: if tokens[i - 1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i - 1]]: chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], [tag])) else: print "No tag found for input token '%s', parse is impossible." % tokens[i - 1] return [] for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) graphs = [] trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += "%s\t%s\t%d\t%s\n" % (tokens[i], "null", parse._arcs[i] + 1, "null") conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], "null", parse._arcs[i] + 1, "null", "-", "-", ) dg = DependencyGraph(conll_format) score = self.compute_prob(dg) if score > max_score: max_parse = dg.tree() max_score = score return [max_parse, max_score]
def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: if tokens[i - 1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i - 1]]: chart[i][j].add( DependencySpan(i - 1, i, i - 1, [-1], [tag])) else: chart[i][j].add( DependencySpan(i - 1, i, i - 1, [-1], [u'NULL'])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += '%s\t%s\t%d\t%s\n' % ( tokens[i], 'null', parse._arcs[i] + 1, 'null') #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with recent change in dependency graph such that there must be a ROOT element. conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % ( i + 1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'ROOT', '-', '-') dg = DependencyGraph(conll_format) score = self.compute_prob(dg) trees.append((score, dg.tree())) trees.sort(key=lambda e: -e[0]) if trees == []: trees = [(0.0, Tree(tokens[0], tokens[1:]))] return ((score, tree) for (score, tree) in trees)
def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: if tokens[i - 1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i - 1]]: chart[i][j].add( DependencySpan(i - 1, i, i - 1, [-1], [tag])) else: print( 'No tag found for input token \'%s\', parse is impossible.' % tokens[i - 1]) return [] for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) graphs = [] trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += '%s\t%s\t%d\t%s\n' % ( tokens[i], 'null', parse._arcs[i] + 1, 'null') conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % ( i + 1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') dg = DependencyGraph(conll_format) score = self.compute_prob(dg) if score > max_score: max_parse = dg.tree() max_score = score return [max_parse, max_score]
def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: An iterator over parse trees. :rtype: iter(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: chart[i][j].add( DependencySpan(i - 1, i, i - 1, [-1], ["null"])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], "null", "null", "null", parse._arcs[i] + 1, "ROOT", "-", "-", ) dg = DependencyGraph(conll_format) # if self.meets_arity(dg): yield dg.tree()
def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: A list of parse trees. :rtype: list(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) graphs = [] trees = [] for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], "null", "null", "null", parse._arcs[i] + 1, "null", "-", "-", ) dg = DependencyGraph(conll_format) # if self.meets_arity(dg): graphs.append(dg) trees.append(dg.tree()) return trees
def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: An iterator over parse trees. :rtype: iter(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ['null'])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % ( i + 1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'ROOT', '-', '-', ) dg = DependencyGraph(conll_format) # if self.meets_arity(dg): yield dg.tree()
def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i,j)) if i==j+1: if tokens[i-1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i-1]]: chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [tag])) else: chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [u'NULL'])) for i in range(1,len(self._tokens)+1): for j in range(i-2,-1,-1): for k in range(i-1,j,-1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with recent change in dependency graph such that there must be a ROOT element. conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'ROOT', '-', '-') dg = DependencyGraph(conll_format) score = self.compute_prob(dg) trees.append((score, dg.tree())) trees.sort(key=lambda e: -e[0]) if trees == []: trees = [(0.0,Tree(tokens[0],tokens[1:]))] return ((score,tree) for (score, tree) in trees)
def conll_file_demo(wf, c, lst, i): print('Mass conll_read demo...') graphs = [DependencyGraph(entry) for entry in lst if entry] ars = [] sent_ids = [] #sent_ids2 = [] events = [] reltypes = [] with open("MAT.txt", 'r') as f1: lines = f1.readlines() for line in lines: cols = line.split("\t") if cols[0] != cols[7] and int(cols[0][1:4]) == i: ars.append((cols[1], cols[4])) #ars.append((cols[8],cols[11])) sent_ids.append(cols[0]) #sent_ids.append(cols[7]) events.append(cols[3]) #events.append(cols[10]) reltypes.append(cols[14]) #print(len(graphs),len(ars)) for l, graph, ar, sent_id_tmp, event, reltype in zip( lst, graphs, ars, sent_ids, events, reltypes): reltype = reltype.strip() depgraph = DependencyGraph(l.strip()) depgraph.tree = graph.tree() dep_nx = nxGraphWroot(depgraph) dep_nx = dep_nx.to_undirected() #print(sent_id_tmp, ar) shortest_path = nx.shortest_path(dep_nx, source=int(ar[0]), target=int(ar[1])) #shortest_path = nx.shortest_path(dep_nx, source = int(ar[2]),target = int(ar[3])) shortest_path_word = [] #shortest_path_word2 = [] for i in shortest_path: print(sent_id_tmp) shortest_path_word.append(c[sent_id_tmp][i]) #shortest_path_word2.append(c[sent_id_tmp2][i]) write_line = sent_id_tmp + '\t' + str( shortest_path_word) + '\t' + "reltype=" + reltype + '\n' wf.write(write_line)
def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: A list of parse trees. :rtype: list(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: chart[i][j].add( DependencySpan(i - 1, i, i - 1, [-1], ['null'])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) graphs = [] trees = [] for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % ( i + 1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') dg = DependencyGraph(conll_format) # if self.meets_arity(dg): graphs.append(dg) trees.append(dg.tree()) return trees
text_data = [ '!!!!!!""@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!""', 'I can not just sit up and HATE on another bitch .. I got too much shit going on!' ] try: #parse the raw string into two different lanugage representation formats result_stanford = tweebo_api.parse_stanford(text_data) result_conll = tweebo_api.parse_conll(text_data) nltk_result = add_root_node(result_conll) nltk_dep_tree_0 = DependencyGraph(nltk_result[0]) nltk_dep_tree_1 = DependencyGraph(nltk_result[1]) #print(result_stanford) #print(result_conll) #print(nltk_result) #print(nltk_dep_tree.contains_cycle()) tree_0 = nltk_dep_tree_0.tree() tree_1 = nltk_dep_tree_1.tree() #nltk_dep_tree.tree().view() print(tree_0) for subtree in tree_0.subtrees(): print(subtree) print(tree_1) for subtree in tree_1.subtrees(): print(subtree) #TODO test a multi-sentence string!! except ServerError as e: print(f'{e}\n{e.message}')