def to_depgraph(self, rel=None): depgraph = DependencyGraph() nodelist = depgraph.nodelist self._to_depgraph(nodelist, 0, 'ROOT') #Add all the dependencies for all the nodes for node_addr, node in enumerate(nodelist): for n2 in nodelist[1:]: if n2['head'] == node_addr: node['deps'].append(n2['address']) depgraph.root = nodelist[1] return depgraph
def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodes = depgraph.nodes self._to_depgraph(nodes, 0, 'ROOT') # Add all the dependencies for all the nodes for address, node in nodes.items(): for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'): if n2['head'] == address: node['deps'].append(n2['address']) depgraph.root = nodes[1] return depgraph
def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodelist = depgraph.nodelist self._to_depgraph(nodelist, 0, "ROOT") # Add all the dependencies for all the nodes for node_addr, node in enumerate(nodelist): for n2 in nodelist[1:]: if n2["head"] == node_addr: node["deps"].append(n2["address"]) depgraph.root = nodelist[1] return depgraph
def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodes = depgraph.nodes self._to_depgraph(nodes, 0, 'ROOT') # Add all the dependencies for all the nodes for address, node in nodes.items(): for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'): if n2['head'] == address: relation = n2['rel'] node['deps'].setdefault(relation,[]) node['deps'][relation].append(n2['address']) depgraph.root = nodes[1] return depgraph
def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodes = depgraph.nodes self._to_depgraph(nodes, 0, "ROOT") # Add all the dependencies for all the nodes for address, node in nodes.items(): for n2 in (n for n in nodes.values() if n["rel"] != "TOP"): if n2["head"] == address: relation = n2["rel"] node["deps"].setdefault(relation, []) node["deps"][relation].append(n2["address"]) depgraph.root = nodes[1] return depgraph
def tree_to_graph(tree): '''Converts a tree structure to a graph structure. This is for the accuracy() function. Args: tree: the tree to convert Returns: a graph representing the tree. note that this graph is really only useable in accuracy() (the only attribute we bother setting is 'head') Raises: None ''' # nodes are dictionaries, which are mutable. So we copy them so we can # change attributes without changing the original nodes tree2 = tree_map(copy.copy, tree) # set the head attributes of each node according to our tree structure def set_heads(tree, parent=0): n = label(tree) n['head'] = parent if isinstance(tree, Tree): [set_heads(child, n['address']) for child in tree] set_heads(tree2) # now we need to generate our nodelist. This requires getting all the # elements ("labels") of our tree and putting them in a flat list def all_elems(tree): elems = [label(tree)] if isinstance(tree, Tree): for t in tree: elems += all_elems(t) return elems dg = DependencyGraph() dg.root = dg.nodelist[0] all = all_elems(tree2) # nodelist should be ordered by address all.sort(key=lambda t: label(t)['address']) dg.nodelist += all return dg
def cabocha2depgraph(t): dg = DependencyGraph() i = 0 for line in t.splitlines(): if line.startswith("*") and not line.endswith('*'): # start of bunsetsu and not the real * cells = line.strip().split(" ", 3) m = re.match(r"([\-0-9]*)([ADIP])", cells[2]) node = dg.nodelist[i] node.update({ 'address': i, 'rel': m.group(2), # dep_type 'word': [], 'tag': [], 'str': "" }) dep_parent = int(m.group(1)) while len(dg.nodelist) < i + 1 or len( dg.nodelist) < dep_parent + 1: dg.nodelist.append({'word': [], 'deps': [], 'tag': []}) if dep_parent == -1: dg.root = node else: dg.nodelist[dep_parent]['deps'].append(i) i += 1 elif not line.startswith("EOS"): # normal morph cells = line.strip().split("\t") morph = (cells[0], tuple(cells[1].split(','))) dg.nodelist[i - 1]['word'].append(morph[0]) dg.nodelist[i - 1]['tag'].append(morph[1]) return dg
def make_dep_tree(sent, deps): adj = merge_with(cons, [], *[{x:[m]} for x,m,_ in deps]) heads = dict([(m,h) for h,m,_ in deps]) rel = dict([(m,rel) for _,m,rel in deps]) n = len(sent["x"]) pos = sent["pos"] x = sent["x"] nodelist = defaultdict(lambda: {"address": -1, "head": -1, "deps": [], "rel": "", "tag": "", "word": None}) for i in range(1, n): node = nodelist[i] node["address"] = i node["head"] = heads[i] node["deps"] = adj[i] if adj.has_key(i) else [] node["tag"] = pos[i] node["word"] = x[i] node["rel"] = rel[i] g = DependencyGraph() g.get_by_address(0)["deps"] = adj[0] if adj.has_key(0) else [] [g.add_node(node) for node in nodelist.values()] g.root = nodelist[adj[0][0]] return g
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values(): if (self._grammar.contains(head_node['word'], dep_node['word']) and head_node['word'] != dep_node['word']): deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1): head_address = head_index + 1 node = graph.nodes[address] node.update({ 'word': token, 'address': address, }) if head_address == 0: rel = 'ROOT' else: rel = '' graph.nodes[head_index + 1]['deps'][rel].append(address) # TODO: check for cycles yield graph
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values() : if ( self._grammar.contains(head_node['word'], dep_node['word']) and head_node['word'] != dep_node['word'] ): deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in range(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1): head_address = head_index + 1 node = graph.nodes[address] node.update( { 'word': token, 'address': address, } ) if head_address == 0: rel = 'ROOT' else: rel = '' graph.nodes[head_index + 1]['deps'][rel].append(address) # TODO: check for cycles yield graph
def as_dependencygraph( self, keep_dummy_root=False, add_morph=True ): ''' Returns this tree as NLTK's DependencyGraph object. Note that this method constructs 'zero_based' graph, where counting of the words starts from 0 and the root index is -1 (not 0, as in Malt-TAB format); Parameters ----------- add_morph : bool Specifies whether the morphological information (information about word lemmas, part-of-speech, and features) should be added to graph nodes. Note that even if **add_morph==True**, morphological information is only added if it is available via estnltk's layer token['analysis']; Default: True keep_dummy_root : bool Specifies whether the graph should include a dummy TOP / ROOT node, which does not refer to any word, and yet is the topmost node of the tree. If the dummy root node is not used, then the root node is the word node headed by -1; Default: False For more information about NLTK's DependencyGraph, see: http://www.nltk.org/_modules/nltk/parse/dependencygraph.html ''' from nltk.parse.dependencygraph import DependencyGraph graph = DependencyGraph( zero_based = True ) all_tree_nodes = [self] + self.get_children() # # 0) Fix the root # if keep_dummy_root: # Note: we have to re-construct the root node manually, # as DependencyGraph's current interface seems to provide # no easy/convenient means for fixing the root node; graph.nodes[-1] = graph.nodes[0] graph.nodes[-1].update( { 'address': -1 } ) graph.root = graph.nodes[-1] del graph.nodes[0] # # 1) Update / Add nodes of the graph # for child in all_tree_nodes: rel = 'xxx' if not child.labels else '|'.join(child.labels) address = child.word_id word = child.text graph.nodes[address].update( { 'address': address, 'word': child.text, 'rel': rel, } ) if not keep_dummy_root and child == self: # If we do not keep the dummy root node, set this tree # as the root node graph.root = graph.nodes[address] if add_morph and child.morph: # Add morphological information, if possible lemmas = set([analysis[LEMMA] for analysis in child.morph]) postags = set([analysis[POSTAG] for analysis in child.morph]) feats = set([analysis[FORM] for analysis in child.morph]) lemma = ('|'.join( list(lemmas) )).replace(' ','_') postag = ('|'.join( list(postags) )).replace(' ','_') feats = ('|'.join( list(feats) )).replace(' ','_') graph.nodes[address].update( { 'tag ': postag, 'ctag' : postag, 'feats': feats, 'lemma': lemma } ) # # 2) Update / Add arcs of the graph # for child in all_tree_nodes: # Connect children of given word deps = [] if not child.children else [c.word_id for c in child.children] head_address = child.word_id for dep in deps: graph.add_arc( head_address, dep ) if child.parent == None and keep_dummy_root: graph.add_arc( -1, head_address ) # Connect the parent of given node head = -1 if not child.parent else child.parent.word_id graph.nodes[head_address].update( { 'head': head, } ) return graph