def construct_dependency_graph(sentence_graph): """ Given node addresses and arcs, construct dependency graph """ tokens = sentence_graph.tokens pos = sentence_graph.features["pos"] parse = sentence_graph.features["depparse"] dep_graph = DependencyGraph() dep_graph.remove_by_address(0) dep_graph.nodes[-1].update({ 'tag': 'TOP', 'address': -1, }) for head_address, address, relation in parse: node = { 'tag': pos[address], 'address': address, 'head': head_address, 'rel': relation, 'word': tokens[address] } dep_graph.add_node(node) for head_address, address, _ in parse: dep_graph.add_arc(head_address, address) return dep_graph
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update({ 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, }) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index+1, } ) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update( { 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, } ) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph
def as_dependencygraph( self, keep_dummy_root=False, add_morph=True ): ''' Returns this tree as NLTK's DependencyGraph object. Note that this method constructs 'zero_based' graph, where counting of the words starts from 0 and the root index is -1 (not 0, as in Malt-TAB format); Parameters ----------- add_morph : bool Specifies whether the morphological information (information about word lemmas, part-of-speech, and features) should be added to graph nodes. Note that even if **add_morph==True**, morphological information is only added if it is available via estnltk's layer token['analysis']; Default: True keep_dummy_root : bool Specifies whether the graph should include a dummy TOP / ROOT node, which does not refer to any word, and yet is the topmost node of the tree. If the dummy root node is not used, then the root node is the word node headed by -1; Default: False For more information about NLTK's DependencyGraph, see: http://www.nltk.org/_modules/nltk/parse/dependencygraph.html ''' from nltk.parse.dependencygraph import DependencyGraph graph = DependencyGraph( zero_based = True ) all_tree_nodes = [self] + self.get_children() # # 0) Fix the root # if keep_dummy_root: # Note: we have to re-construct the root node manually, # as DependencyGraph's current interface seems to provide # no easy/convenient means for fixing the root node; graph.nodes[-1] = graph.nodes[0] graph.nodes[-1].update( { 'address': -1 } ) graph.root = graph.nodes[-1] del graph.nodes[0] # # 1) Update / Add nodes of the graph # for child in all_tree_nodes: rel = 'xxx' if not child.labels else '|'.join(child.labels) address = child.word_id word = child.text graph.nodes[address].update( { 'address': address, 'word': child.text, 'rel': rel, } ) if not keep_dummy_root and child == self: # If we do not keep the dummy root node, set this tree # as the root node graph.root = graph.nodes[address] if add_morph and child.morph: # Add morphological information, if possible lemmas = set([analysis[LEMMA] for analysis in child.morph]) postags = set([analysis[POSTAG] for analysis in child.morph]) feats = set([analysis[FORM] for analysis in child.morph]) lemma = ('|'.join( list(lemmas) )).replace(' ','_') postag = ('|'.join( list(postags) )).replace(' ','_') feats = ('|'.join( list(feats) )).replace(' ','_') graph.nodes[address].update( { 'tag ': postag, 'ctag' : postag, 'feats': feats, 'lemma': lemma } ) # # 2) Update / Add arcs of the graph # for child in all_tree_nodes: # Connect children of given word deps = [] if not child.children else [c.word_id for c in child.children] head_address = child.word_id for dep in deps: graph.add_arc( head_address, dep ) if child.parent == None and keep_dummy_root: graph.add_arc( -1, head_address ) # Connect the parent of given node head = -1 if not child.parent else child.parent.word_id graph.nodes[head_address].update( { 'head': head, } ) return graph