def make_dep_tree(sent, deps): adj = merge_with(cons, [], *[{x:[m]} for x,m,_ in deps]) heads = dict([(m,h) for h,m,_ in deps]) rel = dict([(m,rel) for _,m,rel in deps]) n = len(sent["x"]) pos = sent["pos"] x = sent["x"] nodelist = defaultdict(lambda: {"address": -1, "head": -1, "deps": [], "rel": "", "tag": "", "word": None}) for i in range(1, n): node = nodelist[i] node["address"] = i node["head"] = heads[i] node["deps"] = adj[i] if adj.has_key(i) else [] node["tag"] = pos[i] node["word"] = x[i] node["rel"] = rel[i] g = DependencyGraph() g.get_by_address(0)["deps"] = adj[0] if adj.has_key(0) else [] [g.add_node(node) for node in nodelist.values()] g.root = nodelist[adj[0][0]] return g
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update({ 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, }) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index+1, } ) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update( { 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, } ) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph