def compose_edge(self, edge, max_size, minimal_only=False, aligned_nodes=[]): tail_choices = [] for tail in edge.tails: choices = [] choices.append(((tail,), 1.0, None)) # Composing through virtual nodes leads to us counting a ton of stuff twice. # Note that composing through virtual nodes will never add any extra edges either. if not tail.is_virtual and (not minimal_only or tail not in aligned_nodes): # In minimal mode, we don't compose through aligned nodes if not minimal_only or not tail in aligned_nodes: for child_edge in tail.get_child_edges(self): # In minimal mode, each tail must be an aligned node. is_valid = True if minimal_only: for t in child_edge.tails: if t not in aligned_nodes and not t.is_terminal_flag: is_valid = False break if not is_valid: continue if len(child_edge.tails) <= max_size - len(edge.tails) + 1: choices.append((child_edge.tails, self.weights[child_edge], child_edge)) tail_choices.append(choices) if len(tail_choices) > max_size: return for chosen_child_edges in enumerate_subsets(tail_choices): new_tails = [] new_weight = self.weights[edge] composed_edges = [edge] for tail, weight, internal_edge in chosen_child_edges: assert len(tail) >= 1 new_tails += tail new_weight *= weight if internal_edge is not None: composed_edges.append(internal_edge) if len(new_tails) <= max_size: new_edge = Edge(edge.head, tuple(new_tails), True) assert len(composed_edges) > 0 new_edge.composed_edges = tuple(composed_edges) if edge.tails != new_edge.tails: self.add(new_edge, new_weight)
def add_experimental_virtual_edges(target_tree, source_tree, s2t_node_alignments, t2s_node_alignments, target_terminals): def project(source_node): alignments = s2t_node_alignments[source_node] #assert len(alignments) <= 1 # TODO: Could unaligned words invalidate this? return list(alignments)[0] if len(alignments) == 1 else None # Derivation[source_node] will hold the minimal way(s) of representing source_node using minimal constituents. # For terminals and well-aligned NTs, there is only one such way: using the node itself. # For NTs that are not node aligned, we will find sets of minimally aligned children that cover source_node. derivations = {} for source_node in source_tree.topsort(): derivations[source_node] = [] if source_node.is_terminal_flag: derivation = (source_node,) derivations[source_node].append((derivation, [])) elif project(source_node) != None: derivation = (source_node,) derivations[source_node].append((derivation, [])) else: for edge in source_tree.head_index[source_node]: for subset in enumerate_subsets([derivations[tail] for tail in edge.tails]): derivation = reduce(operator.add, [derivation for derivation, _ in subset]) skipped_edges = reduce(operator.add, [edges for _, edges in subset]) for node in derivation: assert len(s2t_node_alignments[node]) >= 1 or node.is_terminal_flag derivations[source_node].append((derivation, [edge] + skipped_edges)) for edge in source_tree.edges.copy(): source_head = edge.head for target_head in s2t_node_alignments[source_head]: for source_subset in enumerate_subsets([derivations[tail] for tail in edge.tails]): source_tails = reduce(operator.add, [derivation for derivation, _ in source_subset]) composed_edge = Edge(source_head, source_tails) skipped_edges = reduce(operator.add, [edges for _, edges in source_subset]) if len(skipped_edges) > 0: composed_edge.composed_edges = tuple([edge] + skipped_edges) composed_edge.is_composed = True assert len(edge.composed_edges) == 0 if composed_edge != edge: assert len(skipped_edges) > 0 source_tree.add(composed_edge) for target_subset in enumerate_subsets([list(s2t_node_alignments[tail]) for tail in source_tails if not tail.is_terminal_flag]): target_tails = target_subset for i in range(*target_head.span): is_included = False for tail in target_tails: if i >= tail.span.start and i < tail.span.end: is_included = True break if not is_included: target_tails.append(target_terminals[i]) target_tails = tuple(sorted(target_tails, key=lambda node: node.span.start)) virtual_edge = Edge(target_head, target_tails) target_tree.add(virtual_edge) return for source_node in source_tree.topsort(): head = project(source_node) if head == None: print >>sys.stderr, str(source_node), 'is unaligned' continue else: print >>sys.stderr, str(source_node), 'is aligned to', str(head) for edge in source_tree.head_index[source_node]: tails = [] valid = True for tail in edge.tails: projection = project(tail) if projection is None: valid = False break tails.append(projection) if valid: virtual_edge = Edge(head, tuple(tails)) target_tree.add(virtual_edge) print >>sys.stderr, head, tails