def read_tree_file(self, treefile):
     f = open(treefile)
     current_indent = 0
     indent_level = 2
     current_edge = None
     stack = []
     for line in f:
         # TODO: why are there None node lines?
         if '|||' in line or line.strip() == 'None':
             indent = 0
             while line[indent] == ' ':
                 indent += 1
             # TODO: hack. None lines have wrong indent in
             # max derivation viterbi trees
             # if line.strip() == 'None':
             #     indent -= 2
             if indent == current_indent + indent_level:
                 current_indent = indent
                 stack.append(node)
             elif indent < current_indent:
                 npop = (current_indent - indent) // indent_level
                 current_indent = indent
                 for i in range(npop):
                     top_tmp = stack.pop()
                     # hg = Hypergraph(top_tmp)
                     # hg.topo_sort()
                     # hg.show()
             node = Node()
             if len(stack) > 0:
                 stack[-1].incoming[0].add_tail(node)
             # TODO: why are there None nodes? these nodes have incoming
             # edges. they are just a nonterminal as a leaf.
             if line.strip() != 'None':
                 rule = Rule()
                 rule.fromstr(line)
                 edge = PhraseHGEdge()
                 edge.rule = rule
                 node.add_incoming(edge)
     hg = Hypergraph(stack[0])
     hg.topo_sort()
     f.close()
     return hg
예제 #2
0
    def run(self):
        # update per-sentence grammars, if there's any
        for g in self.grammars:
            g.update(self.id)
        self.flog = open('%s/%s_%s' % (FLAGS.run_dir, 'log', self.suffix), 'w')
        if FLAGS.show_time:
            self.flog.write('running on %s\n\n' % socket.gethostname())
            self.flog.flush()

        fwords = self.line.split()
        if FLAGS.preprocess:
            self.fidx2replacement = {}
            j = 0
            for i, token in enumerate(fwords):
                if token in ('$number', '$date'):
                    self.fidx2replacement[i] = self.special[j][1]
                    j += 1

        self.flog.write('[%s][%s words] %s\n' %
                        (self.id, len(fwords), self.line))

        decoder = Decoder(fwords, self.grammars, self.features)

        begin_time = time()
        if FLAGS.decoding_method == 'agenda':
            item = decoder.decode()
        elif FLAGS.decoding_method == 'cyk':
            item = decoder.decode_cyk()
        elif FLAGS.decoding_method == 'earley':
            item = decoder.decode_earley()
        else:
            assert False, '"%s" not valid decoding option' \
                    % FLAGS.decoding_method
        self.time = time() - begin_time

        if item is None:
            self.out = '[decoder failed to build a goal item]'
        else:
            hg = Hypergraph(item)
            hg.set_semiring(hypergraph.SHORTEST_PATH)
            hg.set_functions(lambda x: x.cost, None, None)
            hg.topo_sort()
            self.kbest = hg.root.best_paths()
            output_tokens = self.kbest[0].translation[:]

            if FLAGS.preprocess:
                for i in range(len(output_tokens)):
                    if output_tokens[i] in ('$number', '$date'):
                        fidx = self.kbest[0].composed_rule.we2f[i]
                        if fidx is not None:
                            output_tokens[i] = self.fidx2replacement[fidx]

            self.out = ' '.join(output_tokens[FLAGS.lm_order - 1:1 -
                                              FLAGS.lm_order])
            self.hg = hg
            if FLAGS.output_hypergraph:
                self.write_hypergraph()

        self.flog.write('%s\n' % self.out)
        self.flog.write('\n')
        if item is not None:
            self.flog.write(self.kbest[0].tree_str())
            self.flog.write('\n')
            self.flog.write(hg.stats())
            self.flog.write('\n')
        self.flog.write(decoder.agenda_stats())
        self.flog.write('\n')
        self.flog.write(decoder.chart.stats())
        self.flog.write('\n')
        for dotchart in decoder.dotcharts:
            self.flog.write(dotchart.stats())
            self.flog.write('\n')

        if FLAGS.show_time:
            timeline = '{:<35}{:>15.2f}\n'.format('[time]:', self.time)
            self.flog.write(timeline)
        self.write_output_file()
        if FLAGS.output_kbest:
            self.write_kbest_to_file()
        self.flog.close()
예제 #3
0
class ABCParser(object):
    """Bilingual parser that glues hiero rules into a hypergraph with
    ABC glue grammar."""
    def __init__(self, n1, n2, phrases):
        """
        n1: French length
        n2: English length
        phrases: a list of PhraseHGNodes that have been partially linked
          according to heiro rule extraction
        """
        self.n1 = n1
        self.n2 = n2
        self.chart = {}
        self.neighbor_index = NeighborIndex()
        self.edge_index = set()
        self.agenda = []
        self.phrases = phrases
        self.glue_nodes = []
        for phrase in phrases:
            bin = self.chart.setdefault((phrase.fi,
                                         phrase.fj,
                                         phrase.ei,
                                         phrase.ej),
                                        {})
            bin[phrase.nt] = phrase
            self.agenda.append(phrase)
            self.neighbor_index.add(phrase)

    # not used, too slow
    def parse(self):
        self.glue_nodes = []
        for i1, j1, i2, j2 in bi_cyk_spans(self.n1, self.n2):
            for k1 in range(i1 + 1, j1):
                for k2 in range(i2 + 1, j2):
                    bin1 = self.chart.get((i1, k1, i2, k2), {})
                    bin2 = self.chart.get((k1, j1, k2, j2), {})
                    for item1 in bin1.values():
                        for item2 in bin2.values():
                            if item2.nt != STRAIGHT:
                                new_item = self.make_item(item1,
                                                          item2,
                                                          False)
                                self.chart_add(new_item)
                    bin1 = self.chart.get((i1, k1, k2, j2), {})
                    bin2 = self.chart.get((k1, j1, i2, k2), {})
                    for item1 in bin1.values():
                        for item2 in bin2.values():
                            if item2.nt != INVERTED:
                                new_item = self.make_item(item1,
                                                          item2,
                                                          True)
                                self.chart_add(new_item)
        self.stats()

    def parse_agenda(self):
        while len(self.agenda) > 0:
            item = self.agenda.pop()
            if logger.level >= 4:
                logger.writeln('pop: %s' % item)
            for item1, item2, inverted in self.neighboring_pairs(item):
                # avoid duplicated edges. note that in ABC grammar,
                # if the boxes of item1 and item2 are given, the nt of the
                # new item is fixed
                if logger.level >= 4:
                    logger.writeln('neighbors: %s %s' % (item1, item2))
                key = (item1.nt, item1.fi, item1.fj, item1.ei, item1.ej,
                       item2.nt, item2.fi, item2.fj, item2.ei, item2.ej)
                if key not in self.edge_index:
                    self.edge_index.add(key)  
                    new_item = self.make_item(item1, item2, inverted)
                    if self.chart_add(new_item):
                        self.agenda.append(new_item)
                        self.neighbor_index.add(new_item)
                        self.glue_nodes.append(new_item)
                        if logger.level >= 4:
                            logger.writeln('push: %s' % new_item)
        # self.stats()
        root = self.final_glue()
        self.hg = Hypergraph(root)
        self.hg.topo_sort()
        self.stats()
        return self.hg

    def final_glue(self):
        unattached = self.phrases[:]
        candidates = self.phrases + self.glue_nodes
        # topo sort. root node at the end
        unattached.sort()
        candidates.sort()
        self.top_roots = []
        self.other_roots = []
        while len(candidates) > 0:
            root = candidates.pop()
            if (root.fi == 0 and
                root.fj == self.n1 and
                root.ei == 0 and
                root.ej == self.n2):
                self.top_roots.append(root)
            else:
                self.other_roots.append(root)
            hg = Hypergraph(root)
            hg.find_reachable_nodes()
            unattached = [n for n in unattached if id(n) not in hg.found]
            candidates = [n for n in candidates if id(n) not in hg.found and \
                          (n.nt == PHRASE or not n < root)]
        top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2)
        # add one edge for each top root
        for root in self.top_roots:
            rule = Rule()
            rule.lhs = START
            rule.f = [root.nt]
            rule.e = [root.nt]
            rule.e2f = [0]
            edge = PhraseHGEdge(rule)
            edge.add_tail(root)
            top_node.add_incoming(edge)
        # add one edge for all other roots
        if ((glue_missing_phrases or len(self.top_roots) == 0)
            and len(self.other_roots) > 0):
            rule = Rule()
            rule.lhs = START
            edge = PhraseHGEdge(rule)
            for root in self.other_roots:
                rule.f.append(root.nt)
                rule.e.append(root.nt)
                edge.add_tail(root)
            rule.e2f = [i for i in range(len(rule.f))]
            top_node.add_incoming(edge)
        return top_node

    # not used
    def final_glue1(self):
        """try to cover all phrases AND glue rules"""
        # candidate glue nodes are glue nodes whose boxes are also phrases
        # candidate_glue_nodes = []
        # for node in self.glue_nodes:
        #     bin = self.chart.get((node.fi, node.fj, node.ei, node.ej))
        #     if bin is not None:
        #         if PHRASE in bin:
        #             candidate_glue_nodes.append(node)
        candidates = self.phrases + self.glue_rules
        # topo sort. root node at the end
        candidates.sort()
        roots = []
        while len(candidates) > 0:
            root = candidates.pop()
            print('pop: %s' % root)
            roots.append(root)
            hg = Hypergraph(root)
            hg.find_reachable_nodes()
            candidates = [n for n in candidates if id(n) not in hg.found]
        top_rule = Rule()
        top_rule.lhs = START
        top_edge = PhraseHGEdge(top_rule)
        for root in roots:
            top_rule.f.append(root.nt)
            top_rule.e.append(root.nt)
            top_edge.add_tail(root)
        top_rule.e2f = [i for i in range(len(top_rule.f))]
        top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2)
        top_node.add_incoming(top_edge)
        return top_node

    def neighboring_pairs(self, item):
        """
        return value is items in the order they appear on f side, and whether
        they are inverted.
        The constraint of ABC grammar is also applied here.
        """
        for neighbor in self.neighbor_index.get((item.fi, item.ej), 0):
            if item.nt != INVERTED:
                yield neighbor, item, True
        for neighbor in self.neighbor_index.get((item.fi, item.ei), 1):
            if item.nt != STRAIGHT:
                yield neighbor, item, False
        for neighbor in self.neighbor_index.get((item.fj, item.ei), 2):
            if neighbor.nt != INVERTED:
                yield item, neighbor, True
        for neighbor in self.neighbor_index.get((item.fj, item.ej), 3):
            if neighbor.nt != STRAIGHT:
                yield item, neighbor, False

    def make_item(self, item1, item2, inverted):
        """item1 and item2 is always given in the order they appear
        on the f side"""
        rule = Rule()
        rule.f = [item1.nt, item2.nt]
        fi = item1.fi
        fj = item2.fj
        if inverted:
            rule.lhs = INVERTED
            rule.e = [item2.nt, item1.nt]
            rule.e2f = [1, 0]
            ei = item2.ei
            ej = item1.ej
        else:
            rule.lhs = STRAIGHT
            rule.e = [item1.nt, item2.nt]
            rule.e2f = [0, 1]
            ei = item1.ei
            ej = item2.ej
        edge = PhraseHGEdge(rule)
        edge.add_tail(item1)
        edge.add_tail(item2)
        new_item = PhraseHGNode(rule.lhs, fi, fj, ei, ej)
        new_item.add_incoming(edge)
        return new_item

    def chart_add(self, item):
        bin = self.chart.setdefault((item.fi,
                                     item.fj,
                                     item.ei,
                                     item.ej),
                                    {})
        added = False
        # the ABCParser applies only glue rules. this test says glue rules
        # are used only when a PHRASE is not already derived for the box
        # if PHRASE not in bin:
        old_item = bin.get(item.nt)
        if old_item:
            old_item.add_incoming(item.incoming[0])
        else:
            added = True
            bin[item.nt] = item
        return added


    def stats(self):
        result = '--ABCParser Stats--\n'

        top_bin = self.chart.get((0, self.n1, 0, self.n2))
        if top_bin is None:
            result += 'parse failed\n'
        else:
            result += 'parse succeeded\n'

        result += self.hg.stats()

        # self.hg.show()

        hiero_rules = 0
        glue_rules = []
        for edge in self.hg.edges():
            if edge.rule.lhs == PHRASE:
                hiero_rules += 1
            else:
                glue_rules.append(edge)
        result += 'hiero rules: %s\n' % hiero_rules
        result += 'glue rules: %s\n' % len(glue_rules)

        rules = []
        for node in self.phrases:
            for edge in node.incoming:
                rules.append(edge.rule)
        hg_rules = set()
        for edge in self.hg.edges():
            hg_rules.add(id(edge.rule))
        unglued_rules = []
        for rule in rules:
            if id(rule) not in hg_rules:
                unglued_rules.append(rule)

        roots = self.top_roots + self.other_roots
        result += 'roots: %s\n' % len(roots)
        for node in roots:
            result += '%s\n' % node

        result += 'unglued rules: %s\n' % len(unglued_rules)
        for rule in unglued_rules:
            result += '%s\n' % rule
        return result
    def run(self):
        # update per-sentence grammars, if there's any
        for g in self.grammars:
            g.update(self.id)

        self.flog = open('%s/%s_%s' % (FLAGS.run_dir,
                                  'log',
                                  self.suffix),
                    'w')
        if FLAGS.show_time:
            self.flog.write('running on %s\n\n' % socket.gethostname())
            self.flog.flush()

        fwords = self.line.strip().split()


        # added by freesunshine, build the local grammar for oov words for each sentence
        rules = []
        if self.oov_idx is not None and len(self.oov_idx) > 0:
            #oov_weight = 8.0
            oov_weight = 0.0001
            for idx in self.oov_idx:
                fw = fwords[idx]
                ew = "."
                rule_str = "[A0-0] ||| %s ||| %s ||| %lf %lf %lf" %(fw, ew, oov_weight, oov_weight, oov_weight)
                rr = Rule()
                rr.fromstr(rule_str)
                rules.append(rr)

        if self.ner_items is not None and len(self.ner_items) > 0:
            for item in self.ner_items:
                concept_weight = 10.0
                st = item[0][0]
                ed = item[0][1]
                fw = ' '.join(fwords[st:ed])
                #concept_weight *= pow((ed-st), 2)
                ew = item[1]
                value = int(ew[2])

                #Here is the feature for difference of nonterminal type
                #concept_weight /= pow(1.4, value)

                #Here is the feature for the favor of longer spans
                #concept_weight *= pow(2, ed-st)

                #Here is the feature for the number of edges
                #concept_weight /= pow(2.0, get_num_edges(ew))
                #print >>sys.stder, ew, concept_weight
                #rule_str = "[A1-1] ||| %s ||| %s ||| " % (fw, ew)
                rule_str = "%s ||| " % ew
                #weight = 5
                if fw == ';':
                    rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight)
                else:
                    rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight)
                rr = Rule()
                #print rule_str
                rr.fromstr(rule_str)
                rules.append(rr)

        #print '===== local_gr ====='
        #for r in rules:
        #    print r

        local_gr = None
        if len(rules) > 0:
          local_gr = Grammar(FLAGS.rule_bin_size)
          local_gr.build(rules, self.grammars[0].features)

        if FLAGS.preprocess:
            self.fidx2replacement = {}
            j = 0
            for i, token in enumerate(fwords):
                if token in ('$number', '$date'):
                    self.fidx2replacement[i] = self.special[j][1]
                    j += 1

        self.flog.write('[%s][%s words] %s\n' %
                   (self.id, len(fwords), self.line))

        decoder = Decoder(fwords,
                          self.grammars,
                          self.features,
                          local_gr)

        begin_time = time()
        if FLAGS.decoding_method == 'agenda':
            item = decoder.decode()
        elif FLAGS.decoding_method == 'cyk':
            item = decoder.decode_cyk()
        elif FLAGS.decoding_method == 'earley':
            item = decoder.decode_earley()
        else:
            assert False, '"%s" not valid decoding option' \
                    % FLAGS.decoding_method
        self.time = time() - begin_time

        if item is None:
            self.out = '[decoder failed to build a goal item]'
        else:
            ttt, succ = item
            item = ttt
            hg = Hypergraph(item)
            hg.set_semiring(hypergraph.SHORTEST_PATH)
            hg.set_functions(lambda x: x.cost, None, None)
            hg.topo_sort()
            self.kbest = hg.root.best_paths()
            #output_tokens = self.kbest[0].translation[:]

            #if FLAGS.preprocess:
            #    for i in range(len(output_tokens)):
            #        if output_tokens[i] in ('$number', '$date'):
            #            fidx = self.kbest[0].composed_rule.we2f[i]
            #            if fidx is not None:
            #                output_tokens[i] = self.fidx2replacement[fidx]

            # @freesunshine target side string output
            #self.out = ' '.join(output_tokens[FLAGS.lm_order-1:
            #                                  1-FLAGS.lm_order])

            self.flog.write('Decuction Tree:\n%s\n' % self.kbest[0].tree_str())
            #self.out = str(self.kbest[0].translation)
            #if succ:
            self.out = self.kbest[0].translation.to_amr_format()[0]
            #else:
            #    self.out = self.kbest[0].translation.toAMR()
            lines = [x.strip() for x in self.out.split('\n')]
            self.out = "".join(lines)

            self.hg = hg
            if FLAGS.output_hypergraph:
                self.write_hypergraph()

        self.flog.write('%s\n' % self.out)
        self.flog.write('\n')
        #if item is not None:
        #    self.flog.write(self.kbest[0].tree_str())
        #    self.flog.write('\n')
        #    self.flog.write(hg.stats())
        #    self.flog.write('\n')
        self.flog.write(decoder.agenda_stats())
        self.flog.write('\n')
        self.flog.write(decoder.chart.stats())
        self.flog.write('\n')
        for dotchart in decoder.dotcharts:
            self.flog.write(dotchart.stats())
            self.flog.write('\n')

        if FLAGS.show_time:
            timeline = '{:<35}{:>15.2f}\n'.format('[time]:', self.time)
            self.flog.write(timeline)
        self.write_output_file()
        if FLAGS.output_kbest:
            self.write_kbest_to_file()
        self.flog.close()