Пример #1
0
    def test_pcfg(self):
        o = pcfg.PCFG()
        tree = Tree('S', (Tree('NP', ('foo', )), Tree('VP', ('bar', ))))

        o.update_counts(tree)
        self.assertSetEqual(set([(p, 1) for p in tree.productions()]),
                            set(o.production_counts.items()))
        self.assertSetEqual(set([(p.lhs(), 1) for p in tree.productions()]),
                            set(o.lhs_counts.items()))
        o.update_counts(tree)

        tree = Tree('S', (Tree('VP', ('foo', )), Tree('NP', ('bar', ))))
        o.update_counts(tree)
        o.update_counts(tree)
        self.assertEqual(6, len(o.production_counts))
        for count in o.production_counts.values():
            self.assertEqual(2, count)
        self.assertEqual(3, len(o.lhs_counts))
        for count in o.lhs_counts.values():
            self.assertEqual(4, count)

        o.compute_scores()
        for production, score in o.scored_productions.items():
            self.assertAlmostEqual(-0.69314718055,
                                   score,
                                   msg='%s' % production)
Пример #2
0
    def test_pcfg(self):
        o = pcfg.PCFG()
        tree = Tree('S', (Tree('NP', ('foo',)), Tree('VP', ('bar',))))

        o.update_counts(tree)
        self.assertSetEqual(
                set([(p, 1) for p in tree.productions()]),
                set(o.production_counts.items()))
        self.assertSetEqual(set([(p.lhs(), 1) for p in tree.productions()]),
                set(o.lhs_counts.items()))
        o.update_counts(tree)

        tree = Tree('S', (Tree('VP', ('foo',)), Tree('NP', ('bar',))))
        o.update_counts(tree)
        o.update_counts(tree)
        self.assertEqual(6, len(o.production_counts))
        for count in o.production_counts.values():
            self.assertEqual(2, count)
        self.assertEqual(3, len(o.lhs_counts))
        for count in o.lhs_counts.values():
            self.assertEqual(4, count)

        o.compute_scores()
        for production, score in o.scored_productions.items():
            self.assertAlmostEqual(-0.69314718055, score, msg='%s' % production)
Пример #3
0
    def _extract_cfg_rules(self, docs, include_lexical, print_progress=False):
        productions = []
        i = 0
        for doc in docs:
            # Parse doc and generate a syntax tree
            parse_tree = Tree("START", self.parser.raw_parse(doc))
            # Extract the productionvrules that correspond to the non-terminal
            # nodes of the tree
            prods = parse_tree.productions()

            # Create a list of all productions in document
            doc_prods = []
            for prod in prods:
                if (str(prod) == "START -> ROOT"):  # skip this
                    continue
                if (include_lexical):
                    doc_prods.append(str(prod))
                else:
                    # note that is_lexical is not the opposite of is_nonlexical
                    if (prod.is_nonlexical()):
                        doc_prods.append(str(prod))

            productions.append(doc_prods)

            # track progress
            if (print_progress):
                if (i % 500) == 0:
                    print("{0}/{1} docs processed...".format(i, len(docs)))
            i += 1

        return productions
Пример #4
0
 def build(self, examples=tuple()):
     """
     :param examples:    tuple or list of nltk Trees
     :return: 
     """
     allproductions = []
     for example in examples:
         q = example
         t = self.grammarify(q)
         t = Tree("S", [t])
         productions = t.productions()
         allproductions += productions
     pcfg = nltk.induce_pcfg(Nonterminal("S"), allproductions)
     return pcfg