def test_pcfg(self): o = pcfg.PCFG() tree = Tree('S', (Tree('NP', ('foo', )), Tree('VP', ('bar', )))) o.update_counts(tree) self.assertSetEqual(set([(p, 1) for p in tree.productions()]), set(o.production_counts.items())) self.assertSetEqual(set([(p.lhs(), 1) for p in tree.productions()]), set(o.lhs_counts.items())) o.update_counts(tree) tree = Tree('S', (Tree('VP', ('foo', )), Tree('NP', ('bar', )))) o.update_counts(tree) o.update_counts(tree) self.assertEqual(6, len(o.production_counts)) for count in o.production_counts.values(): self.assertEqual(2, count) self.assertEqual(3, len(o.lhs_counts)) for count in o.lhs_counts.values(): self.assertEqual(4, count) o.compute_scores() for production, score in o.scored_productions.items(): self.assertAlmostEqual(-0.69314718055, score, msg='%s' % production)
def test_pcfg(self): o = pcfg.PCFG() tree = Tree('S', (Tree('NP', ('foo',)), Tree('VP', ('bar',)))) o.update_counts(tree) self.assertSetEqual( set([(p, 1) for p in tree.productions()]), set(o.production_counts.items())) self.assertSetEqual(set([(p.lhs(), 1) for p in tree.productions()]), set(o.lhs_counts.items())) o.update_counts(tree) tree = Tree('S', (Tree('VP', ('foo',)), Tree('NP', ('bar',)))) o.update_counts(tree) o.update_counts(tree) self.assertEqual(6, len(o.production_counts)) for count in o.production_counts.values(): self.assertEqual(2, count) self.assertEqual(3, len(o.lhs_counts)) for count in o.lhs_counts.values(): self.assertEqual(4, count) o.compute_scores() for production, score in o.scored_productions.items(): self.assertAlmostEqual(-0.69314718055, score, msg='%s' % production)
def _extract_cfg_rules(self, docs, include_lexical, print_progress=False): productions = [] i = 0 for doc in docs: # Parse doc and generate a syntax tree parse_tree = Tree("START", self.parser.raw_parse(doc)) # Extract the productionvrules that correspond to the non-terminal # nodes of the tree prods = parse_tree.productions() # Create a list of all productions in document doc_prods = [] for prod in prods: if (str(prod) == "START -> ROOT"): # skip this continue if (include_lexical): doc_prods.append(str(prod)) else: # note that is_lexical is not the opposite of is_nonlexical if (prod.is_nonlexical()): doc_prods.append(str(prod)) productions.append(doc_prods) # track progress if (print_progress): if (i % 500) == 0: print("{0}/{1} docs processed...".format(i, len(docs))) i += 1 return productions
def build(self, examples=tuple()): """ :param examples: tuple or list of nltk Trees :return: """ allproductions = [] for example in examples: q = example t = self.grammarify(q) t = Tree("S", [t]) productions = t.productions() allproductions += productions pcfg = nltk.induce_pcfg(Nonterminal("S"), allproductions) return pcfg