def test_horz_markov_0(self): t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))") model = UPCFG([t], horzMarkov=0) prods = model.productions() prods2 = [ # the right-binarized productions: ProbabilisticProduction(N('NP'), [N('Det'), N('NP|<>')], prob=1.0), ProbabilisticProduction(N('NP|<>'), [N('Noun'), N('Adj')], prob=1.0), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))
def test_horz_markov_None(self): t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))") # Bugfix from official test (, start='NP') model = UPCFG([t], start='NP') # horzMarkov=None by default prods = model.productions() prods2 = [ # the right-binarized productions: ProbabilisticProduction(N('NP'), [N('Det'), N('NP|<Noun-Adj>')], prob=1.0), ProbabilisticProduction(N('NP|<Noun-Adj>'), [N('Noun'), N('Adj')], prob=1.0), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # Non-Terminal start symbol of the pcfg. # Be aware that the start symbol now is specified by the init parameter # 'start', and not the start label of the trees in parsed_sents self.start = N(start) self.horzMarkov = horzMarkov # saving repeated productions (for induce probabilities) productions = [] for t in parsed_sents: unlex_t = unlexicalize(t.copy(deep=True)) # Set node label unlex_t.set_label(start) unlex_t.chomsky_normal_form(horzMarkov=horzMarkov) # Not collapsing the Root (collapseRoot=False) unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True) productions += unlex_t.productions() self.pcfg = induce_pcfg(self.start, productions) self._probabilistic_productions = self.pcfg.productions() self._parser = CKYParser(self.pcfg)
def test_productions(self): t = Tree.fromstring(""" (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) # Bugfix from official test (, start='S') model = UPCFG([t], start='S') prods = model.productions() prods2 = [ ProbabilisticProduction(N('S'), [N('NP'), N('VP')], prob=1.0), ProbabilisticProduction(N('NP'), [N('Det'), N('Noun')], prob=0.5), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('VP'), [N('Verb'), N('NP')], prob=1.0), ProbabilisticProduction(N('Verb'), ['Verb'], prob=1.0), ProbabilisticProduction(N('NP'), [N('Noun'), N('Adj')], prob=0.5), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))