def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. start -- start symbol. horzMarkov -- None for default. A number n >= 0 for horizontal markov. """ self.start = start count_Y_Z = defaultdict(lambda: defaultdict(int)) count_X = defaultdict(int) for t in parsed_sents: # it's a copy of tree. We don't want to modify the original tree. # mutable structures unle_trees = unlexicalize(t.copy(deep=True)) # chomsky normal form with horizontal markov. unle_trees.chomsky_normal_form(horzMarkov=horzMarkov) # collapse subtrees with a single child. unle_trees.collapse_unary(collapsePOS=True) for prod in unle_trees.productions(): count_Y_Z[prod.lhs()][prod.rhs()] += 1 count_X[prod.lhs()] += 1 # create a list of productions. productions = [] for X, c_X in count_X.items(): for (Y_Z, c_Y_Z) in count_Y_Z[X].items(): q = c_Y_Z / float(c_X) productions.append(ProbabilisticProduction(X, Y_Z, prob=q)) self.production = productions grammar = PCFG(Nonterminal(start), productions) self.parser = CKYParser(grammar)
def test_unlexicalize_does_change_tree(self): t = Tree.fromstring( """ (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) unlexicalize(t) ut2 = Tree.fromstring( """ (S (NP (Det Det) (Noun Noun)) (VP (Verb Verb) (NP (Noun Noun) (Adj Adj))) ) """) self.assertEqual(t, ut2)
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # { A -> B : count(A -> B) } productions_counts = defaultdict(int) # { A : count(A) } lhs_count = defaultdict(int) # left_hand_side_count self.start = start # Para la gramatica del parser CKY self.prods = [] # Lista de producciones # Hacemos una copia de t porque al hacer el unlexicalize, este me # modifica el arbol # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents] unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents] for t in unlex_sents: t.chomsky_normal_form(horzMarkov=horzMarkov) t.collapse_unary(collapsePOS=True, collapseRoot=True) for prod in t.productions(): # type(prod): <class 'nltk.grammar.Production'> # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> # Cada elemento de prod.rhs() es del tipo: # <class 'nltk.grammar.Nonterminal'> productions_counts[prod] += 1 lhs_count[prod.lhs()] += 1 for prod, count_prod in productions_counts.items(): # type(production): <class 'nltk.grammar.Production'> # production : A -> B # type(count_prod): int # count_prod : count(A -> B) count_lhs = lhs_count.get(prod.lhs(), 0) # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> q_ML = float(count_prod) / count_lhs self.prods += [ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=q_ML)] # Cada elemento de self.prods es del tipo: # <class 'nltk.grammar.ProbabilisticProduction'> # type(PCFG(...)) = <class 'nltk.grammar.PCFG'> # PCFG(start, productions) # type(start): Nonterminal # type(productions): list(Production) grammar = PCFG(Nonterminal(start), self.prods) self.my_parser = CKYParser(grammar)
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # Non-Terminal start symbol of the pcfg. # Be aware that the start symbol now is specified by the init parameter # 'start', and not the start label of the trees in parsed_sents self.start = N(start) self.horzMarkov = horzMarkov # saving repeated productions (for induce probabilities) productions = [] for t in parsed_sents: unlex_t = unlexicalize(t.copy(deep=True)) # Set node label unlex_t.set_label(start) unlex_t.chomsky_normal_form(horzMarkov=horzMarkov) # Not collapsing the Root (collapseRoot=False) unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True) productions += unlex_t.productions() self.pcfg = induce_pcfg(self.start, productions) self._probabilistic_productions = self.pcfg.productions() self._parser = CKYParser(self.pcfg)