def read_productions(self, productions_filename): productions = [] with io.open(productions_filename, 'r', encoding='utf8') as f: for line in f: line = line.strip() components = line.split(u'+') lhs = Nonterminal(components[0]) rhs = tuple([ Nonterminal(nt.strip()) for nt in components[1].split(u' ') ]) prob = float(components[2]) pp = ProbabilisticProduction(lhs, rhs, prob=prob) productions.append(pp) self.grammar = PCFG(Nonterminal('S'), productions)
def pcfg_learn(treebank, n): productions = list() for i in range(n): for tree in treebank.parsed_sents()[:i+1]: chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^') prod_gen = tree_to_productions(tree) tree_to_append = next(prod_gen) while tree_to_append: productions.append(tree_to_append) try: tree_to_append = next(prod_gen) except Exception as e: tree_to_append = False productions = get_productions(productions) return PCFG(Nonterminal('S'), productions)
def pcfg_learn1(treebank, n): productions = list() for i in range(n): for tree in treebank.parsed_sents()[:i + 1]: prod_gen = tree_to_productions(tree, "BOT") tree_to_append = next(prod_gen)[0] while tree_to_append: if tree_to_append.lhs() == Nonterminal('NP'): productions.append(tree_to_append) try: tree_to_append = next(prod_gen)[0] except Exception as e: tree_to_append = False productions, dist = get_productions(productions) return PCFG(Nonterminal('NP'), productions), dist
def create_pcfg(start_symbol, productions): pcount = {} lcount = {} for prod in productions: lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 pcount[prod] = pcount.get(prod, 0) + 1 prods = [ ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()]) for p in pcount ] # threshold= 5e-3 # to_remove = [p for p in prods if p.is_lexical() and len(p) == 1 and p.prob() < threshold] # if to_remove: # return create_pcfg(start_symbol, [p for p in prods if p.is_lexical() and len(p) == 1 and p.prob() > threshold]) return PCFG(start_symbol, prods)
def renormalize(self, height=10**4, tol=10**(-17), min_height=100): """Return renormalized grammar. Raise ValueError if for at least one nonterminal, its coverage equals zero. Input: height - maximal height of parse trees of which the coverage is calculated of. tol - tolerance as a stopping condition. If change is smaller than the input tolerance, then it stops. min_height - overrides tolerance stopping condition and calculates coverage of all heights <= min_height. It also determines for how many previous steps the change is measured, i.e. for levels (height-1 - min_height/2). verbosity - if set to > 0, it prints stopping probability change, height and input tolerance. """ coverages_dict = self.list_coverages(height, tol, min_height) if min(coverages_dict[A] for A in coverages_dict) < tol: # input tol print([A for A in coverages_dict if coverages_dict[A] < tol]) raise ValueError("Not all coverages are positive, so" + " renormalization cannot be performed since zero" + " division.") def chi(prod, coverages_dict): """Renormalizes production probability p^~ as in Chi paper(22).""" subprobabs = prod.prob() for symbol in prod.rhs(): if not isinstance(symbol, Nonterminal): continue # or subprobabs = 1 else: subprobabs *= coverages_dict[symbol] return subprobabs/coverages_dict[prod.lhs()] prods = [ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=chi(prod, coverages_dict)) for prod in self.grammar.productions()] return PCFG(self.grammar.start(), prods)
def update_grammar(words, grammar, smoothing=None): # if smoothing is None use Add One. pcount = {} lcount = 0 new_prods = [] lhs = None for prod in grammar.productions(): if str(prod.lhs()) == 'NN': lhs = prod.lhs() lcount += 1 pcount[prod] = pcount.get(prod, 0) + 1 add = len(words) + len(pcount) avg = 1 / lcount if lhs is None: lhs = Nonterminal('NN') for word in words: rhs = (word.strip("'"), ) if smoothing is None: prob = 1 / (lcount + add) else: prob = avg / len(words) prod = ProbabilisticProduction(lhs, rhs, prob=prob) new_prods.append(prod) for p in grammar.productions(): if str(p.lhs()) == 'NN': if smoothing is None: p = ProbabilisticProduction(p.lhs(), p.rhs(), prob= (pcount[p] + 1) / (lcount + add)) else: p = ProbabilisticProduction(p.lhs(), p.rhs(), prob= p.prob() - (avg / lcount)) new_prods.append(p) return PCFG(grammar.start(), new_prods)