예제 #1
0
 def read_productions(self, productions_filename):
     productions = []
     with io.open(productions_filename, 'r', encoding='utf8') as f:
         for line in f:
             line = line.strip()
             components = line.split(u'+')
             lhs = Nonterminal(components[0])
             rhs = tuple([
                 Nonterminal(nt.strip()) for nt in components[1].split(u' ')
             ])
             prob = float(components[2])
             pp = ProbabilisticProduction(lhs, rhs, prob=prob)
             productions.append(pp)
     self.grammar = PCFG(Nonterminal('S'), productions)
예제 #2
0
def pcfg_learn(treebank, n):
    productions = list()
    for i in range(n):
        for tree in treebank.parsed_sents()[:i+1]:
            chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')
            prod_gen = tree_to_productions(tree)
            tree_to_append = next(prod_gen)
            while tree_to_append:
                productions.append(tree_to_append)
                try:
                    tree_to_append = next(prod_gen)
                except Exception as e:
                    tree_to_append = False
    productions = get_productions(productions)
    return PCFG(Nonterminal('S'), productions)
예제 #3
0
def pcfg_learn1(treebank, n):
    productions = list()
    for i in range(n):
        for tree in treebank.parsed_sents()[:i + 1]:
            prod_gen = tree_to_productions(tree, "BOT")
            tree_to_append = next(prod_gen)[0]
            while tree_to_append:
                if tree_to_append.lhs() == Nonterminal('NP'):
                    productions.append(tree_to_append)
                try:
                    tree_to_append = next(prod_gen)[0]
                except Exception as e:
                    tree_to_append = False
    productions, dist = get_productions(productions)
    return PCFG(Nonterminal('NP'), productions), dist
예제 #4
0
def create_pcfg(start_symbol, productions):
    pcount = {}
    lcount = {}

    for prod in productions:
        lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
        pcount[prod] = pcount.get(prod, 0) + 1

    prods = [
        ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()])
        for p in pcount
    ]

    # threshold= 5e-3
    # to_remove = [p for p in prods if p.is_lexical() and len(p) == 1 and p.prob() < threshold]
    
    # if to_remove:
    #     return create_pcfg(start_symbol, [p for p in prods if p.is_lexical() and len(p) == 1 and p.prob() > threshold])

    return PCFG(start_symbol, prods)
예제 #5
0
 def renormalize(self, height=10**4, tol=10**(-17), min_height=100):
     """Return renormalized grammar. 
     
     Raise ValueError if for at least one nonterminal, its coverage
     equals zero.
     Input:
         height - maximal height of parse trees of which the
             coverage is calculated of.
         tol - tolerance as a stopping condition. If change
             is smaller than the input tolerance, then it stops.
         min_height - overrides tolerance stopping condition and
             calculates coverage of all heights <= min_height. It
             also determines for how many previous steps the change
             is measured, i.e. for levels (height-1 - min_height/2).
         verbosity - if set to > 0, it prints stopping probability
             change, height and input tolerance.
     """
     coverages_dict = self.list_coverages(height, tol, min_height)
     if min(coverages_dict[A] for A in coverages_dict) < tol:  # input tol
         print([A for A in coverages_dict if coverages_dict[A] < tol])
         raise ValueError("Not all coverages are positive, so"
                         + " renormalization cannot be performed since zero"
                         + " division.")
     def chi(prod, coverages_dict):
         """Renormalizes production probability p^~ as in Chi paper(22)."""
         subprobabs = prod.prob()
         for symbol in prod.rhs():
             if not isinstance(symbol, Nonterminal):
                 continue  # or subprobabs = 1
             else:
                 subprobabs *= coverages_dict[symbol]
         return subprobabs/coverages_dict[prod.lhs()]
     prods = [ProbabilisticProduction(prod.lhs(), prod.rhs(),
                                     prob=chi(prod, coverages_dict))
             for prod in self.grammar.productions()]
     return PCFG(self.grammar.start(), prods)
예제 #6
0
def update_grammar(words, grammar, smoothing=None):
    # if smoothing is None use Add One.
    pcount = {}
    lcount = 0
    new_prods = []
    lhs = None
    for prod in grammar.productions():
        if str(prod.lhs()) == 'NN':
            lhs = prod.lhs()
            lcount += 1
            pcount[prod] = pcount.get(prod, 0) + 1

    add = len(words) + len(pcount)
    avg = 1 / lcount

    if lhs is None:
        lhs = Nonterminal('NN')

    for word in words:
        rhs = (word.strip("'"), )
        if smoothing is None:
            prob = 1 / (lcount + add)
        else:
            prob = avg / len(words)
        prod = ProbabilisticProduction(lhs, rhs, prob=prob)
        new_prods.append(prod)

    for p in grammar.productions():
        if str(p.lhs()) == 'NN':
            if smoothing is None:
                p = ProbabilisticProduction(p.lhs(), p.rhs(), prob= (pcount[p] + 1) / (lcount + add))
            else:
                p = ProbabilisticProduction(p.lhs(), p.rhs(), prob= p.prob() - (avg / lcount))
        new_prods.append(p)

    return PCFG(grammar.start(), new_prods)