def description_length(pcfg, sequences): N = len(np.unique([str(r) for p in pcfg.productions() for r in p.rhs()])) logN = log2(N) pcfg_dl = sum([(1 + len(p.rhs())) * logN for p in pcfg.productions()]) print(pcfg_dl) #most probable parse for each sequence parses = [InsideChartParser(pcfg).parse_all(s) for s in sequences] parses = [ sorted(p, key=lambda t: t.prob(), reverse=True)[0] for p in parses ] seq_dl = sum([(1 + len(r.rhs())) * logN for p in parses for r in p.productions()]) print(seq_dl) return pcfg_dl + seq_dl
def to_grammar(sequences, sections): end_state = np.max(np.hstack(sequences)) + 1 #for now removing -1 (but deal with it later!) sequences = [np.append(s[s >= 0], end_state) for s in sequences] new_seqs = to_productions(sequences, end_state) trees = [Tree.fromstring(to_tree(s[1:], sections, s[0])) for s in new_seqs] prods = [p for t in trees for p in t.productions()] prods = induce_pcfg(Nonterminal('S'), prods).productions() grammar_string = '\n'.join([str(p) for p in prods]) for k in set([s[0] for s in new_seqs if s[0] != 'S']): grammar_string = grammar_string.replace("'" + str(k) + "'", str(k)) grammar = PCFG.fromstring(grammar_string) print(grammar) parser = InsideChartParser(grammar) #parser.trace(1) sentences = [ Tree.fromstring(to_tree(s[:-1], sections)).leaves() for s in sequences ] parses = flatten(multiprocess('parsing', parser.parse_all, sentences), 1) probs = mean_probs(parses, grammar) print(probs)
over_5 = 0 for k, v in transitions.items(): if v >= 5: filt_trans[k] = (v, v / total) filt_trans = {k: (v, v / over_5) for k, v in filt_trans.items()} filt_trans from nltk import induce_pcfg from nltk import InsideChartParser prods = list({ production for sent in treebank.parsed_sents() for production in sent.productions() }) g_pfcg = induce_pcfg(Nonterminal('S'), prods) p_parser = InsideChartParser(g_pfcg, beam_size=400) sents = [ 'Mr. Vinken is chairman .'.split(), 'Stocks rose .'.split(), 'Alan introduced a plan .'.split() ] for sent in sents: print(sent) for p in p_parser.parse(sent): print(p) list(parse) list(p_parser.parse(['you', 'are', 'sleeping']))
def __init__(self, treebank, rootsymbol='S', wrap=False, cnf=True, cleanup=True, normalize=False, extratags=(), parser=InsideChartParser, **parseroptions): """ initialize a DOP model given a treebank. uses the Goodman reduction of a STSG to a PCFG. after initialization, self.parser will contain an InsideChartParser. >>> tree = Tree("(S (NP mary) (VP walks))") >>> d = GoodmanDOP([tree]) >>> print d.grammar Grammar with 8 productions (start state = S) NP -> 'mary' [1.0] NP@1 -> 'mary' [1.0] S -> NP VP [0.25] S -> NP VP@2 [0.25] S -> NP@1 VP [0.25] S -> NP@1 VP@2 [0.25] VP -> 'walks' [1.0] VP@2 -> 'walks' [1.0] >>> print d.parser.parse("mary walks".split()) (S (NP mary) (VP@2 walks)) (p=0.25) @param treebank: a list of Tree objects. Caveat lector: terminals may not have (non-terminals as) siblings. @param wrap: boolean specifying whether to add the start symbol to each tree @param normalize: whether to normalize frequencies @param parser: a class which will be instantiated with the DOP model as its grammar. Supports BitParChartParser. instance variables: - self.grammar a WeightedGrammar containing the PCFG reduction - self.fcfg a list of strings containing the PCFG reduction with frequencies instead of probabilities - self.parser an InsideChartParser object - self.exemplars dictionary of known parse trees (memoization)""" from bitpar import BitParChartParser nonterminalfd, subtreefd, cfg = FreqDist(), FreqDist(), FreqDist() ids = count(1) self.exemplars = {} if wrap: # wrap trees in a common root symbol (eg. for morphology) treebank = [Tree(rootsymbol, [a]) for a in treebank] if cnf: #CNF conversion is destructive treebank = list(treebank) for a in treebank: a.chomsky_normal_form() #todo: sibling annotation necessary? # add unique IDs to nodes utreebank = [(tree, decorate_with_ids(tree, ids)) for tree in treebank] # count node frequencies for tree, utree in utreebank: nodefreq(tree, utree, subtreefd, nonterminalfd) if isinstance(parser, BitParChartParser): lexicon = set(x for a, b in utreebank for x in a.pos() + b.pos()) # this takes the most time, produce CFG rules: cfg = FreqDist(chain(*(self.goodman(tree, utree) for tree, utree in utreebank))) cfg.update("%s\t%s" % (t, w) for w, t in extratags if w not in lexicon) lexicon.update(a for a in extratags if a not in lexicon) # annotate rules with frequencies self.fcfg = frequencies(cfg, subtreefd, nonterminalfd, normalize) self.parser = BitParChartParser(self.fcfg, lexicon, rootsymbol, cleanup=cleanup, **parseroptions) else: cfg = FreqDist(chain(*(self.goodman(tree, utree, False) for tree, utree in utreebank))) probs = probabilities(cfg, subtreefd, nonterminalfd) #for a in probs: print a self.grammar = WeightedGrammar(Nonterminal(rootsymbol), probs) self.parser = InsideChartParser(self.grammar) #stuff for self.mccparse #the highest id #self.addresses = ids.next() #a list of interior + exterior nodes, #ie., non-terminals with and without ids #self.nonterminals = nonterminalfd.keys() #a mapping of ids to nonterminals without their IDs #self.nonterminal = dict(a.split("@")[::-1] for a in # nonterminalfd.keys() if "@" in a) #clean up del cfg, nonterminalfd
class GoodmanDOP: def __init__(self, treebank, rootsymbol='S', wrap=False, cnf=True, cleanup=True, normalize=False, extratags=(), parser=InsideChartParser, **parseroptions): """ initialize a DOP model given a treebank. uses the Goodman reduction of a STSG to a PCFG. after initialization, self.parser will contain an InsideChartParser. >>> tree = Tree("(S (NP mary) (VP walks))") >>> d = GoodmanDOP([tree]) >>> print d.grammar Grammar with 8 productions (start state = S) NP -> 'mary' [1.0] NP@1 -> 'mary' [1.0] S -> NP VP [0.25] S -> NP VP@2 [0.25] S -> NP@1 VP [0.25] S -> NP@1 VP@2 [0.25] VP -> 'walks' [1.0] VP@2 -> 'walks' [1.0] >>> print d.parser.parse("mary walks".split()) (S (NP mary) (VP@2 walks)) (p=0.25) @param treebank: a list of Tree objects. Caveat lector: terminals may not have (non-terminals as) siblings. @param wrap: boolean specifying whether to add the start symbol to each tree @param normalize: whether to normalize frequencies @param parser: a class which will be instantiated with the DOP model as its grammar. Supports BitParChartParser. instance variables: - self.grammar a WeightedGrammar containing the PCFG reduction - self.fcfg a list of strings containing the PCFG reduction with frequencies instead of probabilities - self.parser an InsideChartParser object - self.exemplars dictionary of known parse trees (memoization)""" from bitpar import BitParChartParser nonterminalfd, subtreefd, cfg = FreqDist(), FreqDist(), FreqDist() ids = count(1) self.exemplars = {} if wrap: # wrap trees in a common root symbol (eg. for morphology) treebank = [Tree(rootsymbol, [a]) for a in treebank] if cnf: #CNF conversion is destructive treebank = list(treebank) for a in treebank: a.chomsky_normal_form() #todo: sibling annotation necessary? # add unique IDs to nodes utreebank = [(tree, decorate_with_ids(tree, ids)) for tree in treebank] # count node frequencies for tree, utree in utreebank: nodefreq(tree, utree, subtreefd, nonterminalfd) if isinstance(parser, BitParChartParser): lexicon = set(x for a, b in utreebank for x in a.pos() + b.pos()) # this takes the most time, produce CFG rules: cfg = FreqDist(chain(*(self.goodman(tree, utree) for tree, utree in utreebank))) cfg.update("%s\t%s" % (t, w) for w, t in extratags if w not in lexicon) lexicon.update(a for a in extratags if a not in lexicon) # annotate rules with frequencies self.fcfg = frequencies(cfg, subtreefd, nonterminalfd, normalize) self.parser = BitParChartParser(self.fcfg, lexicon, rootsymbol, cleanup=cleanup, **parseroptions) else: cfg = FreqDist(chain(*(self.goodman(tree, utree, False) for tree, utree in utreebank))) probs = probabilities(cfg, subtreefd, nonterminalfd) #for a in probs: print a self.grammar = WeightedGrammar(Nonterminal(rootsymbol), probs) self.parser = InsideChartParser(self.grammar) #stuff for self.mccparse #the highest id #self.addresses = ids.next() #a list of interior + exterior nodes, #ie., non-terminals with and without ids #self.nonterminals = nonterminalfd.keys() #a mapping of ids to nonterminals without their IDs #self.nonterminal = dict(a.split("@")[::-1] for a in # nonterminalfd.keys() if "@" in a) #clean up del cfg, nonterminalfd def goodman(self, tree, utree, bitparfmt=True): """ given a parsetree from a treebank, yield a goodman reduction of eight rules per node (in the case of a binary tree). >>> tree = Tree("(S (NP mary) (VP walks))") >>> d = GoodmanDOP([tree]) >>> utree = decorate_with_ids(tree, count(1)) >>> sorted(d.goodman(tree, utree, False)) [(NP, ('mary',)), (NP@1, ('mary',)), (S, (NP, VP)), (S, (NP, VP@2)), (S, (NP@1, VP)), (S, (NP@1, VP@2)), (VP, ('walks',)), (VP@2, ('walks',))] """ # linear: nr of nodes sep = "\t" for p, up in zip(tree.productions(), utree.productions()): if len(p.rhs()) == 0: raise ValueError if len(p.rhs()) == 1: if not isinstance(p.rhs()[0], Nonterminal): rhs = (p.rhs(), ) else: rhs = (p.rhs(), up.rhs()) #else: rhs = product(*zip(p.rhs(), up.rhs())) else: if all(isinstance(a, Nonterminal) for a in up.rhs()): rhs = set(product(*zip(p.rhs(), up.rhs()))) else: rhs = product(*zip(p.rhs(), up.rhs())) # constant factor: 8 #for l, r in product(*((p.lhs(), up.lhs()), rhs)): for l, r in product(set((p.lhs(), up.lhs())), rhs): #yield Production(l, r) if bitparfmt: yield "%s%s%s" % (l, sep, sep.join(map(unicode, r))) else: yield l, r # yield a delayed computation that also gives the frequencies # given a distribution of nonterminals #yield (lambda fd: WeightedProduction(l, r, prob= # reduce(mul, map(lambda z: '@' in z and # fd[z] or 1, r)) / float(fd[l]))) def parse(self, sent): """most probable derivation (not very good).""" return self.parser.parse(sent) def mostprobableparse(self, sent, sample=None): """warning: this problem is NP-complete. using an unsorted chart parser avoids unnecessary sorting (since we need all derivations anyway). @param sent: a sequence of terminals @param sample: None or int; if int then sample that many parses""" p = FreqDist() for a in self.parser.nbest_parse(sent, sample): p.inc(removeids(a).freeze(), a.prob()) if p.max(): return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()]) else: raise ValueError("no parse") def mostconstituentscorrect(self, sent): """ not working yet. almost verbatim translation of Goodman's (1996) most constituents correct parsing algorithm, except for python's zero-based indexing. needs to be modified to return the actual parse tree. expects a pcfg in the form of a dictionary from productions to probabilities """ def g(s, t, x): def f(s, t, x): return self.pcfg[Production(rootsymbol, sent[1:s] + [x] + sent[s+1:])] def e(s, t, x): return self.pcfg[Production(x, sent[s:t+1])] return f(s, t, x) * e(s, t, x ) / e(1, n, rootsymbol) sumx = defaultdict(int) #zero maxc = defaultdict(int) #zero for length in range(2, len(sent)+1): for s in range(1, len(sent) + length): t = s + length - 1 for x in self.nonterminals: sumx[x] = g(s, t, x) for k in range(self.addresses): #ordered dictionary here x = self.nonterminal[k] sumx[x] += g(s, t, "%s@%d" % (x, k)) max_x = max(sumx[x] for x in self.nonterminals) #for x in self.nonterminals: # max_x = argmax(sumx, x) #??? best_split = max(maxc[(s,r)] + maxc[(r+1,t)] for r in range(s, t)) #for r in range(s, t): # best_split = max(maxc[(s,r)] + maxc[(r+1,t)]) maxc[(s,t)] = sumx[max_x] + best_split return maxc[(1, len(sent) + 1)]