def TrainPCFG(CPDist, Start='S'): PcfgProds = [] for node in CPDist.conditions(): #Populate PCFG for sample in CPDist[node].samples(): RHS = [] #try: if isinstance(sample, str): RHS = [sample] elif isinstance(sample, Nonterminal): RHS.append(sample) elif isinstance(sample, Tree): RHS.append(Nonterminal(sample.node)) elif isinstance(sample, Token): RHS.append(sample) elif isinstance(sample, list) or isinstance(sample, tuple): for token in sample: if isinstance(token, Tree): RHS.append(Nonterminal(token.node)) elif isinstance(token, Nonterminal): RHS.append(token) elif isinstance(token, Token): RHS.append(token) else: RHS.append(token) else: RHS.append(token.node) print 'Unknown ', node, sample PcfgProds.append( PCFGProduction(Nonterminal(node), RHS, prob=CPDist[node].prob(sample))) #except: print 'missed on', node,sample return HashPCFG(Nonterminal(Start), PcfgProds)
def get_rand_rhs(self, lhs): if not isinstance(lhs, Nonterminal): lhs = Nonterminal(lhs.upper()) r = random.random() s = 0 for i in self._index[lhs]: p = self._productions[i] s += p.prob() if s > r: return p return None
def get_rand_rhs(self,lhs): if not isinstance(lhs,Nonterminal): lhs = Nonterminal(lhs.upper()) r = random.random() s = 0 for i in self._index[lhs]: p = self._productions[i] s += p.prob() if s > r: return p return None
def get_prob_rhs(self,lhs,prob): if not isinstance(lhs,Nonterminal): lhs = Nonterminal(lhs.upper()) rhs = [] if prob > 1 and prob < 100: prob /= 100 p = 0.0 for r in self._index[lhs]: rule = self._productions[r] p += rule.prob() rhs.append(rule) if p > prob: break return rhs
def get_prob_rhs(self, lhs, prob): if not isinstance(lhs, Nonterminal): lhs = Nonterminal(lhs.upper()) rhs = [] if prob > 1 and prob < 100: prob /= 100 p = 0.0 for r in self._index[lhs]: rule = self._productions[r] p += rule.prob() rhs.append(rule) if p > prob: break return rhs
def demo2(): from nltk.cfg import Nonterminal, CFGProduction, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions CFGProduction(S, NP, VP), CFGProduction(NP, Det, N), CFGProduction(NP, NP, PP), CFGProduction(VP, VP, PP), CFGProduction(VP, V, NP, PP), CFGProduction(VP, V, NP), CFGProduction(PP, P, NP), CFGProduction(PP), CFGProduction(PP, 'up', 'over', NP), # Lexical Productions CFGProduction(NP, 'I'), CFGProduction(Det, 'the'), CFGProduction(Det, 'a'), CFGProduction(N, 'man'), CFGProduction(V, 'saw'), CFGProduction(P, 'in'), CFGProduction(P, 'with'), CFGProduction(N, 'park'), CFGProduction(N, 'dog'), CFGProduction(N, 'statue'), CFGProduction(Det, 'my'), ) grammar = CFG(S, productions) text = 'I saw a man in the park'.split() d = CFGDemo(grammar, text) d.mainloop()
def print_all_rhs(self, lhs=''): for i, p in enumerate(self.get_all_rhs(lhs)): print p if not lhs: print len(self._lexicon_grammar), 'Lexicon Rules;', print len(self._nt_grammar), 'Nonterminal Rules;', else: print len([ i for i in self._index[Nonterminal(lhs)] if i in self._lexicon_grammar ]), 'Lexicon Rules;', print len([ i for i in self._index[Nonterminal(lhs)] if i in self._nt_grammar ]), 'Nonterminal Rules;', print i, 'Rules.'
def CountSubTree(CFDist, token): Output = [] for child in token: if isinstance(child, Tree): Output.append(Nonterminal(child.node)) CountSubTree(CFDist, child) elif isinstance(child, Token): text_token = child['TEXT'].lower() Output.append(text_token) else: print 'Unmatched token', token, child CFDist[token.node].inc(tuple(Output))
def demo(): from nltk.cfg import Nonterminal, CFGProduction, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions CFGProduction(S, NP, VP), CFGProduction(NP, Det, N), CFGProduction(NP, NP, PP), CFGProduction(VP, VP, PP), CFGProduction(VP, V, NP, PP), CFGProduction(VP, V, NP), CFGProduction(PP, P, NP), CFGProduction(PP), CFGProduction(PP, 'up', 'over', NP), # Lexical Productions CFGProduction(NP, 'I'), CFGProduction(Det, 'the'), CFGProduction(Det, 'a'), CFGProduction(N, 'man'), CFGProduction(V, 'saw'), CFGProduction(P, 'in'), CFGProduction(P, 'with'), CFGProduction(N, 'park'), CFGProduction(N, 'dog'), CFGProduction(N, 'statue'), CFGProduction(Det, 'my'), ) #productions *= 10 grammar = CFG(S, productions) def cb(cfg): print cfg top = Tk() editor = CFGEditor(top, grammar, cb) Label(top, text='\nTesting CFG Editor\n').pack() Button(top, text='Quit', command=top.destroy).pack() top.mainloop()
def demo(): """ Create a shift reduce parser demo, using a simple grammar and text. """ from nltk.cfg import Nonterminal, CFGProduction, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions CFGProduction(S, [NP, VP]), CFGProduction(NP, [Det, N]), CFGProduction(NP, [NP, PP]), CFGProduction(VP, [VP, PP]), CFGProduction(VP, [V, NP, PP]), CFGProduction(VP, [V, NP]), CFGProduction(PP, [P, NP]), # Lexical Productions CFGProduction(NP, ['I']), CFGProduction(Det, ['the']), CFGProduction(Det, ['a']), CFGProduction(N, ['man']), CFGProduction(V, ['saw']), CFGProduction(P, ['in']), CFGProduction(P, ['with']), CFGProduction(N, ['park']), CFGProduction(N, ['dog']), CFGProduction(N, ['statue']), CFGProduction(Det, ['my']), ) grammar = CFG(S, productions) # tokenize the sentence sent = Token(TEXT='my dog saw a man in the park with a statue') WhitespaceTokenizer().tokenize(sent) ShiftReduceParserDemo(grammar, sent).mainloop()
def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = CFG(Nonterminal('S'), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side='top', fill='x', expand=0) self._init_prodframe() self._prodframe.pack(side='top', fill='both', expand=1) self._init_buttons() self._buttonframe.pack(side='bottom', fill='x', expand=0) self._textwidget.focus()
def regexp(): """ Demo regexp grammar """ from nltk.cfg import Nonterminal, CFGProduction, CFG nonterminals = 'NP N AND' (NP, N, AND) = [Nonterminal(s) for s in nonterminals.split()] productions = ( CFGProduction(NP, NP, AND, NP), CFGProduction(NP, N), CFGProduction(N, 'cabbages'), CFGProduction(AND, 'and'), CFGProduction(N, 'kings'), ) grammar = CFG(NP, productions) sent = 'cabbages and kings' text = WSTokenizer().tokenize(sent) RecursiveDescentParserDemo(grammar, text).mainloop()
def get_rand_rhs(self,lhs): if not isinstance(lhs,Nonterminal): lhs = Nonterminal(lhs.upper()) return self._productions[random.choice(self._index[lhs])]
def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = CFG(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg)
def get_max_rhs(self,lhs): if not isinstance(lhs,Nonterminal): lhs = Nonterminal(lhs.upper()) return self._productions[self._index[lhs][0]]
def get_all_rhs(self,lhs=''): if not lhs: return self._productions if not isinstance(lhs,Nonterminal): lhs = Nonterminal(lhs.upper()) return [self._productions[n] for n in self._index[lhs]]
def get_max_rhs(self, lhs): if not isinstance(lhs, Nonterminal): lhs = Nonterminal(lhs.upper()) return self._productions[self._index[lhs][0]]
def get_rand_rhs(self, lhs): if not isinstance(lhs, Nonterminal): lhs = Nonterminal(lhs.upper()) return self._productions[random.choice(self._index[lhs])]
def get_all_rhs(self, lhs=''): if not lhs: return self._productions if not isinstance(lhs, Nonterminal): lhs = Nonterminal(lhs.upper()) return [self._productions[n] for n in self._index[lhs]]