def if_then_else_demo(): """ Demo if-then-else grammar """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar nonterminals = 'E E1 PLUS T T1 TIMES F LPAREN RPAREN ID' (E, E1, PLUS, T, T1, TIMES, F, LPAREN, RPAREN, ID) = [Nonterminal(s) for s in nonterminals.split()] productions = ( Production(E, [T, E1]), Production(E1, [PLUS, T, E1]), Production(E1, []), Production(T, [F, T1]), Production(T1, [TIMES, F, T1]), Production(T1, []), Production(F, [LPAREN, E, RPAREN]), Production(F, [ID]), Production(PLUS, ['+']), Production(TIMES, ['*']), Production(LPAREN, ['(']), Production(RPAREN, [')']), Production(ID, ['a']), Production(ID, ['b']), Production(ID, ['c']), ) grammar = ContextFreeGrammar(E, productions) text = "a * b + c".split() RecursiveDescentApp(grammar, text).mainloop()
def demo2(): from nltk import Nonterminal, Production, ContextFreeGrammar nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ['up', 'over', NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = ContextFreeGrammar(S, productions) text = 'I saw a man in the park'.split() d = CFGDemo(grammar, text) d.mainloop()
def demo(): from nltk import Nonterminal, ContextFreeGrammar nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] grammar = ContextFreeGrammar.fromstring(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' Det -> 'my' NP -> 'I' N -> 'dog' N -> 'man' N -> 'park' N -> 'statue' V -> 'saw' P -> 'in' P -> 'up' P -> 'over' P -> 'with' """) def cb(grammar): print(grammar) top = Tk() editor = CFGEditor(top, grammar, cb) Label(top, text='\nTesting CFG Editor\n').pack() Button(top, text='Quit', command=top.destroy).pack() top.mainloop()
def demo(N=23): from nltk.grammar import ContextFreeGrammar print('Generating the first %d sentences for demo grammar:' % (N, )) print(demo_grammar) grammar = ContextFreeGrammar.fromstring(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print('%3d. %s' % (n, ' '.join(sent)))
def add_new_vocab_rule(self, rule): """ Adds a new vocabulary rule to the set of rules, and recreates self.cfg and self.parser. """ self.rules.append(Production(NT(rule[0]), rule[1])) self.cfg = ContextFreeGrammar(NT("S"), self.rules) self.parser = EarleyChartParser(self.cfg, trace=0)
def set_grammar(self, grammar): """ Asign a new grammar to the parser Args: - parser_args: needs grammar of type :class:`ContextFreeGrammar` """ self._grammar = ContextFreeGrammar(Nonterminal('S'), grammar)
def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = ContextFreeGrammar(Nonterminal('S'), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side='top', fill='x', expand=0) self._init_prodframe() self._prodframe.pack(side='top', fill='both', expand=1) self._init_buttons() self._buttonframe.pack(side='bottom', fill='x', expand=0) self._textwidget.focus()
def parse_NP(self, sen): """ Parses a partial sentence (that is, usually a noun phrase. Returns the parse, or returns a tuple. """ try: cfg_temp = ContextFreeGrammar(NT("NP"), self.rules) parser_temp = EarleyChartParser(cfg_temp, trace=0) parse = parser_temp.nbest_parse(sen.strip().split(" "), trace=0) except: print traceback.format_exc() else: if parse: return parse[0] print "failure" return None
def fail_demo(): """ Demo grammar that should not work with backtracking for all inputs """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar S = Nonterminal('S') A = Nonterminal('A') productions = ( Production(S, [ A, S, A ]), Production(S, [ A, A ]), Production(A, [ 'a' ]), ) grammar = ContextFreeGrammar(S, productions) text = "a a a a a a".split() #text = "a a a a".split() RecursiveDescentApp(grammar, text).mainloop()
def app(): """ Create a shift reduce parser app, using a simple grammar and text. """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = ContextFreeGrammar(S, productions) # tokenize the sentence sent = 'my dog saw a man in the park with a statue'.split() ShiftReduceApp(grammar, sent).mainloop()
def __init__(self, rules_file="rules.gr", vocab_file="vocabulary.gr"): """ Reads in grammar rules (from rules_file) and vocab rules (from vocab_file) and creates self.cfg (a ContextFreeGrammar) and self.parser (a EarleyChartParser). """ self.rules = [] test_sentences = [] # get the rules from rules_file grammar = open(rules_file, "r") line = grammar.readline() while line: if line.strip() != "" and not line.strip().startswith("#"): line = line[2:] parts = line.partition("\t") lhs = parts[0].strip() rhs = [NT(x) for x in parts[2].strip().split(" ")] self.rules.append(Production(NT(lhs), rhs)) line = grammar.readline() grammar.close() # get the rules from vocab_file vocab = open(vocab_file, "r") line = vocab.readline() while line: if line.strip() != "" and not line.strip().startswith("#"): line = line[2:] parts = line.partition("\t") lhs = parts[0].strip() rhs = parts[2].strip().lower().split(" ") self.rules.append(Production(NT(lhs), rhs)) line = vocab.readline() vocab.close() # create the grammar and parser self.cfg = ContextFreeGrammar(NT("S"), self.rules) self.parser = EarleyChartParser(self.cfg, trace=0)
def app(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from nltk.grammar import ContextFreeGrammar grammar = ContextFreeGrammar.fromstring(""" # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """) sent = 'the dog saw a man in the park'.split() RecursiveDescentApp(grammar, sent).mainloop()
class CFGEditor(object): """ A dialog window for creating and editing context free grammars. ``CFGEditor`` imposes the following restrictions: - All nonterminals must be strings consisting of word characters. - All terminals must be strings consisting of word characters and space characters. """ # Regular expressions used by _analyze_line. Precompile them, so # we can process the text faster. ARROW = SymbolWidget.SYMBOLS['rightarrow'] _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|("+ARROW+"))") _ARROW_RE = re.compile("\s*(->|("+ARROW+"))\s*") _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" + # LHS "(->|("+ARROW+"))\s*" + # arrow r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$") # RHS _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|("+ARROW+")") _BOLD = ('helvetica', -12, 'bold') def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = ContextFreeGrammar(Nonterminal('S'), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side='top', fill='x', expand=0) self._init_prodframe() self._prodframe.pack(side='top', fill='both', expand=1) self._init_buttons() self._buttonframe.pack(side='bottom', fill='x', expand=0) self._textwidget.focus() def _init_startframe(self): frame = self._startframe = Frame(self._top) self._start = Entry(frame) self._start.pack(side='right') Label(frame, text='Start Symbol:').pack(side='right') Label(frame, text='Productions:').pack(side='left') self._start.insert(0, self._cfg.start().symbol()) def _init_buttons(self): frame = self._buttonframe = Frame(self._top) Button(frame, text='Ok', command=self._ok, underline=0, takefocus=0).pack(side='left') Button(frame, text='Apply', command=self._apply, underline=0, takefocus=0).pack(side='left') Button(frame, text='Reset', command=self._reset, underline=0, takefocus=0,).pack(side='left') Button(frame, text='Cancel', command=self._cancel, underline=0, takefocus=0).pack(side='left') Button(frame, text='Help', command=self._help, underline=0, takefocus=0).pack(side='right') def _init_bindings(self): self._top.title('CFG Editor') self._top.bind('<Control-q>', self._cancel) self._top.bind('<Alt-q>', self._cancel) self._top.bind('<Control-d>', self._cancel) #self._top.bind('<Control-x>', self._cancel) self._top.bind('<Alt-x>', self._cancel) self._top.bind('<Escape>', self._cancel) #self._top.bind('<Control-c>', self._cancel) self._top.bind('<Alt-c>', self._cancel) self._top.bind('<Control-o>', self._ok) self._top.bind('<Alt-o>', self._ok) self._top.bind('<Control-a>', self._apply) self._top.bind('<Alt-a>', self._apply) self._top.bind('<Control-r>', self._reset) self._top.bind('<Alt-r>', self._reset) self._top.bind('<Control-h>', self._help) self._top.bind('<Alt-h>', self._help) self._top.bind('<F1>', self._help) def _init_prodframe(self): self._prodframe = Frame(self._top) # Create the basic Text widget & scrollbar. self._textwidget = Text(self._prodframe, background='#e0e0e0', exportselection=1) self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient='vertical') self._textwidget.config(yscrollcommand = self._textscroll.set) self._textscroll.config(command=self._textwidget.yview) self._textscroll.pack(side='right', fill='y') self._textwidget.pack(expand=1, fill='both', side='left') # Initialize the colorization tags. Each nonterminal gets its # own tag, so they aren't listed here. self._textwidget.tag_config('terminal', foreground='#006000') self._textwidget.tag_config('arrow', font='symbol') self._textwidget.tag_config('error', background='red') # Keep track of what line they're on. We use that to remember # to re-analyze a line whenever they leave it. self._linenum = 0 # Expand "->" to an arrow. self._top.bind('>', self._replace_arrows) # Re-colorize lines when appropriate. self._top.bind('<<Paste>>', self._analyze) self._top.bind('<KeyPress>', self._check_analyze) self._top.bind('<ButtonPress>', self._check_analyze) # Tab cycles focus. (why doesn't this work??) def cycle(e, textwidget=self._textwidget): textwidget.tk_focusNext().focus() self._textwidget.bind('<Tab>', cycle) prod_tuples = [(p.lhs(),[p.rhs()]) for p in self._cfg.productions()] for i in range(len(prod_tuples)-1,0,-1): if (prod_tuples[i][0] == prod_tuples[i-1][0]): if () in prod_tuples[i][1]: continue if () in prod_tuples[i-1][1]: continue print(prod_tuples[i-1][1]) print(prod_tuples[i][1]) prod_tuples[i-1][1].extend(prod_tuples[i][1]) del prod_tuples[i] for lhs, rhss in prod_tuples: print(lhs, rhss) s = '%s ->' % lhs for rhs in rhss: for elt in rhs: if isinstance(elt, Nonterminal): s += ' %s' % elt else: s += ' %r' % elt s += ' |' s = s[:-2] + '\n' self._textwidget.insert('end', s) self._analyze() # # Add the producitons to the text widget, and colorize them. # prod_by_lhs = {} # for prod in self._cfg.productions(): # if len(prod.rhs()) > 0: # prod_by_lhs.setdefault(prod.lhs(),[]).append(prod) # for (lhs, prods) in prod_by_lhs.items(): # self._textwidget.insert('end', '%s ->' % lhs) # self._textwidget.insert('end', self._rhs(prods[0])) # for prod in prods[1:]: # print '\t|'+self._rhs(prod), # self._textwidget.insert('end', '\t|'+self._rhs(prod)) # print # self._textwidget.insert('end', '\n') # for prod in self._cfg.productions(): # if len(prod.rhs()) == 0: # self._textwidget.insert('end', '%s' % prod) # self._analyze() # def _rhs(self, prod): # s = '' # for elt in prod.rhs(): # if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol() # else: s += ' %r' % elt # return s def _clear_tags(self, linenum): """ Remove all tags (except ``arrow`` and ``sel``) from the given line of the text widget used for editing the productions. """ start = '%d.0'%linenum end = '%d.end'%linenum for tag in self._textwidget.tag_names(): if tag not in ('arrow', 'sel'): self._textwidget.tag_remove(tag, start, end) def _check_analyze(self, *e): """ Check if we've moved to a new line. If we have, then remove all colorization from the line we moved to, and re-colorize the line that we moved from. """ linenum = int(self._textwidget.index('insert').split('.')[0]) if linenum != self._linenum: self._clear_tags(linenum) self._analyze_line(self._linenum) self._linenum = linenum def _replace_arrows(self, *e): """ Replace any ``'->'`` text strings with arrows (char \\256, in symbol font). This searches the whole buffer, but is fast enough to be done anytime they press '>'. """ arrow = '1.0' while True: arrow = self._textwidget.search('->', arrow, 'end+1char') if arrow == '': break self._textwidget.delete(arrow, arrow+'+2char') self._textwidget.insert(arrow, self.ARROW, 'arrow') self._textwidget.insert(arrow, '\t') arrow = '1.0' while True: arrow = self._textwidget.search(self.ARROW, arrow+'+1char', 'end+1char') if arrow == '': break self._textwidget.tag_add('arrow', arrow, arrow+'+1char') def _analyze_token(self, match, linenum): """ Given a line number and a regexp match for a token on that line, colorize the token. Note that the regexp match gives us the token's text, start index (on the line), and end index (on the line). """ # What type of token is it? if match.group()[0] in "'\"": tag = 'terminal' elif match.group() in ('->', self.ARROW): tag = 'arrow' else: # If it's a nonterminal, then set up new bindings, so we # can highlight all instances of that nonterminal when we # put the mouse over it. tag = 'nonterminal_'+match.group() if tag not in self._textwidget.tag_names(): self._init_nonterminal_tag(tag) start = '%d.%d' % (linenum, match.start()) end = '%d.%d' % (linenum, match.end()) self._textwidget.tag_add(tag, start, end) def _init_nonterminal_tag(self, tag, foreground='blue'): self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD) if not self._highlight_matching_nonterminals: return def enter(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background='#80ff80') def leave(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background='') self._textwidget.tag_bind(tag, '<Enter>', enter) self._textwidget.tag_bind(tag, '<Leave>', leave) def _analyze_line(self, linenum): """ Colorize a given line. """ # Get rid of any tags that were previously on the line. self._clear_tags(linenum) # Get the line line's text string. line = self._textwidget.get(repr(linenum)+'.0', repr(linenum)+'.end') # If it's a valid production, then colorize each token. if CFGEditor._PRODUCTION_RE.match(line): # It's valid; Use _TOKEN_RE to tokenize the production, # and call analyze_token on each token. def analyze_token(match, self=self, linenum=linenum): self._analyze_token(match, linenum) return '' CFGEditor._TOKEN_RE.sub(analyze_token, line) elif line.strip() != '': # It's invalid; show the user where the error is. self._mark_error(linenum, line) def _mark_error(self, linenum, line): """ Mark the location of an error in a line. """ arrowmatch = CFGEditor._ARROW_RE.search(line) if not arrowmatch: # If there's no arrow at all, highlight the whole line. start = '%d.0' % linenum end = '%d.end' % linenum elif not CFGEditor._LHS_RE.match(line): # Otherwise, if the LHS is bad, highlight it. start = '%d.0' % linenum end = '%d.%d' % (linenum, arrowmatch.start()) else: # Otherwise, highlight the RHS. start = '%d.%d' % (linenum, arrowmatch.end()) end = '%d.end' % linenum # If we're highlighting 0 chars, highlight the whole line. if self._textwidget.compare(start, '==', end): start = '%d.0' % linenum end = '%d.end' % linenum self._textwidget.tag_add('error', start, end) def _analyze(self, *e): """ Replace ``->`` with arrows, and colorize the entire buffer. """ self._replace_arrows() numlines = int(self._textwidget.index('end').split('.')[0]) for linenum in range(1, numlines+1): # line numbers start at 1. self._analyze_line(linenum) def _parse_productions(self): """ Parse the current contents of the textwidget buffer, to create a list of productions. """ productions = [] # Get the text, normalize it, and split it into lines. text = self._textwidget.get('1.0', 'end') text = re.sub(self.ARROW, '->', text) text = re.sub('\t', ' ', text) lines = text.split('\n') # Convert each line to a CFG production for line in lines: line = line.strip() if line=='': continue productions += parse_cfg_production(line) #if line.strip() == '': continue #if not CFGEditor._PRODUCTION_RE.match(line): # raise ValueError('Bad production string %r' % line) # #(lhs_str, rhs_str) = line.split('->') #lhs = Nonterminal(lhs_str.strip()) #rhs = [] #def parse_token(match, rhs=rhs): # token = match.group() # if token[0] in "'\"": rhs.append(token[1:-1]) # else: rhs.append(Nonterminal(token)) # return '' #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str) # #productions.append(Production(lhs, *rhs)) return productions def _destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def _ok(self, *e): self._apply() self._destroy() def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = ContextFreeGrammar(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg) def _reset(self, *e): self._textwidget.delete('1.0', 'end') for production in self._cfg.productions(): self._textwidget.insert('end', '%s\n' % production) self._analyze() if self._set_cfg_callback is not None: self._set_cfg_callback(self._cfg) def _cancel(self, *e): try: self._reset() except: pass self._destroy() def _help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText(self._parent, 'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(), width=75, font='fixed') except: ShowText(self._parent, 'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(), width=75)
def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = ContextFreeGrammar(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg)
class CFGEditor(object): """ A dialog window for creating and editing context free grammars. C{CFGEditor} places the following restrictions on what C{CFG}s can be edited: - All nonterminals must be strings consisting of word characters. - All terminals must be strings consisting of word characters and space characters. """ # Regular expressions used by _analyze_line. Precompile them, so # we can process the text faster. ARROW = SymbolWidget.SYMBOLS['rightarrow'] _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))") _ARROW_RE = re.compile("\s*(->|(" + ARROW + "))\s*") _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" + # LHS "(->|(" + ARROW + "))\s*" + # arrow r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$") # RHS _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")") _BOLD = ('helvetica', -12, 'bold') def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = ContextFreeGrammar(Nonterminal('S'), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side='top', fill='x', expand=0) self._init_prodframe() self._prodframe.pack(side='top', fill='both', expand=1) self._init_buttons() self._buttonframe.pack(side='bottom', fill='x', expand=0) self._textwidget.focus() def _init_startframe(self): frame = self._startframe = Frame(self._top) self._start = Entry(frame) self._start.pack(side='right') Label(frame, text='Start Symbol:').pack(side='right') Label(frame, text='Productions:').pack(side='left') self._start.insert(0, self._cfg.start().symbol()) def _init_buttons(self): frame = self._buttonframe = Frame(self._top) Button(frame, text='Ok', command=self._ok, underline=0, takefocus=0).pack(side='left') Button(frame, text='Apply', command=self._apply, underline=0, takefocus=0).pack(side='left') Button( frame, text='Reset', command=self._reset, underline=0, takefocus=0, ).pack(side='left') Button(frame, text='Cancel', command=self._cancel, underline=0, takefocus=0).pack(side='left') Button(frame, text='Help', command=self._help, underline=0, takefocus=0).pack(side='right') def _init_bindings(self): self._top.title('CFG Editor') self._top.bind('<Control-q>', self._cancel) self._top.bind('<Alt-q>', self._cancel) self._top.bind('<Control-d>', self._cancel) #self._top.bind('<Control-x>', self._cancel) self._top.bind('<Alt-x>', self._cancel) self._top.bind('<Escape>', self._cancel) #self._top.bind('<Control-c>', self._cancel) self._top.bind('<Alt-c>', self._cancel) self._top.bind('<Control-o>', self._ok) self._top.bind('<Alt-o>', self._ok) self._top.bind('<Control-a>', self._apply) self._top.bind('<Alt-a>', self._apply) self._top.bind('<Control-r>', self._reset) self._top.bind('<Alt-r>', self._reset) self._top.bind('<Control-h>', self._help) self._top.bind('<Alt-h>', self._help) self._top.bind('<F1>', self._help) def _init_prodframe(self): self._prodframe = Frame(self._top) # Create the basic Text widget & scrollbar. self._textwidget = Text(self._prodframe, background='#e0e0e0', exportselection=1) self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient='vertical') self._textwidget.config(yscrollcommand=self._textscroll.set) self._textscroll.config(command=self._textwidget.yview) self._textscroll.pack(side='right', fill='y') self._textwidget.pack(expand=1, fill='both', side='left') # Initialize the colorization tags. Each nonterminal gets its # own tag, so they aren't listed here. self._textwidget.tag_config('terminal', foreground='#006000') self._textwidget.tag_config('arrow', font='symbol') self._textwidget.tag_config('error', background='red') # Keep track of what line they're on. We use that to remember # to re-analyze a line whenever they leave it. self._linenum = 0 # Expand "->" to an arrow. self._top.bind('>', self._replace_arrows) # Re-colorize lines when appropriate. self._top.bind('<<Paste>>', self._analyze) self._top.bind('<KeyPress>', self._check_analyze) self._top.bind('<ButtonPress>', self._check_analyze) # Tab cycles focus. (why doesn't this work??) def cycle(e, textwidget=self._textwidget): textwidget.tk_focusNext().focus() self._textwidget.bind('<Tab>', cycle) prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()] for i in range(len(prod_tuples) - 1, 0, -1): if (prod_tuples[i][0] == prod_tuples[i - 1][0]): if () in prod_tuples[i][1]: continue if () in prod_tuples[i - 1][1]: continue print prod_tuples[i - 1][1] print prod_tuples[i][1] prod_tuples[i - 1][1].extend(prod_tuples[i][1]) del prod_tuples[i] for lhs, rhss in prod_tuples: print lhs, rhss s = '%s ->' % lhs for rhs in rhss: for elt in rhs: if isinstance(elt, Nonterminal): s += ' %s' % elt else: s += ' %r' % elt s += ' |' s = s[:-2] + '\n' self._textwidget.insert('end', s) self._analyze() # # Add the producitons to the text widget, and colorize them. # prod_by_lhs = {} # for prod in self._cfg.productions(): # if len(prod.rhs()) > 0: # prod_by_lhs.setdefault(prod.lhs(),[]).append(prod) # for (lhs, prods) in prod_by_lhs.items(): # self._textwidget.insert('end', '%s ->' % lhs) # self._textwidget.insert('end', self._rhs(prods[0])) # for prod in prods[1:]: # print '\t|'+self._rhs(prod), # self._textwidget.insert('end', '\t|'+self._rhs(prod)) # print # self._textwidget.insert('end', '\n') # for prod in self._cfg.productions(): # if len(prod.rhs()) == 0: # self._textwidget.insert('end', '%s' % prod) # self._analyze() # def _rhs(self, prod): # s = '' # for elt in prod.rhs(): # if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol() # else: s += ' %r' % elt # return s def _clear_tags(self, linenum): """ Remove all tags (except C{arrow} and C{sel}) from the given line of the text widget used for editing the productions. """ start = '%d.0' % linenum end = '%d.end' % linenum for tag in self._textwidget.tag_names(): if tag not in ('arrow', 'sel'): self._textwidget.tag_remove(tag, start, end) def _check_analyze(self, *e): """ Check if we've moved to a new line. If we have, then remove all colorization from the line we moved to, and re-colorize the line that we moved from. """ linenum = int(self._textwidget.index('insert').split('.')[0]) if linenum != self._linenum: self._clear_tags(linenum) self._analyze_line(self._linenum) self._linenum = linenum def _replace_arrows(self, *e): """ Replace any C{'->'} text strings with arrows (char \\256, in symbol font). This searches the whole buffer, but is fast enough to be done anytime they press '>'. """ arrow = '1.0' while 1: arrow = self._textwidget.search('->', arrow, 'end+1char') if arrow == '': break self._textwidget.delete(arrow, arrow + '+2char') self._textwidget.insert(arrow, self.ARROW, 'arrow') self._textwidget.insert(arrow, '\t') arrow = '1.0' while 1: arrow = self._textwidget.search(self.ARROW, arrow + '+1char', 'end+1char') if arrow == '': break self._textwidget.tag_add('arrow', arrow, arrow + '+1char') def _analyze_token(self, match, linenum): """ Given a line number and a regexp match for a token on that line, colorize the token. Note that the regexp match gives us the token's text, start index (on the line), and end index (on the line). """ # What type of token is it? if match.group()[0] in "'\"": tag = 'terminal' elif match.group() in ('->', self.ARROW): tag = 'arrow' else: # If it's a nonterminal, then set up new bindings, so we # can highlight all instances of that nonterminal when we # put the mouse over it. tag = 'nonterminal_' + match.group() if tag not in self._textwidget.tag_names(): self._init_nonterminal_tag(tag) start = '%d.%d' % (linenum, match.start()) end = '%d.%d' % (linenum, match.end()) self._textwidget.tag_add(tag, start, end) def _init_nonterminal_tag(self, tag, foreground='blue'): self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD) if not self._highlight_matching_nonterminals: return def enter(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background='#80ff80') def leave(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background='') self._textwidget.tag_bind(tag, '<Enter>', enter) self._textwidget.tag_bind(tag, '<Leave>', leave) def _analyze_line(self, linenum): """ Colorize a given line. """ # Get rid of any tags that were previously on the line. self._clear_tags(linenum) # Get the line line's text string. line = self._textwidget.get( ` linenum ` + '.0', ` linenum ` + '.end') # If it's a valid production, then colorize each token. if CFGEditor._PRODUCTION_RE.match(line): # It's valid; Use _TOKEN_RE to tokenize the production, # and call analyze_token on each token. def analyze_token(match, self=self, linenum=linenum): self._analyze_token(match, linenum) return '' CFGEditor._TOKEN_RE.sub(analyze_token, line) elif line.strip() != '': # It's invalid; show the user where the error is. self._mark_error(linenum, line) def _mark_error(self, linenum, line): """ Mark the location of an error in a line. """ arrowmatch = CFGEditor._ARROW_RE.search(line) if not arrowmatch: # If there's no arrow at all, highlight the whole line. start = '%d.0' % linenum end = '%d.end' % linenum elif not CFGEditor._LHS_RE.match(line): # Otherwise, if the LHS is bad, highlight it. start = '%d.0' % linenum end = '%d.%d' % (linenum, arrowmatch.start()) else: # Otherwise, highlight the RHS. start = '%d.%d' % (linenum, arrowmatch.end()) end = '%d.end' % linenum # If we're highlighting 0 chars, highlight the whole line. if self._textwidget.compare(start, '==', end): start = '%d.0' % linenum end = '%d.end' % linenum self._textwidget.tag_add('error', start, end) def _analyze(self, *e): """ Replace C{->} with arrows, and colorize the entire buffer. """ self._replace_arrows() numlines = int(self._textwidget.index('end').split('.')[0]) for linenum in range(1, numlines + 1): # line numbers start at 1. self._analyze_line(linenum) def _parse_productions(self): """ Parse the current contents of the textwidget buffer, to create a list of productions. """ productions = [] # Get the text, normalize it, and split it into lines. text = self._textwidget.get('1.0', 'end') text = re.sub(self.ARROW, '->', text) text = re.sub('\t', ' ', text) lines = text.split('\n') # Convert each line to a CFG production for line in lines: line = line.strip() if line == '': continue productions += parse_cfg_production(line) #if line.strip() == '': continue #if not CFGEditor._PRODUCTION_RE.match(line): # raise ValueError('Bad production string %r' % line) # #(lhs_str, rhs_str) = line.split('->') #lhs = Nonterminal(lhs_str.strip()) #rhs = [] #def parse_token(match, rhs=rhs): # token = match.group() # if token[0] in "'\"": rhs.append(token[1:-1]) # else: rhs.append(Nonterminal(token)) # return '' #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str) # #productions.append(Production(lhs, *rhs)) return productions def _destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def _ok(self, *e): self._apply() self._destroy() def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = ContextFreeGrammar(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg) def _reset(self, *e): self._textwidget.delete('1.0', 'end') for production in self._cfg.productions(): self._textwidget.insert('end', '%s\n' % production) self._analyze() if self._set_cfg_callback is not None: self._set_cfg_callback(self._cfg) def _cancel(self, *e): try: self._reset() except: pass self._destroy() def _help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText(self._parent, 'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(), width=75, font='fixed') except: ShowText(self._parent, 'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(), width=75)
# # Creazione di una PCFG estratta dal treebank Penn # import nltk from nltk.corpus import treebank from nltk.grammar import ContextFreeGrammar, Nonterminal, Production #estrazione delle produzioni dal treebank production_list = list(production for sent in treebank.parsed_sents() for production in sent.productions()) tbank_productions = set(production_list) tbank_grammar = ContextFreeGrammar(Nonterminal('S'), list(tbank_productions)) grammar_productions = tbank_grammar.productions() lhsCount={} prodCount={} probs={} #preparo le strutture dati: count(a->b)/count(a) con a non terminale for production in production_list: if production.lhs() in lhsCount: lhsCount[production.lhs()] = lhsCount[production.lhs()] + 1 else: lhsCount[production.lhs()] = 1 if production in prodCount: prodCount[production] = prodCount[production] + 1 else: prodCount[production] = 1
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp_tokenize(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain Production(O, [D]), Production(O, [H]), Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain Production(H, [D, '/', D]), # domain, forward slash, another operator Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = ContextFreeGrammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
import nltk from nltk.corpus import treebank from nltk.grammar import ContextFreeGrammar, Nonterminal tbank_productions = set(production for sent in treebank.parsed_sents() for production in sent.productions()) tbank_grammar = ContextFreeGrammar(Nonterminal('S'), list(tbank_productions)) print tbank_grammar