示例#1
0
def if_then_else_demo():
    """
    Demo if-then-else grammar
    """
    from nltk.grammar import Nonterminal, Production, ContextFreeGrammar
    nonterminals = 'E E1 PLUS T T1 TIMES F LPAREN RPAREN ID'
    (E, E1, PLUS, T, T1, TIMES, F, LPAREN, RPAREN,
     ID) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        Production(E, [T, E1]),
        Production(E1, [PLUS, T, E1]),
        Production(E1, []),
        Production(T, [F, T1]),
        Production(T1, [TIMES, F, T1]),
        Production(T1, []),
        Production(F, [LPAREN, E, RPAREN]),
        Production(F, [ID]),
        Production(PLUS, ['+']),
        Production(TIMES, ['*']),
        Production(LPAREN, ['(']),
        Production(RPAREN, [')']),
        Production(ID, ['a']),
        Production(ID, ['b']),
        Production(ID, ['c']),
    )
    grammar = ContextFreeGrammar(E, productions)

    text = "a * b + c".split()
    RecursiveDescentApp(grammar, text).mainloop()
示例#2
0
文件: cfg.py 项目: wrand/tweater
def demo2():
    from nltk import Nonterminal, Production, ContextFreeGrammar
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ['up', 'over', NP]),

        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )
    grammar = ContextFreeGrammar(S, productions)

    text = 'I saw a man in the park'.split()
    d = CFGDemo(grammar, text)
    d.mainloop()
示例#3
0
文件: cfg.py 项目: sfu-natlang/nltk
def demo():
    from nltk import Nonterminal, ContextFreeGrammar
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
                                           for s in nonterminals.split()]

    grammar = ContextFreeGrammar.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    Det -> 'my'
    NP -> 'I'
    N -> 'dog'
    N -> 'man'
    N -> 'park'
    N -> 'statue'
    V -> 'saw'
    P -> 'in'
    P -> 'up'
    P -> 'over'
    P -> 'with'
    """)

    def cb(grammar): print(grammar)
    top = Tk()
    editor = CFGEditor(top, grammar, cb)
    Label(top, text='\nTesting CFG Editor\n').pack()
    Button(top, text='Quit', command=top.destroy).pack()
    top.mainloop()
示例#4
0
def demo(N=23):
    from nltk.grammar import ContextFreeGrammar

    print('Generating the first %d sentences for demo grammar:' % (N, ))
    print(demo_grammar)
    grammar = ContextFreeGrammar.fromstring(demo_grammar)
    for n, sent in enumerate(generate(grammar, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))
示例#5
0
 def add_new_vocab_rule(self, rule):
     """
     Adds a new vocabulary rule to the set of rules, and
     recreates self.cfg and self.parser.
     """
     self.rules.append(Production(NT(rule[0]), rule[1]))
     self.cfg = ContextFreeGrammar(NT("S"), self.rules)
     self.parser = EarleyChartParser(self.cfg, trace=0)
示例#6
0
    def set_grammar(self, grammar):
        """
        Asign a new grammar to the parser

        Args:
            - parser_args: needs grammar of type :class:`ContextFreeGrammar`
        """
        self._grammar = ContextFreeGrammar(Nonterminal('S'), grammar)
示例#7
0
文件: cfg.py 项目: wrand/tweater
    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None: self._cfg = cfg
        else: self._cfg = ContextFreeGrammar(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()
示例#8
0
    def parse_NP(self, sen):
        """
        Parses a partial sentence (that is, usually a noun phrase.
        Returns the parse, or returns a tuple.
        """
        try:
            cfg_temp = ContextFreeGrammar(NT("NP"), self.rules)
            parser_temp = EarleyChartParser(cfg_temp, trace=0)
            parse = parser_temp.nbest_parse(sen.strip().split(" "), trace=0)
        except:
            print traceback.format_exc()
        else:
            if parse:
                return parse[0]

        print "failure"
        return None
示例#9
0
def fail_demo():
    """
    Demo grammar that should not work with backtracking for all inputs
    """
    from nltk.grammar import Nonterminal, Production, ContextFreeGrammar
    S = Nonterminal('S')
    A = Nonterminal('A')
    productions = (
        Production(S, [ A, S, A ]),
        Production(S, [ A, A ]),
        Production(A, [ 'a' ]),
        )
    grammar = ContextFreeGrammar(S, productions)

    text = "a a a a a a".split()
    #text = "a a a a".split()

    RecursiveDescentApp(grammar, text).mainloop()
示例#10
0
文件: cfg.py 项目: sfu-natlang/nltk
    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None: self._cfg = cfg
        else: self._cfg = ContextFreeGrammar(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()
示例#11
0
def app():
    """
    Create a shift reduce parser app, using a simple grammar and
    text.
    """

    from nltk.grammar import Nonterminal, Production, ContextFreeGrammar
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),

        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )

    grammar = ContextFreeGrammar(S, productions)

    # tokenize the sentence
    sent = 'my dog saw a man in the park with a statue'.split()

    ShiftReduceApp(grammar, sent).mainloop()
示例#12
0
    def __init__(self, rules_file="rules.gr", vocab_file="vocabulary.gr"):
        """
        Reads in grammar rules (from rules_file) and vocab rules (from
        vocab_file) and creates self.cfg (a ContextFreeGrammar) and
        self.parser (a EarleyChartParser).
        """
        self.rules = []
        test_sentences = []

        # get the rules from rules_file
        grammar = open(rules_file, "r")
        line = grammar.readline()
        while line:
            if line.strip() != "" and not line.strip().startswith("#"):
                line = line[2:]
                parts = line.partition("\t")
                lhs = parts[0].strip()
                rhs = [NT(x) for x in parts[2].strip().split(" ")]
                self.rules.append(Production(NT(lhs), rhs))
            line = grammar.readline()
        grammar.close()

        # get the rules from vocab_file
        vocab = open(vocab_file, "r")
        line = vocab.readline()
        while line:
            if line.strip() != "" and not line.strip().startswith("#"):
                line = line[2:]
                parts = line.partition("\t")
                lhs = parts[0].strip()
                rhs = parts[2].strip().lower().split(" ")
                self.rules.append(Production(NT(lhs), rhs))
            line = vocab.readline()
        vocab.close()

        # create the grammar and parser
        self.cfg = ContextFreeGrammar(NT("S"), self.rules)
        self.parser = EarleyChartParser(self.cfg, trace=0)
示例#13
0
def app():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """
    from nltk.grammar import ContextFreeGrammar
    grammar = ContextFreeGrammar.fromstring("""
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """)

    sent = 'the dog saw a man in the park'.split()

    RecursiveDescentApp(grammar, sent).mainloop()
示例#14
0
def demo():
    from nltk import Nonterminal, ContextFreeGrammar
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]

    grammar = ContextFreeGrammar.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    Det -> 'my'
    NP -> 'I'
    N -> 'dog'
    N -> 'man'
    N -> 'park'
    N -> 'statue'
    V -> 'saw'
    P -> 'in'
    P -> 'up'
    P -> 'over'
    P -> 'with'
    """)

    def cb(grammar):
        print(grammar)

    top = Tk()
    editor = CFGEditor(top, grammar, cb)
    Label(top, text='\nTesting CFG Editor\n').pack()
    Button(top, text='Quit', command=top.destroy).pack()
    top.mainloop()
示例#15
0
文件: cfg.py 项目: sfu-natlang/nltk
class CFGEditor(object):
    """
    A dialog window for creating and editing context free grammars.
    ``CFGEditor`` imposes the following restrictions:

    - All nonterminals must be strings consisting of word
      characters.
    - All terminals must be strings consisting of word characters
      and space characters.
    """
    # Regular expressions used by _analyze_line.  Precompile them, so
    # we can process the text faster.
    ARROW = SymbolWidget.SYMBOLS['rightarrow']
    _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|("+ARROW+"))")
    _ARROW_RE = re.compile("\s*(->|("+ARROW+"))\s*")
    _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" +              # LHS
                                "(->|("+ARROW+"))\s*" +        # arrow
                                r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$") # RHS
    _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|("+ARROW+")")
    _BOLD = ('helvetica', -12, 'bold')

    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None: self._cfg = cfg
        else: self._cfg = ContextFreeGrammar(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()

    def _init_startframe(self):
        frame = self._startframe = Frame(self._top)
        self._start = Entry(frame)
        self._start.pack(side='right')
        Label(frame, text='Start Symbol:').pack(side='right')
        Label(frame, text='Productions:').pack(side='left')
        self._start.insert(0, self._cfg.start().symbol())

    def _init_buttons(self):
        frame = self._buttonframe = Frame(self._top)
        Button(frame, text='Ok', command=self._ok,
               underline=0, takefocus=0).pack(side='left')
        Button(frame, text='Apply', command=self._apply,
               underline=0, takefocus=0).pack(side='left')
        Button(frame, text='Reset', command=self._reset,
               underline=0, takefocus=0,).pack(side='left')
        Button(frame, text='Cancel', command=self._cancel,
               underline=0, takefocus=0).pack(side='left')
        Button(frame, text='Help', command=self._help,
               underline=0, takefocus=0).pack(side='right')

    def _init_bindings(self):
        self._top.title('CFG Editor')
        self._top.bind('<Control-q>', self._cancel)
        self._top.bind('<Alt-q>', self._cancel)
        self._top.bind('<Control-d>', self._cancel)
        #self._top.bind('<Control-x>', self._cancel)
        self._top.bind('<Alt-x>', self._cancel)
        self._top.bind('<Escape>', self._cancel)
        #self._top.bind('<Control-c>', self._cancel)
        self._top.bind('<Alt-c>', self._cancel)

        self._top.bind('<Control-o>', self._ok)
        self._top.bind('<Alt-o>', self._ok)
        self._top.bind('<Control-a>', self._apply)
        self._top.bind('<Alt-a>', self._apply)
        self._top.bind('<Control-r>', self._reset)
        self._top.bind('<Alt-r>', self._reset)
        self._top.bind('<Control-h>', self._help)
        self._top.bind('<Alt-h>', self._help)
        self._top.bind('<F1>', self._help)

    def _init_prodframe(self):
        self._prodframe = Frame(self._top)

        # Create the basic Text widget & scrollbar.
        self._textwidget = Text(self._prodframe, background='#e0e0e0',
                                exportselection=1)
        self._textscroll = Scrollbar(self._prodframe, takefocus=0,
                                     orient='vertical')
        self._textwidget.config(yscrollcommand = self._textscroll.set)
        self._textscroll.config(command=self._textwidget.yview)
        self._textscroll.pack(side='right', fill='y')
        self._textwidget.pack(expand=1, fill='both', side='left')

        # Initialize the colorization tags.  Each nonterminal gets its
        # own tag, so they aren't listed here.
        self._textwidget.tag_config('terminal', foreground='#006000')
        self._textwidget.tag_config('arrow', font='symbol')
        self._textwidget.tag_config('error', background='red')

        # Keep track of what line they're on.  We use that to remember
        # to re-analyze a line whenever they leave it.
        self._linenum = 0

        # Expand "->" to an arrow.
        self._top.bind('>', self._replace_arrows)

        # Re-colorize lines when appropriate.
        self._top.bind('<<Paste>>', self._analyze)
        self._top.bind('<KeyPress>', self._check_analyze)
        self._top.bind('<ButtonPress>', self._check_analyze)

        # Tab cycles focus. (why doesn't this work??)
        def cycle(e, textwidget=self._textwidget):
            textwidget.tk_focusNext().focus()
        self._textwidget.bind('<Tab>', cycle)

        prod_tuples = [(p.lhs(),[p.rhs()]) for p in self._cfg.productions()]
        for i in range(len(prod_tuples)-1,0,-1):
            if (prod_tuples[i][0] == prod_tuples[i-1][0]):
                if () in prod_tuples[i][1]: continue
                if () in prod_tuples[i-1][1]: continue
                print(prod_tuples[i-1][1])
                print(prod_tuples[i][1])
                prod_tuples[i-1][1].extend(prod_tuples[i][1])
                del prod_tuples[i]

        for lhs, rhss in prod_tuples:
            print(lhs, rhss)
            s = '%s ->' % lhs
            for rhs in rhss:
                for elt in rhs:
                    if isinstance(elt, Nonterminal): s += ' %s' % elt
                    else: s += ' %r' % elt
                s += ' |'
            s = s[:-2] + '\n'
            self._textwidget.insert('end', s)

        self._analyze()

#         # Add the producitons to the text widget, and colorize them.
#         prod_by_lhs = {}
#         for prod in self._cfg.productions():
#             if len(prod.rhs()) > 0:
#                 prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
#         for (lhs, prods) in prod_by_lhs.items():
#             self._textwidget.insert('end', '%s ->' % lhs)
#             self._textwidget.insert('end', self._rhs(prods[0]))
#             for prod in prods[1:]:
#                 print '\t|'+self._rhs(prod),
#                 self._textwidget.insert('end', '\t|'+self._rhs(prod))
#             print
#             self._textwidget.insert('end', '\n')
#         for prod in self._cfg.productions():
#             if len(prod.rhs()) == 0:
#                 self._textwidget.insert('end', '%s' % prod)
#         self._analyze()

#     def _rhs(self, prod):
#         s = ''
#         for elt in prod.rhs():
#             if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
#             else: s += ' %r' % elt
#         return s

    def _clear_tags(self, linenum):
        """
        Remove all tags (except ``arrow`` and ``sel``) from the given
        line of the text widget used for editing the productions.
        """
        start = '%d.0'%linenum
        end = '%d.end'%linenum
        for tag in self._textwidget.tag_names():
            if tag not in ('arrow', 'sel'):
                self._textwidget.tag_remove(tag, start, end)

    def _check_analyze(self, *e):
        """
        Check if we've moved to a new line.  If we have, then remove
        all colorization from the line we moved to, and re-colorize
        the line that we moved from.
        """
        linenum = int(self._textwidget.index('insert').split('.')[0])
        if linenum != self._linenum:
            self._clear_tags(linenum)
            self._analyze_line(self._linenum)
            self._linenum = linenum

    def _replace_arrows(self, *e):
        """
        Replace any ``'->'`` text strings with arrows (char \\256, in
        symbol font).  This searches the whole buffer, but is fast
        enough to be done anytime they press '>'.
        """
        arrow = '1.0'
        while True:
            arrow = self._textwidget.search('->', arrow, 'end+1char')
            if arrow == '': break
            self._textwidget.delete(arrow, arrow+'+2char')
            self._textwidget.insert(arrow, self.ARROW, 'arrow')
            self._textwidget.insert(arrow, '\t')

        arrow = '1.0'
        while True:
            arrow = self._textwidget.search(self.ARROW, arrow+'+1char',
                                            'end+1char')
            if arrow == '': break
            self._textwidget.tag_add('arrow', arrow, arrow+'+1char')

    def _analyze_token(self, match, linenum):
        """
        Given a line number and a regexp match for a token on that
        line, colorize the token.  Note that the regexp match gives us
        the token's text, start index (on the line), and end index (on
        the line).
        """
        # What type of token is it?
        if match.group()[0] in "'\"": tag = 'terminal'
        elif match.group() in ('->', self.ARROW): tag = 'arrow'
        else:
            # If it's a nonterminal, then set up new bindings, so we
            # can highlight all instances of that nonterminal when we
            # put the mouse over it.
            tag = 'nonterminal_'+match.group()
            if tag not in self._textwidget.tag_names():
                self._init_nonterminal_tag(tag)

        start = '%d.%d' % (linenum, match.start())
        end = '%d.%d' % (linenum, match.end())
        self._textwidget.tag_add(tag, start, end)

    def _init_nonterminal_tag(self, tag, foreground='blue'):
        self._textwidget.tag_config(tag, foreground=foreground,
                                    font=CFGEditor._BOLD)
        if not self._highlight_matching_nonterminals:
            return
        def enter(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='#80ff80')
        def leave(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='')
        self._textwidget.tag_bind(tag, '<Enter>', enter)
        self._textwidget.tag_bind(tag, '<Leave>', leave)

    def _analyze_line(self, linenum):
        """
        Colorize a given line.
        """
        # Get rid of any tags that were previously on the line.
        self._clear_tags(linenum)

        # Get the line line's text string.
        line = self._textwidget.get(repr(linenum)+'.0', repr(linenum)+'.end')

        # If it's a valid production, then colorize each token.
        if CFGEditor._PRODUCTION_RE.match(line):
            # It's valid; Use _TOKEN_RE to tokenize the production,
            # and call analyze_token on each token.
            def analyze_token(match, self=self, linenum=linenum):
                self._analyze_token(match, linenum)
                return ''
            CFGEditor._TOKEN_RE.sub(analyze_token, line)
        elif line.strip() != '':
            # It's invalid; show the user where the error is.
            self._mark_error(linenum, line)

    def _mark_error(self, linenum, line):
        """
        Mark the location of an error in a line.
        """
        arrowmatch = CFGEditor._ARROW_RE.search(line)
        if not arrowmatch:
            # If there's no arrow at all, highlight the whole line.
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        elif not CFGEditor._LHS_RE.match(line):
            # Otherwise, if the LHS is bad, highlight it.
            start = '%d.0' % linenum
            end = '%d.%d' % (linenum, arrowmatch.start())
        else:
            # Otherwise, highlight the RHS.
            start = '%d.%d' % (linenum, arrowmatch.end())
            end = '%d.end' % linenum

        # If we're highlighting 0 chars, highlight the whole line.
        if self._textwidget.compare(start, '==', end):
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        self._textwidget.tag_add('error', start, end)

    def _analyze(self, *e):
        """
        Replace ``->`` with arrows, and colorize the entire buffer.
        """
        self._replace_arrows()
        numlines = int(self._textwidget.index('end').split('.')[0])
        for linenum in range(1, numlines+1):  # line numbers start at 1.
            self._analyze_line(linenum)

    def _parse_productions(self):
        """
        Parse the current contents of the textwidget buffer, to create
        a list of productions.
        """
        productions = []

        # Get the text, normalize it, and split it into lines.
        text = self._textwidget.get('1.0', 'end')
        text = re.sub(self.ARROW, '->', text)
        text = re.sub('\t', ' ', text)
        lines = text.split('\n')

        # Convert each line to a CFG production
        for line in lines:
            line = line.strip()
            if line=='': continue
            productions += parse_cfg_production(line)
            #if line.strip() == '': continue
            #if not CFGEditor._PRODUCTION_RE.match(line):
            #    raise ValueError('Bad production string %r' % line)
            #
            #(lhs_str, rhs_str) = line.split('->')
            #lhs = Nonterminal(lhs_str.strip())
            #rhs = []
            #def parse_token(match, rhs=rhs):
            #    token = match.group()
            #    if token[0] in "'\"": rhs.append(token[1:-1])
            #    else: rhs.append(Nonterminal(token))
            #    return ''
            #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
            #
            #productions.append(Production(lhs, *rhs))

        return productions

    def _destroy(self, *e):
        if self._top is None: return
        self._top.destroy()
        self._top = None

    def _ok(self, *e):
        self._apply()
        self._destroy()

    def _apply(self, *e):
        productions = self._parse_productions()
        start = Nonterminal(self._start.get())
        cfg = ContextFreeGrammar(start, productions)
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(cfg)

    def _reset(self, *e):
        self._textwidget.delete('1.0', 'end')
        for production in self._cfg.productions():
            self._textwidget.insert('end', '%s\n' % production)
        self._analyze()
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(self._cfg)

    def _cancel(self, *e):
        try: self._reset()
        except: pass
        self._destroy()

    def _help(self, *e):
        # The default font's not very legible; try using 'fixed' instead.
        try:
            ShowText(self._parent, 'Help: Chart Parser Demo',
                     (_CFGEditor_HELP).strip(), width=75, font='fixed')
        except:
            ShowText(self._parent, 'Help: Chart Parser Demo',
                     (_CFGEditor_HELP).strip(), width=75)
示例#16
0
文件: cfg.py 项目: wrand/tweater
 def _apply(self, *e):
     productions = self._parse_productions()
     start = Nonterminal(self._start.get())
     cfg = ContextFreeGrammar(start, productions)
     if self._set_cfg_callback is not None:
         self._set_cfg_callback(cfg)
示例#17
0
文件: cfg.py 项目: wrand/tweater
class CFGEditor(object):
    """
    A dialog window for creating and editing context free grammars.
    C{CFGEditor} places the following restrictions on what C{CFG}s can
    be edited:
        - All nonterminals must be strings consisting of word
          characters.
        - All terminals must be strings consisting of word characters
          and space characters.
    """
    # Regular expressions used by _analyze_line.  Precompile them, so
    # we can process the text faster.
    ARROW = SymbolWidget.SYMBOLS['rightarrow']
    _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
    _ARROW_RE = re.compile("\s*(->|(" + ARROW + "))\s*")
    _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" +  # LHS
                                "(->|(" + ARROW + "))\s*" +  # arrow
                                r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$")  # RHS
    _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
    _BOLD = ('helvetica', -12, 'bold')

    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None: self._cfg = cfg
        else: self._cfg = ContextFreeGrammar(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()

    def _init_startframe(self):
        frame = self._startframe = Frame(self._top)
        self._start = Entry(frame)
        self._start.pack(side='right')
        Label(frame, text='Start Symbol:').pack(side='right')
        Label(frame, text='Productions:').pack(side='left')
        self._start.insert(0, self._cfg.start().symbol())

    def _init_buttons(self):
        frame = self._buttonframe = Frame(self._top)
        Button(frame, text='Ok', command=self._ok, underline=0,
               takefocus=0).pack(side='left')
        Button(frame,
               text='Apply',
               command=self._apply,
               underline=0,
               takefocus=0).pack(side='left')
        Button(
            frame,
            text='Reset',
            command=self._reset,
            underline=0,
            takefocus=0,
        ).pack(side='left')
        Button(frame,
               text='Cancel',
               command=self._cancel,
               underline=0,
               takefocus=0).pack(side='left')
        Button(frame,
               text='Help',
               command=self._help,
               underline=0,
               takefocus=0).pack(side='right')

    def _init_bindings(self):
        self._top.title('CFG Editor')
        self._top.bind('<Control-q>', self._cancel)
        self._top.bind('<Alt-q>', self._cancel)
        self._top.bind('<Control-d>', self._cancel)
        #self._top.bind('<Control-x>', self._cancel)
        self._top.bind('<Alt-x>', self._cancel)
        self._top.bind('<Escape>', self._cancel)
        #self._top.bind('<Control-c>', self._cancel)
        self._top.bind('<Alt-c>', self._cancel)

        self._top.bind('<Control-o>', self._ok)
        self._top.bind('<Alt-o>', self._ok)
        self._top.bind('<Control-a>', self._apply)
        self._top.bind('<Alt-a>', self._apply)
        self._top.bind('<Control-r>', self._reset)
        self._top.bind('<Alt-r>', self._reset)
        self._top.bind('<Control-h>', self._help)
        self._top.bind('<Alt-h>', self._help)
        self._top.bind('<F1>', self._help)

    def _init_prodframe(self):
        self._prodframe = Frame(self._top)

        # Create the basic Text widget & scrollbar.
        self._textwidget = Text(self._prodframe,
                                background='#e0e0e0',
                                exportselection=1)
        self._textscroll = Scrollbar(self._prodframe,
                                     takefocus=0,
                                     orient='vertical')
        self._textwidget.config(yscrollcommand=self._textscroll.set)
        self._textscroll.config(command=self._textwidget.yview)
        self._textscroll.pack(side='right', fill='y')
        self._textwidget.pack(expand=1, fill='both', side='left')

        # Initialize the colorization tags.  Each nonterminal gets its
        # own tag, so they aren't listed here.
        self._textwidget.tag_config('terminal', foreground='#006000')
        self._textwidget.tag_config('arrow', font='symbol')
        self._textwidget.tag_config('error', background='red')

        # Keep track of what line they're on.  We use that to remember
        # to re-analyze a line whenever they leave it.
        self._linenum = 0

        # Expand "->" to an arrow.
        self._top.bind('>', self._replace_arrows)

        # Re-colorize lines when appropriate.
        self._top.bind('<<Paste>>', self._analyze)
        self._top.bind('<KeyPress>', self._check_analyze)
        self._top.bind('<ButtonPress>', self._check_analyze)

        # Tab cycles focus. (why doesn't this work??)
        def cycle(e, textwidget=self._textwidget):
            textwidget.tk_focusNext().focus()

        self._textwidget.bind('<Tab>', cycle)

        prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
        for i in range(len(prod_tuples) - 1, 0, -1):
            if (prod_tuples[i][0] == prod_tuples[i - 1][0]):
                if () in prod_tuples[i][1]: continue
                if () in prod_tuples[i - 1][1]: continue
                print prod_tuples[i - 1][1]
                print prod_tuples[i][1]
                prod_tuples[i - 1][1].extend(prod_tuples[i][1])
                del prod_tuples[i]

        for lhs, rhss in prod_tuples:
            print lhs, rhss
            s = '%s ->' % lhs
            for rhs in rhss:
                for elt in rhs:
                    if isinstance(elt, Nonterminal): s += ' %s' % elt
                    else: s += ' %r' % elt
                s += ' |'
            s = s[:-2] + '\n'
            self._textwidget.insert('end', s)

        self._analyze()

#         # Add the producitons to the text widget, and colorize them.
#         prod_by_lhs = {}
#         for prod in self._cfg.productions():
#             if len(prod.rhs()) > 0:
#                 prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
#         for (lhs, prods) in prod_by_lhs.items():
#             self._textwidget.insert('end', '%s ->' % lhs)
#             self._textwidget.insert('end', self._rhs(prods[0]))
#             for prod in prods[1:]:
#                 print '\t|'+self._rhs(prod),
#                 self._textwidget.insert('end', '\t|'+self._rhs(prod))
#             print
#             self._textwidget.insert('end', '\n')
#         for prod in self._cfg.productions():
#             if len(prod.rhs()) == 0:
#                 self._textwidget.insert('end', '%s' % prod)
#         self._analyze()

#     def _rhs(self, prod):
#         s = ''
#         for elt in prod.rhs():
#             if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
#             else: s += ' %r' % elt
#         return s

    def _clear_tags(self, linenum):
        """
        Remove all tags (except C{arrow} and C{sel}) from the given
        line of the text widget used for editing the productions.
        """
        start = '%d.0' % linenum
        end = '%d.end' % linenum
        for tag in self._textwidget.tag_names():
            if tag not in ('arrow', 'sel'):
                self._textwidget.tag_remove(tag, start, end)

    def _check_analyze(self, *e):
        """
        Check if we've moved to a new line.  If we have, then remove
        all colorization from the line we moved to, and re-colorize
        the line that we moved from.
        """
        linenum = int(self._textwidget.index('insert').split('.')[0])
        if linenum != self._linenum:
            self._clear_tags(linenum)
            self._analyze_line(self._linenum)
            self._linenum = linenum

    def _replace_arrows(self, *e):
        """
        Replace any C{'->'} text strings with arrows (char \\256, in
        symbol font).  This searches the whole buffer, but is fast
        enough to be done anytime they press '>'.
        """
        arrow = '1.0'
        while 1:
            arrow = self._textwidget.search('->', arrow, 'end+1char')
            if arrow == '': break
            self._textwidget.delete(arrow, arrow + '+2char')
            self._textwidget.insert(arrow, self.ARROW, 'arrow')
            self._textwidget.insert(arrow, '\t')

        arrow = '1.0'
        while 1:
            arrow = self._textwidget.search(self.ARROW, arrow + '+1char',
                                            'end+1char')
            if arrow == '': break
            self._textwidget.tag_add('arrow', arrow, arrow + '+1char')

    def _analyze_token(self, match, linenum):
        """
        Given a line number and a regexp match for a token on that
        line, colorize the token.  Note that the regexp match gives us
        the token's text, start index (on the line), and end index (on
        the line).
        """
        # What type of token is it?
        if match.group()[0] in "'\"": tag = 'terminal'
        elif match.group() in ('->', self.ARROW): tag = 'arrow'
        else:
            # If it's a nonterminal, then set up new bindings, so we
            # can highlight all instances of that nonterminal when we
            # put the mouse over it.
            tag = 'nonterminal_' + match.group()
            if tag not in self._textwidget.tag_names():
                self._init_nonterminal_tag(tag)

        start = '%d.%d' % (linenum, match.start())
        end = '%d.%d' % (linenum, match.end())
        self._textwidget.tag_add(tag, start, end)

    def _init_nonterminal_tag(self, tag, foreground='blue'):
        self._textwidget.tag_config(tag,
                                    foreground=foreground,
                                    font=CFGEditor._BOLD)
        if not self._highlight_matching_nonterminals:
            return

        def enter(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='#80ff80')

        def leave(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='')

        self._textwidget.tag_bind(tag, '<Enter>', enter)
        self._textwidget.tag_bind(tag, '<Leave>', leave)

    def _analyze_line(self, linenum):
        """
        Colorize a given line.
        """
        # Get rid of any tags that were previously on the line.
        self._clear_tags(linenum)

        # Get the line line's text string.
        line = self._textwidget.get( ` linenum ` + '.0', ` linenum ` + '.end')

        # If it's a valid production, then colorize each token.
        if CFGEditor._PRODUCTION_RE.match(line):
            # It's valid; Use _TOKEN_RE to tokenize the production,
            # and call analyze_token on each token.
            def analyze_token(match, self=self, linenum=linenum):
                self._analyze_token(match, linenum)
                return ''

            CFGEditor._TOKEN_RE.sub(analyze_token, line)
        elif line.strip() != '':
            # It's invalid; show the user where the error is.
            self._mark_error(linenum, line)

    def _mark_error(self, linenum, line):
        """
        Mark the location of an error in a line.
        """
        arrowmatch = CFGEditor._ARROW_RE.search(line)
        if not arrowmatch:
            # If there's no arrow at all, highlight the whole line.
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        elif not CFGEditor._LHS_RE.match(line):
            # Otherwise, if the LHS is bad, highlight it.
            start = '%d.0' % linenum
            end = '%d.%d' % (linenum, arrowmatch.start())
        else:
            # Otherwise, highlight the RHS.
            start = '%d.%d' % (linenum, arrowmatch.end())
            end = '%d.end' % linenum

        # If we're highlighting 0 chars, highlight the whole line.
        if self._textwidget.compare(start, '==', end):
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        self._textwidget.tag_add('error', start, end)

    def _analyze(self, *e):
        """
        Replace C{->} with arrows, and colorize the entire buffer.
        """
        self._replace_arrows()
        numlines = int(self._textwidget.index('end').split('.')[0])
        for linenum in range(1, numlines + 1):  # line numbers start at 1.
            self._analyze_line(linenum)

    def _parse_productions(self):
        """
        Parse the current contents of the textwidget buffer, to create
        a list of productions.
        """
        productions = []

        # Get the text, normalize it, and split it into lines.
        text = self._textwidget.get('1.0', 'end')
        text = re.sub(self.ARROW, '->', text)
        text = re.sub('\t', ' ', text)
        lines = text.split('\n')

        # Convert each line to a CFG production
        for line in lines:
            line = line.strip()
            if line == '': continue
            productions += parse_cfg_production(line)
            #if line.strip() == '': continue
            #if not CFGEditor._PRODUCTION_RE.match(line):
            #    raise ValueError('Bad production string %r' % line)
            #
            #(lhs_str, rhs_str) = line.split('->')
            #lhs = Nonterminal(lhs_str.strip())
            #rhs = []
            #def parse_token(match, rhs=rhs):
            #    token = match.group()
            #    if token[0] in "'\"": rhs.append(token[1:-1])
            #    else: rhs.append(Nonterminal(token))
            #    return ''
            #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
            #
            #productions.append(Production(lhs, *rhs))

        return productions

    def _destroy(self, *e):
        if self._top is None: return
        self._top.destroy()
        self._top = None

    def _ok(self, *e):
        self._apply()
        self._destroy()

    def _apply(self, *e):
        productions = self._parse_productions()
        start = Nonterminal(self._start.get())
        cfg = ContextFreeGrammar(start, productions)
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(cfg)

    def _reset(self, *e):
        self._textwidget.delete('1.0', 'end')
        for production in self._cfg.productions():
            self._textwidget.insert('end', '%s\n' % production)
        self._analyze()
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(self._cfg)

    def _cancel(self, *e):
        try:
            self._reset()
        except:
            pass
        self._destroy()

    def _help(self, *e):
        # The default font's not very legible; try using 'fixed' instead.
        try:
            ShowText(self._parent,
                     'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(),
                     width=75,
                     font='fixed')
        except:
            ShowText(self._parent,
                     'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(),
                     width=75)
示例#18
0
#
# Creazione di una PCFG estratta dal treebank Penn
#

import nltk
from nltk.corpus import treebank
from nltk.grammar import ContextFreeGrammar, Nonterminal, Production

#estrazione delle produzioni dal treebank
production_list = list(production for sent in treebank.parsed_sents()
                        for production in sent.productions())
tbank_productions = set(production_list)
tbank_grammar = ContextFreeGrammar(Nonterminal('S'), list(tbank_productions))

grammar_productions = tbank_grammar.productions()


lhsCount={}
prodCount={}
probs={}

#preparo le strutture dati: count(a->b)/count(a) con a non terminale
for production in production_list:
	if production.lhs() in lhsCount:
		lhsCount[production.lhs()] = lhsCount[production.lhs()] + 1
	else:
		lhsCount[production.lhs()] = 1
	if production in prodCount:
		prodCount[production] = prodCount[production] + 1
	else:
		prodCount[production] = 1
示例#19
0
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp_tokenize(self.string, re_all)

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            Production(O, [D]),
            Production(O, [H]),
            Production(O, [T]),

            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")"
            Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            Production(H, [D, '/', O]))

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$')
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = Production(D, [tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = ContextFreeGrammar(O, productions)
        rd_parser = parse.RecursiveDescentParser(grammar)

        # Tokens need to be redefined.
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp_tokenize(self.string, re_all)
        toklist = list(tokens)

        # Store the parsing.
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError:
            print "Could not parse query."
            return

        # Set the nltk.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":", "").replace("')'", "").replace(
            "table(", "").replace("','", "").replace("'", "").replace("/", "")
        self.nltktree = parse.tree.bracket_parse(string2)

        # Store the resulting nltk.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()
示例#20
0
import nltk
from nltk.corpus import treebank
from nltk.grammar import ContextFreeGrammar, Nonterminal

tbank_productions = set(production for sent in treebank.parsed_sents()
                        for production in sent.productions())
tbank_grammar = ContextFreeGrammar(Nonterminal('S'), list(tbank_productions))

print tbank_grammar