Exemplo n.º 1
0
def exploreCFG(cfg, length_limit):
    """
    Generate strings with the CFG,
    without ever allowing an intermediate expression to exceed the length_limit.
    
    Note that not all strings with length <= length_limit that can be generated will be generated.
    For example, with S -> 1S | e and length_limit = 2, we won't be able to generate "11"
    because in the sequence S -> 1S -> 11S -> 11, 11S is too long and we would stop exploring there.
    """
    cfg = CFG(cfg.start(), [splitProdRhs(prod) for prod in cfg.productions()])

    finished = set()  # Expressions with no nonterminals left
    visited = set(
    )  # Expressions with nonterminals that have already been explored
    to_explore = [(cfg.start(), )]

    while to_explore:
        expr = to_explore.pop()
        if expr in visited or len(expr) > length_limit:
            continue

        for i in range(len(expr)):
            if isinstance(expr[i], Nonterminal):
                break
        else:
            finished.add(expr)
            continue
        visited.add(expr)

        for prod in cfg.productions(lhs=expr[i]):
            to_explore.append(expr[:i] + prod.rhs() + expr[i + 1:])

    return finished
Exemplo n.º 2
0
Arquivo: cfg.py Projeto: Geolem/nltk
class CFGEditor:
    """
    A dialog window for creating and editing context free grammars.
    ``CFGEditor`` imposes the following restrictions:

    - All nonterminals must be strings consisting of word
      characters.
    - All terminals must be strings consisting of word characters
      and space characters.
    """

    # Regular expressions used by _analyze_line.  Precompile them, so
    # we can process the text faster.
    ARROW = SymbolWidget.SYMBOLS["rightarrow"]
    _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
    _ARROW_RE = re.compile(r"\s*(->|(" + ARROW + r"))\s*")
    _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" + "(->|("  # LHS
                                + ARROW + r"))\s*" +
                                r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$"  # arrow
                                )  # RHS
    _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
    _BOLD = ("helvetica", -12, "bold")

    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None:
            self._cfg = cfg
        else:
            self._cfg = CFG(Nonterminal("S"), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side="top", fill="x", expand=0)
        self._init_prodframe()
        self._prodframe.pack(side="top", fill="both", expand=1)
        self._init_buttons()
        self._buttonframe.pack(side="bottom", fill="x", expand=0)

        self._textwidget.focus()

    def _init_startframe(self):
        frame = self._startframe = Frame(self._top)
        self._start = Entry(frame)
        self._start.pack(side="right")
        Label(frame, text="Start Symbol:").pack(side="right")
        Label(frame, text="Productions:").pack(side="left")
        self._start.insert(0, self._cfg.start().symbol())

    def _init_buttons(self):
        frame = self._buttonframe = Frame(self._top)
        Button(frame, text="Ok", command=self._ok, underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Apply",
               command=self._apply,
               underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Reset",
               command=self._reset,
               underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Cancel",
               command=self._cancel,
               underline=0,
               takefocus=0).pack(side="left")
        Button(frame,
               text="Help",
               command=self._help,
               underline=0,
               takefocus=0).pack(side="right")

    def _init_bindings(self):
        self._top.title("CFG Editor")
        self._top.bind("<Control-q>", self._cancel)
        self._top.bind("<Alt-q>", self._cancel)
        self._top.bind("<Control-d>", self._cancel)
        # self._top.bind('<Control-x>', self._cancel)
        self._top.bind("<Alt-x>", self._cancel)
        self._top.bind("<Escape>", self._cancel)
        # self._top.bind('<Control-c>', self._cancel)
        self._top.bind("<Alt-c>", self._cancel)

        self._top.bind("<Control-o>", self._ok)
        self._top.bind("<Alt-o>", self._ok)
        self._top.bind("<Control-a>", self._apply)
        self._top.bind("<Alt-a>", self._apply)
        self._top.bind("<Control-r>", self._reset)
        self._top.bind("<Alt-r>", self._reset)
        self._top.bind("<Control-h>", self._help)
        self._top.bind("<Alt-h>", self._help)
        self._top.bind("<F1>", self._help)

    def _init_prodframe(self):
        self._prodframe = Frame(self._top)

        # Create the basic Text widget & scrollbar.
        self._textwidget = Text(self._prodframe,
                                background="#e0e0e0",
                                exportselection=1)
        self._textscroll = Scrollbar(self._prodframe,
                                     takefocus=0,
                                     orient="vertical")
        self._textwidget.config(yscrollcommand=self._textscroll.set)
        self._textscroll.config(command=self._textwidget.yview)
        self._textscroll.pack(side="right", fill="y")
        self._textwidget.pack(expand=1, fill="both", side="left")

        # Initialize the colorization tags.  Each nonterminal gets its
        # own tag, so they aren't listed here.
        self._textwidget.tag_config("terminal", foreground="#006000")
        self._textwidget.tag_config("arrow", font="symbol")
        self._textwidget.tag_config("error", background="red")

        # Keep track of what line they're on.  We use that to remember
        # to re-analyze a line whenever they leave it.
        self._linenum = 0

        # Expand "->" to an arrow.
        self._top.bind(">", self._replace_arrows)

        # Re-colorize lines when appropriate.
        self._top.bind("<<Paste>>", self._analyze)
        self._top.bind("<KeyPress>", self._check_analyze)
        self._top.bind("<ButtonPress>", self._check_analyze)

        # Tab cycles focus. (why doesn't this work??)
        def cycle(e, textwidget=self._textwidget):
            textwidget.tk_focusNext().focus()

        self._textwidget.bind("<Tab>", cycle)

        prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
        for i in range(len(prod_tuples) - 1, 0, -1):
            if prod_tuples[i][0] == prod_tuples[i - 1][0]:
                if () in prod_tuples[i][1]:
                    continue
                if () in prod_tuples[i - 1][1]:
                    continue
                print(prod_tuples[i - 1][1])
                print(prod_tuples[i][1])
                prod_tuples[i - 1][1].extend(prod_tuples[i][1])
                del prod_tuples[i]

        for lhs, rhss in prod_tuples:
            print(lhs, rhss)
            s = "%s ->" % lhs
            for rhs in rhss:
                for elt in rhs:
                    if isinstance(elt, Nonterminal):
                        s += " %s" % elt
                    else:
                        s += " %r" % elt
                s += " |"
            s = s[:-2] + "\n"
            self._textwidget.insert("end", s)

        self._analyze()

    #         # Add the producitons to the text widget, and colorize them.
    #         prod_by_lhs = {}
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) > 0:
    #                 prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
    #         for (lhs, prods) in prod_by_lhs.items():
    #             self._textwidget.insert('end', '%s ->' % lhs)
    #             self._textwidget.insert('end', self._rhs(prods[0]))
    #             for prod in prods[1:]:
    #                 print '\t|'+self._rhs(prod),
    #                 self._textwidget.insert('end', '\t|'+self._rhs(prod))
    #             print
    #             self._textwidget.insert('end', '\n')
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) == 0:
    #                 self._textwidget.insert('end', '%s' % prod)
    #         self._analyze()

    #     def _rhs(self, prod):
    #         s = ''
    #         for elt in prod.rhs():
    #             if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
    #             else: s += ' %r' % elt
    #         return s

    def _clear_tags(self, linenum):
        """
        Remove all tags (except ``arrow`` and ``sel``) from the given
        line of the text widget used for editing the productions.
        """
        start = "%d.0" % linenum
        end = "%d.end" % linenum
        for tag in self._textwidget.tag_names():
            if tag not in ("arrow", "sel"):
                self._textwidget.tag_remove(tag, start, end)

    def _check_analyze(self, *e):
        """
        Check if we've moved to a new line.  If we have, then remove
        all colorization from the line we moved to, and re-colorize
        the line that we moved from.
        """
        linenum = int(self._textwidget.index("insert").split(".")[0])
        if linenum != self._linenum:
            self._clear_tags(linenum)
            self._analyze_line(self._linenum)
            self._linenum = linenum

    def _replace_arrows(self, *e):
        """
        Replace any ``'->'`` text strings with arrows (char \\256, in
        symbol font).  This searches the whole buffer, but is fast
        enough to be done anytime they press '>'.
        """
        arrow = "1.0"
        while True:
            arrow = self._textwidget.search("->", arrow, "end+1char")
            if arrow == "":
                break
            self._textwidget.delete(arrow, arrow + "+2char")
            self._textwidget.insert(arrow, self.ARROW, "arrow")
            self._textwidget.insert(arrow, "\t")

        arrow = "1.0"
        while True:
            arrow = self._textwidget.search(self.ARROW, arrow + "+1char",
                                            "end+1char")
            if arrow == "":
                break
            self._textwidget.tag_add("arrow", arrow, arrow + "+1char")

    def _analyze_token(self, match, linenum):
        """
        Given a line number and a regexp match for a token on that
        line, colorize the token.  Note that the regexp match gives us
        the token's text, start index (on the line), and end index (on
        the line).
        """
        # What type of token is it?
        if match.group()[0] in "'\"":
            tag = "terminal"
        elif match.group() in ("->", self.ARROW):
            tag = "arrow"
        else:
            # If it's a nonterminal, then set up new bindings, so we
            # can highlight all instances of that nonterminal when we
            # put the mouse over it.
            tag = "nonterminal_" + match.group()
            if tag not in self._textwidget.tag_names():
                self._init_nonterminal_tag(tag)

        start = "%d.%d" % (linenum, match.start())
        end = "%d.%d" % (linenum, match.end())
        self._textwidget.tag_add(tag, start, end)

    def _init_nonterminal_tag(self, tag, foreground="blue"):
        self._textwidget.tag_config(tag,
                                    foreground=foreground,
                                    font=CFGEditor._BOLD)
        if not self._highlight_matching_nonterminals:
            return

        def enter(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background="#80ff80")

        def leave(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background="")

        self._textwidget.tag_bind(tag, "<Enter>", enter)
        self._textwidget.tag_bind(tag, "<Leave>", leave)

    def _analyze_line(self, linenum):
        """
        Colorize a given line.
        """
        # Get rid of any tags that were previously on the line.
        self._clear_tags(linenum)

        # Get the line line's text string.
        line = self._textwidget.get(
            repr(linenum) + ".0",
            repr(linenum) + ".end")

        # If it's a valid production, then colorize each token.
        if CFGEditor._PRODUCTION_RE.match(line):
            # It's valid; Use _TOKEN_RE to tokenize the production,
            # and call analyze_token on each token.
            def analyze_token(match, self=self, linenum=linenum):
                self._analyze_token(match, linenum)
                return ""

            CFGEditor._TOKEN_RE.sub(analyze_token, line)
        elif line.strip() != "":
            # It's invalid; show the user where the error is.
            self._mark_error(linenum, line)

    def _mark_error(self, linenum, line):
        """
        Mark the location of an error in a line.
        """
        arrowmatch = CFGEditor._ARROW_RE.search(line)
        if not arrowmatch:
            # If there's no arrow at all, highlight the whole line.
            start = "%d.0" % linenum
            end = "%d.end" % linenum
        elif not CFGEditor._LHS_RE.match(line):
            # Otherwise, if the LHS is bad, highlight it.
            start = "%d.0" % linenum
            end = "%d.%d" % (linenum, arrowmatch.start())
        else:
            # Otherwise, highlight the RHS.
            start = "%d.%d" % (linenum, arrowmatch.end())
            end = "%d.end" % linenum

        # If we're highlighting 0 chars, highlight the whole line.
        if self._textwidget.compare(start, "==", end):
            start = "%d.0" % linenum
            end = "%d.end" % linenum
        self._textwidget.tag_add("error", start, end)

    def _analyze(self, *e):
        """
        Replace ``->`` with arrows, and colorize the entire buffer.
        """
        self._replace_arrows()
        numlines = int(self._textwidget.index("end").split(".")[0])
        for linenum in range(1, numlines + 1):  # line numbers start at 1.
            self._analyze_line(linenum)

    def _parse_productions(self):
        """
        Parse the current contents of the textwidget buffer, to create
        a list of productions.
        """
        productions = []

        # Get the text, normalize it, and split it into lines.
        text = self._textwidget.get("1.0", "end")
        text = re.sub(self.ARROW, "->", text)
        text = re.sub("\t", " ", text)
        lines = text.split("\n")

        # Convert each line to a CFG production
        for line in lines:
            line = line.strip()
            if line == "":
                continue
            productions += _read_cfg_production(line)
            # if line.strip() == '': continue
            # if not CFGEditor._PRODUCTION_RE.match(line):
            #    raise ValueError('Bad production string %r' % line)
            #
            # (lhs_str, rhs_str) = line.split('->')
            # lhs = Nonterminal(lhs_str.strip())
            # rhs = []
            # def parse_token(match, rhs=rhs):
            #    token = match.group()
            #    if token[0] in "'\"": rhs.append(token[1:-1])
            #    else: rhs.append(Nonterminal(token))
            #    return ''
            # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
            #
            # productions.append(Production(lhs, *rhs))

        return productions

    def _destroy(self, *e):
        if self._top is None:
            return
        self._top.destroy()
        self._top = None

    def _ok(self, *e):
        self._apply()
        self._destroy()

    def _apply(self, *e):
        productions = self._parse_productions()
        start = Nonterminal(self._start.get())
        cfg = CFG(start, productions)
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(cfg)

    def _reset(self, *e):
        self._textwidget.delete("1.0", "end")
        for production in self._cfg.productions():
            self._textwidget.insert("end", "%s\n" % production)
        self._analyze()
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(self._cfg)

    def _cancel(self, *e):
        try:
            self._reset()
        except:
            pass
        self._destroy()

    def _help(self, *e):
        # The default font's not very legible; try using 'fixed' instead.
        try:
            ShowText(
                self._parent,
                "Help: Chart Parser Demo",
                (_CFGEditor_HELP).strip(),
                width=75,
                font="fixed",
            )
        except:
            ShowText(
                self._parent,
                "Help: Chart Parser Demo",
                (_CFGEditor_HELP).strip(),
                width=75,
            )
Exemplo n.º 3
0
class CFGEditor(object):
    """
    A dialog window for creating and editing context free grammars.
    ``CFGEditor`` imposes the following restrictions:

    - All nonterminals must be strings consisting of word
      characters.
    - All terminals must be strings consisting of word characters
      and space characters.
    """

    # Regular expressions used by _analyze_line.  Precompile them, so
    # we can process the text faster.
    ARROW = SymbolWidget.SYMBOLS['rightarrow']
    _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
    _ARROW_RE = re.compile("\s*(->|(" + ARROW + "))\s*")
    _PRODUCTION_RE = re.compile(
        r"(^\s*\w+\s*)"
        + "(->|("  # LHS
        + ARROW
        + "))\s*"
        + r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$"  # arrow
    )  # RHS
    _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
    _BOLD = ('helvetica', -12, 'bold')

    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None:
            self._cfg = cfg
        else:
            self._cfg = CFG(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()

    def _init_startframe(self):
        frame = self._startframe = Frame(self._top)
        self._start = Entry(frame)
        self._start.pack(side='right')
        Label(frame, text='Start Symbol:').pack(side='right')
        Label(frame, text='Productions:').pack(side='left')
        self._start.insert(0, self._cfg.start().symbol())

    def _init_buttons(self):
        frame = self._buttonframe = Frame(self._top)
        Button(frame, text='Ok', command=self._ok, underline=0, takefocus=0).pack(
            side='left'
        )
        Button(frame, text='Apply', command=self._apply, underline=0, takefocus=0).pack(
            side='left'
        )
        Button(frame, text='Reset', command=self._reset, underline=0, takefocus=0).pack(
            side='left'
        )
        Button(
            frame, text='Cancel', command=self._cancel, underline=0, takefocus=0
        ).pack(side='left')
        Button(frame, text='Help', command=self._help, underline=0, takefocus=0).pack(
            side='right'
        )

    def _init_bindings(self):
        self._top.title('CFG Editor')
        self._top.bind('<Control-q>', self._cancel)
        self._top.bind('<Alt-q>', self._cancel)
        self._top.bind('<Control-d>', self._cancel)
        # self._top.bind('<Control-x>', self._cancel)
        self._top.bind('<Alt-x>', self._cancel)
        self._top.bind('<Escape>', self._cancel)
        # self._top.bind('<Control-c>', self._cancel)
        self._top.bind('<Alt-c>', self._cancel)

        self._top.bind('<Control-o>', self._ok)
        self._top.bind('<Alt-o>', self._ok)
        self._top.bind('<Control-a>', self._apply)
        self._top.bind('<Alt-a>', self._apply)
        self._top.bind('<Control-r>', self._reset)
        self._top.bind('<Alt-r>', self._reset)
        self._top.bind('<Control-h>', self._help)
        self._top.bind('<Alt-h>', self._help)
        self._top.bind('<F1>', self._help)

    def _init_prodframe(self):
        self._prodframe = Frame(self._top)

        # Create the basic Text widget & scrollbar.
        self._textwidget = Text(
            self._prodframe, background='#e0e0e0', exportselection=1
        )
        self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient='vertical')
        self._textwidget.config(yscrollcommand=self._textscroll.set)
        self._textscroll.config(command=self._textwidget.yview)
        self._textscroll.pack(side='right', fill='y')
        self._textwidget.pack(expand=1, fill='both', side='left')

        # Initialize the colorization tags.  Each nonterminal gets its
        # own tag, so they aren't listed here.
        self._textwidget.tag_config('terminal', foreground='#006000')
        self._textwidget.tag_config('arrow', font='symbol')
        self._textwidget.tag_config('error', background='red')

        # Keep track of what line they're on.  We use that to remember
        # to re-analyze a line whenever they leave it.
        self._linenum = 0

        # Expand "->" to an arrow.
        self._top.bind('>', self._replace_arrows)

        # Re-colorize lines when appropriate.
        self._top.bind('<<Paste>>', self._analyze)
        self._top.bind('<KeyPress>', self._check_analyze)
        self._top.bind('<ButtonPress>', self._check_analyze)

        # Tab cycles focus. (why doesn't this work??)
        def cycle(e, textwidget=self._textwidget):
            textwidget.tk_focusNext().focus()

        self._textwidget.bind('<Tab>', cycle)

        prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
        for i in range(len(prod_tuples) - 1, 0, -1):
            if prod_tuples[i][0] == prod_tuples[i - 1][0]:
                if () in prod_tuples[i][1]:
                    continue
                if () in prod_tuples[i - 1][1]:
                    continue
                print(prod_tuples[i - 1][1])
                print(prod_tuples[i][1])
                prod_tuples[i - 1][1].extend(prod_tuples[i][1])
                del prod_tuples[i]

        for lhs, rhss in prod_tuples:
            print(lhs, rhss)
            s = '%s ->' % lhs
            for rhs in rhss:
                for elt in rhs:
                    if isinstance(elt, Nonterminal):
                        s += ' %s' % elt
                    else:
                        s += ' %r' % elt
                s += ' |'
            s = s[:-2] + '\n'
            self._textwidget.insert('end', s)

        self._analyze()

    #         # Add the producitons to the text widget, and colorize them.
    #         prod_by_lhs = {}
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) > 0:
    #                 prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
    #         for (lhs, prods) in prod_by_lhs.items():
    #             self._textwidget.insert('end', '%s ->' % lhs)
    #             self._textwidget.insert('end', self._rhs(prods[0]))
    #             for prod in prods[1:]:
    #                 print '\t|'+self._rhs(prod),
    #                 self._textwidget.insert('end', '\t|'+self._rhs(prod))
    #             print
    #             self._textwidget.insert('end', '\n')
    #         for prod in self._cfg.productions():
    #             if len(prod.rhs()) == 0:
    #                 self._textwidget.insert('end', '%s' % prod)
    #         self._analyze()

    #     def _rhs(self, prod):
    #         s = ''
    #         for elt in prod.rhs():
    #             if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
    #             else: s += ' %r' % elt
    #         return s

    def _clear_tags(self, linenum):
        """
        Remove all tags (except ``arrow`` and ``sel``) from the given
        line of the text widget used for editing the productions.
        """
        start = '%d.0' % linenum
        end = '%d.end' % linenum
        for tag in self._textwidget.tag_names():
            if tag not in ('arrow', 'sel'):
                self._textwidget.tag_remove(tag, start, end)

    def _check_analyze(self, *e):
        """
        Check if we've moved to a new line.  If we have, then remove
        all colorization from the line we moved to, and re-colorize
        the line that we moved from.
        """
        linenum = int(self._textwidget.index('insert').split('.')[0])
        if linenum != self._linenum:
            self._clear_tags(linenum)
            self._analyze_line(self._linenum)
            self._linenum = linenum

    def _replace_arrows(self, *e):
        """
        Replace any ``'->'`` text strings with arrows (char \\256, in
        symbol font).  This searches the whole buffer, but is fast
        enough to be done anytime they press '>'.
        """
        arrow = '1.0'
        while True:
            arrow = self._textwidget.search('->', arrow, 'end+1char')
            if arrow == '':
                break
            self._textwidget.delete(arrow, arrow + '+2char')
            self._textwidget.insert(arrow, self.ARROW, 'arrow')
            self._textwidget.insert(arrow, '\t')

        arrow = '1.0'
        while True:
            arrow = self._textwidget.search(self.ARROW, arrow + '+1char', 'end+1char')
            if arrow == '':
                break
            self._textwidget.tag_add('arrow', arrow, arrow + '+1char')

    def _analyze_token(self, match, linenum):
        """
        Given a line number and a regexp match for a token on that
        line, colorize the token.  Note that the regexp match gives us
        the token's text, start index (on the line), and end index (on
        the line).
        """
        # What type of token is it?
        if match.group()[0] in "'\"":
            tag = 'terminal'
        elif match.group() in ('->', self.ARROW):
            tag = 'arrow'
        else:
            # If it's a nonterminal, then set up new bindings, so we
            # can highlight all instances of that nonterminal when we
            # put the mouse over it.
            tag = 'nonterminal_' + match.group()
            if tag not in self._textwidget.tag_names():
                self._init_nonterminal_tag(tag)

        start = '%d.%d' % (linenum, match.start())
        end = '%d.%d' % (linenum, match.end())
        self._textwidget.tag_add(tag, start, end)

    def _init_nonterminal_tag(self, tag, foreground='blue'):
        self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD)
        if not self._highlight_matching_nonterminals:
            return

        def enter(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='#80ff80')

        def leave(e, textwidget=self._textwidget, tag=tag):
            textwidget.tag_config(tag, background='')

        self._textwidget.tag_bind(tag, '<Enter>', enter)
        self._textwidget.tag_bind(tag, '<Leave>', leave)

    def _analyze_line(self, linenum):
        """
        Colorize a given line.
        """
        # Get rid of any tags that were previously on the line.
        self._clear_tags(linenum)

        # Get the line line's text string.
        line = self._textwidget.get(repr(linenum) + '.0', repr(linenum) + '.end')

        # If it's a valid production, then colorize each token.
        if CFGEditor._PRODUCTION_RE.match(line):
            # It's valid; Use _TOKEN_RE to tokenize the production,
            # and call analyze_token on each token.
            def analyze_token(match, self=self, linenum=linenum):
                self._analyze_token(match, linenum)
                return ''

            CFGEditor._TOKEN_RE.sub(analyze_token, line)
        elif line.strip() != '':
            # It's invalid; show the user where the error is.
            self._mark_error(linenum, line)

    def _mark_error(self, linenum, line):
        """
        Mark the location of an error in a line.
        """
        arrowmatch = CFGEditor._ARROW_RE.search(line)
        if not arrowmatch:
            # If there's no arrow at all, highlight the whole line.
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        elif not CFGEditor._LHS_RE.match(line):
            # Otherwise, if the LHS is bad, highlight it.
            start = '%d.0' % linenum
            end = '%d.%d' % (linenum, arrowmatch.start())
        else:
            # Otherwise, highlight the RHS.
            start = '%d.%d' % (linenum, arrowmatch.end())
            end = '%d.end' % linenum

        # If we're highlighting 0 chars, highlight the whole line.
        if self._textwidget.compare(start, '==', end):
            start = '%d.0' % linenum
            end = '%d.end' % linenum
        self._textwidget.tag_add('error', start, end)

    def _analyze(self, *e):
        """
        Replace ``->`` with arrows, and colorize the entire buffer.
        """
        self._replace_arrows()
        numlines = int(self._textwidget.index('end').split('.')[0])
        for linenum in range(1, numlines + 1):  # line numbers start at 1.
            self._analyze_line(linenum)

    def _parse_productions(self):
        """
        Parse the current contents of the textwidget buffer, to create
        a list of productions.
        """
        productions = []

        # Get the text, normalize it, and split it into lines.
        text = self._textwidget.get('1.0', 'end')
        text = re.sub(self.ARROW, '->', text)
        text = re.sub('\t', ' ', text)
        lines = text.split('\n')

        # Convert each line to a CFG production
        for line in lines:
            line = line.strip()
            if line == '':
                continue
            productions += _read_cfg_production(line)
            # if line.strip() == '': continue
            # if not CFGEditor._PRODUCTION_RE.match(line):
            #    raise ValueError('Bad production string %r' % line)
            #
            # (lhs_str, rhs_str) = line.split('->')
            # lhs = Nonterminal(lhs_str.strip())
            # rhs = []
            # def parse_token(match, rhs=rhs):
            #    token = match.group()
            #    if token[0] in "'\"": rhs.append(token[1:-1])
            #    else: rhs.append(Nonterminal(token))
            #    return ''
            # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
            #
            # productions.append(Production(lhs, *rhs))

        return productions

    def _destroy(self, *e):
        if self._top is None:
            return
        self._top.destroy()
        self._top = None

    def _ok(self, *e):
        self._apply()
        self._destroy()

    def _apply(self, *e):
        productions = self._parse_productions()
        start = Nonterminal(self._start.get())
        cfg = CFG(start, productions)
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(cfg)

    def _reset(self, *e):
        self._textwidget.delete('1.0', 'end')
        for production in self._cfg.productions():
            self._textwidget.insert('end', '%s\n' % production)
        self._analyze()
        if self._set_cfg_callback is not None:
            self._set_cfg_callback(self._cfg)

    def _cancel(self, *e):
        try:
            self._reset()
        except:
            pass
        self._destroy()

    def _help(self, *e):
        # The default font's not very legible; try using 'fixed' instead.
        try:
            ShowText(
                self._parent,
                'Help: Chart Parser Demo',
                (_CFGEditor_HELP).strip(),
                width=75,
                font='fixed',
            )
        except:
            ShowText(
                self._parent,
                'Help: Chart Parser Demo',
                (_CFGEditor_HELP).strip(),
                width=75,
            )
            stimtrees.append(
                ("(ROOT" + m, num)
            )  # add ROOT tag back at the beginning of the tree, and output a tuple with tree and the stimulus ID (num)

"""Fix up the last tree in stimulus 0.. stimulus 0 excluded the last IU (="stating that") in the speaker's turn, to make it fit with the desired IU count. Those two words were included in the text given to the parser in case the omission would have caused the parser difficulty, but we don't want to include them in our analysis since the subjects didn't actually hear them.. I removed those two words from the stanford parse tree text file that we read in earlier, but now I need to add in the final parentheses to make the parse processable by the Tree function
"""
stimtrees[5] = (stimtrees[5][0] + ")))))))\n(. ?))\n(. .)))\n\n", stimtrees[5][1])

processed_trees = [
    Tree.fromstring(tree[0]) for tree in stimtrees
]  # create Tree structure and viewable tree image for each tree
processed_trees[0]  # shows tree image for stimulus 0
# prods=[t.productions() for t in processed_trees]
rules = reduce(lambda x, y: x + y, [t.productions() for t in processed_trees])
mycfg = CFG(Nonterminal("ROOT"), rules)
mycfg.start()
mycfg.productions(
    lhs=Nonterminal("PP")
)  # Will print productions for the specified nonterminal item (e.g. "PP", a prepositional phrase), where the PP is the left-hand side of the rule (e.g. PP -> whatever)

#%%
# ==============================================================================
# Loop through Production rules to extract Syntactic Tags and Terminal Words, keep track of Clause boundaries by looking for the first word appearing after an "S" tag
# ==============================================================================
words = []
counter = 0
tags = []
ruleset = []
ClauseBoundary = (
    False
)  # below, this variable will be set to TRUE if the rule begins with 'S' (clause boundary), or FALSE the rule contains a terminal