Exemplo n.º 1
0
    def __init__(self):

        """
        Class initializer.
        """

        # Call the superclass initializer

        Parser.__init__(self)

        # Set the containers we're going to recognize

        self._containers = [('(', ')'), ('{', '}'), ('[', ']')]

        # Define our token pattern matches

        tmat = self._token_matches
        tmat.append(TokenMatch(re.compile(r"(/\*)(?s).*?(\*/)"),
                               [CommentToken]))
        tmat.append(TokenMatch(re.compile(r"//.*"),
                               [CommentToken]))
        tmat.append(TokenMatch(re.compile(r"#(?:(?!/\*|//).)*"),
                               [PreprocToken]))
        tmat.append(TokenMatch(re.compile(r'''(["/']).*?(?<!\\)(\\\\)*\1'''),
                               [StringToken]))
        tmat.append(TokenMatch(re.compile(r"[a-zA-Z_][\w]*"),
                               [IdentifierToken]))
Exemplo n.º 2
0
    def __init__(self):

        """
        Class initializer.
        """

        # Call the superclass initializer

        Parser.__init__(self)

        # Set the containers we're going to recognize

        self._containers = [('(', ')'), ('{', '}'), ('[', ']')]

        # Define our token pattern matches

        tmat = self._token_matches

        # Search order matters, here. Patterns will be matched in
        # the order in which they appear in this list. Look for
        # comments, first, as they override everything, and then
        # look for multi-line strings, then strings, then backticks.
        # Backticks are deprecated, and removed in Python 3, but
        # retained here for backwards compatibility.

        tmat.append(TokenMatch(re.compile(r"#.*"),
                               [CommentToken]))
        tmat.append(TokenMatch(re.compile(r"r?([\"|\']{3})[^\1]*?" +
                                          r"(?<!\\)(\\\\)*\1"),
                               [MLStringToken]))
        tmat.append(TokenMatch(re.compile(r'''r?(["|']).*?(?<!\\)(\\\\)*\1'''),
                               [StringToken]))
        tmat.append(TokenMatch(re.compile(r"r?([`]).*?(?<!\\)(\\\\)*\1"),
                               [BacktickToken]))

        # Decorators and definitions go next. Note that we include
        # periods within the match for a decorator, unlike for regular
        # identifiers

        tmat.append(TokenMatch(re.compile(r"@[a-zA-Z_][\w\.]*"),
                               [DecoratorToken]))
        tmat.append(TokenMatch(re.compile(r"(def)(\s+)([a-zA-Z_][\w]*)"),
                               [KeywordToken,
                                WhitespaceToken,
                                DefinitionToken]))
        tmat.append(TokenMatch(re.compile(r"(class)(\s+)([a-zA-Z_][\w]*)"),
                               [KeywordToken,
                                WhitespaceToken,
                                DefinitionToken]))

        # Match regular identifiers, next. First check if an identifier
        # is preceded by a period, since if it is, we should not treat
        # it as a builtin or a keyword. If it's not, then label it a
        # FirstIdentifierToken, and _get_token() will replace it with a
        # KeywordToken or a BuiltinToken, if necessary.

        tmat.append(TokenMatch(re.compile(r"(\.)([a-zA-Z_][\w]*)"),
                               [SeparatorToken, IdentifierToken]))
        tmat.append(TokenMatch(re.compile(r"[a-zA-Z_][\w]*"),
                               [FirstIdentifierToken]))

        # Match numbers. Start with floats which start with a number
        # rather than a period, then floats which start with a period.
        # Note that we cannot make numbers both before and after the
        # period as optional in the same regular expression, or a plain
        # period will match as a float.
        #
        # Then match integers. Include an optional j on the end to
        # catch complex numbers. Note that, currently, the real and
        # imaginary parts of a complex number will be captured as two
        # separate numbers with an operator between them, which is
        # probably not ideal, and may be changed in the future.

        tmat.append(TokenMatch(re.compile(r"[0-9]+[\.][0-9]*((e|E)[\+\-]" +
                                          r"?[0-9]+)?(J|j)?"),
                               [FloatToken]))
        tmat.append(TokenMatch(re.compile(r"[\.][0-9]+((e|E)[\+\-]?" +
                                          r"[0-9]+)?(j|J)?"),
                               [FloatToken]))
        tmat.append(TokenMatch(re.compile(r"(0x)?[0-9]+(L|l)?(J|j)?"),
                               [IntegerToken]))

        # Look for assignment delimiter tokens, except the
        # regular '=' operator

        tmat.append(TokenMatch(re.compile(r"(\+=|\-=|\*=|/=|%=|//=|\*\*=)"),
                               [DelimiterToken]))

        # Look for multi-character operators

        tmat.append(TokenMatch(re.compile(r"(\*\*|<<|>>|<=|>=|<>|==|!=|//)"),
                               [OperatorToken]))

        # Look for the '=' operator only after matching any
        # multi-line operators, in particular we would never
        # match the '==' operator if we looked for the '='
        # operator first

        tmat.append(TokenMatch(re.compile(r"="),
                               [DelimiterToken]))

        # Look for single character operators

        tmat.append(TokenMatch(re.compile(r"[\+\*\-\/%~&\^\|<>]"),
                               [OperatorToken]))

        # Finally, look for single character separators

        tmat.append(TokenMatch(re.compile(r"[,:\.]"),
                               [SeparatorToken]))