Пример #1
0
 def copyTokenToRepeater(t, l, s):
     if t:
         if len(t) == 1:
             rep << t[0]
         else:
             # flatten t tokens
             tflat = _flatten(t)
             rep << And(Literal(tt) for tt in tflat)
     else:
         rep << Empty()
Пример #2
0
def makeHTMLTags(tagStr, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
    """Helper to construct opening and closing tag expressions for HTML,
    given a tag name. Matches tags in either upper or lower case,
    attributes with namespaces and with quoted or unquoted values.
    """
    if isinstance(tagStr, text):
        resname = tagStr
        tagStr = Keyword(tagStr, caseless=True)
    else:
        resname = tagStr.parser_name

    tagAttrName = Word(alphas, alphanums + "_-:")
    tagAttrValue = quotedString.addParseAction(removeQuotes) | Word(
        printables, exclude=">"
    )
    simpler_name = "".join(resname.replace(":", " ").title().split())

    openTag = (
        (
            suppress_LT
            + tagStr("tag")
            + OpenDict(ZeroOrMore(Group(
                tagAttrName.addParseAction(downcaseTokens)
                + Optional(Suppress("=") + tagAttrValue)
            )))
            + Optional(
                "/", default=[False]
            )("empty").addParseAction(lambda t, l, s: t[0] == "/")
            + suppress_GT
        )
        .set_token_name("start" + simpler_name)
        .set_parser_name("<%s>" % resname)
    )

    closeTag = (
        Combine(Literal("</") + tagStr + ">")
        .set_token_name("end" + simpler_name)
        .set_parser_name("</%s>" % resname)
    )

    # openTag.tag = resname
    # closeTag.tag = resname
    # openTag.tag_body = SkipTo(closeTag)

    return openTag, closeTag
Пример #3
0
def QuotedString(
    quote_char,
    esc_char=None,
    esc_quote=None,
    multiline=False,
    unquote_results=True,
    end_quote_char="",
    convert_whitespace_escape=True,
):
    r"""
    Token for matching strings that are delimited by quoting characters.

    Defined with the following parameters:

        - quote_char - string of one or more characters defining the
          quote delimiting string
        - esc_char - character to escape quotes, typically backslash
          (default= ``None``)
        - esc_quote - special quote sequence to escape an embedded quote
          string (such as SQL's ``""`` to escape an embedded ``"``)
          (default= ``None``)
        - multiline - boolean indicating whether quotes can span
          multiple lines (default= ``False``)
        - unquoteResults - boolean indicating whether the matched text
          should be unquoted (default= ``True``)
        - end_quote_char - string of one or more characters defining the
          end of the quote delimited string (default= ``None``  => same as
          quote_char)
        - convertWhitespaceEscapes - convert escaped whitespace
          (``'\t'``, ``'\n'``, etc.) to actual whitespace
          (default= ``True``)

    """
    quote_char = quote_char.strip()
    end_quote_char = end_quote_char.strip() or quote_char

    if not quote_char:
        Log.error("quote_char cannot be the empty string")
    if not end_quote_char:
        Log.error("end_quote_char cannot be the empty string")

    excluded = Literal(end_quote_char)

    if multiline:
        anychar = AnyChar()
    else:
        anychar = Char(exclude="\n")
        excluded |= Char("\r\n")

    included = ~Literal(end_quote_char) + anychar

    if esc_quote:
        included = Literal(esc_quote) | included
    if esc_char:
        excluded |= Literal(esc_char)
        included = esc_char + Char(printables) | included
        esc_char_replace_pattern = re.escape(esc_char) + "(.)"

    prec, pattern = (
        Literal(quote_char) + ((~excluded + anychar) | included)[0:]
    ).__regex__()
    # IMPORTANT: THE end_quote_char IS OUTSIDE THE Regex BECAUSE OF PATHOLOGICAL BACKTRACKING
    output = Combine(Regex(pattern) + Literal(end_quote_char))

    def post_parse(tokens):
        ret = tokens[0]
        if unquote_results:
            # strip off quotes
            ret = ret[len(quote_char) : -len(end_quote_char)]

            if isinstance(ret, text):
                # replace escaped whitespace
                if "\\" in ret and convert_whitespace_escape:
                    ws_map = {
                        r"\t": "\t",
                        r"\n": "\n",
                        r"\f": "\f",
                        r"\r": "\r",
                    }
                    for wslit, wschar in ws_map.items():
                        ret = ret.replace(wslit, wschar)

                # replace escaped characters
                if esc_char:
                    ret = re.sub(esc_char_replace_pattern, r"\g<1>", ret)

                # replace escaped quotes
                if esc_quote:
                    ret = ret.replace(esc_quote, end_quote_char)

        return ParseResults(tokens.type, tokens.start, tokens.end, [ret])

    return output.addParseAction(post_parse).streamline()
Пример #4
0
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString):
    """Helper method for defining nested lists enclosed in opening and
    closing delimiters ("(" and ")" are the default).

    Parameters:
     - opener - opening character for a nested list
       (default= ``"("``); can also be a mo_parsing expression
     - closer - closing character for a nested list
       (default= ``")"``); can also be a mo_parsing expression
     - content - expression for items within the nested lists
       (default= ``None``)
     - ignoreExpr - expression for ignoring opening and closing
       delimiters (default= `quotedString`)

    If an expression is not provided for the content argument, the
    nested expression will capture all whitespace-delimited content
    between delimiters as a list of separate values.

    Use the ``ignoreExpr`` argument to define expressions that may
    contain opening or closing characters that should not be treated as
    opening or closing characters for nesting, such as quotedString or
    a comment expression.  Specify multiple expressions using an
    `Or` or `MatchFirst`. The default is
    `quotedString`, but if no expressions are to be ignored, then
    pass ``None`` for this argument.

    """
    if opener == closer:
        raise ValueError("opening and closing strings cannot be the same")
    if content is None:
        if not isinstance(opener, text) or not isinstance(closer, text):
            raise ValueError(
                "opening and closing arguments must be strings if no content expression"
                " is given"
            )

        ignore_chars = engine.CURRENT.white_chars
        with Engine(""):

            def scrub(t):
                return t[0].strip()

            if len(opener) == 1 and len(closer) == 1:
                if ignoreExpr is not None:
                    content = Combine(OneOrMore(
                        ~ignoreExpr
                        + CharsNotIn(opener + closer + "".join(ignore_chars), exact=1,)
                    )).addParseAction(scrub)
                else:
                    content = Empty + CharsNotIn(
                        opener + closer + "".join(ignore_chars)
                    ).addParseAction(scrub)
            else:
                if ignoreExpr is not None:
                    content = Combine(OneOrMore(
                        ~ignoreExpr
                        + ~Literal(opener)
                        + ~Literal(closer)
                        + CharsNotIn(ignore_chars, exact=1)
                    )).addParseAction(scrub)
                else:
                    content = Combine(OneOrMore(
                        ~Literal(opener)
                        + ~Literal(closer)
                        + CharsNotIn(ignore_chars, exact=1)
                    )).addParseAction(scrub)
    ret = Forward()
    if ignoreExpr is not None:
        ret <<= Group(
            Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
        )
    else:
        ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
    ret.set_parser_name("nested %s%s expression" % (opener, closer))
    return ret
Пример #5
0
dblQuotedString = Combine(
    #       0         1         2         3         4         5
    #       012345678901234567890123456789012345678901234567890123456789
    Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*')
    + '"'
).set_parser_name("string enclosed in double quotes")
sglQuotedString = Combine(
    Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'"
).set_parser_name("string enclosed in single quotes")
quotedString = Combine(
    Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"'
    | Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'"
).set_parser_name("quotedString using single or double quotes")
unicodeString = Combine(
    Literal("u") + quotedString
).set_parser_name("unicode string literal")


def countedArray(expr, intExpr=None):
    """Helper to define a counted list of expressions.

    This helper defines a pattern of the form::

        integer expr expr expr...

    where the leading integer tells how many expr expressions follow.
    The matched tokens returns the array of expr tokens as a list - the
    leading count token is suppressed.

    If ``intExpr`` is specified, it should be a mo_parsing expression
Пример #6
0
from mo_parsing.utils import regex_range

# import later
And, Or, MatchFirst = [None] * 3

dblQuotedString = Combine(
    Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') +
    '"').set_parser_name("string enclosed in double quotes")
sglQuotedString = Combine(
    Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") +
    "'").set_parser_name("string enclosed in single quotes")
quotedString = Combine(
    Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"'
    | Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") +
    "'").set_parser_name("quotedString using single or double quotes")
unicodeString = Combine(Literal("u") +
                        quotedString).set_parser_name("unicode string literal")


def delimitedList(expr, separator=",", combine=False):
    """
    PARSE DELIMITED LIST OF expr
    Example::

        delimitedList(Word(alphas)).parseString("aa,bb,cc") # -> ['aa', 'bb', 'cc']
        delimitedList(Word(hexnums), delim=':', combine=True).parseString("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE']
    """
    if combine:
        return Combine(expr + ZeroOrMore(separator + expr))
    else:
        return expr + ZeroOrMore(Suppress(separator) + expr)
Пример #7
0
def infixNotation(baseExpr,
                  spec,
                  lpar=Suppress(Literal("(")),
                  rpar=Suppress(Literal(")"))):
    """
    :param baseExpr: expression representing the most basic element for the
       nested
    :param spec: list of tuples, one for each operator precedence level
       in the expression grammar; each tuple is of the form ``(opExpr,
       numTerms, rightLeftAssoc, parseAction)``, where:

       - opExpr is the mo_parsing expression for the operator; may also
         be a string, which will be converted to a Literal; if numTerms
         is 3, opExpr is a tuple of two expressions, for the two
         operators separating the 3 terms
       - numTerms is the number of terms for this operator (must be 1,
         2, or 3)
       - rightLeftAssoc is the indicator whether the operator is right
         or left associative, using the mo_parsing-defined constants
         ``RIGHT_ASSOC`` and ``LEFT_ASSOC``.
       - parseAction is the parse action to be associated with
         expressions matching this operator expression
    :param lpar: expression for matching left-parentheses
       (default= ``Suppress('(')``)
    :param rpar: expression for matching right-parentheses
       (default= ``Suppress(')')``)
    :return: ParserElement
    """

    all_op = {}

    def norm(op):
        if op == None:
            op = _no_op
        output = all_op.get(id(op))
        if output:
            return output

        def record_self(tok):
            ParseResults(tok.type, tok.start, tok.end, [tok.type.parser_name])

        output = engine.CURRENT.normalize(op)
        is_suppressed = isinstance(output, Suppress)
        if is_suppressed:
            output = output.expr
        output = output.addParseAction(record_self)
        all_op[id(op)] = is_suppressed, output
        return is_suppressed, output

    opList = []
    """
    SCRUBBED LIST OF OPERATORS
    * expr - used exclusively for ParseResult(expr, [...]), not used to match
    * op - used to match 
    * arity - same
    * assoc - same
    * parse_actions - same
    """

    for operDef in spec:
        op, arity, assoc, rest = operDef[0], operDef[1], operDef[2], operDef[
            3:]
        parse_actions = list(map(wrap_parse_action, listwrap(
            rest[0]))) if rest else []
        if arity == 1:
            is_suppressed, op = norm(op)
            if assoc == RIGHT_ASSOC:
                opList.append((
                    Group(baseExpr + op),
                    op,
                    is_suppressed,
                    arity,
                    assoc,
                    parse_actions,
                ))
            else:
                opList.append((
                    Group(op + baseExpr),
                    op,
                    is_suppressed,
                    arity,
                    assoc,
                    parse_actions,
                ))
        elif arity == 2:
            is_suppressed, op = norm(op)
            opList.append((
                Group(baseExpr + op + baseExpr),
                op,
                is_suppressed,
                arity,
                assoc,
                parse_actions,
            ))
        elif arity == 3:
            is_suppressed, op = zip(norm(op[0]), norm(op[1]))
            opList.append((
                Group(baseExpr + op[0] + baseExpr + op[1] + baseExpr),
                op,
                is_suppressed,
                arity,
                assoc,
                parse_actions,
            ))
    opList = tuple(opList)

    def record_op(op):
        def output(tokens):
            return ParseResults(NO_PARSER, tokens.start, tokens.end,
                                [(tokens, op)])

        return output

    prefix_ops = MatchFirst([
        op.addParseAction(record_op(op))
        for expr, op, is_suppressed, arity, assoc, pa in opList
        if arity == 1 and assoc == RIGHT_ASSOC
    ])
    suffix_ops = MatchFirst([
        op.addParseAction(record_op(op))
        for expr, op, is_suppressed, arity, assoc, pa in opList
        if arity == 1 and assoc == LEFT_ASSOC
    ])
    ops = Or([
        opPart.addParseAction(record_op(opPart)) for opPart in set(
            opPart for expr, op, is_suppressed, arity, assoc, pa in opList
            if arity > 1 for opPart in (op if isinstance(op, tuple) else [op]))
    ])

    def make_tree(tokens, loc, string):
        flat_tokens = list(tokens)
        num = len(opList)
        op_index = 0
        while len(flat_tokens) > 1 and op_index < num:
            expr, op, is_suppressed, arity, assoc, parse_actions = opList[
                op_index]
            if arity == 1:
                if assoc == RIGHT_ASSOC:
                    # PREFIX OPERATOR -3
                    todo = list(reversed(list(enumerate(flat_tokens[:-1]))))
                    for i, (r, o) in todo:
                        if o == op:
                            tok = flat_tokens[i + 1][0]
                            if is_suppressed:
                                result = ParseResults(expr, tok.start, tok.end,
                                                      (tok, ))
                            else:
                                result = ParseResults(expr, r.start, tok.end,
                                                      (r, tok))
                            break
                    else:
                        op_index += 1
                        continue
                else:
                    # SUFFIX OPERATOR 3!
                    todo = list(enumerate(flat_tokens[1:]))
                    for i, (r, o) in todo:
                        if o == op:
                            tok = flat_tokens[i][0]
                            if is_suppressed:
                                result = ParseResults(expr, tok.start, tok.end,
                                                      (tok, ))
                            else:
                                result = ParseResults(expr, tok.start, r.end, (
                                    tok,
                                    r,
                                ))
                            break
                    else:
                        op_index += 1
                        continue
            elif arity == 2:
                todo = list(enumerate(flat_tokens[1:-1]))
                if assoc == RIGHT_ASSOC:
                    todo = list(reversed(todo))

                for i, (r, o) in todo:
                    if o == op:
                        if is_suppressed:
                            result = ParseResults(
                                expr,
                                flat_tokens[i][0].start,
                                flat_tokens[i + 2][0].end,
                                (flat_tokens[i][0], flat_tokens[i + 2][0]),
                            )
                        else:
                            result = ParseResults(
                                expr,
                                flat_tokens[i][0].start,
                                flat_tokens[i + 2][0].end,
                                (flat_tokens[i][0], r, flat_tokens[i + 2][0]),
                            )
                        break
                else:
                    op_index += 1
                    continue

            else:  # arity==3
                todo = list(enumerate(flat_tokens[1:-3]))
                if assoc == RIGHT_ASSOC:
                    todo = list(reversed(todo))

                for i, (r0, o0) in todo:
                    if o0 == op[0]:
                        r1, o1 = flat_tokens[i + 3]
                        if o1 == op[1]:
                            seq = [
                                flat_tokens[i][0],
                                flat_tokens[i + 2][0],
                                flat_tokens[i + 4][0],
                            ]
                            s0, s1 = is_suppressed
                            if not s1:
                                seq.insert(2, r1)
                            if not s0:
                                seq.insert(1, r0)

                            result = ParseResults(expr, seq[0].start,
                                                  seq[-1].end, seq)
                            break
                else:
                    op_index += 1
                    continue

            for p in parse_actions:
                result = p(result, -1, string)
            offset = (0, 2, 3, 5)[arity]
            flat_tokens[i:i + offset] = [(result, (expr, ))]
            op_index = 0

        result = flat_tokens[0][0]
        result.end = tokens.end
        return result

    flat = Forward()
    iso = lpar.suppress() + flat + rpar.suppress()
    atom = (baseExpr | iso).addParseAction(record_op(baseExpr))
    modified = ZeroOrMore(prefix_ops) + atom + ZeroOrMore(suffix_ops)
    flat << (modified + ZeroOrMore(ops + modified)
             ).addParseAction(make_tree).streamline()

    return flat.streamline()
Пример #8
0
def hex_to_char(t):
    return Literal(unichr(int(t.value().lower().split("x")[1], 16)))
Пример #9
0
    elif mode in "*?":
        return ZeroOrMore(operand)
    elif mode in "+?":
        return OneOrMore(operand)
    elif mode == "?":
        return Optional(operand)
    else:
        Log.error("not expected")


PLAIN_ENGINE.use()

#########################################################################################
# SQUARE BRACKETS

any_whitechar = Literal("\\s").addParseAction(lambda: Char(whitespace))
not_whitechar = Literal("\\S").addParseAction(lambda: Char(exclude=whitespace))
any_wordchar = Literal("\\w").addParseAction(lambda: Char(alphanums + "_"))
not_wordchar = Literal("\\W").addParseAction(lambda: Char(exclude=alphanums + "_"))
any_digitchar = Literal("\\d").addParseAction(lambda: Char(nums))
not_digitchar = Literal("\\D").addParseAction(lambda: Char(exclude=nums))
bs_char = Literal("\\\\").addParseAction(lambda: Literal("\\"))
tab_char = Literal("\\t").addParseAction(lambda: Literal("\t"))
CR = Literal("\\n").addParseAction(lambda: Literal("\n"))
LF = Literal("\\r").addParseAction(lambda: Literal("\r"))
any_char = Literal(".").addParseAction(lambda: AnyChar())

macro = (
    any_whitechar
    | any_wordchar
    | any_digitchar
Пример #10
0
    elif mode in "*?":
        return ZeroOrMore(operand)
    elif mode in "+?":
        return OneOrMore(operand)
    elif mode == "?":
        return Optional(operand)
    else:
        Log.error("not expected")


PLAIN_ENGINE.use()

#########################################################################################
# SQUARE BRACKETS

any_whitechar = Literal("\\s").addParseAction(lambda: Char(whitespace))
not_whitechar = Literal("\\S").addParseAction(lambda: Char(exclude=whitespace))
any_wordchar = Literal("\\w").addParseAction(lambda: Char(alphanums + "_"))
not_wordchar = Literal("\\W").addParseAction(
    lambda: Char(exclude=alphanums + "_"))
any_digitchar = Literal("\\d").addParseAction(lambda: Char(nums))
not_digitchar = Literal("\\D").addParseAction(lambda: Char(exclude=nums))
bs_char = Literal("\\\\").addParseAction(lambda: Literal("\\"))
tab_char = Literal("\\t").addParseAction(lambda: Literal("\t"))
CR = Literal("\\n").addParseAction(lambda: Literal("\n"))
LF = Literal("\\r").addParseAction(lambda: Literal("\r"))
any_char = Literal(".").addParseAction(lambda: AnyChar())

macro = (any_whitechar
         | any_wordchar
         | any_digitchar