def name_token(tokens): with PLAIN_ENGINE: n = tokens["name"] v = tokens["value"] if not n: n = str(num_captures) return Combine(v).set_token_name(n)
def delimitedList(expr, separator=",", combine=False): """ PARSE DELIMITED LIST OF expr Example:: delimitedList(Word(alphas)).parseString("aa,bb,cc") # -> ['aa', 'bb', 'cc'] delimitedList(Word(hexnums), delim=':', combine=True).parseString("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE'] """ if combine: return Combine(expr + ZeroOrMore(separator + expr)) else: return expr + ZeroOrMore(Suppress(separator) + expr)
def makeHTMLTags(tagStr, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")): """Helper to construct opening and closing tag expressions for HTML, given a tag name. Matches tags in either upper or lower case, attributes with namespaces and with quoted or unquoted values. """ if isinstance(tagStr, text): resname = tagStr tagStr = Keyword(tagStr, caseless=True) else: resname = tagStr.parser_name tagAttrName = Word(alphas, alphanums + "_-:") tagAttrValue = quotedString.addParseAction(removeQuotes) | Word( printables, exclude=">" ) simpler_name = "".join(resname.replace(":", " ").title().split()) openTag = ( ( suppress_LT + tagStr("tag") + OpenDict(ZeroOrMore(Group( tagAttrName.addParseAction(downcaseTokens) + Optional(Suppress("=") + tagAttrValue) ))) + Optional( "/", default=[False] )("empty").addParseAction(lambda t, l, s: t[0] == "/") + suppress_GT ) .set_token_name("start" + simpler_name) .set_parser_name("<%s>" % resname) ) closeTag = ( Combine(Literal("</") + tagStr + ">") .set_token_name("end" + simpler_name) .set_parser_name("</%s>" % resname) ) # openTag.tag = resname # closeTag.tag = resname # openTag.tag_body = SkipTo(closeTag) return openTag, closeTag
def QuotedString( quote_char, esc_char=None, esc_quote=None, multiline=False, unquote_results=True, end_quote_char="", convert_whitespace_escape=True, ): r""" Token for matching strings that are delimited by quoting characters. Defined with the following parameters: - quote_char - string of one or more characters defining the quote delimiting string - esc_char - character to escape quotes, typically backslash (default= ``None``) - esc_quote - special quote sequence to escape an embedded quote string (such as SQL's ``""`` to escape an embedded ``"``) (default= ``None``) - multiline - boolean indicating whether quotes can span multiple lines (default= ``False``) - unquoteResults - boolean indicating whether the matched text should be unquoted (default= ``True``) - end_quote_char - string of one or more characters defining the end of the quote delimited string (default= ``None`` => same as quote_char) - convertWhitespaceEscapes - convert escaped whitespace (``'\t'``, ``'\n'``, etc.) to actual whitespace (default= ``True``) """ quote_char = quote_char.strip() end_quote_char = end_quote_char.strip() or quote_char if not quote_char: Log.error("quote_char cannot be the empty string") if not end_quote_char: Log.error("end_quote_char cannot be the empty string") excluded = Literal(end_quote_char) if multiline: anychar = AnyChar() else: anychar = Char(exclude="\n") excluded |= Char("\r\n") included = ~Literal(end_quote_char) + anychar if esc_quote: included = Literal(esc_quote) | included if esc_char: excluded |= Literal(esc_char) included = esc_char + Char(printables) | included esc_char_replace_pattern = re.escape(esc_char) + "(.)" prec, pattern = ( Literal(quote_char) + ((~excluded + anychar) | included)[0:] ).__regex__() # IMPORTANT: THE end_quote_char IS OUTSIDE THE Regex BECAUSE OF PATHOLOGICAL BACKTRACKING output = Combine(Regex(pattern) + Literal(end_quote_char)) def post_parse(tokens): ret = tokens[0] if unquote_results: # strip off quotes ret = ret[len(quote_char) : -len(end_quote_char)] if isinstance(ret, text): # replace escaped whitespace if "\\" in ret and convert_whitespace_escape: ws_map = { r"\t": "\t", r"\n": "\n", r"\f": "\f", r"\r": "\r", } for wslit, wschar in ws_map.items(): ret = ret.replace(wslit, wschar) # replace escaped characters if esc_char: ret = re.sub(esc_char_replace_pattern, r"\g<1>", ret) # replace escaped quotes if esc_quote: ret = ret.replace(esc_quote, end_quote_char) return ParseResults(tokens.type, tokens.start, tokens.end, [ret]) return output.addParseAction(post_parse).streamline()
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString): """Helper method for defining nested lists enclosed in opening and closing delimiters ("(" and ")" are the default). Parameters: - opener - opening character for a nested list (default= ``"("``); can also be a mo_parsing expression - closer - closing character for a nested list (default= ``")"``); can also be a mo_parsing expression - content - expression for items within the nested lists (default= ``None``) - ignoreExpr - expression for ignoring opening and closing delimiters (default= `quotedString`) If an expression is not provided for the content argument, the nested expression will capture all whitespace-delimited content between delimiters as a list of separate values. Use the ``ignoreExpr`` argument to define expressions that may contain opening or closing characters that should not be treated as opening or closing characters for nesting, such as quotedString or a comment expression. Specify multiple expressions using an `Or` or `MatchFirst`. The default is `quotedString`, but if no expressions are to be ignored, then pass ``None`` for this argument. """ if opener == closer: raise ValueError("opening and closing strings cannot be the same") if content is None: if not isinstance(opener, text) or not isinstance(closer, text): raise ValueError( "opening and closing arguments must be strings if no content expression" " is given" ) ignore_chars = engine.CURRENT.white_chars with Engine(""): def scrub(t): return t[0].strip() if len(opener) == 1 and len(closer) == 1: if ignoreExpr is not None: content = Combine(OneOrMore( ~ignoreExpr + CharsNotIn(opener + closer + "".join(ignore_chars), exact=1,) )).addParseAction(scrub) else: content = Empty + CharsNotIn( opener + closer + "".join(ignore_chars) ).addParseAction(scrub) else: if ignoreExpr is not None: content = Combine(OneOrMore( ~ignoreExpr + ~Literal(opener) + ~Literal(closer) + CharsNotIn(ignore_chars, exact=1) )).addParseAction(scrub) else: content = Combine(OneOrMore( ~Literal(opener) + ~Literal(closer) + CharsNotIn(ignore_chars, exact=1) )).addParseAction(scrub) ret = Forward() if ignoreExpr is not None: ret <<= Group( Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer) ) else: ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) ret.set_parser_name("nested %s%s expression" % (opener, closer)) return ret
# replace escaped characters if esc_char: ret = re.sub(esc_char_replace_pattern, r"\g<1>", ret) # replace escaped quotes if esc_quote: ret = ret.replace(esc_quote, end_quote_char) return ParseResults(tokens.type, tokens.start, tokens.end, [ret]) return output.addParseAction(post_parse).streamline() dblQuotedString = Combine( # 0 1 2 3 4 5 # 012345678901234567890123456789012345678901234567890123456789 Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"' ).set_parser_name("string enclosed in double quotes") sglQuotedString = Combine( Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'" ).set_parser_name("string enclosed in single quotes") quotedString = Combine( Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"' | Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'" ).set_parser_name("quotedString using single or double quotes") unicodeString = Combine( Literal("u") + quotedString ).set_parser_name("unicode string literal") def countedArray(expr, intExpr=None):
macro = ( any_whitechar | any_wordchar | any_digitchar | not_digitchar | not_wordchar | not_whitechar | CR | LF | any_char | bs_char | tab_char ) escapedChar = ( ~macro + Combine("\\" + AnyChar()) ).addParseAction(lambda t: Literal(t.value()[1])) plainChar = Char(exclude=r"\]").addParseAction(lambda t: Literal(t.value())) escapedHexChar = Combine( (Literal("\\0x") | Literal("\\x") | Literal("\\X")) # lookup literals is faster + OneOrMore(Char(hexnums)) ).addParseAction(hex_to_char) escapedOctChar = Combine( Literal("\\0") + OneOrMore(Char("01234567")) ).addParseAction(lambda t: Literal(unichr(int(t.value()[2:], 8)))) singleChar = escapedHexChar | escapedOctChar | escapedChar | plainChar charRange = Group(singleChar("min") + "-" + singleChar("max")).addParseAction(to_range)