def test_unescape(): from rpython.rlib.parsing.regexparse import unescape s = "".join([ "\\x%s%s" % (a, b) for a in "0123456789abcdefABCDEF" for b in "0123456789ABCDEFabcdef" ]) assert unescape(s) == eval("'" + s + "'")
def test_unescape(): from rpython.rlib.parsing.regexparse import unescape s = "".join(["\\x%s%s" % (a, b) for a in "0123456789abcdefABCDEF" for b in "0123456789ABCDEFabcdef"]) assert unescape(s) == eval("'" + s + "'")
def create_pcre_pickle(file, dumper): """Create a filtered PCRE test file for the test.""" lines = [line for line in file.readlines()] # Look for things to skip... no_escape = r'(^|[^\\])(\\\\)*' # Make sure there's no escaping \ greedy_ops = re.compile(no_escape + r'[*?+}\(]\?') # Look for *? +? }? (? back_refs = re.compile(no_escape + r'\(.*' + no_escape + r'\\1') # find a \1 caret_in_middle = re.compile(no_escape + r'[^\[\\]\^') posix_char_classes = re.compile( no_escape + r'\[[^]]*\[:[^]]+:\][^]]*\]') # like [[:digit:]] bad_backslashes = re.compile( no_escape + r'(\\Q|\\E|\\G|\\P|\\8|\\9|\\A|\\Z|\\F|\\R|\\B|\\b|\\h|\\H|\\v|\\V|\\z|\\N)' ) # PCRE allows \Q.....\E to quote substrings, we dont. # Perl allows single-digit hex escapes. Change \x0 -> \x00, for example expand_perl_hex = re.compile(r'\\x([0-9a-fA-F]{1})(?=[^0-9a-fA-F]|$)') # suite = [ # [regex, flags, [(test,result),(test,result),...]] # [regex, flags, [(test,result),(test,result),...]] # ] suite = [] while lines: delim = None regex = '' # A line is marked by a start-delimeter and an end-delimeter. # The delimeter is non-alphanumeric # If a backslash follows the delimiter, then the backslash should # be appended to the end. (Otherwise, \ + delim would not be a # delim anymore!) while 1: regex += lines.pop(0) if not delim: if not regex.strip(): # Suppress blank lanes before delim regex = '' continue delim = regex.strip()[0] assert delim in (set(string.printable) - set(string.letters) - set(string.digits)) test_re = re.compile( r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)' % {'delim': delim}) # last two groups are an optional backslash and optional flags matches = test_re.findall(regex) if matches: break assert len(matches) == 1 # check to make sure we matched right regex = matches[0][0] regex += matches[0][-2] # Add the backslash, if we gotta flags = matches[0][-1] # Get the flags for the regex # Gotta tolerate Perl's short hexes regex = expand_perl_hex.sub(lambda m: r'\x0' + m.group(1), regex) tests = [] if greedy_ops.search(regex) or back_refs.search(regex): # Suppress complex features we can't do pass elif flags: # Suppress any test that requires PCRE flags pass elif posix_char_classes.search(regex): pass elif caret_in_middle.search(regex): pass elif bad_backslashes.search(regex): pass else: # In any other case, we're going to add the test # All the above test fall through and DONT get appended suite.append([regex, flags, tests]) # Now find the test and expected result while lines: test = lines.pop(0).strip() if not test: break # blank line ends the set if test.endswith( '\\' ): # Tests that end in \ expect the \ to be chopped off assert not test.endswith( '\\\\\\' ) # Make sure not three \'s. otherwise this check will get ridiculous if not test.endswith('\\\\'): # Two \'s means a real \ test = test[:-1] test = expand_perl_hex.sub(lambda m: r'\x0' + m.group(1), test) disqualify_test = bad_backslashes.search(test) try: test = unescape(test) except Exception: disqualify_test = True print "Warning: could not unescape %r" % test # Third line in the OUTPUT is the result, either: # ' 0: ...' for a match (but this is ONLY escaped by \x__ types) # 'No match' for no match # (other kinds exist, but we ignore them) while lines: match = lines.pop(0).rstrip('\r\n') match = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1), 16)), match) if match.startswith('No match') or match.startswith( 'Error') or match.startswith('Partial'): match = None break elif match.startswith(' 0:'): # Now we need to eat any further lines like: # ' 1: ....' a subgroup match match = match[4:] while lines[0].strip(): # ' 0+ ...' is also possible here if lines[0][2] in [':', '+']: lines.pop(0) else: break break elif not match: print " *** %r ***" % match raise Exception("Lost sync in output.") if not disqualify_test: tests.append((test, match)) # Last step, if there are regex's that dont have any tests, # might as well strip them out suite = [test for test in suite if test[2]] dumper.dump(suite)
def create_pcre_pickle(file, dumper): """Create a filtered PCRE test file for the test.""" lines = [line for line in file.readlines()] # Look for things to skip... no_escape = r'(^|[^\\])(\\\\)*' # Make sure there's no escaping \ greedy_ops = re.compile(no_escape + r'[*?+}\(]\?') # Look for *? +? }? (? back_refs = re.compile(no_escape + r'\(.*' + no_escape + r'\\1') # find a \1 caret_in_middle = re.compile(no_escape + r'[^\[\\]\^') posix_char_classes = re.compile(no_escape + r'\[[^]]*\[:[^]]+:\][^]]*\]') # like [[:digit:]] bad_backslashes = re.compile(no_escape + r'(\\Q|\\E|\\G|\\P|\\8|\\9|\\A|\\Z|\\F|\\R|\\B|\\b|\\h|\\H|\\v|\\V|\\z|\\N)') # PCRE allows \Q.....\E to quote substrings, we dont. # Perl allows single-digit hex escapes. Change \x0 -> \x00, for example expand_perl_hex = re.compile(r'\\x([0-9a-fA-F]{1})(?=[^0-9a-fA-F]|$)') # suite = [ # [regex, flags, [(test,result),(test,result),...]] # [regex, flags, [(test,result),(test,result),...]] # ] suite = [] while lines: delim = None regex = '' # A line is marked by a start-delimeter and an end-delimeter. # The delimeter is non-alphanumeric # If a backslash follows the delimiter, then the backslash should # be appended to the end. (Otherwise, \ + delim would not be a # delim anymore!) while 1: regex += lines.pop(0) if not delim: if not regex.strip(): # Suppress blank lanes before delim regex = '' continue delim = regex.strip()[0] assert delim in (set(string.printable) - set(string.letters) - set(string.digits)) test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)' % {'delim': delim}) # last two groups are an optional backslash and optional flags matches = test_re.findall(regex) if matches: break assert len(matches)==1 # check to make sure we matched right regex = matches[0][0] regex += matches[0][-2] # Add the backslash, if we gotta flags = matches[0][-1] # Get the flags for the regex # Gotta tolerate Perl's short hexes regex = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), regex) tests = [] if greedy_ops.search(regex) or back_refs.search(regex): # Suppress complex features we can't do pass elif flags: # Suppress any test that requires PCRE flags pass elif posix_char_classes.search(regex): pass elif caret_in_middle.search(regex): pass elif bad_backslashes.search(regex): pass else: # In any other case, we're going to add the test # All the above test fall through and DONT get appended suite.append([regex, flags, tests]) # Now find the test and expected result while lines: test = lines.pop(0).strip() if not test: break # blank line ends the set if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off assert not test.endswith('\\\\\\') # Make sure not three \'s. otherwise this check will get ridiculous if not test.endswith('\\\\'): # Two \'s means a real \ test = test[:-1] test = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), test) disqualify_test = bad_backslashes.search(test) try: test = unescape(test) except Exception: disqualify_test = True print "Warning: could not unescape %r" % test # Third line in the OUTPUT is the result, either: # ' 0: ...' for a match (but this is ONLY escaped by \x__ types) # 'No match' for no match # (other kinds exist, but we ignore them) while lines: match = lines.pop(0).rstrip('\r\n') match = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), match) if match.startswith('No match') or match.startswith('Error') or match.startswith('Partial'): match = None break elif match.startswith(' 0:'): # Now we need to eat any further lines like: # ' 1: ....' a subgroup match match = match[4:] while lines[0].strip(): # ' 0+ ...' is also possible here if lines[0][2] in [':','+']: lines.pop(0) else: break break elif not match: print " *** %r ***" % match raise Exception("Lost sync in output.") if not disqualify_test: tests.append((test,match)) # Last step, if there are regex's that dont have any tests, # might as well strip them out suite = [test for test in suite if test[2]] dumper.dump(suite)