def test_not_literal(self): r = get_code(r"[^a]") assert rsre_core.match(r, "A") assert not rsre_core.match(r, "a") r = get_code(r"[^a]+$") assert rsre_core.match(r, "Bx123") assert not rsre_core.match(r, "--a--")
def test_group_branch(self): r_code5 = get_code(r'<abc>(ab|c)</abc>') res = rsre_core.match(r_code5, '<abc>ab</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (5, 7) res = rsre_core.match(r_code5, '<abc>c</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (5, 6) res = rsre_core.match(r_code5, '<abc>de</abc>def') assert res is None
def test_not_literal_ignore(self): r = get_code(r"(?i)[^a]") assert rsre_core.match(r, "G") assert not rsre_core.match(r, "a") assert not rsre_core.match(r, "A") r = get_code(r"(?i)[^a]+$") assert rsre_core.match(r, "Gx123") assert not rsre_core.match(r, "--A--")
def test_lookbehind(self): r = get_code(r"([a-z]*)(?<=de)") assert rsre_core.match(r, "ade") res = rsre_core.match(r, "adefg") assert res is not None and res.get_mark(1) == 3 assert not rsre_core.match(r, "abc") assert not rsre_core.match(r, "X") assert not rsre_core.match(r, "eX")
def test_group_branch_max_until(self): r_code6 = get_code(r'<abc>(ab|c)*a</abc>') res = rsre_core.match(r_code6, '<abc>ccabcccaba</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (12, 14) r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>') res = rsre_core.match(r_code7, '<abc>ccabcccaba</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (12, 14) assert (res.get_mark(2), res.get_mark(3)) == (12, 14) assert (res.get_mark(4), res.get_mark(5)) == (11, 12)
def test_min_until_0_65535(self): r_code2 = get_code(r'<abc>(?:xy)*?xy</abc>') res = rsre_core.match(r_code2, '<abc></abc>def') assert res is None res = rsre_core.match(r_code2, '<abc>xy</abc>def') assert res is not None res = rsre_core.match(r_code2, '<abc>xyxyxy</abc>def') assert res is not None res = rsre_core.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') assert res is not None
def test_groupref_exists(self): r = get_code(r"((a)|(b))c(?(2)d)$") assert not rsre_core.match(r, "ac") assert rsre_core.match(r, "acd") assert rsre_core.match(r, "bc") assert not rsre_core.match(r, "bcd") # r = get_code(r"((a)|(b))c(?(2)d|e)$") assert not rsre_core.match(r, "ac") assert rsre_core.match(r, "acd") assert not rsre_core.match(r, "ace") assert not rsre_core.match(r, "bc") assert not rsre_core.match(r, "bcd") assert rsre_core.match(r, "bce")
def test_bigcharset(self): for i in range(100): chars = [unichr(random.randrange(0x100, 0xD000)) for n in range(random.randrange(1, 25))] pattern = u'[%s]' % (u''.join(chars),) r = get_code(pattern) for c in chars: assert rsre_core.match(r, c) for i in range(200): c = unichr(random.randrange(0x0, 0xD000)) res = rsre_core.match(r, c) if c in chars: assert res is not None else: assert res is None
def test_empty_minuntil(self): r_code, r = get_code_and_re(r'(a?)+?y') #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory res = rsre_core.match(r_code, 'z') assert not res # r_code, r = get_code_and_re(r'(a?){4,6}?y') assert not r.match('z') res = rsre_core.match(r_code, 'z') assert not res # r_code, r = get_code_and_re(r'(a?)*?y') #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory res = rsre_core.match(r_code, 'z') assert not res
def test_empty_maxuntil(self): r_code, r = get_code_and_re(r'(a?)+y') assert r.match('y') res = rsre_core.match(r_code, 'y') assert res # r_code, r = get_code_and_re(r'(a?){4,6}y') assert r.match('y') res = rsre_core.match(r_code, 'y') assert res # r_code, r = get_code_and_re(r'(a?)*y') assert r.match('y') res = rsre_core.match(r_code, 'y') assert res
def split_tokens(text): """Given the raw text of a trifle program, split it into things that look like tokens. """ tokens = [] while text: found_match = False for token, regexp in TOKENS: match = rsre_core.match(regexp, text) if match: found_match = True matched_text = text[:match.match_end] text = text[match.match_end:] if token in [WHITESPACE, COMMENT]: pass else: tokens.append(matched_text) break if not found_match: # TODO: It would be nice to suggest where open # brackets/quotation marks started, to give the user a hint. raise LexFailed(u"Could not lex remainder: '%s'" % text) return tokens
def test_min_until_3_5(self): r_code2, r = get_code_and_re(r'<abc>(?:xy){3,5}?xy</abc>') for i in range(8): s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef' assert (r.match(s) is not None) is (3 <= i-1 <= 5) res = rsre_core.match(r_code2, s) assert (res is not None) is (3 <= i-1 <= 5)
def test_min_repeat_one_with_backref(): # Python 3.5 compiles "(.)\1*?b" using MIN_REPEAT_ONE r = [ MARK, 0, ANY, MARK, 1, MIN_REPEAT_ONE, 6, 0, MAXREPEAT, GROUPREF, 0, SUCCESS, LITERAL, 98, SUCCESS ] assert rsre_core.match(rsre_core.CompiledPattern(r), "aaab").match_end == 4
def run_external(t, use_search): from rpython.rlib.rsre.test.re_tests import SUCCEED, FAIL, SYNTAX_ERROR pattern, s, outcome = t[:3] if len(t) == 5: repl, expected = t[3:5] else: assert len(t) == 3 print 'trying:', t try: obj = get_code(pattern) except re.error: if outcome == SYNTAX_ERROR: return # Expected a syntax error raise if outcome == SYNTAX_ERROR: raise Exception("this should have been a syntax error") # if use_search: result = rsre_core.search(obj, s) else: # Emulate a poor man's search() with repeated match()s for i in range(len(s)+1): result = rsre_core.match(obj, s, start=i) if result: break # if outcome == FAIL: if result is not None: raise Exception("succeeded incorrectly") elif outcome == SUCCEED: if result is None: raise Exception("failed incorrectly") # Matched, as expected, so now we compute the # result string and compare it to our expected result. start, end = result.span(0) vardict={'found': result.group(0), 'groups': result.group(), }#'flags': result.re.flags} for i in range(1, 100): try: gi = result.group(i) # Special hack because else the string concat fails: if gi is None: gi = "None" except IndexError: gi = "Error" vardict['g%d' % i] = gi #for i in result.re.groupindex.keys(): # try: # gi = result.group(i) # if gi is None: # gi = "None" # except IndexError: # gi = "Error" # vardict[i] = gi repl = eval(repl, vardict) if repl != expected: raise Exception("grouping error: %r should be %r" % (repl, expected))
def test_minuntil_lastmark_restore(self): r_code9, r9 = get_code_and_re(r'(x|yz)+?(y)??c') match = r9.match('xyzxc') assert match.span(1) == (3, 4) assert match.span(2) == (-1, -1) res = rsre_core.match(r_code9, 'xyzxc') assert (res.get_mark(0), res.get_mark(1)) == (3, 4) assert (res.get_mark(2), res.get_mark(3)) == (-1, -1)
def test_minuntil_bug(self): r_code9, r9 = get_code_and_re(r'((x|yz)+?(y)??c)*') match = r9.match('xycxyzxc') assert match.span(2) == (6, 7) #assert match.span(3) == (1, 2) --- bug of CPython res = rsre_core.match(r_code9, 'xycxyzxc') assert (res.get_mark(2), res.get_mark(3)) == (6, 7) assert (res.get_mark(4), res.get_mark(5)) == (1, 2)
def test_range_ignore(self): from rpython.rlib.unicodedata import unicodedb rsre_char.set_unicode_db(unicodedb) # r = get_code(u"[\U00010428-\U0001044f]", re.I) assert r.count(27) == 1 # OPCODE_RANGE r[r.index(27)] = 32 # => OPCODE_RANGE_IGNORE assert rsre_core.match(r, u"\U00010428")
def test_assert_not_group(self): r = get_code(r"abc(?!(de)f)(.)") res = rsre_core.match(r, "abcdeFghi") assert res is not None assert res.span(2) == (3, 4) # this I definitely classify as Horrendously Implementation Dependent. # CPython answers (3, 5). assert res.span(1) == (-1, -1)
def test_repeat_one_with_backref(): # Python 3.5 compiles "(.)\1*" using REPEAT_ONE instead of REPEAT: # it's a valid optimization because \1 is always one character long r = [ MARK, 0, ANY, MARK, 1, REPEAT_ONE, 6, 0, MAXREPEAT, GROUPREF, 0, SUCCESS, SUCCESS ] assert rsre_core.match(rsre_core.CompiledPattern(r), "aaa").match_end == 3
def test_bigcharset(self): for i in range(100): chars = [ unichr(random.randrange(0x100, 0xD000)) for n in range(random.randrange(1, 25)) ] pattern = u'[%s]' % (u''.join(chars), ) r = get_code(pattern) for c in chars: assert rsre_core.match(r, c) for i in range(200): c = unichr(random.randrange(0x0, 0xD000)) res = rsre_core.match(r, c) if c in chars: assert res is not None else: assert res is None
def test_range_ignore(self): from rpython.rlib.unicodedata import unicodedb rsre_char.set_unicode_db(unicodedb) # r = get_code(u"[\U00010428-\U0001044f]", re.I) assert r.pattern.count(27) == 1 # OPCODE_RANGE r.pattern[r.pattern.index(27)] = 32 # => OPCODE_RANGE_IGNORE assert rsre_core.match(r, u"\U00010428")
def test_groupref_ignore(self): r = get_code(r"(?i)(xx+)\1+$") # match non-prime numbers of x assert not rsre_core.match(r, "xX") assert not rsre_core.match(r, "xxX") assert rsre_core.match(r, "Xxxx") assert not rsre_core.match(r, "xxxXx") assert rsre_core.match(r, "xXxxxx") assert not rsre_core.match(r, "xxxXxxx") assert rsre_core.match(r, "xxxxxxXx") assert rsre_core.match(r, "xxxXxxxxx")
def test_groupref(self): r = get_code(r"(xx+)\1+$") # match non-prime numbers of x assert not rsre_core.match(r, "xx") assert not rsre_core.match(r, "xxx") assert rsre_core.match(r, "xxxx") assert not rsre_core.match(r, "xxxxx") assert rsre_core.match(r, "xxxxxx") assert not rsre_core.match(r, "xxxxxxx") assert rsre_core.match(r, "xxxxxxxx") assert rsre_core.match(r, "xxxxxxxxx")
def test_empty_maxuntil(self): r_code, r = get_code_and_re(r'(a?)+y') assert r.match('y') assert r.match('aaayaaay').span() == (0, 4) res = rsre_core.match(r_code, 'y') assert res res = rsre_core.match(r_code, 'aaayaaay') assert res and res.span() == (0, 4) # r_code, r = get_code_and_re(r'(a?){4,6}y') assert r.match('y') res = rsre_core.match(r_code, 'y') assert res # r_code, r = get_code_and_re(r'(a?)*y') assert r.match('y') res = rsre_core.match(r_code, 'y') assert res
def test_group_branch_repeat_complex_case(self): r_code8, r8 = get_code_and_re(r'<abc>((a)|(b))*</abc>') match = r8.match('<abc>ab</abc>') assert match.span(1) == (6, 7) assert match.span(3) == (6, 7) assert match.span(2) == (5, 6) res = rsre_core.match(r_code8, '<abc>ab</abc>') assert (res.get_mark(0), res.get_mark(1)) == (6, 7) assert (res.get_mark(4), res.get_mark(5)) == (6, 7) assert (res.get_mark(2), res.get_mark(3)) == (5, 6)
def test_group_7(self): r_code7, r7 = get_code_and_re(r'<abc>((a)?(b))*</abc>') match = r7.match('<abc>bbbabbbb</abc>') assert match.span(1) == (12, 13) assert match.span(3) == (12, 13) assert match.span(2) == (8, 9) res = rsre_core.match(r_code7, '<abc>bbbabbbb</abc>') assert (res.get_mark(0), res.get_mark(1)) == (12, 13) assert (res.get_mark(4), res.get_mark(5)) == (12, 13) assert (res.get_mark(2), res.get_mark(3)) == (8, 9)
def entrypoint1(r, string, repeat): r = array2list(r) string = hlstr(string) match = None for i in range(repeat): match = rsre_core.match(r, string) if match is None: return -1 else: return match.match_end
def test_empty_maxuntil_2(self): try: r_code, r = get_code_and_re(r'X(.*?)+X') except re.error as e: py.test.skip("older version of the stdlib: %s" % (e,)) assert r.match('XfooXbarX').span() == (0, 5) assert r.match('XfooXbarX').span(1) == (4, 4) res = rsre_core.match(r_code, 'XfooXbarX') assert res.span() == (0, 5) assert res.span(1) == (4, 4)
def test_match_end(self): r = get_code("ab") assert rsre_core.match(r, "abc") assert rsre_core.match(r, "abc", end=333) assert rsre_core.match(r, "abc", end=3) assert rsre_core.match(r, "abc", end=2) assert not rsre_core.match(r, "abc", end=1) assert not rsre_core.match(r, "abc", end=0) assert not rsre_core.match(r, "abc", end=-1)
def test_in_ignore(self): r = get_code(r"(?i)[a-f]") assert rsre_core.match(r, "b") assert rsre_core.match(r, "C") assert not rsre_core.match(r, "g") r = get_code(r"(?i)[a-f]+$") assert rsre_core.match(r, "bCdEf") assert not rsre_core.match(r, "g") assert not rsre_core.match(r, "aaagaaa")
def test_match_start(self): r = get_code(r"^ab") assert rsre_core.match(r, "abc") assert not rsre_core.match(r, "xxxabc", start=3) assert not rsre_core.match(r, "xx\nabc", start=3) # r = get_code(r"(?m)^ab") assert rsre_core.match(r, "abc") assert not rsre_core.match(r, "xxxabc", start=3) assert rsre_core.match(r, "xx\nabc", start=3)
def test_bug1(self): # REPEAT_ONE inside REPEAT r = get_code(r"(?:.+)?B") assert rsre_core.match(r, "AB") is not None r = get_code(r"(?:AA+?)+B") assert rsre_core.match(r, "AAAB") is not None r = get_code(r"(?:AA+)+?B") assert rsre_core.match(r, "AAAB") is not None r = get_code(r"(?:AA+?)+?B") assert rsre_core.match(r, "AAAB") is not None # REPEAT inside REPEAT r = get_code(r"(?:(?:xy)+)?B") assert rsre_core.match(r, "xyB") is not None r = get_code(r"(?:xy(?:xy)+?)+B") assert rsre_core.match(r, "xyxyxyB") is not None r = get_code(r"(?:xy(?:xy)+)+?B") assert rsre_core.match(r, "xyxyxyB") is not None r = get_code(r"(?:xy(?:xy)+?)+?B") assert rsre_core.match(r, "xyxyxyB") is not None
def test_category(self): r = get_code(r"[\sx]") assert rsre_core.match(r, "x") assert rsre_core.match(r, " ") assert not rsre_core.match(r, "n")
def test_repeated_set(self): r = get_code(r"[a0x]+f") assert rsre_core.match(r, "a0af") assert not rsre_core.match(r, "a0yaf")
def test_repeat_one_with_backref(): # Python 3.5 compiles "(.)\1*" using REPEAT_ONE instead of REPEAT: # it's a valid optimization because \1 is always one character long r = [MARK, 0, ANY, MARK, 1, REPEAT_ONE, 6, 0, MAXREPEAT, GROUPREF, 0, SUCCESS, SUCCESS] assert rsre_core.match(r, "aaa").match_end == 3
def test_min_repeat_one_with_backref(): # Python 3.5 compiles "(.)\1*?b" using MIN_REPEAT_ONE r = [MARK, 0, ANY, MARK, 1, MIN_REPEAT_ONE, 6, 0, MAXREPEAT, GROUPREF, 0, SUCCESS, LITERAL, 98, SUCCESS] assert rsre_core.match(r, "aaab").match_end == 4