def test_or_operator(self): regex = translate_pattern('0|1', anchors=False) self.assertEqual(regex, r'^(0|1)$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('0').group(0), '0') self.assertEqual(pattern.search('1').group(0), '1') self.assertIsNone(pattern.search('1\n')) self.assertIsNone(pattern.search('')) self.assertIsNone(pattern.search('2')) self.assertIsNone(pattern.search('01')) self.assertIsNone(pattern.search('1\n ')) regex = translate_pattern(r'\d+[%]|\d*\.\d+[%]', anchors=False) self.assertEqual(regex, r'^(\d+[%]|\d*\.\d+[%])$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('99%').group(0), '99%') self.assertEqual(pattern.search('99.9%').group(0), '99.9%') self.assertEqual(pattern.search('.90%').group(0), '.90%') self.assertIsNone(pattern.search('%')) self.assertIsNone(pattern.search('90.%')) regex = translate_pattern('([ -~]|\n|\r|\t)*', anchors=False) self.assertEqual(regex, '^(([ -~]|\n|\r|\t)*)$(?!\\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('ciao\t-~ ').group(0), 'ciao\t-~ ') self.assertEqual(pattern.search('\r\r').group(0), '\r\r') self.assertEqual(pattern.search('\n -.abc').group(0), '\n -.abc') self.assertIsNone(pattern.search('à')) self.assertIsNone(pattern.search('\t\n à'))
def test_invalid_hyphen(self): with self.assertRaises(RegexError) as ctx: translate_pattern('[a-b-c]') self.assertIn("unescaped character '-' at position 4", str(ctx.exception)) regex = translate_pattern('[a-b-c]', xsd_version='1.1') self.assertEqual(regex, '[\\-a-c]') self.assertEqual(translate_pattern('[-a-bc]'), regex) self.assertEqual(translate_pattern('[a-bc-]'), regex)
def test_possessive_quantifiers(self): # Note: possessive quantifiers (*+, ++, ?+, {m,n}+) are supported in Python 3.11+ with self.assertRaises(RegexError) as ctx: translate_pattern('^[abcd]*+$') self.assertIn("unexpected meta character '+' at position 8", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('^[abcd]{1,5}+$') self.assertIn("unexpected meta character '+' at position 12", str(ctx.exception))
def test_back_references(self): self.assertEqual(translate_pattern('(a)\\1'), '(a)\\1') self.assertEqual(translate_pattern('(a)\\11'), '(a)\\1[1]') regex = translate_pattern('((((((((((((a))))))))))))\\11') self.assertEqual(regex, '((((((((((((a))))))))))))\\11') with self.assertRaises(RegexError) as ctx: translate_pattern('(a)\\1', back_references=False) self.assertIn("not allowed escape sequence", str(ctx.exception))
def test_backslash_and_escapes(self): regex = translate_pattern('\\') self.assertEqual(regex, '\\') regex = translate_pattern('\\i') self.assertTrue(regex.startswith('[:A-Z_a-z')) regex = translate_pattern('\\I') self.assertTrue(regex.startswith('[^:A-Z_a-z')) regex = translate_pattern('\\c') self.assertTrue(regex.startswith('[-.0-9:A-Z_a-z')) regex = translate_pattern('\\C') self.assertTrue(regex.startswith('[^-.0-9:A-Z_a-z'))
def test_character_class_subtraction(self): regex = translate_pattern('[a-z-[aeiuo]]') self.assertEqual(regex, '[b-df-hj-np-tv-z]') # W3C XSD 1.1 test group RegexTest_422 regex = translate_pattern('[^0-9-[a-zAE-Z]]') self.assertEqual(regex, '[^0-9AE-Za-z]') regex = translate_pattern(r'^([^0-9-[a-zAE-Z]]|[\w-[a-zAF-Z]])+$') pattern = re.compile(regex) self.assertIsNone(pattern.search('azBCDE1234567890BCDEFza')) self.assertEqual(pattern.search('BCD').group(0), 'BCD')
def test_anchors(self): regex = translate_pattern('a^b') self.assertEqual(regex, 'a^b') regex = translate_pattern('a^b', anchors=False) self.assertEqual(regex, '^(a\\^b)$(?!\\n\Z)') regex = translate_pattern('ab$') self.assertEqual(regex, 'ab$(?!\\n\\Z)') regex = translate_pattern('ab$', anchors=False) self.assertEqual(regex, '^(ab\\$)$(?!\\n\Z)')
def test_category_escape(self): regex = translate_pattern('^\\p{IsBasicLatin}*$') self.assertEqual(regex, '^[\x00-\x7f]*$(?!\\n\\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('').group(0), '') self.assertEqual(pattern.search('e').group(0), 'e') self.assertIsNone(pattern.search('è')) regex = translate_pattern('^[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}]*$') self.assertEqual(regex, '^[\x00-\xff]*$(?!\\n\\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('e').group(0), 'e') self.assertEqual(pattern.search('è').group(0), 'è') self.assertIsNone(pattern.search('Ĭ'))
def test_issue_079(self): # Do not escape special characters in character class regex = translate_pattern('[^\n\t]+', anchors=False) self.assertEqual(regex, '^([^\t\n]+)$(?!\\n\\Z)') pattern = re.compile(regex) self.assertIsNone(pattern.search('first\tsecond\tthird')) self.assertEqual(pattern.search('first second third').group(0), 'first second third')
def test_digit_shortcut(self): regex = translate_pattern(r'\d{1,3}\.\d{1,2}', anchors=False) self.assertEqual(regex, r'^(\d{1,3}\.\d{1,2})$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('12.40').group(0), '12.40') self.assertEqual(pattern.search('867.00').group(0), '867.00') self.assertIsNone(pattern.search('867.00\n')) self.assertIsNone(pattern.search('867.00 ')) self.assertIsNone(pattern.search('867.000')) self.assertIsNone(pattern.search('1867.0')) self.assertIsNone(pattern.search('a1.13')) regex = translate_pattern(r'[-+]?(\d+|\d+(\.\d+)?%)', anchors=False) self.assertEqual(regex, r'^([\+\-]?(\d+|\d+(\.\d+)?%))$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('78.8%').group(0), '78.8%') self.assertIsNone(pattern.search('867.00'))
def test_dot_wildcard(self): regex = translate_pattern('.+', anchors=False) self.assertEqual(regex, '^([^\r\n]+)$(?!\\n\\Z)') pattern = re.compile(regex) self.assertIsNone(pattern.search('line1\rline2\r')) self.assertIsNone(pattern.search('line1\nline2')) self.assertIsNone(pattern.search('')) self.assertIsNotNone(pattern.search('\\')) self.assertEqual(pattern.search('abc').group(0), 'abc') regex = translate_pattern('.+T.+(Z|[+-].+)', anchors=False) self.assertEqual(regex, '^([^\r\n]+T[^\r\n]+(Z|[\\+\\-][^\r\n]+))$(?!\\n\\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('12T0A3+36').group(0), '12T0A3+36') self.assertEqual(pattern.search('12T0A3Z').group(0), '12T0A3Z') self.assertIsNone(pattern.search('')) self.assertIsNone(pattern.search('12T0A3Z2'))
def test_ending_newline_match(self): # Related with xmlschema's issue #223 regex = translate_pattern(pattern=r"\d{2}:\d{2}:\d{6,7}", back_references=False, lazy_quantifiers=False, anchors=False) pattern = re.compile(regex) self.assertIsNotNone(pattern.match("38:36:000031")) self.assertIsNone(pattern.match("38:36:000031\n"))
def test_occurrences_qualifiers(self): regex = translate_pattern('#[0-9a-fA-F]{3}([0-9a-fA-F]{3})?', anchors=False) self.assertEqual(regex, r'^(#[0-9A-Fa-f]{3}([0-9A-Fa-f]{3})?)$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('#F3D').group(0), '#F3D') self.assertIsNone(pattern.search('#F3D\n')) self.assertEqual(pattern.search('#F3DA30').group(0), '#F3DA30') self.assertIsNone(pattern.search('#F3')) self.assertIsNone(pattern.search('#F3D ')) self.assertIsNone(pattern.search('F3D')) self.assertIsNone(pattern.search(''))
def test_invalid_character_class(self): with self.assertRaises(RegexError) as ctx: translate_pattern('[[]') self.assertIn("invalid character '['", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('ab]d') self.assertIn("unexpected meta character ']'", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('[abc\\1]') self.assertIn("illegal back-reference in character class", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('[--a]') self.assertIn("invalid character range '--'", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('[a-z-[c-q') self.assertIn("unterminated character class", str(ctx.exception))
def test_empty_character_class(self): regex = translate_pattern('[a-[a-f]]', anchors=False) self.assertEqual(regex, r'^([^\w\W])$(?!\n\Z)') self.assertRaises(RegexError, translate_pattern, '[]') self.assertEqual(translate_pattern(r'[\w-[\w]]'), r'[^\w\W]') self.assertEqual(translate_pattern(r'[\s-[\s]]'), r'[^\w\W]') self.assertEqual(translate_pattern(r'[\c-[\c]]'), r'[^\w\W]') self.assertEqual(translate_pattern(r'[\i-[\i]]'), r'[^\w\W]') self.assertEqual(translate_pattern('[a-[ab]]'), r'[^\w\W]') self.assertEqual(translate_pattern('[^a-[^a]]'), r'[^\w\W]')
def test_not_spaces(self): regex = translate_pattern(r"[\S' ']{1,10}", anchors=False) if sys.version_info >= (3,): self.assertEqual( regex, "^([\x00-\x08\x0b\x0c\x0e-\x1f!-\U0010ffff ']{1,10})$(?!\\n\\Z)" ) pattern = re.compile(regex) self.assertIsNone(pattern.search('alpha\r')) self.assertEqual(pattern.search('beta').group(0), 'beta') self.assertIsNone(pattern.search('beta\n')) self.assertIsNone(pattern.search('beta\n ')) self.assertIsNone(pattern.search('')) self.assertIsNone(pattern.search('over the maximum length!')) self.assertIsNotNone(pattern.search('\\')) self.assertEqual(pattern.search('abc').group(0), 'abc')
def test_character_class_reordering(self): regex = translate_pattern('[A-Z ]', anchors=False) self.assertEqual(regex, '^([ A-Z])$(?!\\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('A').group(0), 'A') self.assertEqual(pattern.search('Z').group(0), 'Z') self.assertEqual(pattern.search('Q').group(0), 'Q') self.assertEqual(pattern.search(' ').group(0), ' ') self.assertIsNone(pattern.search(' ')) self.assertIsNone(pattern.search('AA')) regex = translate_pattern(r'[0-9.,DHMPRSTWYZ/:+\-]+', anchors=False) self.assertEqual(regex, r'^([\+-\-\.-:DHMPR-TWYZ]+)$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('12,40').group(0), '12,40') self.assertEqual(pattern.search('YYYY:MM:DD').group(0), 'YYYY:MM:DD') self.assertIsNone(pattern.search('')) self.assertIsNone(pattern.search('C')) regex = translate_pattern('[^: \n\r\t]+', anchors=False) self.assertEqual(regex, '^([^\t\n\r :]+)$(?!\\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('56,41').group(0), '56,41') self.assertIsNone(pattern.search('56,41\n')) self.assertIsNone(pattern.search('13:20')) regex = translate_pattern(r'^[A-Za-z0-9_\-]+(:[A-Za-z0-9_\-]+)?$') self.assertEqual(regex, r'^[\-0-9A-Z_a-z]+(:[\-0-9A-Z_a-z]+)?$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('fa9').group(0), 'fa9') self.assertIsNone(pattern.search('-x_1:_tZ-\n')) self.assertEqual(pattern.search('-x_1:_tZ-').group(0), '-x_1:_tZ-') self.assertIsNone(pattern.search('')) self.assertIsNone(pattern.search('+78')) regex = translate_pattern(r'[!%\^\*@~;#,|/]', anchors=False) self.assertEqual(regex, r'^([!#%\*,/;@\^\|~])$(?!\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('#').group(0), '#') self.assertEqual(pattern.search('!').group(0), '!') self.assertEqual(pattern.search('^').group(0), '^') self.assertEqual(pattern.search('|').group(0), '|') self.assertEqual(pattern.search('*').group(0), '*') self.assertIsNone(pattern.search('**')) self.assertIsNone(pattern.search('b')) self.assertIsNone(pattern.search('')) regex = translate_pattern('[A-Za-z]+:[A-Za-z][A-Za-z0-9\\-]+', anchors=False) self.assertEqual(regex, '^([A-Za-z]+:[A-Za-z][\\-0-9A-Za-z]+)$(?!\\n\Z)') pattern = re.compile(regex) self.assertEqual(pattern.search('zk:xy-9s').group(0), 'zk:xy-9s') self.assertIsNone(pattern.search('xx:y'))
def test_invalid_quantifiers(self): with self.assertRaises(RegexError) as ctx: translate_pattern('{1}') self.assertIn("unexpected quantifier '{'", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('.{1,2,3}') self.assertIn("invalid quantifier '{'", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('*') self.assertIn("unexpected quantifier '*'", str(ctx.exception))
def test_invalid_pattern_groups(self): with self.assertRaises(RegexError) as ctx: translate_pattern('(?.*)') self.assertIn("invalid '(?...)' extension notation", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('(.*))') self.assertIn("unbalanced parenthesis ')'", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('((.*)') self.assertIn("unterminated subpattern in expression", str(ctx.exception))
def test_character_class_shortcuts(self): regex = translate_pattern(r"^[\i-[:]][\c-[:]]*$") pattern = re.compile(regex) self.assertEqual(pattern.search('x11').group(0), 'x11') self.assertIsNone(pattern.search('3a')) regex = translate_pattern(r"^\w*$") pattern = re.compile(regex) self.assertEqual(pattern.search('aA_x7').group(0), 'aA_x7') self.assertIsNone(pattern.search('.')) self.assertIsNone(pattern.search('-')) regex = translate_pattern(r"\W*", anchors=False) pattern = re.compile(regex) self.assertIsNone(pattern.search('aA_x7')) self.assertEqual(pattern.search('.-').group(0), '.-') regex = translate_pattern(r"^\d*$") pattern = re.compile(regex) self.assertEqual(pattern.search('6410').group(0), '6410') self.assertIsNone(pattern.search('a')) self.assertIsNone(pattern.search('-')) regex = translate_pattern(r"^\D*$") pattern = re.compile(regex) self.assertIsNone(pattern.search('6410')) self.assertEqual(pattern.search('a').group(0), 'a') self.assertEqual(pattern.search('-').group(0), '-') # Pull Request 114 regex = translate_pattern(r"^[\w]{0,5}$") pattern = re.compile(regex) self.assertEqual(pattern.search('abc').group(0), 'abc') self.assertIsNone(pattern.search('.')) regex = translate_pattern(r"^[\W]{0,5}$") pattern = re.compile(regex) self.assertEqual(pattern.search('.').group(0), '.') self.assertIsNone(pattern.search('abc'))
def test_verbose_patterns(self): regex = translate_pattern('\\ s*[a-z]+', flags=re.VERBOSE) self.assertEqual(regex, '\\s*[a-z]+') regex = translate_pattern('\\ p{ Is BasicLatin}+', flags=re.VERBOSE) self.assertEqual(regex, '[\x00-\x7f]+')
def test_lazy_quantifiers(self): regex = translate_pattern('.*?') self.assertEqual(regex, '[^\r\n]*?') regex = translate_pattern('[a-z]{2,3}?') self.assertEqual(regex, '[a-z]{2,3}?') regex = translate_pattern('[a-z]*?') self.assertEqual(regex, '[a-z]*?') regex = translate_pattern('[a-z]*', lazy_quantifiers=False) self.assertEqual(regex, '[a-z]*') with self.assertRaises(RegexError) as ctx: translate_pattern('.*?', lazy_quantifiers=False) self.assertEqual(str(ctx.exception), "unexpected meta character '?' at position 2: '.*?'") with self.assertRaises(RegexError): translate_pattern('[a-z]{2,3}?', lazy_quantifiers=False) with self.assertRaises(RegexError): translate_pattern(r'[a-z]{2,3}?\s+', lazy_quantifiers=False) with self.assertRaises(RegexError): translate_pattern(r'[a-z]+?\s+', lazy_quantifiers=False)
def test_character_class_range(self): regex = translate_pattern('[bc-]') self.assertEqual(regex, r'[\-bc]')
def test_block_escapes(self): regex = translate_pattern('\\p{P}') self.assertTrue(regex.startswith('[!-#%-')) regex = translate_pattern('\\P{P}') self.assertTrue(regex.startswith('[^!-#%-')) regex = translate_pattern('\\p{IsBasicLatin}') self.assertEqual(regex, '[\x00-\x7f]') regex = translate_pattern('\\p{IsBasicLatin}', flags=re.IGNORECASE) self.assertEqual(regex, '(?-i:[\x00-\x7f])') with self.assertRaises(RegexError) as ctx: translate_pattern('\\px') self.assertIn("a '{' expected", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('\\p{Pu') self.assertIn("truncated unicode block escape", str(ctx.exception)) with self.assertRaises(RegexError) as ctx: translate_pattern('\\p{Unknown}') self.assertIn("'Unknown' doesn't match to any Unicode category", str(ctx.exception)) regex = translate_pattern('\\p{IsUnknown}', xsd_version='1.1') self.assertEqual(regex, '[\x00-\U0010fffe]') with self.assertRaises(RegexError) as ctx: translate_pattern('\\p{IsUnknown}') self.assertIn("'IsUnknown' doesn't match to any Unicode block", str(ctx.exception))