class RegularExpressionUnitTests(unittest.TestCase):
    def setUp(self):
        self.space_ctx_ = StateSpaceContext(SimpleStateSpace())
        self.space_ctx_.__enter__()

    def tearDown(self):
        self.space_ctx_.__exit__(None, None, None)

    def test_handle_simple(self):
        self.assertIsNotNone(eval_regex("abc", 0, "abc", 0))
        self.assertIsNone(eval_regex("abc", 0, "ab", 0))

    def test_handle_or(self):
        self.assertIsNotNone(eval_regex("a|bc", 0, "bc", 0))
        self.assertEqual(eval_regex("a|bc", 0, "bc", 0).span(), (0, 2))

        self.assertIsNotNone(eval_regex("a|bc", 0, "ab", 0))
        self.assertEqual(eval_regex("a|bc", 0, "ab", 0).span(), (0, 1))

        self.assertIsNone(eval_regex("a|bc", 0, "c", 0))
        self.assertIsNone(eval_regex("a|bc", 0, "bd", 0))

    def test_handle_start_markers(self):
        self.assertIsNotNone(eval_regex(r"^ab", 0, "abc", 0))
        self.assertIsNotNone(eval_regex(r"\Aab", 0, "abc", 0))
        with self.assertRaises(ReUnhandled):
            # Surprisingly!: re.compile('^bc').match('abc', 1) is None
            # Even more surprisingly, the end markers work differently.
            # We simply don't handle start markers with offset:
            self.assertIsNone(eval_regex(r"^bc", 0, "abc", 1))

    def test_handle_end_markers(self):
        self.assertIsNotNone(eval_regex(r"abc$", 0, "abc", 0))
        self.assertIsNotNone(eval_regex(r"abc$", 0, "abcd", 0, 3))
        self.assertIsNotNone(eval_regex(r"abc\Z", 0, "abc", 0))
        self.assertIsNotNone(eval_regex(r"abc\Z", re.MULTILINE, "abc", 0))
        with self.assertRaises(ReUnhandled):
            self.assertIsNone(eval_regex("abc$", re.MULTILINE, "abc", 0))

    def test_handle_range(self):
        self.assertIsNotNone(eval_regex("[a-z]7", 0, "b7", 0))
        self.assertIsNotNone(eval_regex("[a-z]7", 0, "z7", 0))
        self.assertIsNone(eval_regex("[a-z]7", 0, "A7", 0))

    def test_handle_ascii_wildcard(self):
        self.assertIsNotNone(eval_regex("1.2", re.A, "1x2", 0))
        self.assertIsNotNone(eval_regex("1.2", re.A, "1\x002", 0))
        self.assertIsNone(eval_regex("1.2", re.A, "111", 0))

    def test_handle_repeats(self):
        self.assertIsNotNone(eval_regex("a+a", 0, "aa", 0))
        self.assertEqual(eval_regex("s", 0, "ssss", 0).span(), (0, 1))
        self.assertEqual(eval_regex("ss", 0, "ssss", 0).span(), (0, 2))
        self.assertIsNotNone(eval_regex("s{1,2}x", 0, "sx", 0))
        self.assertIsNotNone(eval_regex("s{1,2}x", 0, "ssx", 0))
        self.assertIsNone(eval_regex("s{1,2}x", 0, "sssx", 0))
        self.assertIsNone(eval_regex("s{1,2}x", 0, "x", 0))
        self.assertIsNotNone(eval_regex("s{2,3}", 0, "ssss", 0))
        self.assertEqual(eval_regex("s{2,3}", 0, "ssss", 0).span(), (0, 3))
        self.assertIsNotNone(eval_regex("y*", 0, "y", 0))
        self.assertEqual(eval_regex("y*", 0, "y", 0).span(), (0, 1))
        self.assertIsNotNone(eval_regex("y*e+", 0, "ye", 0))
        self.assertIsNotNone(eval_regex("y*e", 0, "yye", 0))
        self.assertEqual(eval_regex("y*e", 0, "yye", 0).span(), (0, 3))
        self.assertIsNotNone(eval_regex("y*e+s{2,3}x", 0, "yessx", 0))
        self.assertIsNotNone(eval_regex("y*e+s{2,3}x", 0, "essx", 0))
        self.assertIsNone(eval_regex("y*e+s{2,3}x", 0, "yyessssx", 0))
        self.assertIsNone(eval_regex("y*e+s{2,3}x", 0, "yssx", 0))
        self.assertIsNone(eval_regex("y*e+s{2,3}x", 0, "ex", 0))

    def test_handle_ascii_numeric(self):
        self.assertIsNotNone(eval_regex(r"a\d", re.A, "a3", 0))
        self.assertIsNotNone(eval_regex(r"a\d", re.A, "a0", 0))
        self.assertIsNone(eval_regex(r"a\d", re.A, "a-", 0))

    def test_handle_noncapturing_group(self):
        self.assertIsNotNone(eval_regex("(?:a|b)c", 0, "ac", 0))
        self.assertIsNotNone(eval_regex("(?:a|b)c", 0, "bc", 0))
        self.assertIsNone(eval_regex("(?:a|b)c", 0, "a", 0))

    def test_handle_capturing_group(self):
        self.assertIsNotNone(eval_regex("(a|b)c", 0, "ac", 0))
        self.assertIsNone(eval_regex("(a|b)c", 0, "a", 0))
        self.assertEqual(eval_regex("(a|b)c", 0, "bc", 0).groups(), ("b", ))

    def test_handle_named_groups(self):
        self.assertIsNotNone(eval_regex("(?P<foo>a|b)c", 0, "bc", 0))
        self.assertEqual(eval_regex("(?P<foo>a|b)c", 0, "bc", 0)["foo"], "b")

    def test_handle_nested_groups(self):
        self.assertIsNotNone(eval_regex("(a|b(xx))+(c)?", 0, "bxxc", 0))
        self.assertEqual(
            eval_regex("(bxx)(c)?", 0, "bxxc", 0).groups(), ("bxx", "c"))
        self.assertEqual(
            eval_regex("(a|b(xx))+(c)?", 0, "bxxc", 0).groups(),
            ("bxx", "xx", "c"))
        self.assertEqual(
            eval_regex("(a|b(xx))+(c)?", 0, "a", 0).groups(),
            ("a", None, None))

    def test_with_fuzzed_inputs(self) -> None:
        rand = random.Random(253209)

        def check(pattern, literal_string, offset):
            flags = re.ASCII | re.DOTALL
            sym_match = eval_regex(pattern, flags, literal_string, offset)
            py_match = re.compile(pattern, flags).match(literal_string, offset)
            if (sym_match is None) != (py_match is None):
                self.assertEqual(py_match, sym_match)
            if py_match is None:
                return
            self.assertEqual(py_match.span(), sym_match.span())
            self.assertEqual(py_match.group(0), sym_match.group(0))
            self.assertEqual(py_match.groups(), sym_match.groups())
            self.assertEqual(py_match.pos, sym_match.pos)
            self.assertEqual(py_match.endpos, sym_match.endpos)
            self.assertEqual(py_match.lastgroup, sym_match.lastgroup)

        for iter in range(100):
            literal_string = "".join(
                rand.choice(["a", "5", "_"])
                for _ in range(rand.choice([0, 1, 1, 2, 2, 3, 4])))
            pattern = "".join(
                rand.choice(["a", "5", "."]) + rand.choice(["", "", "+", "*"])
                for _ in range(rand.choice([0, 1, 1, 2, 2])))
            offset = rand.choice([0, 0, 0, 0, 1])
            with self.subTest(
                    msg=
                    f'Trial {iter}: evaluating pattern "{pattern}" against "{literal_string}" at {offset}'
            ):
                check(pattern, literal_string, offset)
Exemplo n.º 2
0
class RegularExpressionUnitTests(unittest.TestCase):
    def setUp(self):
        self.space_ctx_ = StateSpaceContext(SimpleStateSpace())
        self.space_ctx_.__enter__()

    def tearDown(self):
        self.space_ctx_.__exit__(None, None, None)

    def test_handle_simple(self):
        self.assertIsNotNone(eval_regex('abc', 0, 'abc', 0))
        self.assertIsNone(eval_regex('abc', 0, 'ab', 0))

    def test_handle_or(self):
        self.assertIsNotNone(eval_regex('a|bc', 0, 'bc', 0))
        self.assertEqual(eval_regex('a|bc', 0, 'bc', 0).span(), (0, 2))

        self.assertIsNotNone(eval_regex('a|bc', 0, 'ab', 0))
        self.assertEqual(eval_regex('a|bc', 0, 'ab', 0).span(), (0, 1))

        self.assertIsNone(eval_regex('a|bc', 0, 'c', 0))
        self.assertIsNone(eval_regex('a|bc', 0, 'bd', 0))

    def test_handle_range(self):
        self.assertIsNotNone(eval_regex('[a-z]7', 0, 'b7', 0))
        self.assertIsNotNone(eval_regex('[a-z]7', 0, 'z7', 0))
        self.assertIsNone(eval_regex('[a-z]7', 0, 'A7', 0))

    def test_handle_ascii_wildcard(self):
        self.assertIsNotNone(eval_regex('1.2', re.A, '1x2', 0))
        self.assertIsNotNone(eval_regex('1.2', re.A, '1\x002', 0))
        self.assertIsNone(eval_regex('1.2', re.A, '111', 0))

    def test_handle_repeats(self):
        self.assertIsNotNone(eval_regex('a+a', 0, 'aa', 0))
        self.assertEqual(eval_regex('s', 0, 'ssss', 0).span(), (0, 1))
        self.assertEqual(eval_regex('ss', 0, 'ssss', 0).span(), (0, 2))
        self.assertIsNotNone(eval_regex('s{1,2}x', 0, 'sx', 0))
        self.assertIsNotNone(eval_regex('s{1,2}x', 0, 'ssx', 0))
        self.assertIsNone(eval_regex('s{1,2}x', 0, 'sssx', 0))
        self.assertIsNone(eval_regex('s{1,2}x', 0, 'x', 0))
        self.assertIsNotNone(eval_regex('s{2,3}', 0, 'ssss', 0))
        self.assertEqual(eval_regex('s{2,3}', 0, 'ssss', 0).span(), (0, 3))
        self.assertIsNotNone(eval_regex('y*', 0, 'y', 0))
        self.assertEqual(eval_regex('y*', 0, 'y', 0).span(), (0, 1))
        self.assertIsNotNone(eval_regex('y*e+', 0, 'ye', 0))
        self.assertIsNotNone(eval_regex('y*e', 0, 'yye', 0))
        self.assertEqual(eval_regex('y*e', 0, 'yye', 0).span(), (0, 3))
        self.assertIsNotNone(eval_regex('y*e+s{2,3}x', 0, 'yessx', 0))
        self.assertIsNotNone(eval_regex('y*e+s{2,3}x', 0, 'essx', 0))
        self.assertIsNone(eval_regex('y*e+s{2,3}x', 0, 'yyessssx', 0))
        self.assertIsNone(eval_regex('y*e+s{2,3}x', 0, 'yssx', 0))
        self.assertIsNone(eval_regex('y*e+s{2,3}x', 0, 'ex', 0))

    def test_handle_ascii_numeric(self):
        self.assertIsNotNone(eval_regex(r'a\d', re.A, 'a3', 0))
        self.assertIsNotNone(eval_regex(r'a\d', re.A, 'a0', 0))
        self.assertIsNone(eval_regex(r'a\d', re.A, 'a-', 0))

    def test_handle_noncapturing_subgroup(self):
        self.assertIsNotNone(eval_regex('(?:a|b)c', 0, 'ac', 0))
        self.assertIsNotNone(eval_regex('(?:a|b)c', 0, 'bc', 0))
        self.assertIsNone(eval_regex('(?:a|b)c', 0, 'a', 0))

    def test_handle_capturing_subgroup(self):
        self.assertIsNotNone(eval_regex('(a|b)c', 0, 'ac', 0))
        self.assertIsNone(eval_regex('(a|b)c', 0, 'a', 0))
        self.assertEqual(eval_regex('(a|b)c', 0, 'bc', 0).groups(), ('b', ))

    def test_handle_nested_subgroups(self):
        self.assertIsNotNone(eval_regex('(a|b(xx))+(c)?', 0, 'bxxc', 0))
        self.assertEqual(
            eval_regex('(bxx)(c)?', 0, 'bxxc', 0).groups(), ('bxx', 'c'))
        self.assertEqual(
            eval_regex('(a|b(xx))+(c)?', 0, 'bxxc', 0).groups(),
            ('bxx', 'xx', 'c'))
        self.assertEqual(
            eval_regex('(a|b(xx))+(c)?', 0, 'a', 0).groups(),
            ('a', None, None))

    def test_with_fuzzed_inputs(self) -> None:
        rand = random.Random(253209)

        def check(pattern, literal_string, offset):
            flags = re.ASCII | re.DOTALL
            sym_match = eval_regex(pattern, flags, literal_string, offset)
            py_match = re.compile(pattern, flags).match(literal_string, offset)
            if (sym_match is None) != (py_match is None):
                self.assertEqual(py_match, sym_match)
            if py_match is None:
                return
            self.assertEqual(py_match.span(), sym_match.span())
            self.assertEqual(py_match.group(0), sym_match.group(0))
            self.assertEqual(py_match.groups(), sym_match.groups())
            self.assertEqual(py_match.pos, sym_match.pos)
            self.assertEqual(py_match.endpos, sym_match.endpos)
            self.assertEqual(py_match.lastgroup, sym_match.lastgroup)

        for iter in range(100):
            literal_string = ''.join(
                rand.choice(['a', '5', '_'])
                for _ in range(rand.choice([0, 1, 1, 2, 2, 3, 4])))
            pattern = ''.join(
                rand.choice(['a', '5', '.']) + rand.choice(['', '', '+', '*'])
                for _ in range(rand.choice([0, 1, 1, 2, 2])))
            offset = rand.choice([0, 0, 0, 0, 1])
            with self.subTest(
                    msg=
                    f'Trial {iter}: evaluating pattern "{pattern}" against "{literal_string}" at {offset}'
            ):
                check(pattern, literal_string, offset)