def test_deterministic_generation(): ab = dgen(r'(ab)*') assert ab.generate_string(0) == '' assert ab.generate_string(2) == 'ab' assert ab.generate_string(4) == 'abab' assert ab.generate_string(3) is None finite = (revex.compile('(aa)*') & revex.compile('a{0,7}')) strings = list(dgen(finite).matching_strings_iter()) assert strings == ['', 'aa', 'aaaa', 'aaaaaa'] assert set(dgen(r'abc').matching_strings_iter()) == {'abc'} assert set(dgen(r'abc|def').matching_strings_iter()) == {'abc', 'def'}
def test_overflow_example(): # Regression test for float overflow in computing the probability # distribution. bits = int(math.ceil(math.log(float_info.max) / math.log(2))) - 1 assert (2.**bits) * 2. == float('inf') actual = re.compile(r'^[01]+$') revex_regex = revex.compile(r'[01]+') gen = rgen(revex_regex, alphabet=list('01')) assert actual.match(gen.generate_string(bits + 1)) assert actual.match(gen.generate_string(bits * 2))
def test_random_walk_matches_regex(regex): actual = re.compile('^%s$' % regex) revex_regex = revex.compile(regex) gen = rgen(revex_regex, alphabet=list(set(regex))) for length in islice(gen.valid_lengths_iter(), 10): for _ in range(10): rand_string = gen.generate_string(length) assert actual.match( rand_string), '%s should match %s' % (regex, rand_string) assert revex_regex.match( rand_string), '%s should match %s' % (regex, rand_string)
def test_repeat(): regex = RE('a{0,2}[a-z]') assert regex.match('q') assert regex.match('a' * 1 + 'q') assert regex.match('a' * 2 + 'q') assert not regex.match('a' * 3 + 'q') assert compile('a{3}') == compile('aaa') assert compile('ba{3}') == compile('baaa') assert compile('(ba){3}') == compile('bababa') assert RE('{').match('{') assert RE('a{}').match('a{}')
def test_valid_lengths_iter(): alphabet = 'abc' ab = RandomRegularLanguageGenerator( revex.compile('(ab)*').as_dfa(alphabet)) assert [i * 2 for i in range(50)] == list(islice(ab.valid_lengths_iter(), 0, 50)) aabb = RandomRegularLanguageGenerator( revex.compile('(aa)*(bb)*').as_dfa(alphabet)) assert [i * 2 for i in range(50) ] == list(islice(aabb.valid_lengths_iter(), 0, 50)) sixes = RandomRegularLanguageGenerator( (revex.compile('(aa)*') & revex.compile('(aaa)*')).as_dfa(alphabet)) assert [i * 6 for i in range(50) ] == list(islice(sixes.valid_lengths_iter(), 0, 50)) finite = (revex.compile('(aa)*') & revex.compile('a{0,16}')).as_dfa('a') valid_lengths = set( RandomRegularLanguageGenerator(finite).valid_lengths_iter()) assert valid_lengths == {0, 2, 4, 6, 8, 10, 12, 14, 16} assert [] == list( RandomRegularLanguageGenerator(EMPTY.as_dfa()).valid_lengths_iter())
def test_longest_string(): ip = revex.compile( r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)') assert ip.as_dfa('0123456789.').has_finite_language assert len(ip.as_dfa('0123456789.').longest_string) == 15 assert ip.match(ip.as_dfa('0123456789.').longest_string) with pytest.raises(InfiniteLanguageError): (revex.compile(r'([ab]{4})*') & revex.compile(r'([ab]{3})*')).as_dfa('ab').longest_string assert (revex.compile(r'(ab)*') & revex.compile(r'(ba)*')).as_dfa('ab').has_finite_language assert (revex.compile(r'(ab)*') & revex.compile(r'(ba)*')).as_dfa('ab').longest_string == '' assert (revex.compile(r'(ab)+') & revex.compile(r'(ba)+')).as_dfa('ab').has_finite_language with pytest.raises(EmptyLanguageError): (revex.compile(r'(ab)+') & revex.compile(r'(ba)+')).as_dfa('ab').longest_string assert EPSILON.as_dfa().longest_string == ''
def test_is_empty(): assert not (~EPSILON).as_dfa().is_empty assert not EPSILON.as_dfa().is_empty assert EMPTY.as_dfa().is_empty assert (revex.compile('a*|b*') & revex.compile('c+')).as_dfa('abc').is_empty
import re from typing import Set # noqa from typing import Tuple # noqa import pytest import six # noqa from hypothesis import given, example from hypothesis import strategies as st import revex from revex.derivative import EPSILON, EMPTY from revex.dfa import DFA, get_equivalent_states, minimize_dfa, \ InfiniteLanguageError, EmptyLanguageError example_regex = revex.compile(r'a[abc]*b[abc]*c') example_dfa = revex.build_dfa(r'a[abc]*b[abc]*c', alphabet='abcd') example_builtin_regex = re.compile(r'^a[abc]*b[abc]*c$') @given(st.text(alphabet='abcd')) @example('abbbbc') def test_derivative_matches_builtin(s): assert example_regex.match(s) == bool(example_builtin_regex.match(s)) @given(st.text(alphabet='abcd')) @example('abbbbc') def test_dfa_matches_builtin(s): assert example_dfa.match(s) == bool(example_builtin_regex.match(s))
def test_longest_string(): ip = revex.compile( r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)' ) assert ip.as_dfa('0123456789.').has_finite_language assert len(ip.as_dfa('0123456789.').longest_string) == 15 assert ip.match(ip.as_dfa('0123456789.').longest_string) with pytest.raises(InfiniteLanguageError): (revex.compile(r'([ab]{4})*') & revex.compile(r'([ab]{3})*')).as_dfa('ab').longest_string assert (revex.compile(r'(ab)*') & revex.compile(r'(ba)*')).as_dfa('ab').has_finite_language assert (revex.compile(r'(ab)*') & revex.compile(r'(ba)*')).as_dfa('ab').longest_string == '' assert (revex.compile(r'(ab)+') & revex.compile(r'(ba)+')).as_dfa('ab').has_finite_language with pytest.raises(EmptyLanguageError): (revex.compile(r'(ab)+') & revex.compile(r'(ba)+')).as_dfa('ab').longest_string assert EPSILON.as_dfa().longest_string == ''
def test(): dfa = revex.build_dfa(r'(a|bb|ccc)*', alphabet='abc') gen = RandomRegularLanguageGenerator(dfa) neg_dfa = (~revex.compile(r'(a|bb|ccc)*')).as_dfa(alphabet='abc') neg_gen = RandomRegularLanguageGenerator(neg_dfa) regex = re.compile(r'^(a|bb|ccc)*$') # These assertions are mostly probabilistic, so the numbers are chosen so # as to make the tests quite likely to pass. assert {gen.generate_string(0) for _ in range(10)} == {''} assert {gen.generate_string(1) for _ in range(10)} == {'a'} negs_1 = Counter(neg_gen.generate_string(1) for _ in range(1000)) assert_dist_approximately_equal(negs_1, {'b': 0.5, 'c': 0.5}) pos_2 = Counter(gen.generate_string(2) for _ in range(1000)) negs_2 = Counter(neg_gen.generate_string(2) for _ in range(1000)) assert_dist_approximately_equal(pos_2, {'aa': 0.5, 'bb': 0.5}) assert_dist_approximately_equal( negs_2, { 'ab': 1 / 7, 'ba': 1 / 7, 'cc': 1 / 7, 'ca': 1 / 7, 'cb': 1 / 7, 'bc': 1 / 7, 'ac': 1 / 7, }) pos_6 = Counter(gen.generate_string(6) for _ in range(10000)) possibilities = [ 'aaaaaa', 'cccccc', 'bbbbbb', 'cccaaa', 'aaaccc', 'abbccc', 'acccbb', 'bbaccc', 'bbccca', 'cccabb', 'cccbba', 'bbaaaa', 'abbaaa', 'aabbaa', 'aaabba', 'aaaabb', 'aabbbb', 'bbaabb', 'bbbbaa', ] assert_dist_approximately_equal( pos_6, {possibility: 1 / len(possibilities) for possibility in possibilities}) for length in range(1, 15): for _ in range(100): pos = gen.generate_string(length) neg = neg_gen.generate_string(length) assert regex.match(pos), pos assert not regex.match(neg), neg
def dgen(regex, alphabet=None): alphabet = alphabet or list(set(six.text_type(regex))) if not isinstance(regex, RegularExpression): regex = revex.compile(regex) return DeterministicRegularLanguageGenerator(regex.as_dfa(alphabet))
def rgen(regex, alphabet=None): alphabet = alphabet or list(set(str(regex))) if not isinstance(regex, RegularExpression): regex = revex.compile(regex) return RandomRegularLanguageGenerator(regex.as_dfa(alphabet))
def test_empty(): assert compile('') == EPSILON assert RE('(a|)').match('') assert RE('(a|)').match('a') assert RE('a|').match('') assert RE('a|').match('a')
def __init__(self, pattern): self.base_re = re.compile(r'\A(%s)\Z' % pattern) self.re = compile(pattern)