def test_random_seed(self): # We can set a seed to use for the random generation. # This allows the same samples to be re-generated in future in case that is necessary. seed_1 = 1234 seed_2 = 5678 cfg = self._simple_cfg() sampler_a = cfgsampler.CFGSampler(cfg) result_a = str(sampler_a.sample()) sampler_b = cfgsampler.CFGSampler(cfg, random_seed=seed_1) result_b = str(sampler_b.sample()) sampler_c = cfgsampler.CFGSampler(cfg, random_seed=seed_2) result_c = str(sampler_c.sample()) sampler_d = cfgsampler.CFGSampler(cfg, random_seed=seed_1) result_d = str(sampler_d.sample()) # We expect to have B==D, but all others unequal. self.assertNotEqual(result_a, result_b) self.assertNotEqual(result_a, result_c) self.assertNotEqual(result_a, result_d) self.assertNotEqual(result_b, result_c) self.assertEqual(result_b, result_d) self.assertNotEqual(result_c, result_d) # In particular, we know the values for B, C and D self.assertEqual(result_b, "xyxzxyxzxyxzxyxzxzxyxyxw") self.assertEqual(result_c, "xyxzxyxyxyxyxzxzxzxzxyxzxyxzxw") self.assertEqual(result_d, "xyxzxyxzxyxzxyxzxzxyxyxw")
def test_random_choices_are_internal(self): # Check that external calls to random.random and numpy.random... do not affect the internal state of the generator. cfg = self._simple_cfg() seed = 1234 expected_1 = "xyxzxyxzxyxzxyxzxzxyxyxw" expected_2 = "xzxzxyxyxzxzxzxyxzxzxzxzxzxyxzxw" expected_3 = "xyxyxyxyxyxzxyxyxyxyxyxzxzxzxzxyxzxyxzxw" sampler_a = cfgsampler.CFGSampler(cfg, random_seed=seed) result_a1 = str(sampler_a.sample()) result_a2 = str(sampler_a.sample()) result_a3 = str(sampler_a.sample()) # Then "normal" case self.assertEqual(result_a1, expected_1) self.assertEqual(result_a2, expected_2) self.assertEqual(result_a3, expected_3) # Now try sampling while other random functions are being called, which change the global state. sampler_b = cfgsampler.CFGSampler(cfg, random_seed=seed) random.random() result_b1 = str(sampler_b.sample()) random.random() random.random() result_b2 = str(sampler_b.sample()) numpy.random.sample([1, 2, 3]) numpy.random.random() result_b3 = str(sampler_b.sample()) # If the random state is internal to CFGSampler, we expect the results to be the same, self.assertEqual(result_b1, expected_1) self.assertEqual(result_b2, expected_2) self.assertEqual(result_b3, expected_3)
def _check_length_distribution(self, cfg, length_mean, length_dev, expected_ps): sampler = cfgsampler.CFGSampler(cfg) start_idx = sampler._nonterminal_index(cfg.start()) ls, ps = sampler._build_random_valid_distribution( start_idx, length_mean, length_dev) # Check the sizes are appropriate self.assertEqual(len(ls), len(ps)) # Check the valid lengths are as expected self.assertEqual(ls, range(length_mean - 2 * length_dev, length_mean + 2 * length_dev + 1)) # Lengths can be +/-2 SD. # Check the probabilities sum to 1. # N.B. This is how it is checked in numpy.random.choice(), which is what we must satisfy: # https://github.com/numpy/numpy/blob/a4dca241647a31a4313238fa183b67e453c1de0f/numpy/random/mtrand/mtrand.pyx#L1129 self.assertAlmostEqual(sum(ps), 1) # Check the probabilities are the exact distribution we expect. self.assertEqual(len(ps), len(expected_ps)) for actual, exp in zip(ps, expected_ps): if exp == 0: # Special handling for the 0 cases, which must really be 0, not just "close enough" self.assertEqual(actual, 0) else: self.assertAlmostEqual(actual, exp)
def __init__(self, config={}, random_seed=None): self.config = self.DEFAULT_CONFIG self.config.update(config) self._build_cfg( ) # Sets self.cfg, which is to br treated as read-only from here on. self._cfgsampler = cfgsampler.CFGSampler(self.cfg, random_seed=random_seed) self._jsgenerator_seed = hash( random_seed) if random_seed is not None else None self._constants_random = random.Random() if random_seed is not None: self._constants_random.seed( hash(random_seed) * 541 ) # 541 is not magic, it's the 100th prime, just to start from a different spot than _jsgenerator_seed. self._constant_generating_functions = [ self._generate_boolean_constant_value, self._generate_integer_constant_value, self._generate_string_constant_value, self._generate_constant_integer_set, self._generate_constant_string_set, self._generate_regex ]
def test_cfgsampler_simple_cfg(self): # A simple "overall" check that we can generate strings in the expected pattern. cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) result = sampler.sample() self.assertIsInstance(result, cfgsampler.TerminalString) self.assertRegexpMatches(str(result), '(x[yz])*xw')
def generate_single_regex(seed=None, length_mean=None, length_dev=None): length_mean = REGEX_GENERATOR_LENGTH_MEAN if length_mean is None else length_mean length_dev = REGEX_GENERATOR_LENGTH_DEV if length_dev is None else length_dev cfg = regex_cfg() generator = cfgsampler.CFGSampler(cfg, random_seed=seed) regex = generator.sample(length_mean=length_mean, length_std_dev=length_dev) return regex
def test_grammar_counts_ambiguous(self): # This is documentation more than desired - strings which can be generated multiple ways in an ambiguous # grammar are counted twice. cfg = self._ambiguous_cfg() sampler = cfgsampler.CFGSampler(cfg) s_idx = sampler._nonterminal_index(nltk.grammar.Nonterminal("S")) # N.B. This grammar can only produce a single string "xx", but there are two S-productions which produce it. self.assertEqual(sampler.f(s_idx, 2), [1, 1])
def test_non_string_terminals(self): cfg = self._non_string_cfg_mixed_types() sampler = cfgsampler.CFGSampler(cfg) # The main test is that there's no crash. # But we will also check that all the values appear in a medium-size string. result = sampler.sample(length_mean=30, length_std_dev=2) self.assertIsInstance(result, cfgsampler.TerminalString) result_list = result.as_list() self.assertIn(len(result), range(26, 35)) for itm in [1234, (5, 6, 7), frozenset([8, 9]), True]: self.assertIn(itm, result_list)
def test_cfgsampler_simple_cfg_multiple_generation(self): num_samples = 10 cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) strings = set() for _ in range(num_samples): new_str = str(sampler.sample()) strings.add(new_str) self.assertRegexpMatches(new_str, '(x[yz])*xw') # Check we actually generated some different strings self.assertGreater(len(strings), 1)
def test_cfgsampler_generated_length(self): num_samples = 10 cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) lens = set() for _ in range(num_samples): new_str_len = len(str(sampler.sample(length_mean=20))) lens.add(new_str_len) self.assertIn(new_str_len, range( 12, 29)) # By default the string length is within 40% each way. # Check we actually generated some different string lengths self.assertGreater(len(lens), 1)
def test_cfgsampler_generated_length_deviation(self): # The same as the above, but we set the deviation and check the generated strings are within the new range. num_samples = 10 cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) lens = set() for _ in range(num_samples): new_str_len = len( str(sampler.sample(length_mean=20, length_std_dev=2))) lens.add(new_str_len) self.assertIn(new_str_len, range(16, 25)) # Lengths can be +/-2 SD. # Check we actually generated some different string lengths self.assertGreater(len(lens), 1)
def test_length_is_valid(self): # Check we do not generate any lengths which the CFG cannot generate. num_samples = 10 cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) start_idx = sampler._nonterminal_index(cfg.start()) lens = set() for _ in range(num_samples): new_str_len = sampler._choose_random_valid_length(start_idx, 20, 5) lens.add(new_str_len) # Check we didn't generate any "bad" lengths # N.B. _simple_cfg can only generate even length strings. for l in lens: self.assertEqual(l % 2, 0)
def test_length_is_random_and_in_expected_range(self): # N.B. This is essentially a dupe of test_cfgsampler_generated_length # Check that over a few tries, we get different lengths num_samples = 10 cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) start_idx = sampler._nonterminal_index(cfg.start()) lens = set() for _ in range(num_samples): new_str_len = sampler._choose_random_valid_length(start_idx, 20, 5) lens.add(new_str_len) self.assertIn(new_str_len, range(10, 31)) # Lengths can be +/-2 SD. # Check we actually generated some different string lengths self.assertGreater(len(lens), 1)
def test_internal_state(self): # This is a general test, to check the implementation matches my notes on the arithmetic CFG. # See also CFGSampler._dump_preprocessor_info() cfg = self._arithmetic_cfg() sampler = cfgsampler.CFGSampler(cfg) s_idx = sampler._nonterminal_index(nltk.grammar.Nonterminal("S")) # Number of nonterminals, mostly for use later. r = len(sampler.ordered_nonterminals) self.assertEqual(r, 1) # Check the s_i values si_list = [sampler.s(i) for i in cfgsampler._inclusive_range(1, r)] self.assertEqual(si_list, [8]) # Check the t_ij values. tij_table = [[ sampler.t(i, j) for j in cfgsampler._inclusive_range(1, sampler.s(i)) ] for i in cfgsampler._inclusive_range(1, r)] expected_tij = [[1, 1, 1, 3, 3, 3, 3, 3]] self.assertEqual(tij_table, expected_tij) # Check some values of f(i, j) self.assertEqual(sampler.f(s_idx, 1), [1, 1, 1, 0, 0, 0, 0, 0]) self.assertEqual(sampler.f(s_idx, 2), [0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(sampler.f(s_idx, 3), [0, 0, 0, 9, 9, 9, 9, 3]) # Check some values of f_prime(i, j, k, n) self.assertEqual(sampler.f_prime(s_idx, 1, 1, 1), [1]) self.assertEqual(sampler.f_prime(s_idx, 4, 1, 1), []) self.assertEqual(sampler.f_prime(s_idx, 4, 3, 1), [3]) self.assertEqual(sampler.f_prime(s_idx, 8, 3, 1), [1]) self.assertEqual(sampler.f_prime(s_idx, 1, 1, 3), [0]) self.assertEqual(sampler.f_prime(s_idx, 4, 1, 3), [9]) self.assertEqual(sampler.f_prime(s_idx, 4, 3, 3), [39]) self.assertEqual(sampler.f_prime(s_idx, 8, 1, 3), [3]) self.assertEqual(sampler.f_prime(s_idx, 8, 3, 3), [0]) self.assertEqual(sampler.f_prime(s_idx, 1, 1, 5), [0]) self.assertEqual(sampler.f_prime(s_idx, 4, 1, 5), [117, 0, 117]) self.assertEqual(sampler.f_prime(s_idx, 4, 3, 5), [975]) self.assertEqual(sampler.f_prime(s_idx, 8, 1, 5), [39]) self.assertEqual(sampler.f_prime(s_idx, 8, 3, 5), [0])
def test_grammar_counts(self): # Check the counts of strings whihc can be generated by each nonterminal at each length. cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) s_idx = sampler._nonterminal_index(nltk.grammar.Nonterminal("S")) t_idx = sampler._nonterminal_index(nltk.grammar.Nonterminal("T")) self.assertEqual(sampler.f(s_idx, 0), [0]) self.assertEqual(sampler.f(s_idx, 1), [0]) self.assertEqual(sampler.f(s_idx, 2), [1]) self.assertEqual(sampler.f(s_idx, 3), [0]) self.assertEqual(sampler.f(s_idx, 4), [2]) self.assertEqual(sampler.f(s_idx, 5), [0]) self.assertEqual(sampler.f(s_idx, 6), [4]) self.assertEqual(sampler.f(s_idx, 10), [16]) self.assertEqual(sampler.f(t_idx, 0), [0, 0, 0]) self.assertEqual(sampler.f(t_idx, 1), [0, 0, 1]) self.assertEqual(sampler.f(t_idx, 2), [0, 0, 0]) self.assertEqual(sampler.f(t_idx, 3), [1, 1, 0]) self.assertEqual(sampler.f(t_idx, 4), [0, 0, 0]) self.assertEqual(sampler.f(t_idx, 5), [2, 2, 0]) self.assertEqual(sampler.f(t_idx, 11), [16, 16, 0])
def test_cfgsampler_unproductive_nonterminals(self): # There should be no problem with unproductive nonterminals. # The "real" test here is that no exception is raised. cfg = self._unproductive_cfg() sampler = cfgsampler.CFGSampler(cfg) self.assertRegexpMatches(str(sampler.sample()), 'xy*')
def test_cfgsampler_empty_productions(self): with self.assertRaises(cfgsampler.CFGSampler.CFGError): cfg = self._empty_productions_cfg() sampler = cfgsampler.CFGSampler(cfg)
def test_cfgsampler_generated_length_minimum_deviation(self): cfg = self._simple_cfg() sampler = cfgsampler.CFGSampler(cfg) with self.assertRaises(cfgsampler.CFGSampler.GenerationError): sampler.sample(length_mean=20, length_std_dev=0.1) # Minimum deviation is 1