def test_daterange_generator_reverse(self): """ Tests the date range generator in reverse """ # generate days from 10/01/1996 to 01/01/1996, in yyyy-mm-dd format start_date = datetime.date(1996, 1, 10) end_date = datetime.date(1996, 1, 1) expected_output = list( map(lambda x: "1996-01-" + str(x + 1).zfill(2), reversed(range(10)))) date_gen = ParamInjector.generate_daterange("%Y-%m-%d", start_date, end_date, "D") self.assertEqual(expected_output, list(date_gen)) # generate months from 01/05/1996 to 01/01/1996, in yyyy/mm format start_date = datetime.date(1996, 5, 1) end_date = datetime.date(1996, 1, 1) expected_output = list( map(lambda x: "1996/" + str(x + 1).zfill(2), reversed(range(5)))) date_gen = ParamInjector.generate_daterange("%Y/%m", start_date, end_date, "M") self.assertEqual(expected_output, list(date_gen)) # generate years from 01/01/2005 to 01/01/1996, in yy format start_date = datetime.date(2005, 1, 1) end_date = datetime.date(1996, 1, 1) expected_output = list( map(lambda x: str(x)[-2:], reversed(range(1996, 2006)))) date_gen = ParamInjector.generate_daterange("%y", start_date, end_date) self.assertEqual(expected_output, list(date_gen))
def test_daterange_generator_error_invalid_range(self): """ Tests the error case when no start/end date is supplied, or when either is invalid """ # no end date start_date = datetime.date(1996, 1, 1) date_gen = ParamInjector.generate_daterange("", start_date, None) self.assertRaises(ValueError, list, date_gen) # no start date end_date = datetime.date(2005, 1, 1) date_gen = ParamInjector.generate_daterange("", None, end_date) self.assertRaises(ValueError, list, date_gen) # invalid start date start_date = "1996-01-01" end_date = datetime.date(2005, 1, 1) date_gen = ParamInjector.generate_daterange("", start_date, end_date) self.assertRaises(ValueError, list, date_gen) # invalid end date start_date = datetime.date(1996, 1, 1) end_date = "2004-12-31" date_gen = ParamInjector.generate_daterange("", start_date, end_date) self.assertRaises(ValueError, list, date_gen)
def test_daterange_generator_error_invalid_frequency(self): """ Tests the error case when the supplied frequency is invalid """ # frequency = None start_date = datetime.date(1996, 1, 1) end_date = datetime.date(1996, 1, 10) date_gen = ParamInjector.generate_daterange("", start_date, end_date, None) self.assertRaises(ValueError, list, date_gen) # frequency = "YEAR" start_date = datetime.date(1996, 1, 1) end_date = datetime.date(1996, 1, 10) date_gen = ParamInjector.generate_daterange("", start_date, end_date, "YEAR") self.assertRaises(ValueError, list, date_gen) # frequency = "" start_date = datetime.date(1996, 1, 1) end_date = datetime.date(1996, 1, 10) date_gen = ParamInjector.generate_daterange("", start_date, end_date, "") self.assertRaises(ValueError, list, date_gen)
def test_number_generator_corner_cases(self): """ Tests some corner cases for the number generator """ # empty range expected_output = [] num_gen = ParamInjector.generate_num_sequence(1, 0) self.assertEqual(expected_output, list(num_gen)) # empty with reversed first and last elements expected_output = [] num_gen = ParamInjector.generate_num_sequence(0, 1, -1) self.assertEqual(expected_output, list(num_gen)) # single element expected_output = ['0'] num_gen = ParamInjector.generate_num_sequence(0, 0) self.assertEqual(expected_output, list(num_gen)) # two elements with last > first expected_output = ['0', '1'] num_gen = ParamInjector.generate_num_sequence(0, 1) self.assertEqual(expected_output, list(num_gen)) # two elements with last < first expected_output = ['1', '0'] num_gen = ParamInjector.generate_num_sequence(1, 0, -1) self.assertEqual(expected_output, list(num_gen))
def test_process_code_generator(self): """ Tests generation of process codes """ code_format = "{:07d}{:02d}{:04d}{:03d}{:04d}" param_limits = [(0, 5), (2018, 2019), [402], [0, 9999]] # 6 first sequential entries for each year, for each origin expected_output = ['00000007520184020000', '00000005120184029999', '00000004120194020000', '00000001720194029999', '00000016020184020000', '00000013620184029999', '00000012620194020000', '00000010220194029999', '00000024520184020000', '00000022120184029999', '00000021120194020000', '00000028420194029999', '00000033020184020000', '00000030620184029999', '00000039320194020000', '00000036920194029999', '00000041520184020000', '00000048820184029999', '00000047820194020000', '00000045420194029999', '00000059720184020000', '00000057320184029999', '00000056320194020000', '00000053920194029999'] proc_gen = ParamInjector.generate_format(code_format, param_limits, \ self.verif_code, 1) self.assertEqual(expected_output, list(proc_gen)) # Generate the same codes, but in a different order (first vary the # sequential number, then the origin, then the year) code_format = "{4:07d}{1:02d}{0:04d}{2:03d}{3:04d}" param_limits = [(2018, 2019), [402], [0, 9999], (0, 5)] # 6 first sequential entries for each year, for each origin expected_output = ['00000007520184020000', '00000016020184020000', '00000024520184020000', '00000033020184020000', '00000041520184020000', '00000059720184020000', '00000005120184029999', '00000013620184029999', '00000022120184029999', '00000030620184029999', '00000048820184029999', '00000057320184029999', '00000004120194020000', '00000012620194020000', '00000021120194020000', '00000039320194020000', '00000047820194020000', '00000056320194020000', '00000001720194029999', '00000010220194029999', '00000028420194029999', '00000036920194029999', '00000045420194029999', '00000053920194029999'] proc_gen = ParamInjector.generate_format(code_format, param_limits, \ self.verif_code_switched, 1) self.assertEqual(expected_output, list(proc_gen)) # Use an empty param_limits with a fixed code_format code_format = "test" param_limits = [] expected_output = [] proc_gen = ParamInjector.generate_format(code_format, param_limits) list(proc_gen) self.assertEqual(expected_output, list(proc_gen))
def test_alphabetic_generator_all(self): """ Tests generation of alphabetic search patterns upper and lowercase """ # one letter, one word lowercase_letters = [chr(97 + x) for x in range(0, 26)] uppercase_letters = [chr(65 + x) for x in range(0, 26)] expected_output = list(map(lambda x: x + "*", lowercase_letters)) expected_output += list(map(lambda x: x + "*", uppercase_letters)) pattern_gen = ParamInjector.generate_alpha(1, 1, False) self.assertEqual(expected_output, list(pattern_gen)) # two letters, one word # check only the first 5 and last 5 entries expected_output_first = ["aa*", "ab*", "ac*", "ad*", "ae*"] expected_output_last = ["ZV*", "ZW*", "ZX*", "ZY*", "ZZ*"] pattern_gen = ParamInjector.generate_alpha(2, 1, False) # first 5 entries output = [next(pattern_gen) for i in range(5)] self.assertEqual(expected_output_first, output) # last 5 entries last_five = deque(pattern_gen, maxlen=5) output = [last_five.popleft() for i in range(5)] self.assertEqual(expected_output_last, output) # two letters, two words # check only the first 5 and last 5 entries expected_output_first = [ "aa* aa*", "aa* ab*", "aa* ac*", "aa* ad*", "aa* ae*" ] expected_output_last = [ "ZZ* ZV*", "ZZ* ZW*", "ZZ* ZX*", "ZZ* ZY*", "ZZ* ZZ*" ] pattern_gen = ParamInjector.generate_alpha(2, 2, False) # first 5 entries output = [next(pattern_gen) for i in range(5)] self.assertEqual(expected_output_first, output) # last 5 entries last_five = deque(pattern_gen, maxlen=5) output = [last_five.popleft() for i in range(5)] self.assertEqual(expected_output_last, output)
def test_process_code_error_invalid_param(self): """ Tests the error case when the list of parameter limits contains invalid values """ code_format = "{:07d}{:02d}{:04d}{:03d}{:04d}" param_limits = [(0, 5), (2018, 2019), [402], 9999] proc_gen = ParamInjector.generate_format(code_format, param_limits, self.verif_code) # invalid param_limits (one of the entries is an integer) self.assertRaises(ValueError, next, proc_gen) param_limits = [(0, 5), (2018, 2019), [402], (1, 2, 3)] proc_gen = ParamInjector.generate_format(code_format, param_limits, self.verif_code) # invalid param_limits (one of the entries is a tuple with 3 elements) self.assertRaises(ValueError, next, proc_gen)
def test_number_generator_simple(self): """ Tests generation of simple linear sequence of numbers without padding """ # stringified numbers from 0 to 9 expected_output = list(map(str, range(10))) num_gen = ParamInjector.generate_num_sequence(0, 9, 1, False) self.assertEqual(expected_output, list(num_gen)) # stringified numbers from 0 to 99 expected_output = list(map(str, range(100))) num_gen = ParamInjector.generate_num_sequence(0, 99, 1, False) self.assertEqual(expected_output, list(num_gen)) # stringified numbers from 0 to 100 expected_output = list(map(str, range(101))) num_gen = ParamInjector.generate_num_sequence(0, 100, 1, False) self.assertEqual(expected_output, list(num_gen))
def test_number_generator_padding(self): """ Tests generation of simple linear sequences of numbers with padding """ # stringified numbers from 0 to 9 expected_output = list(map(str, range(10))) num_gen = ParamInjector.generate_num_sequence(0, 9) self.assertEqual(expected_output, list(num_gen)) # stringified numbers from 0 to 99, each one with 2 digits expected_output = list(map(lambda x: str(x).zfill(2), range(100))) num_gen = ParamInjector.generate_num_sequence(0, 99) self.assertEqual(expected_output, list(num_gen)) # stringified numbers from 0 to 100, each one with 3 digits expected_output = list(map(lambda x: str(x).zfill(3), range(101))) num_gen = ParamInjector.generate_num_sequence(0, 100) self.assertEqual(expected_output, list(num_gen))
def test_const_generator(self): """ Tests general functionality of the constant value generator """ # Simple case input_data = "test" expected_output = ["test"] const_gen = ParamInjector.generate_constant(input_data) self.assertEqual(expected_output, list(const_gen)) # No data input_data = "" expected_output = [""] const_gen = ParamInjector.generate_constant(input_data) self.assertEqual(expected_output, list(const_gen))
def test_daterange_generator_error_empty_format(self): """ Tests the error case when the given date format is empty """ start_date = datetime.date(1996, 1, 1) end_date = datetime.date(2005, 1, 1) date_gen = ParamInjector.generate_daterange("", start_date, end_date) self.assertRaises(ValueError, list, date_gen)
def test_alphabetic_generator_error_corner_cases(self): """ Tests some corner cases for the alphabetic generator """ # two words with zero length pattern_gen = ParamInjector.generate_alpha(2, 0) self.assertRaises(ValueError, list, pattern_gen) # zero words with length two pattern_gen = ParamInjector.generate_alpha(0, 2) self.assertRaises(ValueError, list, pattern_gen) # negative value for length pattern_gen = ParamInjector.generate_alpha(0, -2) self.assertRaises(ValueError, list, pattern_gen) # negative value for word count pattern_gen = ParamInjector.generate_alpha(-2, 0) self.assertRaises(ValueError, list, pattern_gen)
def test_alphabetic_generator_lower(self): """ Tests generation of alphabetic search patterns (all lowercase) """ # one letter, one word lowercase_letters = [chr(97 + x) for x in range(0, 26)] expected_output = list(map(lambda x: x + "*", lowercase_letters)) pattern_gen = ParamInjector.generate_alpha(1, 1) self.assertEqual(expected_output, list(pattern_gen)) # two letters, one word # check only the first 5 and last 5 entries expected_output_first = ["aa*", "ab*", "ac*", "ad*", "ae*"] expected_output_last = ["zv*", "zw*", "zx*", "zy*", "zz*"] pattern_gen = ParamInjector.generate_alpha(2, 1) # first 5 entries output = [next(pattern_gen) for i in range(5)] self.assertEqual(expected_output_first, output) # last 5 entries last_five = deque(pattern_gen, maxlen=5) output = [last_five.popleft() for i in range(5)] self.assertEqual(expected_output_last, output) # two letters, two words # check only the first 5 and last 5 entries expected_output_first = [ "aa* aa*", "aa* ab*", "aa* ac*", "aa* ad*", "aa* ae*" ] expected_output_last = [ "zz* zv*", "zz* zw*", "zz* zx*", "zz* zy*", "zz* zz*" ] pattern_gen = ParamInjector.generate_alpha(2, 2) # first 5 entries output = [next(pattern_gen) for i in range(5)] self.assertEqual(expected_output_first, output) # last 5 entries last_five = deque(pattern_gen, maxlen=5) output = [last_five.popleft() for i in range(5)] self.assertEqual(expected_output_last, output)
def test_number_generator_reverse(self): """ Tests generation of sequences of numbers in reverse """ # stringified numbers from 9 to 0 expected_output = list(map(str, range(10))) expected_output.reverse() num_gen = ParamInjector.generate_num_sequence(9, 0, -1, False) self.assertEqual(expected_output, list(num_gen)) # stringified numbers from 99 to 0 expected_output = list(map(str, range(100))) expected_output.reverse() num_gen = ParamInjector.generate_num_sequence(99, 0, -1, False) self.assertEqual(expected_output, list(num_gen)) # stringified numbers from 100 to 0 expected_output = list(map(str, range(101))) expected_output.reverse() num_gen = ParamInjector.generate_num_sequence(100, 0, -1, False) self.assertEqual(expected_output, list(num_gen))
def test_list_generator(self): """ Tests general functionality of the predefined list generator """ # Simple case list_str = "a,b,c,d,e" expected_output = ["a", "b", "c", "d", "e"] list_gen = ParamInjector.generate_list(list_str) self.assertEqual(expected_output, list(list_gen)) # Another simple case list_str = "alice,bob,charlie,david,emily" expected_output = ["alice", "bob", "charlie", "david", "emily"] list_gen = ParamInjector.generate_list(list_str) self.assertEqual(expected_output, list(list_gen)) # Leading and trailing spaces are ignored list_str = "a , b ,c, d, e" expected_output = ["a", "b", "c", "d", "e"] list_gen = ParamInjector.generate_list(list_str) self.assertEqual(expected_output, list(list_gen)) # No elements list_str = "" expected_output = [""] list_gen = ParamInjector.generate_list(list_str) self.assertEqual(expected_output, list(list_gen)) # Single element list_str = "singletest" expected_output = ["singletest"] list_gen = ParamInjector.generate_list(list_str) self.assertEqual(expected_output, list(list_gen))
def test_process_code_error_verif_index(self): """ Tests the error case when a verification function is defined but no index is supplied for the position of this verification function """ code_format = "{:07d}{:02d}{:04d}{:03d}{:04d}" param_limits = [(0, 5), (2018, 2019), [402], [0, 9999]] proc_gen = ParamInjector.generate_format(code_format, param_limits, self.verif_code) # try to generate one process code entry without the verification digit # index self.assertRaises(ValueError, next, proc_gen)
def test_daterange_generator_limits(self): """ Tests the date range generator's behaviour in some corner cases where the start/end dates could cause off-by-one errors """ # generate months from 01/01/1996 to 30/04/1996, in yyyy/mm format # should include Jan to Apr, without May start_date = datetime.date(1996, 1, 1) end_date = datetime.date(1996, 4, 30) expected_output = list( map(lambda x: "1996/" + str(x + 1).zfill(2), range(4))) date_gen = ParamInjector.generate_daterange("%Y/%m", start_date, end_date, "M") self.assertEqual(expected_output, list(date_gen)) # generate years from 01/01/1996 to 31/12/2004, in yy format # should include 1996 to 2004, without 2005 start_date = datetime.date(1996, 1, 1) end_date = datetime.date(2004, 12, 31) expected_output = list(map(lambda x: str(x)[-2:], range(1996, 2005))) date_gen = ParamInjector.generate_daterange("%y", start_date, end_date) self.assertEqual(expected_output, list(date_gen))
def create_parameter_generators(probe, parameter_handlers, filter_limits=True): """ Loads the parameter information and creates a list of the respective generators from the ParamInjector module, while filtering the ranges as necessary """ url_injectors = [] initial_values = [] for i in [1, 2]: # We run this code twice: the first pass will get the initial # values for each parameter, which is used in the second pass to # filter the ends of the limits as required # I couldn't find a nicer way to do this if not filter_limits and i == 2: # Don't filter limits unless required break for param_index, param in enumerate(parameter_handlers): param_type = param['parameter_type'] param_gen = None if i == 2 and not param['filter_range']: # We are running the "filtering" pass but this parameter # should not be filtered continue entries_list = [] cons_misses = None if i == 2: # Configure the list of extra parameters for the range # inference entries_list = initial_values.copy() entries_list[param_index] = None cons_misses = int(param['cons_misses']) if param_type == "process_code": PROCESS_FORMAT = '{:07d}-{:02d}.{:04d}.{}.{:02d}.{:04d}' first_year = int(param['first_year_proc_param']) last_year = int(param['last_year_proc_param']) segment_ids = param['segment_ids_proc_param'].split(",") court_ids = param['court_ids_proc_param'].split(",") origin_ids = param['origin_ids_proc_param'].split(",") # turn string lists into integers segment_ids = list(map(int, segment_ids)) court_ids = list(map(int, court_ids)) origin_ids = list(map(int, origin_ids)) max_seq = 9999999 if i == 2: # Filter the process_code range max_seq = RangeInference.filter_process_code( first_year, last_year, segment_ids, court_ids, origin_ids, probe, entries_list, cons_misses=cons_misses) subparam_list = [ # sequential identifier (0, max_seq), # year (first_year, last_year), # segment identifiers segment_ids, # court identifiers court_ids, # origin identifiers origin_ids ] param_gen = ParamInjector.generate_format( code_format=PROCESS_FORMAT, param_limits=subparam_list, verif=ParamInjector.process_code_verification, verif_index=1) elif param_type == "number_seq": begin = int(param['first_num_param']) end = int(param['last_num_param']) if i == 2: # Filter the number range end = RangeInference.filter_numeric_range( begin, end, probe, entries_list, cons_misses=cons_misses) param_gen = ParamInjector.generate_num_sequence( first=begin, last=end, step=int(param['step_num_param']), leading=param['leading_num_param'], ) elif param_type == 'date_seq': begin = datetime.date.fromisoformat( param['start_date_date_param']) end = datetime.date.fromisoformat(param['end_date_date_param']) frequency = param['frequency_date_param'] date_format = param['date_format_date_param'] if i == 2: # Filter the date range end = RangeInference.filter_daterange( begin, end, probe, frequency, date_format, entries_list, cons_misses=cons_misses) param_gen = ParamInjector.generate_daterange( date_format=date_format, start_date=begin, end_date=end, frequency=frequency, ) elif param_type == 'alpha_seq': # We don't do anything diferent here if it's the second # pass, since alphabetic sequences can't be filtered length = int(param['length_alpha_param']) num_words = int(param['num_words_alpha_param']) no_upper = param['no_upper_alpha_param'] param_gen = ParamInjector.generate_alpha(length=length, num_words=num_words, no_upper=no_upper) elif param_type == 'value_list': # No filtering applied to this parameter list_values = param['value_list_param'] param_gen = ParamInjector.generate_list(elements=list_values) elif param_type == 'const_value': # No filtering applied to this parameter const_value = param['value_const_param'] param_gen = ParamInjector.generate_constant(value=const_value) else: raise ValueError(f"Invalid parameter type: {param_type}") if i == 2 and param_gen is not None: # We have filtered the range for this parameter, and should # update the generator in the list url_injectors[param_index] = param_gen else: # Create a copy of the generator, to extract the first # value. After that, add to the list of parameter # generators param_gen, param_gen_first = itertools.tee(param_gen) initial_values.append(next(param_gen_first)) url_injectors.append(param_gen) return url_injectors
def preprocess(entry): entries_list = initial_values.copy() entries_list[index] = entry return ParamInjector.format_params(code_format, tuple(entries_list), verif, verif_index)
def verif(seq): return ParamInjector.process_code_verification( seq, year, segment, court, origin)