def test_numeric_corner_cases(self): """ Tests the behavior of the numerical range inference in some corner cases """ # Empty range (no entries in the original search space) hit_check = RangeInferenceTest.dummy_hit_check(0, -1) # Yearly last_entry = RangeInference.filter_numeric_range(0, 100, hit_check, 10) self.assertIsNone(last_entry) # Entire search space is filled hit_check = RangeInferenceTest.dummy_hit_check(0, 123) last_entry = RangeInference.filter_numeric_range(0, 123, hit_check, 10) self.assertEqual(last_entry, 123) # Only one entry at the beginning hit_check = RangeInferenceTest.dummy_hit_check(0, 0) last_entry = RangeInference.filter_numeric_range(0, 123, hit_check, 10) self.assertEqual(last_entry, 0) # Only one entry near the beginning hit_check = RangeInferenceTest.dummy_hit_check(5, 5) last_entry = RangeInference.filter_numeric_range(0, 123, hit_check, 10) self.assertEqual(last_entry, 5)
def test_formatted_code_corner_cases(self): """ Tests the behavior of the formatted code inference in some corner cases """ # Entire search space is filled first_begin = 1 first_end = 100 second_begin = 1 second_end = 100 entry_probe = RangeInferenceTest.dummy_code_probe( "-", [first_end, second_end]) result = RangeInference.filter_formatted_code( "{:02}-{:02}", [(first_begin, first_end), (second_begin, second_end)], [True, True], entry_probe) final_values = list(map(lambda x: x[-1], result)) self.assertEqual(final_values, [first_end, second_end]) # Only one entry at the beginning entry_probe = RangeInferenceTest.dummy_code_probe("-", [1, 1]) result = RangeInference.filter_formatted_code( "{:02}-{:02}", [(first_begin, first_end), (second_begin, second_end)], [True, True], entry_probe) final_values = list(map(lambda x: x[-1], result)) self.assertEqual(final_values, [1, 1])
def test_numeric_range_inference(self): """ Tests simple numeric ranges """ # Last entry at position number 50 hit_check = RangeInferenceTest.dummy_hit_check(0, 50) last_entry = RangeInference.filter_numeric_range(0, 200, hit_check, 10) self.assertEqual(last_entry, 50) # Interval beginning at 25 and ending at 48 hit_check = RangeInferenceTest.dummy_hit_check(25, 48) last_entry = RangeInference.filter_numeric_range(0, 200, hit_check, 10) self.assertEqual(last_entry, 48) # Case where our module is unable to work: there is a gap in the # sequence, and it is larger than the cons_misses parameter # Create two interval checks hit_check_25_48 = RangeInferenceTest.dummy_hit_check(25, 48) hit_check_90_95 = RangeInferenceTest.dummy_hit_check(90, 95) # Test against both intervals and returns True if it belongs to any of # them def hit_check(x): return (hit_check_25_48(x) or hit_check_90_95(x)) last_entry = RangeInference.filter_numeric_range(0, 200, hit_check, 10) self.assertEqual(last_entry, 48) # Same case as above, but with a higher value for cons_misses, in which # case the module finds the correct answer last_entry = RangeInference.filter_numeric_range(0, 200, hit_check, 50) self.assertEqual(last_entry, 95)
def test_numeric_range_inference(self): """ Tests simple numeric ranges """ # Last entry at position number 50 entry_probe = RangeInferenceTest.dummy_entry_probe(0, 50) last_entry = RangeInference.filter_numeric_range(0, 200, entry_probe, cons_misses=10) self.assertEqual(last_entry, 50) # Interval beginning at 25 and ending at 48 entry_probe = RangeInferenceTest.dummy_entry_probe(25, 48) last_entry = RangeInference.filter_numeric_range(0, 200, entry_probe, cons_misses=10) self.assertEqual(last_entry, 48) # Simple case similar to the first, but with extra parameters entry_probe = RangeInferenceTest.dummy_entry_probe(0, 50) last_entry = RangeInference.filter_numeric_range(0, 200, entry_probe, [None, 2, 'test'], cons_misses=10) self.assertEqual(last_entry, 50) # Case where our module is unable to work: there is a gap in the # sequence, and it is larger than the cons_misses parameter # Create two interval checks def check_25_48(x): return 25 <= x[0] <= 48 def check_90_95(x): return 90 <= x[0] <= 95 # Test against both intervals and returns True if it belongs to any of # them def check_all(x): return (check_25_48(x) or check_90_95(x)) # Create a specific mock of EntryProbing for this entry_probe = mock.Mock(spec=EntryProbing, check_entry=check_all) last_entry = RangeInference.filter_numeric_range(0, 200, entry_probe, cons_misses=10) self.assertEqual(last_entry, 48) # Same case as above, but with a higher value for cons_misses, in which # case the module finds the correct answer last_entry = RangeInference.filter_numeric_range(0, 200, entry_probe, cons_misses=50) self.assertEqual(last_entry, 95)
def test_date_corner_cases(self): """ Tests the behavior of the date range inference in some corner cases """ # Empty range (no entries in the original search space) int_begin = date(2010, 1, 1) int_end = date(2009, 1, 1) begin = date(2000, 1, 1) end = date(2020, 1, 1) entry_probe = RangeInferenceTest.dummy_entry_probe(int_begin, int_end) # Yearly last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'Y', 10) self.assertIsNone(last_entry) # Monthly last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'M', 10) self.assertIsNone(last_entry) # Daily last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'D', 10) self.assertIsNone(last_entry) # Entire search space is filled entry_probe = RangeInferenceTest.dummy_entry_probe(begin, end) # Yearly last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'Y', 10) self.assertEqual(last_entry.year, end.year) # Monthly last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'M', 10) self.assertEqual(last_entry.year, end.year) self.assertEqual(last_entry.month, end.month) # Daily last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'D', 10) self.assertEqual(last_entry, end) # Only one entry at the beginning entry_probe = RangeInferenceTest.dummy_entry_probe(begin, begin) last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'D', 10) self.assertEqual(last_entry, begin) # Only one entry near the beginning entry_date = date(2000, 1, 5) entry_probe = RangeInferenceTest.dummy_entry_probe( entry_date, entry_date) last_entry = RangeInference.filter_daterange(begin, end, entry_probe, 'D', 10) self.assertEqual(last_entry, entry_date)
def test_formatted_code_inference(self): """ Tests simple formatted codes """ # 01-01 to 10-10 first_begin = 1 first_end = 100 first_last_entry = 10 second_begin = 1 second_last_entry = 10 second_end = 100 entry_probe = RangeInferenceTest.dummy_code_probe( "-", [first_last_entry, second_last_entry]) result = RangeInference.filter_formatted_code( "{:02}-{:02}", [(first_begin, first_end), (second_begin, second_end)], [True, True], entry_probe) final_values = list(map(lambda x: x[-1], result)) self.assertEqual(final_values, [first_last_entry, second_last_entry]) # Same as above, but only filter the first parameter result = RangeInference.filter_formatted_code( "{:02}-{:02}", [(first_begin, first_end), (second_begin, second_end)], [True, False], entry_probe) final_values = list(map(lambda x: x[-1], result)) self.assertEqual(final_values, [first_last_entry, second_end]) # Now only filter the second parameter result = RangeInference.filter_formatted_code( "{:02}-{:02}", [(first_begin, first_end), (second_begin, second_end)], [False, True], entry_probe) final_values = list(map(lambda x: x[-1], result)) self.assertEqual(final_values, [first_end, second_last_entry])
def test_process_code_inference(self): """ Tests inference of process codes """ first_year = 2010 last_year = 2020 segment_ids = [4] court_ids = [2] origin_ids = [0, 9999] LAST_VAL = 20 def check(x): return int(x[0].split("-")[0]) <= LAST_VAL entry_probe = mock.Mock(spec=EntryProbing, check_entry=check) result = RangeInference.filter_process_code(first_year, last_year, segment_ids, court_ids, origin_ids, entry_probe) self.assertEqual(result, LAST_VAL)
def test_daterange_inference(self): """ Tests simple date ranges """ # Last entry at day 01/01/2012 int_begin = date(2010, 1, 1) int_end = date(2012, 1, 1) end = date(2020, 1, 1) hit_check = RangeInferenceTest.dummy_hit_check(int_begin, int_end) last_entry = RangeInference.filter_daterange(int_begin, end, hit_check, 'Y', 10) # Since we're using yearly resolution, only the year value matters self.assertEqual(last_entry.year, int_end.year) # Same as above but with monthly resolution last_entry = RangeInference.filter_daterange(int_begin, end, hit_check, 'M', 10) # Using monthly resolution, check year and month self.assertEqual(last_entry.year, int_end.year) self.assertEqual(last_entry.month, int_end.month) # Same as above but with daily resolution last_entry = RangeInference.filter_daterange(int_begin, end, hit_check, 'D', 10) # Since now the resolution is daily, we can just compare the results self.assertEqual(last_entry, int_end) # Repeating the tests above, but now with int_end at 31/12/2011, to # catch off-by-one errors int_end = date(2011, 12, 31) hit_check = RangeInferenceTest.dummy_hit_check(int_begin, int_end) last_entry = RangeInference.filter_daterange(int_begin, end, hit_check, 'Y', 10) self.assertEqual(last_entry.year, int_end.year) last_entry = RangeInference.filter_daterange(int_begin, end, hit_check, 'M', 10) self.assertEqual(last_entry.year, int_end.year) self.assertEqual(last_entry.month, int_end.month) last_entry = RangeInference.filter_daterange(int_begin, end, hit_check, 'D', 10) self.assertEqual(last_entry, int_end)
def create_parameter_generators(probe, parameter_handlers, filter_limits=True): """ Loads the parameter information and creates a list of the respective generators from the ParamInjector module, while filtering the ranges as necessary """ url_injectors = [] initial_values = [] for i in [1, 2]: # We run this code twice: the first pass will get the initial # values for each parameter, which is used in the second pass to # filter the ends of the limits as required # I couldn't find a nicer way to do this if not filter_limits and i == 2: # Don't filter limits unless required break for param_index, param in enumerate(parameter_handlers): param_type = param['parameter_type'] param_gen = None if i == 2 and not param['filter_range']: # We are running the "filtering" pass but this parameter # should not be filtered continue entries_list = [] cons_misses = None if i == 2: # Configure the list of extra parameters for the range # inference entries_list = initial_values.copy() entries_list[param_index] = None cons_misses = int(param['cons_misses']) if param_type == "process_code": PROCESS_FORMAT = '{:07d}-{:02d}.{:04d}.{}.{:02d}.{:04d}' first_year = int(param['first_year_proc_param']) last_year = int(param['last_year_proc_param']) segment_ids = param['segment_ids_proc_param'].split(",") court_ids = param['court_ids_proc_param'].split(",") origin_ids = param['origin_ids_proc_param'].split(",") # turn string lists into integers segment_ids = list(map(int, segment_ids)) court_ids = list(map(int, court_ids)) origin_ids = list(map(int, origin_ids)) max_seq = 9999999 if i == 2: # Filter the process_code range max_seq = RangeInference.filter_process_code( first_year, last_year, segment_ids, court_ids, origin_ids, probe, entries_list, cons_misses=cons_misses) subparam_list = [ # sequential identifier (0, max_seq), # year (first_year, last_year), # segment identifiers segment_ids, # court identifiers court_ids, # origin identifiers origin_ids ] param_gen = ParamInjector.generate_format( code_format=PROCESS_FORMAT, param_limits=subparam_list, verif=ParamInjector.process_code_verification, verif_index=1) elif param_type == "number_seq": begin = int(param['first_num_param']) end = int(param['last_num_param']) if i == 2: # Filter the number range end = RangeInference.filter_numeric_range( begin, end, probe, entries_list, cons_misses=cons_misses) param_gen = ParamInjector.generate_num_sequence( first=begin, last=end, step=int(param['step_num_param']), leading=param['leading_num_param'], ) elif param_type == 'date_seq': begin = datetime.date.fromisoformat( param['start_date_date_param']) end = datetime.date.fromisoformat(param['end_date_date_param']) frequency = param['frequency_date_param'] date_format = param['date_format_date_param'] if i == 2: # Filter the date range end = RangeInference.filter_daterange( begin, end, probe, frequency, date_format, entries_list, cons_misses=cons_misses) param_gen = ParamInjector.generate_daterange( date_format=date_format, start_date=begin, end_date=end, frequency=frequency, ) elif param_type == 'alpha_seq': # We don't do anything diferent here if it's the second # pass, since alphabetic sequences can't be filtered length = int(param['length_alpha_param']) num_words = int(param['num_words_alpha_param']) no_upper = param['no_upper_alpha_param'] param_gen = ParamInjector.generate_alpha(length=length, num_words=num_words, no_upper=no_upper) elif param_type == 'value_list': # No filtering applied to this parameter list_values = param['value_list_param'] param_gen = ParamInjector.generate_list(elements=list_values) elif param_type == 'const_value': # No filtering applied to this parameter const_value = param['value_const_param'] param_gen = ParamInjector.generate_constant(value=const_value) else: raise ValueError(f"Invalid parameter type: {param_type}") if i == 2 and param_gen is not None: # We have filtered the range for this parameter, and should # update the generator in the list url_injectors[param_index] = param_gen else: # Create a copy of the generator, to extract the first # value. After that, add to the list of parameter # generators param_gen, param_gen_first = itertools.tee(param_gen) initial_values.append(next(param_gen_first)) url_injectors.append(param_gen) return url_injectors