def test_ocr_skip(self): obj = { "scan_tag": { "scanner": { "name": "integration_test", "pk": 0 }, "time": "2020-01-01T00:00:00+00:00" }, "source": FilesystemSource(os.path.join(test_data_path, "ocr", "good")).to_json_object(), "rule": CPRRule(modulus_11=False, ignore_irrelevant=False).to_json_object(), "configuration": { "skip_mime_types": ["image/*"] } } self.messages.append(( obj, "os2ds_scan_specs", )) self.run_pipeline() for message, queue in self.unhandled: if queue == "os2ds_results": self.assertFalse(message["matched"], "OCR match found with OCR disabled") else: self.fail("unexpected message in queue {0}".format(queue))
def test_corrupted_container(self): obj = { "scan_tag": "integration_test", "source": FilesystemSource(os.path.join(test_data_path, "pdf", "corrupted")).to_json_object(), "rule": CPRRule(modulus_11=False, ignore_irrelevant=False).to_json_object(), "configuration": {} } self.messages.append(( obj, "os2ds_scan_specs", )) self.run_pipeline() print(self.unhandled) self.assertEqual(len(self.unhandled), 1) self.assertEqual(self.unhandled[0][0]["origin"], "os2ds_problems")
if reload_content: h = FilesystemHandle.make_handle(fpath) content = get_content_from_handle(h) # newrule = CPRRule(modulus_11=True, ignore_irrelevant=False, # examine_context=True) # newrule.extract_surrounding_words = MethodType(extract_surrounding_words_fixed, newrule) rules = [ (CPRSimple(modulus_11=True, ignore_irrelevant=False, examine_context=True), "simple w. context"), (CPRComplicated(modulus_11=True, ignore_irrelevant=False, examine_context=True), "'accepted' w. context"), (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True), "current w. context"), # (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True), # "current w. context"), (CPROld(modulus_11=True, ignore_irrelevant=False, examine_context=False), "old wo. context"), (CPROld(modulus_11=True, ignore_irrelevant=False, examine_context=True), "old w. context"), # (newrule, "new w. context"), ] for rule, description in rules: print(description) @timing def f(rule):
def setUp(self): self.rule = CPRRule(modulus_11=False, ignore_irrelevant=False)
"""@timing decorator """ @wraps(func) def wrap(*args, **kw): ts = time() * 1000 result = func(*args, **kw) te = time() * 1000 print('func:{!r}, took: {:.4f} ms'.format(func.__name__, te - ts)) return result return cast(F, wrap) rules = [ (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=False, blacklist=None), matches, "match all"), (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True, blacklist=None), [matches[i] for i in [0, 1, 2, 3, 5, 6, 19]], "match using context rules"), (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True, blacklist=None, whitelist=None), [matches[i] for i in [0, 2, 3, 5, 6, 19]], "match setting `whitelist=None`"), (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True,
def test_simplerule_matches(self): candidates = [ (CPRRule(modulus_11=False, ignore_irrelevant=False), """ 2205995008: forbryder, 230500 0003: forbryder, 240501-0006: forbryder, 250501-1987: forbryder""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX", "2505XXXXXX"]), (CPRRule(modulus_11=True, ignore_irrelevant=True), """ 2205995008: forbryder, 230500 0003: forbryder, 240501-0006: forbryder, 250501-1987: forbryder""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]), (CPRRule(modulus_11=True, ignore_irrelevant=True, examine_context=False), """ Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008 Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003 240501-0006""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]), (CPRRule(modulus_11=True, ignore_irrelevant=True, examine_context=True), """ Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008 Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003 240501-0006""", ["2405XXXXXX"]), (RegexRule("((four|six)( [aopt]+)?|(one|seven) [aopt]+)"), """ one one potato two potato three potato four five potato six potato seven potato more!""", ["one potato", "four", "six potato", "seven potato"]), (LastModifiedRule( datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)), datetime(2019, 12, 31, 23, 59, 59, tzinfo=timezone.utc), ["2019-12-31T23:59:59+0000"]), (LastModifiedRule( datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)), datetime(2019, 5, 22, 0, 0, 1, tzinfo=timezone.utc), None), (DimensionsRule(width_range=range(0, 16385), height_range=range(0, 16385), min_dim=256), (128, 256), [[128, 256]]), (DimensionsRule(width_range=range(0, 16385), height_range=range(0, 16385), min_dim=256), (128, 255), []), (DimensionsRule(width_range=range(256, 1024), height_range=range(256, 1024), min_dim=0), (256, 256), [[256, 256]]), (DimensionsRule(width_range=range(256, 1024), height_range=range(256, 1024), min_dim=0), (32, 32), []), ] for rule, in_value, expected in candidates: with self.subTest(rule): json = rule.to_json_object() back_again = rule.from_json_object(json) self.assertEqual(rule, back_again) with self.subTest(rule): matches = rule.match(in_value) if expected: self.assertEqual([match["match"] for match in matches], expected) else: self.assertFalse(list(matches))