def test_rule_names(self): A = RegexRule("A", name="Fragment A") B = RegexRule("B", name="Fragment B") C1 = RegexRule("C1", name="Fragment C1") C2 = RegexRule("C2", name="Fragment C2") C = OrRule(C1, C2, name="Fragment C") self.assertEqual( AndRule(A, B).presentation, "(Fragment A and Fragment B)") self.assertEqual( OrRule(A, B, C).presentation, "(Fragment A, Fragment B, or Fragment C)")
def test_sensitivity_matches(self): rule = AndRule( RegexRule("bad thing"), OrRule( RegexRule("very bad", sensitivity=Sensitivity.CRITICAL), RegexRule("moderately bad", sensitivity=Sensitivity.PROBLEM), RegexRule("slightly bad", sensitivity=Sensitivity.WARNING), FallbackRule(sensitivity=Sensitivity.INFORMATION))) expected = [("very bad thing", Sensitivity.CRITICAL), ("moderately bad thing", Sensitivity.PROBLEM), ("moderately bad very bad thing", Sensitivity.CRITICAL), ("slightly moderately bad thing", Sensitivity.PROBLEM), ("moderately slightly bad thing", Sensitivity.WARNING), ("bad thing", Sensitivity.INFORMATION), ("moderately quite bad thing", Sensitivity.INFORMATION)] for in_v, sensitivity in expected: matched, results = run_rule(rule, in_v) self.assertEqual(matched, True) self.assertEqual( sensitivity, max([ rule.sensitivity for rule, matches in results.items() if rule.sensitivity is not None and matches ], key=lambda sensitivity: sensitivity.value))
from os2datascanner.engine2.pipeline import (explorer, processor, matcher, tagger, exporter) here_path = os.path.dirname(__file__) test_data_path = os.path.join(here_path, "..", "data") data = """Hwæt! wē Gār-Dena in gēar-dagum þēod-cyninga þrym gefrūnon, hū ðā æþeling as ell en fremedon. Oft Scyld Scēfing sceaþena þrēatum, monegum mǣgþum meodo-setla oftēah.""" data_url = "data:text/plain;base64,{0}".format( base64.encodebytes(data.encode("utf-8")).decode("ascii")) rule = OrRule( RegexRule("Æthelred the Unready", name="Check for ill-advised kings"), RegexRule("Scyld S(.*)g", sensitivity=Sensitivity.CRITICAL), RegexRule("Professor James Moriarty")) expected_matches = [{ "rule": { "type": "regex", "name": "Check for ill-advised kings", "sensitivity": None, "expression": "Æthelred the Unready" }, "matches": None }, { "rule": { "type": "regex", "name": None, "sensitivity": Sensitivity.CRITICAL.value,
def run(self, user=None): """Schedules a scan to be run by the pipeline. Returns the scan tag of the resulting scan on success. An exception will be raised if the underlying source is not available, and a pika.exceptions.AMQPError (or a subclass) will be raised if it was not possible to communicate with the pipeline.""" now = datetime.datetime.now(tz=tz.gettz()).replace(microsecond=0) # Create a new engine2 scan specification rule = OrRule.make(*[ r.make_engine2_rule() for r in self.rules.all().select_subclasses() ]) configuration = {} prerules = [] if self.do_last_modified_check: last = self.e2_last_run_at if last: prerules.append(LastModifiedRule(last)) if self.do_ocr: # If we are doing OCR, then filter out any images smaller than # 128x32 (or 32x128)... cr = make_if( HasConversionRule(OutputType.ImageDimensions), DimensionsRule(width_range=range(32, 16385), height_range=range(32, 16385), min_dim=128), True) prerules.append(cr) else: # ... and, if we're not, then skip all of the image files configuration["skip_mime_types"] = ["image/*"] rule = AndRule.make(*prerules, rule) scan_tag = { 'time': now.isoformat(), 'user': user.username if user else None, 'scanner': { 'pk': self.pk, 'name': self.name }, 'organisation': { 'name': self.organization.name, 'uuid': str(self.organization.uuid) }, 'destination': 'pipeline_collector' } # Build ScanSpecMessages for all Sources message_template = messages.ScanSpecMessage( scan_tag=scan_tag, rule=rule, configuration=configuration, source=None, progress=None) outbox = [] source_count = 0 for source in self.generate_sources(): outbox.append((settings.AMQP_PIPELINE_TARGET, message_template._replace(source=source))) source_count += 1 # Also build ConversionMessages for the objects that we should try to # scan again (our pipeline_collector is responsible for eventually # deleting these reminders) message_template = messages.ConversionMessage( scan_spec=message_template, handle=None, progress=messages.ProgressFragment(rule=None, matches=[])) for reminder in self.checkups.all(): ib = reminder.interested_before rule_here = AndRule.make( LastModifiedRule(ib) if ib else True, rule) outbox.append((settings.AMQP_CONVERSION_TARGET, message_template._deep_replace( scan_spec__source=reminder.handle.source, handle=reminder.handle, progress__rule=rule_here))) self.e2_last_run_at = now self.save() # OK, we're committed now! Create a model object to track the status of # this scan... ScanStatus.objects.create(scanner=self, scan_tag=scan_tag, total_sources=source_count, total_objects=self.checkups.count()) # ... and dispatch the scan specifications to the pipeline with PikaPipelineSender(write={queue for queue, _ in outbox}) as pps: for queue, message in outbox: pps.publish_message(queue, message.to_json_object()) return scan_tag
def run(self, type, blocking=False, user=None): """Run a scan with the Scanner. Return the Scan object if we started the scanner. Return None if there is already a scanner running, or if there was a problem running the scanner. """ local_tz = tz.gettz() now = datetime.datetime.now().replace(microsecond=0) # Check that this source is accessible, and return the resulting error # if it isn't source = self.make_engine2_source() with SourceManager() as sm, closing(source.handles(sm)) as handles: try: print(next(handles, True)) except ResourceUnavailableError as ex: return ", ".join([str(a) for a in ex.args[1:]]) # Create a new engine2 scan specification and submit it to the # pipeline rule = OrRule.make(*[ r.make_engine2_rule() for r in self.rules.all().select_subclasses() ]) prerules = [] if self.do_last_modified_check: # Make sure that the timestamp we give to LastModifiedRule is # timezone-aware; engine2's serialisation code requires this # for all datetime.datetimes, so LastModifiedRule will raise a # ValueError if we try to give it a naive one last = self.e2_last_run_at if last: if not last.tzinfo or last.tzinfo.utcoffset(last) is None: last = last.replace(tzinfo=local_tz) prerules.append(LastModifiedRule(last)) if self.do_ocr: cr = make_if( HasConversionRule(OutputType.ImageDimensions), DimensionsRule(width_range=range(32, 16385), height_range=range(32, 16385), min_dim=128), True) prerules.append(cr) rule = AndRule.make(*prerules, rule) message = { 'scan_tag': now.isoformat(), 'source': source.to_json_object(), 'rule': rule.to_json_object() } queue_name = settings.AMQP_PIPELINE_TARGET self.e2_last_run_at = now self.save() scan = now.isoformat() print(queue_name, json.dumps(message)) amqp_connection_manager.start_amqp(queue_name) amqp_connection_manager.send_message(queue_name, json.dumps(message)) amqp_connection_manager.close_connection() return scan
class RuleTests(unittest.TestCase): def test_simplerule_matches(self): candidates = [ (CPRRule(modulus_11=False, ignore_irrelevant=False), """ 2205995008: forbryder, 230500 0003: forbryder, 240501-0006: forbryder, 250501-1987: forbryder""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX", "2505XXXXXX"]), (CPRRule(modulus_11=True, ignore_irrelevant=True), """ 2205995008: forbryder, 230500 0003: forbryder, 240501-0006: forbryder, 250501-1987: forbryder""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]), (CPRRule(modulus_11=True, ignore_irrelevant=True, examine_context=False), """ Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008 Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003 240501-0006""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]), (CPRRule(modulus_11=True, ignore_irrelevant=True, examine_context=True), """ Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008 Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003 240501-0006""", ["2405XXXXXX"]), (RegexRule("((four|six)( [aopt]+)?|(one|seven) [aopt]+)"), """ one one potato two potato three potato four five potato six potato seven potato more!""", ["one potato", "four", "six potato", "seven potato"]), (LastModifiedRule( datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)), datetime(2019, 12, 31, 23, 59, 59, tzinfo=timezone.utc), ["2019-12-31T23:59:59+0000"]), (LastModifiedRule( datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)), datetime(2019, 5, 22, 0, 0, 1, tzinfo=timezone.utc), None), (DimensionsRule(width_range=range(0, 16385), height_range=range(0, 16385), min_dim=256), (128, 256), [[128, 256]]), (DimensionsRule(width_range=range(0, 16385), height_range=range(0, 16385), min_dim=256), (128, 255), []), (DimensionsRule(width_range=range(256, 1024), height_range=range(256, 1024), min_dim=0), (256, 256), [[256, 256]]), (DimensionsRule(width_range=range(256, 1024), height_range=range(256, 1024), min_dim=0), (32, 32), []), ] for rule, in_value, expected in candidates: with self.subTest(rule): json = rule.to_json_object() back_again = rule.from_json_object(json) self.assertEqual(rule, back_again) with self.subTest(rule): matches = rule.match(in_value) if expected: self.assertEqual([match["match"] for match in matches], expected) else: self.assertFalse(list(matches)) compound_candidates = [ (AndRule(RegexRule("A"), OrRule(RegexRule("B"), RegexRule("C"))), [("A", False, 3), ("AB", True, 2), ("ABC", True, 2), ("BC", False, 1), ("AC", True, 3)]), (NotRule( AndRule(RegexRule("A"), OrRule(RegexRule("B"), RegexRule("C")))), [("A", True, 3), ("AB", False, 2), ("ABC", False, 2), ("BC", True, 1), ("AC", False, 3)]), (AndRule(NotRule(OrRule(RegexRule("B"), RegexRule("C"))), RegexRule("A")), [("A", True, 3), ("AB", False, 1), ("ABC", False, 1), ("BC", False, 1), ("AC", False, 2)]) ] def test_compound_rule_matches(self): for rule, tests in RuleTests.compound_candidates: for input_string, outcome, evaluation_count in tests: now = rule evaluations = 0 while True: print(now) head, pve, nve = now.split() evaluations += 1 print(head) match = list(head.match(input_string)) print(match) if match: now = pve else: now = nve if isinstance(now, bool): break print(input_string, now, outcome) self.assertEqual(outcome, now, "{0}: wrong result".format(input_string)) self.assertEqual( evaluation_count, evaluations, "{0}: wrong evaluation count".format(input_string)) def test_json_round_trip(self): for rule, _ in RuleTests.compound_candidates: with self.subTest(rule): json = rule.to_json_object() back_again = rule.from_json_object(json) self.assertEqual(rule, back_again) def test_oxford_comma(self): self.assertEqual(oxford_comma(["Monday"], "and"), "Monday") self.assertEqual(oxford_comma(["Monday", "Tuesday"], "and"), "Monday and Tuesday") self.assertEqual( oxford_comma(["Monday", "Tuesday", "Wednesday"], "and"), "Monday, Tuesday, and Wednesday") def test_rule_names(self): A = RegexRule("A", name="Fragment A") B = RegexRule("B", name="Fragment B") C1 = RegexRule("C1", name="Fragment C1") C2 = RegexRule("C2", name="Fragment C2") C = OrRule(C1, C2, name="Fragment C") self.assertEqual( AndRule(A, B).presentation, "(Fragment A and Fragment B)") self.assertEqual( OrRule(A, B, C).presentation, "(Fragment A, Fragment B, or Fragment C)")