def test_rule_names(self):
     A = RegexRule("A", name="Fragment A")
     B = RegexRule("B", name="Fragment B")
     C1 = RegexRule("C1", name="Fragment C1")
     C2 = RegexRule("C2", name="Fragment C2")
     C = OrRule(C1, C2, name="Fragment C")
     self.assertEqual(
         AndRule(A, B).presentation, "(Fragment A and Fragment B)")
     self.assertEqual(
         OrRule(A, B, C).presentation,
         "(Fragment A, Fragment B, or Fragment C)")
Пример #2
0
    def test_sensitivity_matches(self):
        rule = AndRule(
            RegexRule("bad thing"),
            OrRule(
                RegexRule("very bad", sensitivity=Sensitivity.CRITICAL),
                RegexRule("moderately bad", sensitivity=Sensitivity.PROBLEM),
                RegexRule("slightly bad", sensitivity=Sensitivity.WARNING),
                FallbackRule(sensitivity=Sensitivity.INFORMATION)))

        expected = [("very bad thing", Sensitivity.CRITICAL),
                    ("moderately bad thing", Sensitivity.PROBLEM),
                    ("moderately bad very bad thing", Sensitivity.CRITICAL),
                    ("slightly moderately bad thing", Sensitivity.PROBLEM),
                    ("moderately slightly bad thing", Sensitivity.WARNING),
                    ("bad thing", Sensitivity.INFORMATION),
                    ("moderately quite bad thing", Sensitivity.INFORMATION)]

        for in_v, sensitivity in expected:
            matched, results = run_rule(rule, in_v)
            self.assertEqual(matched, True)
            self.assertEqual(
                sensitivity,
                max([
                    rule.sensitivity for rule, matches in results.items()
                    if rule.sensitivity is not None and matches
                ],
                    key=lambda sensitivity: sensitivity.value))
Пример #3
0
from os2datascanner.engine2.pipeline import (explorer, processor, matcher,
                                             tagger, exporter)

here_path = os.path.dirname(__file__)
test_data_path = os.path.join(here_path, "..", "data")

data = """Hwæt! wē Gār-Dena in gēar-dagum
þēod-cyninga þrym gefrūnon,
hū ðā æþeling as ell en fremedon.
Oft Scyld Scēfing sceaþena þrēatum,
monegum mǣgþum meodo-setla oftēah."""
data_url = "data:text/plain;base64,{0}".format(
    base64.encodebytes(data.encode("utf-8")).decode("ascii"))

rule = OrRule(
    RegexRule("Æthelred the Unready", name="Check for ill-advised kings"),
    RegexRule("Scyld S(.*)g", sensitivity=Sensitivity.CRITICAL),
    RegexRule("Professor James Moriarty"))

expected_matches = [{
    "rule": {
        "type": "regex",
        "name": "Check for ill-advised kings",
        "sensitivity": None,
        "expression": "Æthelred the Unready"
    },
    "matches": None
}, {
    "rule": {
        "type": "regex",
        "name": None,
        "sensitivity": Sensitivity.CRITICAL.value,
Пример #4
0
    def run(self, user=None):
        """Schedules a scan to be run by the pipeline. Returns the scan tag of
        the resulting scan on success.

        An exception will be raised if the underlying source is not available,
        and a pika.exceptions.AMQPError (or a subclass) will be raised if it
        was not possible to communicate with the pipeline."""
        now = datetime.datetime.now(tz=tz.gettz()).replace(microsecond=0)

        # Create a new engine2 scan specification
        rule = OrRule.make(*[
            r.make_engine2_rule()
            for r in self.rules.all().select_subclasses()
        ])

        configuration = {}

        prerules = []
        if self.do_last_modified_check:
            last = self.e2_last_run_at
            if last:
                prerules.append(LastModifiedRule(last))

        if self.do_ocr:
            # If we are doing OCR, then filter out any images smaller than
            # 128x32 (or 32x128)...
            cr = make_if(
                HasConversionRule(OutputType.ImageDimensions),
                DimensionsRule(width_range=range(32, 16385),
                               height_range=range(32, 16385),
                               min_dim=128), True)
            prerules.append(cr)
        else:
            # ... and, if we're not, then skip all of the image files
            configuration["skip_mime_types"] = ["image/*"]

        rule = AndRule.make(*prerules, rule)

        scan_tag = {
            'time': now.isoformat(),
            'user': user.username if user else None,
            'scanner': {
                'pk': self.pk,
                'name': self.name
            },
            'organisation': {
                'name': self.organization.name,
                'uuid': str(self.organization.uuid)
            },
            'destination': 'pipeline_collector'
        }

        # Build ScanSpecMessages for all Sources
        message_template = messages.ScanSpecMessage(
            scan_tag=scan_tag,
            rule=rule,
            configuration=configuration,
            source=None,
            progress=None)
        outbox = []
        source_count = 0
        for source in self.generate_sources():
            outbox.append((settings.AMQP_PIPELINE_TARGET,
                           message_template._replace(source=source)))
            source_count += 1

        # Also build ConversionMessages for the objects that we should try to
        # scan again (our pipeline_collector is responsible for eventually
        # deleting these reminders)
        message_template = messages.ConversionMessage(
            scan_spec=message_template,
            handle=None,
            progress=messages.ProgressFragment(rule=None, matches=[]))
        for reminder in self.checkups.all():
            ib = reminder.interested_before
            rule_here = AndRule.make(
                LastModifiedRule(ib) if ib else True, rule)
            outbox.append((settings.AMQP_CONVERSION_TARGET,
                           message_template._deep_replace(
                               scan_spec__source=reminder.handle.source,
                               handle=reminder.handle,
                               progress__rule=rule_here)))

        self.e2_last_run_at = now
        self.save()

        # OK, we're committed now! Create a model object to track the status of
        # this scan...
        ScanStatus.objects.create(scanner=self,
                                  scan_tag=scan_tag,
                                  total_sources=source_count,
                                  total_objects=self.checkups.count())

        # ... and dispatch the scan specifications to the pipeline
        with PikaPipelineSender(write={queue for queue, _ in outbox}) as pps:
            for queue, message in outbox:
                pps.publish_message(queue, message.to_json_object())

        return scan_tag
Пример #5
0
    def run(self, type, blocking=False, user=None):
        """Run a scan with the Scanner.

        Return the Scan object if we started the scanner.
        Return None if there is already a scanner running,
        or if there was a problem running the scanner.
        """
        local_tz = tz.gettz()
        now = datetime.datetime.now().replace(microsecond=0)

        # Check that this source is accessible, and return the resulting error
        # if it isn't
        source = self.make_engine2_source()
        with SourceManager() as sm, closing(source.handles(sm)) as handles:
            try:
                print(next(handles, True))
            except ResourceUnavailableError as ex:
                return ", ".join([str(a) for a in ex.args[1:]])

        # Create a new engine2 scan specification and submit it to the
        # pipeline
        rule = OrRule.make(*[
            r.make_engine2_rule()
            for r in self.rules.all().select_subclasses()
        ])

        prerules = []
        if self.do_last_modified_check:
            # Make sure that the timestamp we give to LastModifiedRule is
            # timezone-aware; engine2's serialisation code requires this
            # for all datetime.datetimes, so LastModifiedRule will raise a
            # ValueError if we try to give it a naive one
            last = self.e2_last_run_at
            if last:
                if not last.tzinfo or last.tzinfo.utcoffset(last) is None:
                    last = last.replace(tzinfo=local_tz)
                prerules.append(LastModifiedRule(last))
        if self.do_ocr:
            cr = make_if(
                HasConversionRule(OutputType.ImageDimensions),
                DimensionsRule(width_range=range(32, 16385),
                               height_range=range(32, 16385),
                               min_dim=128), True)
            prerules.append(cr)

        rule = AndRule.make(*prerules, rule)

        message = {
            'scan_tag': now.isoformat(),
            'source': source.to_json_object(),
            'rule': rule.to_json_object()
        }
        queue_name = settings.AMQP_PIPELINE_TARGET

        self.e2_last_run_at = now
        self.save()

        scan = now.isoformat()

        print(queue_name, json.dumps(message))
        amqp_connection_manager.start_amqp(queue_name)
        amqp_connection_manager.send_message(queue_name, json.dumps(message))
        amqp_connection_manager.close_connection()

        return scan
class RuleTests(unittest.TestCase):
    def test_simplerule_matches(self):
        candidates = [
            (CPRRule(modulus_11=False, ignore_irrelevant=False), """
2205995008: forbryder,
230500 0003: forbryder,
240501-0006: forbryder,
250501-1987: forbryder""",
             ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX", "2505XXXXXX"]),
            (CPRRule(modulus_11=True, ignore_irrelevant=True), """
2205995008: forbryder,
230500 0003: forbryder,
240501-0006: forbryder,
250501-1987: forbryder""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]),
            (CPRRule(modulus_11=True,
                     ignore_irrelevant=True,
                     examine_context=False), """
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003
240501-0006""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]),
            (CPRRule(modulus_11=True,
                     ignore_irrelevant=True,
                     examine_context=True), """
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003
240501-0006""", ["2405XXXXXX"]),
            (RegexRule("((four|six)( [aopt]+)?|(one|seven) [aopt]+)"), """
one
one potato
two potato
three potato
four
five potato
six potato
seven potato
more!""", ["one potato", "four", "six potato", "seven potato"]),
            (LastModifiedRule(
                datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)),
             datetime(2019, 12, 31, 23, 59, 59,
                      tzinfo=timezone.utc), ["2019-12-31T23:59:59+0000"]),
            (LastModifiedRule(
                datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)),
             datetime(2019, 5, 22, 0, 0, 1, tzinfo=timezone.utc), None),
            (DimensionsRule(width_range=range(0, 16385),
                            height_range=range(0, 16385),
                            min_dim=256), (128, 256), [[128, 256]]),
            (DimensionsRule(width_range=range(0, 16385),
                            height_range=range(0, 16385),
                            min_dim=256), (128, 255), []),
            (DimensionsRule(width_range=range(256, 1024),
                            height_range=range(256, 1024),
                            min_dim=0), (256, 256), [[256, 256]]),
            (DimensionsRule(width_range=range(256, 1024),
                            height_range=range(256, 1024),
                            min_dim=0), (32, 32), []),
        ]

        for rule, in_value, expected in candidates:
            with self.subTest(rule):
                json = rule.to_json_object()
                back_again = rule.from_json_object(json)
                self.assertEqual(rule, back_again)

            with self.subTest(rule):
                matches = rule.match(in_value)
                if expected:
                    self.assertEqual([match["match"] for match in matches],
                                     expected)
                else:
                    self.assertFalse(list(matches))

    compound_candidates = [
        (AndRule(RegexRule("A"), OrRule(RegexRule("B"),
                                        RegexRule("C"))), [("A", False, 3),
                                                           ("AB", True, 2),
                                                           ("ABC", True, 2),
                                                           ("BC", False, 1),
                                                           ("AC", True, 3)]),
        (NotRule(
            AndRule(RegexRule("A"), OrRule(RegexRule("B"), RegexRule("C")))),
         [("A", True, 3), ("AB", False, 2), ("ABC", False, 2), ("BC", True, 1),
          ("AC", False, 3)]),
        (AndRule(NotRule(OrRule(RegexRule("B"), RegexRule("C"))),
                 RegexRule("A")), [("A", True, 3), ("AB", False, 1),
                                   ("ABC", False, 1), ("BC", False, 1),
                                   ("AC", False, 2)])
    ]

    def test_compound_rule_matches(self):
        for rule, tests in RuleTests.compound_candidates:
            for input_string, outcome, evaluation_count in tests:
                now = rule
                evaluations = 0

                while True:
                    print(now)
                    head, pve, nve = now.split()
                    evaluations += 1
                    print(head)
                    match = list(head.match(input_string))
                    print(match)
                    if match:
                        now = pve
                    else:
                        now = nve
                    if isinstance(now, bool):
                        break
                print(input_string, now, outcome)
                self.assertEqual(outcome, now,
                                 "{0}: wrong result".format(input_string))
                self.assertEqual(
                    evaluation_count, evaluations,
                    "{0}: wrong evaluation count".format(input_string))

    def test_json_round_trip(self):
        for rule, _ in RuleTests.compound_candidates:
            with self.subTest(rule):
                json = rule.to_json_object()
                back_again = rule.from_json_object(json)
                self.assertEqual(rule, back_again)

    def test_oxford_comma(self):
        self.assertEqual(oxford_comma(["Monday"], "and"), "Monday")
        self.assertEqual(oxford_comma(["Monday", "Tuesday"], "and"),
                         "Monday and Tuesday")
        self.assertEqual(
            oxford_comma(["Monday", "Tuesday", "Wednesday"], "and"),
            "Monday, Tuesday, and Wednesday")

    def test_rule_names(self):
        A = RegexRule("A", name="Fragment A")
        B = RegexRule("B", name="Fragment B")
        C1 = RegexRule("C1", name="Fragment C1")
        C2 = RegexRule("C2", name="Fragment C2")
        C = OrRule(C1, C2, name="Fragment C")
        self.assertEqual(
            AndRule(A, B).presentation, "(Fragment A and Fragment B)")
        self.assertEqual(
            OrRule(A, B, C).presentation,
            "(Fragment A, Fragment B, or Fragment C)")