Exemplo n.º 1
0
    def test_simple_regex_match(self):
        print(Source.from_url(data_url).to_json_object())
        obj = {
            "scan_tag": {
                "scanner": {
                    "name": "integration_test",
                    "pk": 0
                },
                "time": "2020-01-01T00:00:00+00:00"
            },
            "source": Source.from_url(data_url).to_json_object(),
            "rule": rule.to_json_object()
        }

        self.messages.append((
            obj,
            "os2ds_scan_specs",
        ))
        self.run_pipeline()

        self.assertEqual(len(self.unhandled), 2)
        results = {body["origin"]: body for body, _ in self.unhandled}

        self.assertTrue(results["os2ds_matches"]["matched"],
                        "RegexRule match failed")
        self.assertEqual(results["os2ds_matches"]["matches"], expected_matches,
                         "RegexRule match did not produce expected result")
Exemplo n.º 2
0
    def test_simple_regex_match(self):
        print(Source.from_url(data_url).to_json_object())
        obj = {
            "scan_tag": {
                "scanner": {
                    "name": "integration_test",
                    "pk": 0
                },
                "time": "2020-01-01T00:00:00+00:00"
            },
            "source": Source.from_url(data_url).to_json_object(),
            "rule": rule.to_json_object()
        }

        self.runner.channel.basic_publish(exchange='',
                                          routing_key="os2ds_scan_specs",
                                          body=dumps(obj).encode())

        try:
            self.runner.run_consumer()
        except StopHandling as e:
            self.assertTrue(self.runner.messages["os2ds_matches"]["matched"],
                            "RegexRule match failed")
            self.assertEqual(
                self.runner.messages["os2ds_matches"]["matches"],
                expected_matches,
                "RegexRule match did not produce expected result")
Exemplo n.º 3
0
    def test_simple_regex_match(self):
        print(Source.from_url(data_url).to_json_object())
        obj = {
            "scan_tag": "integration_test",
            "source": Source.from_url(data_url).to_json_object(),
            "rule": rule.to_json_object()
        }

        self.channel.basic_publish(exchange='',
                                   routing_key="os2ds_scan_specs",
                                   body=dumps(obj).encode())

        messages = {}

        def result_received(a, b, c, d):
            body = loads(d.decode("utf-8"))
            messages[body["origin"]] = body
            if len(messages) == 2:
                raise StopHandling()

        self.channel.basic_consume("os2ds_results", result_received)

        try:
            self.channel.start_consuming()
        except StopHandling as e:
            self.assertTrue(messages["os2ds_matches"]["matched"],
                            "RegexRule match failed")
            self.assertEqual(
                messages["os2ds_matches"]["matches"], expected_matches,
                "RegexRule match did not produce expected result")
Exemplo n.º 4
0
    def process(self, source, sm, depth=0):
        if depth == 0:
            self.assertIsNone(source.handle,
                              "{0}: unexpected backing handle".format(source))
        for handle in source.handles(sm):
            print("{0}{1}".format("  " * depth, handle))
            guessed = Source.from_handle(handle)
            computed = Source.from_handle(handle, sm)

            if computed or guessed:
                self.process(computed or guessed, sm, depth + 1)

            elif handle.name == "url":
                with handle.follow(sm).make_stream() as fp:
                    url = fp.read().decode("utf-8")
                self.process(Source.from_url(url), sm, depth + 1)

            elif handle.name == "test-vector" or isinstance(
                    source, DataSource):
                r = handle.follow(sm)

                self.assertTrue(r.check(), "check() method failed")
                reported_size = r.get_size()
                last_modified = r.get_last_modified()

                with r.make_stream() as fp:
                    stream_raw = fp.read()
                    stream_size = len(stream_raw)
                    stream_content = stream_raw.decode("utf-8")
                with r.make_path() as p:
                    with open(p, "rb") as fp:
                        file_raw = fp.read()
                        file_size = len(file_raw)
                        file_content = file_raw.decode("utf-8")

                self.assertIsInstance(last_modified, SingleResult,
                                      ("{0}: last modification date is not a"
                                       " SingleResult").format(handle))
                self.assertIsInstance(
                    last_modified.value, datetime,
                    ("{0}: last modification date value is not a"
                     "datetime.datetime").format(handle))

                self.assertIsInstance(reported_size, SingleResult,
                                      ("{0}: resource length is not a"
                                       " SingleResult").format(handle))
                self.assertEqual(
                    stream_size, reported_size.value,
                    "{0}: model stream length invalid".format(handle))
                self.assertEqual(
                    file_size, reported_size.value,
                    "{0}: model stream length invalid".format(handle))
                self.assertEqual(
                    file_raw, stream_raw,
                    "{0}: model file and stream not equal".format(handle))
                self.assertEqual(stream_content, self.correct_content,
                                 "{0}: model stream invalid".format(handle))
                self.assertEqual(file_content, self.correct_content,
                                 "{0}: model file invalid".format(handle))
Exemplo n.º 5
0
 def test_incomplete_json(self):
     with self.assertRaises(DeserialisationError):
         Source.from_json_object({"hostname": "gopher.invalid"})
     with self.assertRaises(DeserialisationError):
         Handle.from_json_object({
             "source": {
                 "type": "gopher",
                 "hostname": "gopher.invalid"
             },
             "path": "/Reference"
         })
Exemplo n.º 6
0
 def test_invalid_json(self):
     with self.assertRaises(UnknownSchemeError):
         Source.from_json_object({
             "type": "gopher",
             "hostname": "gopher.invalid"
         })
     with self.assertRaises(UnknownSchemeError):
         Handle.from_json_object({
             "type": "gopher",
             "source": {
                 "type": "gopher",
                 "hostname": "gopher.invalid"
             },
             "path": "/Reference"
         })
Exemplo n.º 7
0
 def test_corrupted_doc(self):
     corrupted_doc_handle = FilesystemHandle.make_handle(
         os.path.join(test_data_path, "msoffice/corrupted/test.trunc.doc"))
     corrupted_doc = Source.from_handle(corrupted_doc_handle)
     with SourceManager() as sm:
         self.assertEqual(
             list(corrupted_doc.handles(sm)), [],
             "unrecognised CDFV2 document should be empty and wasn't")
Exemplo n.º 8
0
    def test_derived_source(self):
        with SourceManager() as sm:
            s = FilesystemSource(test_data_path)
            h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip")

            zs = Source.from_handle(h)
            self.assertIsNotNone(
                zs.handle, "{0}: derived source has no handle".format(zs))
Exemplo n.º 9
0
def try_apply(sm, source):
    for handle in source.handles(sm):
        derived = Source.from_handle(handle, sm)
        if derived:
            try_apply(sm, derived)
        else:
            resource = handle.follow(sm)
            representation = convert(resource, OutputType.Text)
            return representation.value
Exemplo n.º 10
0
 def test_libreoffice_size(self):
     large_doc_handle = FilesystemHandle.make_handle(
         os.path.join(test_data_path, "libreoffice/html-explosion.ods"))
     large_doc = Source.from_handle(large_doc_handle)
     with SourceManager() as sm:
         for h in large_doc.handles(sm):
             if h.name.endswith(".html"):
                 r = h.follow(sm)
                 self.assertLess(r.get_size().value, 1048576,
                                 "LibreOffice HTML output was too big")
Exemplo n.º 11
0
 def test_smbc_url(self):
     with SourceManager() as sm:
         source = Source.from_url(
             "smbc://*****:*****@samba/general")
         try:
             with contextlib.closing(source.handles(sm)) as c:
                 next(c)
         except Exception:
             self.skipTest("test Samba server not up (not running in CI?)")
         self.process(source, sm)
Exemplo n.º 12
0
def try_apply(sm, source, rule):
    for handle in source.handles(sm):
        derived = Source.from_handle(handle, sm)
        if derived:
            yield from try_apply(sm, derived, rule)
        else:
            resource = handle.follow(sm)
            representation = convert(resource, rule.operates_on)
            if representation:
                yield from rule.match(representation.value)
Exemplo n.º 13
0
 def test_eml_files(self):
     fs = FilesystemSource(test_data_path)
     with SourceManager() as sm:
         for h in fs.handles(sm):
             mail_source = Source.from_handle(h)
             self.assertIsInstance(
                     mail_source,
                     MailSource,
                     "conversion of {0} to MailSource failed".format(h))
             for h in mail_source.handles(sm):
                 self.assertIsInstance(
                         h,
                         MailPartHandle)
Exemplo n.º 14
0
 def handle(self, **kwargs):
     urls = kwargs['urls']
     guess, summarise = kwargs['guess'], kwargs['summarise']
     with SourceManager() as sm:
         for i in urls:
             try:
                 s = Source.from_url(i)
                 url_explorer.print_source(sm,
                                           s,
                                           guess=guess,
                                           summarise=summarise)
             except UnknownSchemeError:
                 pass
    def test_simple_regex_match(self):
        print(Source.from_url(data_url).to_json_object())
        obj = {
            "scan_tag": {
                "scanner": {
                    "name": "integration_test",
                    "pk": 0
                },
                "time": "2020-01-01T00:00:00+00:00"
            },
            "source": Source.from_url(data_url).to_json_object(),
            "rule": rule.to_json_object()
        }

        self.runner.channel.basic_publish(exchange='',
                                          routing_key="os2ds_scan_specs",
                                          body=dumps(obj).encode())

        messages = {}

        def result_received(channel, method, properties, body):
            channel.basic_ack(method.delivery_tag)
            body = loads(body.decode("utf-8"))
            messages[body["origin"]] = body
            if len(messages) == 2:
                raise StopHandling()

        self.runner.channel.basic_consume("os2ds_results", result_received)

        try:
            self.runner.run_consumer()
        except StopHandling as e:
            self.assertTrue(messages["os2ds_matches"]["matched"],
                            "RegexRule match failed")
            self.assertEqual(
                messages["os2ds_matches"]["matches"], expected_matches,
                "RegexRule match did not produce expected result")
Exemplo n.º 16
0
 def test_invalid_url(self):
     with self.assertRaises(UnknownSchemeError):
         Source.from_url("Well, this just isn't a URL at all!")
Exemplo n.º 17
0
 def test_handles_failure(self):
     with self.assertRaises(Exception):
         with SourceManager() as sm:
             source = Source.from_url("http://example.invalid./")
             with contextlib.closing(source.handles(sm)) as handles:
                 next(handles)
Exemplo n.º 18
0
 def test_invalid_scheme(self):
     with self.assertRaises(UnknownSchemeError):
         Source.from_url("xxx-invalid://data/20")
Exemplo n.º 19
0
 def test_local_url(self):
     with SourceManager() as sm:
         self.process(Source.from_url("file://" + test_data_path), sm)
Exemplo n.º 20
0
    "name": "test.txt",
}

# json `data` content needs to be base64 encoded
json_gzip = {
    "type": "data",
    "content": b64encode(gzip_content),
    "mime": "application/gzip",
    "name": "test.txt",
}

for j in (
        json_data,
        json_gzip,
):
    s = Source.from_json_object(j)
    while True:
        h_generator = s.handles(sm)
        h = next(h_generator)
        r = h.follow(sm)

        #if h.guess_type() == "text/plain":
        print(f"handle\t{h}")
        print(f"resource\t{r}")

        print("raw content:")
        with r.make_stream() as fp:
            print("\t\t{0}".format(fp.read()))

        # should succed for text -> text conversion
        try:
Exemplo n.º 21
0
 def run_rule_on_handle(self, handle):
     with SourceManager() as sm:
         source = Source.from_handle(handle, sm)
         self.assertIsNotNone(
             source, "{0} couldn't be made into a Source".format(handle))
         self.run_rule(source, sm)
Exemplo n.º 22
0

body = {
    "rule": {
        "type": "regex",
        "expression": "[Tt]est"
    },
    "source": {
        "type": "data",
        "content": "VGhpcyBpcyBvbmx5IGEgdGVzdA==",
        "mime": "text/plain",
        "name": "test.txt"
    }
}

source = Source.from_json_object(body["source"])
top_type = _get_top(source).type_label

rule = Rule.from_json_object(body["rule"])

message = messages.ScanSpecMessage(scan_tag=messages.ScanTagFragment(
    time=time_now(),
    user=None,
    scanner=messages.ScannerFragment(pk=0, name="API server demand scan"),
    organisation=messages.OrganisationFragment(name="API server",
                                               uuid=uuid4())),
                                   source=source,
                                   rule=rule,
                                   configuration={},
                                   progress=None).to_json_object()
Exemplo n.º 23
0
def get_content_from_handle(handle):
    with SourceManager() as sm:
        source = Source.from_handle(handle, sm)
        assert source is not None, f"{handle} cound not be made into a Source"
        return try_apply(sm, source)
Exemplo n.º 24
0
    with r.make_stream() as fp:
        content = fp.read()
        # same as r.compute_type() implemented in FileResource
        # we could only read the first 512 bytes to get mime type
        mtype = mime.from_buffer(content)
        with open(fname, 'wb') as fh:
            fh.write(content)

# To see how the pipeline can work with data sources of all kinds without
# knowing what they are, we can try working with the JSON form of ToySource:
from os2datascanner.engine2.model.core import Source, SourceManager

sm = SourceManager()
generic_source = Source.from_json_object({
    "type": "toy",
    "username": "******",
    "password": "******"
})
print([h.relative_path for h in generic_source.handles(sm)])
""" The description of Handles earlier glossed them as references to "objects".
But what is an object?

To some extent this depends on the Source. In a filesystem, an object is a file:
a named stream of bytes with some metadata. In an email account, an object is an
email. In a case management system, an object is a case.

But sometimes the lines are blurrier than that. For example, consider a Zip
file. It is a file: it's a stream of bytes with a name, a size, and some
metadata. It can also, however, be viewed as a container for other files, each
of which in turn also has these properties.