def test_corrupted_ocr(self): fs = FilesystemSource(os.path.join(test_data_path, "corrupted")) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) self.assertEqual(convert(resource, OutputType.Text), None, "{0}: error handling failed".format(h))
def test_ocr_conversions(self): fs = FilesystemSource(os.path.join(test_data_path, "good")) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) self.assertEqual( convert(resource, OutputType.Text).value, expected_result, "{0}: content failed".format(h))
def test_size_computation(self): fs = FilesystemSource(test_data_path) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) size = convert(resource, OutputType.ImageDimensions) if not size: if "rgba32" in h.relative_path: self.skipTest("Pillow RGBA bug detected -- skipping") else: size = size.value self.assertEqual(size, expected_size, "{0}: size failed")
def test_eml_files(self): fs = FilesystemSource(test_data_path) with SourceManager() as sm: for h in fs.handles(sm): mail_source = Source.from_handle(h) self.assertIsInstance( mail_source, MailSource, "conversion of {0} to MailSource failed".format(h)) for h in mail_source.handles(sm): self.assertIsInstance( h, MailPartHandle)
def test_ocr_skip(self): obj = { "scan_tag": { "scanner": { "name": "integration_test", "pk": 0 }, "time": "2020-01-01T00:00:00+00:00" }, "source": FilesystemSource(os.path.join(test_data_path, "ocr", "good")).to_json_object(), "rule": CPRRule(modulus_11=False, ignore_irrelevant=False).to_json_object(), "configuration": { "skip_mime_types": ["image/*"] } } self.messages.append(( obj, "os2ds_scan_specs", )) self.run_pipeline() for message, queue in self.unhandled: if queue == "os2ds_results": self.assertFalse(message["matched"], "OCR match found with OCR disabled") else: self.fail("unexpected message in queue {0}".format(queue))
def test_derived_source(self): with SourceManager() as sm: s = FilesystemSource(test_data_path) h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip") zs = Source.from_handle(h) self.assertIsNotNone( zs.handle, "{0}: derived source has no handle".format(zs))
def get_different_filesystemhandle(file_ending, folder_level): path = '/' for x in range(0, folder_level): path += ''.join( random.choice(string.ascii_lowercase) for i in range(10)) + '/' return FilesystemHandle( FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"), "{0}{1}{2}".format(path, random.choice(string.ascii_lowercase), file_ending))
def test_encrypted_zip(self): # Check that all the ZipHandles we get out of an encrypted Zip file # actually work. (It's fine if we don't get any, but the ones we *do* # need to work!) encrypted_file = ZipSource( FilesystemHandle( FilesystemSource(test_data_path), "encrypted-test-vector.zip")) with SourceManager() as sm: for h in encrypted_file.handles(sm): h.follow(sm).compute_type()
def test_corrupted_container(self): obj = { "scan_tag": "integration_test", "source": FilesystemSource(os.path.join(test_data_path, "pdf", "corrupted")).to_json_object(), "rule": CPRRule(modulus_11=False, ignore_irrelevant=False).to_json_object(), "configuration": {} } self.messages.append(( obj, "os2ds_scan_specs", )) self.run_pipeline() print(self.unhandled) self.assertEqual(len(self.unhandled), 1) self.assertEqual(self.unhandled[0][0]["origin"], "os2ds_problems")
import os.path import unittest from os2datascanner.utils.metadata import guess_responsible_party from os2datascanner.engine2.model.core import Handle, Source, SourceManager from os2datascanner.engine2.model.file import (FilesystemHandle, FilesystemSource) from os2datascanner.engine2.model.derived.libreoffice import ( LibreOfficeObjectHandle, LibreOfficeSource) here_path = os.path.dirname(__file__) test_data_path = os.path.join(here_path, "data") test_handle = LibreOfficeObjectHandle( LibreOfficeSource( FilesystemHandle(FilesystemSource(test_data_path), "libreoffice/embedded-cpr.odt")), "embedded-cpr.html") class CountingProxy: def __init__(self, real_handle): self.__attr_accesses = {} self._real_handle = real_handle def __getattr__(self, attr): self.__attr_accesses[attr] = self.get_attr_access_count(attr) + 1 return getattr(self._real_handle, attr) def get_attr_access_count(self, attr): return self.__attr_accesses.get(attr, 0)
def from_json_object(obj): return FilesystemSource(path=obj["path"])
def test_sources(self): sources_and_urls = [ (FilesystemSource("/usr"), "file:///usr"), ( SMBSource("//10.0.0.30/Share$/Documents"), "smb://10.0.0.30/Share%24/Documents", ), ( SMBSource("//10.0.0.30/Share$/Documents", "FaithfullA"), "smb://[email protected]/Share%24/Documents", ), ( SMBSource( "//10.0.0.30/Share$/Documents", "FaithfullA", "secretpassword", ), "smb://*****:*****@10.0.0.30/Share%24/Documents", ), ( SMBSource( "//10.0.0.30/Share$/Documents", "FaithfullA", "secretpassword", "SYSGRP", ), "smb://SYSGRP;FaithfullA:[email protected]/Share%24" "/Documents", ), ( SMBSource( "//10.0.0.30/Share$/Documents", "FaithfullA", None, "SYSGRP", ), "smb://SYSGRP;[email protected]/Share%24/Documents", ), ( SMBCSource( "//INT-SRV-01/Q$", "FaithfullA", None, "SYSGRP", ), "smbc://SYSGRP;FaithfullA@INT-SRV-01/Q%24", ), (WebSource("http://www.example.com"), "http://www.example.com"), ( SecureWebSource("https://www.example.com"), "https://www.example.com", ), ( DataSource(b"This is a test", "text/plain"), "data:text/plain;base64,VGhpcyBpcyBhIHRlc3Q=", ), ] for source, url in sources_and_urls: with self.subTest(url): generated_url = source.to_url() self.assertEqual(url, generated_url)
def test_json_round_trip(self): example_handles = [ FilesystemHandle(FilesystemSource("/usr/share/common-licenses"), "GPL-3"), DataHandle(DataSource(b"Test", "text/plain"), "file"), FilteredHandle( GzipSource( FilesystemHandle( FilesystemSource("/usr/share/doc/coreutils"), "changelog.Debian.gz")), "changelog.Debian"), SMBHandle(SMBSource("//SERVER/Resource", "username"), "~ocument.docx"), SMBCHandle( SMBCSource("//SERVER/Resource", "username", "topsecret", "WORKGROUP8"), "~ocument.docx"), ZipHandle( ZipSource( SMBCHandle( SMBCSource("//SERVER/Resource", "username", driveletter="W"), "Confidential Documents.zip")), "doc/Personal Information.docx"), WebHandle(WebSource("https://secret.data.invalid/"), "lottery-numbers-for-next-week.txt"), TarHandle( TarSource( FilesystemHandle(FilesystemSource("/home/user"), "Downloads/data.tar.gz")), "data0.txt"), MailPartHandle( MailSource( EWSMailHandle( EWSAccountSource(domain="cloudy.example", server=CLOUD, admin_user="******", admin_password="******", user="******"), "SW5ib3hJRA==.TWVzc2dJRA==", "Re: Castles in the sky")), "1/pictograph.jpeg", "image/jpeg"), PDFObjectHandle( PDFPageSource( PDFPageHandle( PDFSource( FilesystemHandle( FilesystemSource("/home/kiddw" "/Documents"), "1699 Gardiners trip/" "treasure_map.pdf")), "10")), "X-marks-the-spot_000-0.png"), LibreOfficeObjectHandle( LibreOfficeSource( FilesystemHandle(FilesystemSource("/media/user/USB STICK"), "What I Did On My Holidays.doc")), "What I Did On My Holidays.html") ] for handle in example_handles: with self.subTest(handle): json = handle.to_json_object() print(handle) print(json) self.assertEqual(handle, handle.from_json_object(json)) print("--")
time2 = "2020-10-28T14:36:20+01:00" scan_tag0 = { "scanner": "Dummy test scanner", "time": time0 } scan_tag1 = { "scanner": "Dummy test scanner", "time": time1 } scan_tag2 = { "scanner": "Dummy test scanner", "time": time2 } common_handle = FilesystemHandle( FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"), "OS2datascanner/Dokumenter/Verdensherredømme - plan.txt") common_rule = RegexRule("Vores hemmelige adgangskode er", sensitivity=Sensitivity.WARNING) dimension_rule = DimensionsRule() common_scan_spec = messages.ScanSpecMessage( scan_tag=None, # placeholder source=common_handle.source, rule=common_rule, configuration={}, progress=None) positive_match = messages.MatchesMessage( scan_spec=common_scan_spec._replace(scan_tag=scan_tag0),
MSGraphFileHandle) from os2datascanner.engine2.model.derived.filtered import (GzipSource, FilteredHandle) from os2datascanner.engine2.model.derived.libreoffice import ( LibreOfficeSource, LibreOfficeObjectHandle) from os2datascanner.engine2.model.derived.mail import (MailSource, MailPartHandle) from os2datascanner.engine2.model.derived.pdf import (PDFSource, PDFPageHandle, PDFPageSource, PDFObjectHandle) from os2datascanner.engine2.model.derived.tar import TarSource, TarHandle from os2datascanner.engine2.model.derived.zip import ZipSource, ZipHandle example_handles = [ FilesystemHandle(FilesystemSource("/usr/share/common-licenses"), "GPL-3"), DataHandle(DataSource(b"Test", "text/plain"), "file"), FilteredHandle( GzipSource( FilesystemHandle(FilesystemSource("/usr/share/doc/coreutils"), "changelog.Debian.gz")), "changelog.Debian"), SMBHandle(SMBSource("//SERVER/Resource", "username"), "~ocument.docx"), SMBCHandle( SMBCSource("//SERVER/Resource", "username", "topsecret", "WORKGROUP8"), "~ocument.docx"), ZipHandle( ZipSource( SMBCHandle( SMBCSource("//SERVER/Resource", "username", driveletter="W"), "Confidential Documents.zip")), "doc/Personal Information.docx"),
Handle, SourceManager, UnknownSchemeError, DeserialisationError, ) # are we running from console? Then set __file__ try: __file__ except: __file__ = str(Path("./derived.py").resolve()) datadir = (Path(__file__).parents[1] / "data/files").resolve() fwd = datadir.absolute() testfile = FilesystemHandle(FilesystemSource(fwd), "test.txt") fs = FilesystemSource(fwd) fh = FilesystemHandle(fs, "cpr-test-single.zip") zs = ZipSource(fh) zh = ZipHandle(zs, "cpr-test-single.txt") zsm = ZipSource( FilesystemHandle(FilesystemSource(fwd), "cpr-test-multiple.zip")) zhm1 = ZipHandle(zsm, "cpr-test/cpr-test2.zip") zhm2 = ZipHandle(zsm, "cpr-test/cpr-test3.zip") zsmd1 = ZipSource(zhm1) zhmd1 = ZipHandle(zsmd1, "cpr2-test.txt") sm = SourceManager()
def test_relative_filesystemsource(self): with self.assertRaises(ValueError): FilesystemSource("../../projects/admin/tests/data/")
def test_relative_filesystemsource(self): with self.assertRaises(ValueError): FilesystemSource("data/")