def test_derived_source(self): with SourceManager() as sm: s = FilesystemSource(test_data_path) h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip") zs = Source.from_handle(h) self.assertIsNotNone( zs.handle, "{0}: derived source has no handle".format(zs))
def test_corrupted_doc(self): corrupted_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "msoffice/corrupted/test.trunc.doc")) corrupted_doc = Source.from_handle(corrupted_doc_handle) with SourceManager() as sm: self.assertEqual( list(corrupted_doc.handles(sm)), [], "unrecognised CDFV2 document should be empty and wasn't")
def get_different_filesystemhandle(file_ending, folder_level): path = '/' for x in range(0, folder_level): path += ''.join( random.choice(string.ascii_lowercase) for i in range(10)) + '/' return FilesystemHandle( FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"), "{0}{1}{2}".format(path, random.choice(string.ascii_lowercase), file_ending))
def test_alternative_trimming(self): alternative_source = MailSource( FilesystemHandle.make_handle( os.path.join(test_data_path, "alternative.eml"))) with SourceManager() as sm: self.assertEqual( len(list(alternative_source.handles(sm))), 1, "text/plain trimming failed")
def test_libreoffice_size(self): large_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "libreoffice/html-explosion.ods")) large_doc = Source.from_handle(large_doc_handle) with SourceManager() as sm: for h in large_doc.handles(sm): if h.name.endswith(".html"): r = h.follow(sm) self.assertLess(r.get_size().value, 1048576, "LibreOffice HTML output was too big")
def test_encrypted_zip(self): # Check that all the ZipHandles we get out of an encrypted Zip file # actually work. (It's fine if we don't get any, but the ones we *do* # need to work!) encrypted_file = ZipSource( FilesystemHandle( FilesystemSource(test_data_path), "encrypted-test-vector.zip")) with SourceManager() as sm: for h in encrypted_file.handles(sm): h.follow(sm).compute_type()
import os.path import unittest from os2datascanner.utils.metadata import guess_responsible_party from os2datascanner.engine2.model.core import Handle, Source, SourceManager from os2datascanner.engine2.model.file import (FilesystemHandle, FilesystemSource) from os2datascanner.engine2.model.derived.libreoffice import ( LibreOfficeObjectHandle, LibreOfficeSource) here_path = os.path.dirname(__file__) test_data_path = os.path.join(here_path, "data") test_handle = LibreOfficeObjectHandle( LibreOfficeSource( FilesystemHandle(FilesystemSource(test_data_path), "libreoffice/embedded-cpr.odt")), "embedded-cpr.html") class CountingProxy: def __init__(self, real_handle): self.__attr_accesses = {} self._real_handle = real_handle def __getattr__(self, attr): self.__attr_accesses[attr] = self.get_attr_access_count(attr) + 1 return getattr(self._real_handle, attr) def get_attr_access_count(self, attr): return self.__attr_accesses.get(attr, 0)
def test_pdf_source(self): self.run_rule( PDFSource( FilesystemHandle.make_handle( os.path.join(test_data_path, "pdf/embedded-cpr.pdf"))))
def test_libreoffice_source(self): self.run_rule( LibreOfficeSource( FilesystemHandle.make_handle( os.path.join(test_data_path, "libreoffice/embedded-cpr.odt"))))
import os.path import unittest from os2datascanner.engine2.model.core import SourceManager from os2datascanner.engine2.model.file import FilesystemHandle from os2datascanner.engine2.conversions.types import OutputType from os2datascanner.engine2.conversions.registry import convert here_path = os.path.dirname(__file__) image_handle = FilesystemHandle.make_handle( os.path.join(here_path, "data/ocr/cpr.png")) html_handle = FilesystemHandle.make_handle( os.path.join(here_path, "data/html/simple.html")) class Engine2ConversionTest(unittest.TestCase): def setUp(self): self._sm = SourceManager() self._ir = image_handle.follow(self._sm) self._hr = html_handle.follow(self._sm) def tearDown(self): self._sm.clear() def test_last_modified(self): self.assertIsNotNone( convert(self._ir, OutputType.LastModified).value)
def test_json_round_trip(self): example_handles = [ FilesystemHandle(FilesystemSource("/usr/share/common-licenses"), "GPL-3"), DataHandle(DataSource(b"Test", "text/plain"), "file"), FilteredHandle( GzipSource( FilesystemHandle( FilesystemSource("/usr/share/doc/coreutils"), "changelog.Debian.gz")), "changelog.Debian"), SMBHandle(SMBSource("//SERVER/Resource", "username"), "~ocument.docx"), SMBCHandle( SMBCSource("//SERVER/Resource", "username", "topsecret", "WORKGROUP8"), "~ocument.docx"), ZipHandle( ZipSource( SMBCHandle( SMBCSource("//SERVER/Resource", "username", driveletter="W"), "Confidential Documents.zip")), "doc/Personal Information.docx"), WebHandle(WebSource("https://secret.data.invalid/"), "lottery-numbers-for-next-week.txt"), TarHandle( TarSource( FilesystemHandle(FilesystemSource("/home/user"), "Downloads/data.tar.gz")), "data0.txt"), MailPartHandle( MailSource( EWSMailHandle( EWSAccountSource(domain="cloudy.example", server=CLOUD, admin_user="******", admin_password="******", user="******"), "SW5ib3hJRA==.TWVzc2dJRA==", "Re: Castles in the sky")), "1/pictograph.jpeg", "image/jpeg"), PDFObjectHandle( PDFPageSource( PDFPageHandle( PDFSource( FilesystemHandle( FilesystemSource("/home/kiddw" "/Documents"), "1699 Gardiners trip/" "treasure_map.pdf")), "10")), "X-marks-the-spot_000-0.png"), LibreOfficeObjectHandle( LibreOfficeSource( FilesystemHandle(FilesystemSource("/media/user/USB STICK"), "What I Did On My Holidays.doc")), "What I Did On My Holidays.html") ] for handle in example_handles: with self.subTest(handle): json = handle.to_json_object() print(handle) print(json) self.assertEqual(handle, handle.from_json_object(json)) print("--")
MSGraphFileHandle) from os2datascanner.engine2.model.derived.filtered import (GzipSource, FilteredHandle) from os2datascanner.engine2.model.derived.libreoffice import ( LibreOfficeSource, LibreOfficeObjectHandle) from os2datascanner.engine2.model.derived.mail import (MailSource, MailPartHandle) from os2datascanner.engine2.model.derived.pdf import (PDFSource, PDFPageHandle, PDFPageSource, PDFObjectHandle) from os2datascanner.engine2.model.derived.tar import TarSource, TarHandle from os2datascanner.engine2.model.derived.zip import ZipSource, ZipHandle example_handles = [ FilesystemHandle(FilesystemSource("/usr/share/common-licenses"), "GPL-3"), DataHandle(DataSource(b"Test", "text/plain"), "file"), FilteredHandle( GzipSource( FilesystemHandle(FilesystemSource("/usr/share/doc/coreutils"), "changelog.Debian.gz")), "changelog.Debian"), SMBHandle(SMBSource("//SERVER/Resource", "username"), "~ocument.docx"), SMBCHandle( SMBCSource("//SERVER/Resource", "username", "topsecret", "WORKGROUP8"), "~ocument.docx"), ZipHandle( ZipSource( SMBCHandle( SMBCSource("//SERVER/Resource", "username", driveletter="W"), "Confidential Documents.zip")), "doc/Personal Information.docx"),
import os.path import unittest from os2datascanner.engine2.model.core import SourceManager from os2datascanner.engine2.model.file import FilesystemHandle here_path = os.path.dirname(__file__) doc_handle = FilesystemHandle.make_handle( os.path.join( here_path, "data", "msoffice", "test.doc")) docx_handle = FilesystemHandle.make_handle( os.path.join( here_path, "data", "msoffice", "test.docx")) class Engine2MIMETests(unittest.TestCase): def test_doc_mime(self): self.assertEqual( doc_handle.guess_type(), "application/msword", ".doc MIME guess is incorrect") with SourceManager() as sm: self.assertEqual( doc_handle.follow(sm).compute_type(), "application/msword", ".doc MIME computation is incorrect") def test_docx_mime(self): self.assertEqual( docx_handle.guess_type(),
def handle(self, **kwargs): with SourceManager() as sm: for path in kwargs['FILE']: guesses = guess_responsible_party( FilesystemHandle.make_handle(path), sm) print("{0}: {1}".format(path, guesses))
Handle, SourceManager, UnknownSchemeError, DeserialisationError, ) # are we running from console? Then set __file__ try: __file__ except: __file__ = str(Path("./derived.py").resolve()) datadir = (Path(__file__).parents[1] / "data/files").resolve() fwd = datadir.absolute() testfile = FilesystemHandle(FilesystemSource(fwd), "test.txt") fs = FilesystemSource(fwd) fh = FilesystemHandle(fs, "cpr-test-single.zip") zs = ZipSource(fh) zh = ZipHandle(zs, "cpr-test-single.txt") zsm = ZipSource( FilesystemHandle(FilesystemSource(fwd), "cpr-test-multiple.zip")) zhm1 = ZipHandle(zsm, "cpr-test/cpr-test2.zip") zhm2 = ZipHandle(zsm, "cpr-test/cpr-test3.zip") zsmd1 = ZipSource(zhm1) zhmd1 = ZipHandle(zsmd1, "cpr2-test.txt") sm = SourceManager()
def test_xlsx(self): self.run_rule_on_handle( FilesystemHandle.make_handle( os.path.join(test_data_path, "msoffice/test.xlsx")))
def test_ods(self): self.run_rule_on_handle( FilesystemHandle.make_handle( os.path.join(test_data_path, "libreoffice/test.ods")))
def test_docx(self): self.run_rule_on_handle( FilesystemHandle.make_handle( os.path.join(test_data_path, "msoffice/embedded-cpr.docx")))
def test_odt(self): self.run_rule_on_handle( FilesystemHandle.make_handle( os.path.join(test_data_path, "libreoffice/embedded-cpr.odt")))
try: cwd = Path(__file__).parent.absolute() except: cwd = Path().absolute() fpath = cwd / '../data/files/document.docx' #fpath = cwd / '../data/files/cpr-examples.odt' reload_content = True reload_content = False try: content except: reload_content = True if reload_content: h = FilesystemHandle.make_handle(fpath) content = get_content_from_handle(h) # newrule = CPRRule(modulus_11=True, ignore_irrelevant=False, # examine_context=True) # newrule.extract_surrounding_words = MethodType(extract_surrounding_words_fixed, newrule) rules = [ (CPRSimple(modulus_11=True, ignore_irrelevant=False, examine_context=True), "simple w. context"), (CPRComplicated(modulus_11=True, ignore_irrelevant=False, examine_context=True), "'accepted' w. context"), (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True), "current w. context"), # (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True),
time2 = "2020-10-28T14:36:20+01:00" scan_tag0 = { "scanner": "Dummy test scanner", "time": time0 } scan_tag1 = { "scanner": "Dummy test scanner", "time": time1 } scan_tag2 = { "scanner": "Dummy test scanner", "time": time2 } common_handle = FilesystemHandle( FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"), "OS2datascanner/Dokumenter/Verdensherredømme - plan.txt") common_rule = RegexRule("Vores hemmelige adgangskode er", sensitivity=Sensitivity.WARNING) dimension_rule = DimensionsRule() common_scan_spec = messages.ScanSpecMessage( scan_tag=None, # placeholder source=common_handle.source, rule=common_rule, configuration={}, progress=None) positive_match = messages.MatchesMessage( scan_spec=common_scan_spec._replace(scan_tag=scan_tag0),