Exemplo n.º 1
0
    def test_derived_source(self):
        with SourceManager() as sm:
            s = FilesystemSource(test_data_path)
            h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip")

            zs = Source.from_handle(h)
            self.assertIsNotNone(
                zs.handle, "{0}: derived source has no handle".format(zs))
Exemplo n.º 2
0
 def test_corrupted_doc(self):
     corrupted_doc_handle = FilesystemHandle.make_handle(
         os.path.join(test_data_path, "msoffice/corrupted/test.trunc.doc"))
     corrupted_doc = Source.from_handle(corrupted_doc_handle)
     with SourceManager() as sm:
         self.assertEqual(
             list(corrupted_doc.handles(sm)), [],
             "unrecognised CDFV2 document should be empty and wasn't")
def get_different_filesystemhandle(file_ending, folder_level):
    path = '/'
    for x in range(0, folder_level):
        path += ''.join(
            random.choice(string.ascii_lowercase) for i in range(10)) + '/'
    return FilesystemHandle(
        FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"),
        "{0}{1}{2}".format(path, random.choice(string.ascii_lowercase),
                           file_ending))
Exemplo n.º 4
0
 def test_alternative_trimming(self):
     alternative_source = MailSource(
             FilesystemHandle.make_handle(
                     os.path.join(test_data_path, "alternative.eml")))
     with SourceManager() as sm:
         self.assertEqual(
                 len(list(alternative_source.handles(sm))),
                 1,
                 "text/plain trimming failed")
Exemplo n.º 5
0
 def test_libreoffice_size(self):
     large_doc_handle = FilesystemHandle.make_handle(
         os.path.join(test_data_path, "libreoffice/html-explosion.ods"))
     large_doc = Source.from_handle(large_doc_handle)
     with SourceManager() as sm:
         for h in large_doc.handles(sm):
             if h.name.endswith(".html"):
                 r = h.follow(sm)
                 self.assertLess(r.get_size().value, 1048576,
                                 "LibreOffice HTML output was too big")
Exemplo n.º 6
0
 def test_encrypted_zip(self):
     # Check that all the ZipHandles we get out of an encrypted Zip file
     # actually work. (It's fine if we don't get any, but the ones we *do*
     # need to work!)
     encrypted_file = ZipSource(
             FilesystemHandle(
                     FilesystemSource(test_data_path),
                     "encrypted-test-vector.zip"))
     with SourceManager() as sm:
         for h in encrypted_file.handles(sm):
             h.follow(sm).compute_type()
import os.path
import unittest

from os2datascanner.utils.metadata import guess_responsible_party
from os2datascanner.engine2.model.core import Handle, Source, SourceManager
from os2datascanner.engine2.model.file import (FilesystemHandle,
                                               FilesystemSource)
from os2datascanner.engine2.model.derived.libreoffice import (
    LibreOfficeObjectHandle, LibreOfficeSource)

here_path = os.path.dirname(__file__)
test_data_path = os.path.join(here_path, "data")
test_handle = LibreOfficeObjectHandle(
    LibreOfficeSource(
        FilesystemHandle(FilesystemSource(test_data_path),
                         "libreoffice/embedded-cpr.odt")), "embedded-cpr.html")


class CountingProxy:
    def __init__(self, real_handle):
        self.__attr_accesses = {}
        self._real_handle = real_handle

    def __getattr__(self, attr):
        self.__attr_accesses[attr] = self.get_attr_access_count(attr) + 1
        return getattr(self._real_handle, attr)

    def get_attr_access_count(self, attr):
        return self.__attr_accesses.get(attr, 0)

Exemplo n.º 8
0
 def test_pdf_source(self):
     self.run_rule(
         PDFSource(
             FilesystemHandle.make_handle(
                 os.path.join(test_data_path, "pdf/embedded-cpr.pdf"))))
Exemplo n.º 9
0
 def test_libreoffice_source(self):
     self.run_rule(
         LibreOfficeSource(
             FilesystemHandle.make_handle(
                 os.path.join(test_data_path,
                              "libreoffice/embedded-cpr.odt"))))
Exemplo n.º 10
0
import os.path
import unittest

from os2datascanner.engine2.model.core import SourceManager
from os2datascanner.engine2.model.file import FilesystemHandle
from os2datascanner.engine2.conversions.types import OutputType
from os2datascanner.engine2.conversions.registry import convert


here_path = os.path.dirname(__file__)
image_handle = FilesystemHandle.make_handle(
        os.path.join(here_path, "data/ocr/cpr.png"))
html_handle = FilesystemHandle.make_handle(
        os.path.join(here_path, "data/html/simple.html"))



class Engine2ConversionTest(unittest.TestCase):
    def setUp(self):
        self._sm = SourceManager()

        self._ir = image_handle.follow(self._sm)
        self._hr = html_handle.follow(self._sm)

    def tearDown(self):
        self._sm.clear()

    def test_last_modified(self):
        self.assertIsNotNone(
                convert(self._ir, OutputType.LastModified).value)
Exemplo n.º 11
0
    def test_json_round_trip(self):
        example_handles = [
            FilesystemHandle(FilesystemSource("/usr/share/common-licenses"),
                             "GPL-3"),
            DataHandle(DataSource(b"Test", "text/plain"), "file"),
            FilteredHandle(
                GzipSource(
                    FilesystemHandle(
                        FilesystemSource("/usr/share/doc/coreutils"),
                        "changelog.Debian.gz")), "changelog.Debian"),
            SMBHandle(SMBSource("//SERVER/Resource", "username"),
                      "~ocument.docx"),
            SMBCHandle(
                SMBCSource("//SERVER/Resource", "username", "topsecret",
                           "WORKGROUP8"), "~ocument.docx"),
            ZipHandle(
                ZipSource(
                    SMBCHandle(
                        SMBCSource("//SERVER/Resource",
                                   "username",
                                   driveletter="W"),
                        "Confidential Documents.zip")),
                "doc/Personal Information.docx"),
            WebHandle(WebSource("https://secret.data.invalid/"),
                      "lottery-numbers-for-next-week.txt"),
            TarHandle(
                TarSource(
                    FilesystemHandle(FilesystemSource("/home/user"),
                                     "Downloads/data.tar.gz")), "data0.txt"),
            MailPartHandle(
                MailSource(
                    EWSMailHandle(
                        EWSAccountSource(domain="cloudy.example",
                                         server=CLOUD,
                                         admin_user="******",
                                         admin_password="******",
                                         user="******"),
                        "SW5ib3hJRA==.TWVzc2dJRA==",
                        "Re: Castles in the sky")), "1/pictograph.jpeg",
                "image/jpeg"),
            PDFObjectHandle(
                PDFPageSource(
                    PDFPageHandle(
                        PDFSource(
                            FilesystemHandle(
                                FilesystemSource("/home/kiddw"
                                                 "/Documents"),
                                "1699 Gardiners trip/"
                                "treasure_map.pdf")), "10")),
                "X-marks-the-spot_000-0.png"),
            LibreOfficeObjectHandle(
                LibreOfficeSource(
                    FilesystemHandle(FilesystemSource("/media/user/USB STICK"),
                                     "What I Did On My Holidays.doc")),
                "What I Did On My Holidays.html")
        ]

        for handle in example_handles:
            with self.subTest(handle):
                json = handle.to_json_object()
                print(handle)
                print(json)
                self.assertEqual(handle, handle.from_json_object(json))
                print("--")
Exemplo n.º 12
0
                                                        MSGraphFileHandle)

from os2datascanner.engine2.model.derived.filtered import (GzipSource,
                                                           FilteredHandle)
from os2datascanner.engine2.model.derived.libreoffice import (
    LibreOfficeSource, LibreOfficeObjectHandle)
from os2datascanner.engine2.model.derived.mail import (MailSource,
                                                       MailPartHandle)
from os2datascanner.engine2.model.derived.pdf import (PDFSource, PDFPageHandle,
                                                      PDFPageSource,
                                                      PDFObjectHandle)
from os2datascanner.engine2.model.derived.tar import TarSource, TarHandle
from os2datascanner.engine2.model.derived.zip import ZipSource, ZipHandle

example_handles = [
    FilesystemHandle(FilesystemSource("/usr/share/common-licenses"), "GPL-3"),
    DataHandle(DataSource(b"Test", "text/plain"), "file"),
    FilteredHandle(
        GzipSource(
            FilesystemHandle(FilesystemSource("/usr/share/doc/coreutils"),
                             "changelog.Debian.gz")), "changelog.Debian"),
    SMBHandle(SMBSource("//SERVER/Resource", "username"), "~ocument.docx"),
    SMBCHandle(
        SMBCSource("//SERVER/Resource", "username", "topsecret", "WORKGROUP8"),
        "~ocument.docx"),
    ZipHandle(
        ZipSource(
            SMBCHandle(
                SMBCSource("//SERVER/Resource", "username", driveletter="W"),
                "Confidential Documents.zip")),
        "doc/Personal Information.docx"),
Exemplo n.º 13
0
import os.path
import unittest

from os2datascanner.engine2.model.core import SourceManager
from os2datascanner.engine2.model.file import FilesystemHandle


here_path = os.path.dirname(__file__)
doc_handle = FilesystemHandle.make_handle(
        os.path.join(
                here_path, "data", "msoffice", "test.doc"))
docx_handle = FilesystemHandle.make_handle(
        os.path.join(
                here_path, "data", "msoffice", "test.docx"))


class Engine2MIMETests(unittest.TestCase):
    def test_doc_mime(self):
        self.assertEqual(
                doc_handle.guess_type(),
                "application/msword",
                ".doc MIME guess is incorrect")
        with SourceManager() as sm:
            self.assertEqual(
                    doc_handle.follow(sm).compute_type(),
                    "application/msword",
                    ".doc MIME computation is incorrect")

    def test_docx_mime(self):
        self.assertEqual(
                docx_handle.guess_type(),
Exemplo n.º 14
0
 def handle(self, **kwargs):
     with SourceManager() as sm:
         for path in kwargs['FILE']:
             guesses = guess_responsible_party(
                 FilesystemHandle.make_handle(path), sm)
             print("{0}: {1}".format(path, guesses))
Exemplo n.º 15
0
    Handle,
    SourceManager,
    UnknownSchemeError,
    DeserialisationError,
)

# are we running from console? Then set __file__
try:
    __file__
except:
    __file__ = str(Path("./derived.py").resolve())

datadir = (Path(__file__).parents[1] / "data/files").resolve()
fwd = datadir.absolute()

testfile = FilesystemHandle(FilesystemSource(fwd), "test.txt")

fs = FilesystemSource(fwd)
fh = FilesystemHandle(fs, "cpr-test-single.zip")
zs = ZipSource(fh)
zh = ZipHandle(zs, "cpr-test-single.txt")

zsm = ZipSource(
    FilesystemHandle(FilesystemSource(fwd), "cpr-test-multiple.zip"))
zhm1 = ZipHandle(zsm, "cpr-test/cpr-test2.zip")
zhm2 = ZipHandle(zsm, "cpr-test/cpr-test3.zip")

zsmd1 = ZipSource(zhm1)
zhmd1 = ZipHandle(zsmd1, "cpr2-test.txt")

sm = SourceManager()
Exemplo n.º 16
0
 def test_xlsx(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "msoffice/test.xlsx")))
Exemplo n.º 17
0
 def test_ods(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "libreoffice/test.ods")))
Exemplo n.º 18
0
 def test_docx(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "msoffice/embedded-cpr.docx")))
Exemplo n.º 19
0
 def test_odt(self):
     self.run_rule_on_handle(
         FilesystemHandle.make_handle(
             os.path.join(test_data_path, "libreoffice/embedded-cpr.odt")))
Exemplo n.º 20
0
try:
    cwd = Path(__file__).parent.absolute()
except:
    cwd = Path().absolute()
fpath = cwd / '../data/files/document.docx'
#fpath = cwd / '../data/files/cpr-examples.odt'

reload_content = True
reload_content = False
try:
    content
except:
    reload_content = True

if reload_content:
    h = FilesystemHandle.make_handle(fpath)
    content = get_content_from_handle(h)

# newrule = CPRRule(modulus_11=True, ignore_irrelevant=False,
#                   examine_context=True)
# newrule.extract_surrounding_words = MethodType(extract_surrounding_words_fixed, newrule)

rules = [
    (CPRSimple(modulus_11=True, ignore_irrelevant=False,
               examine_context=True), "simple w. context"),
    (CPRComplicated(modulus_11=True,
                    ignore_irrelevant=False,
                    examine_context=True), "'accepted' w. context"),
    (CPRRule(modulus_11=True, ignore_irrelevant=False,
             examine_context=True), "current w. context"),
    # (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True),
time2 = "2020-10-28T14:36:20+01:00"
scan_tag0 = {
    "scanner": "Dummy test scanner",
    "time": time0
}
scan_tag1 = {
    "scanner": "Dummy test scanner",
    "time": time1
}
scan_tag2 = {
    "scanner": "Dummy test scanner",
    "time": time2
}

common_handle = FilesystemHandle(
        FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"),
        "OS2datascanner/Dokumenter/Verdensherredømme - plan.txt")
common_rule = RegexRule("Vores hemmelige adgangskode er",
                        sensitivity=Sensitivity.WARNING)
dimension_rule = DimensionsRule()


common_scan_spec = messages.ScanSpecMessage(
        scan_tag=None, # placeholder
        source=common_handle.source,
        rule=common_rule,
        configuration={},
        progress=None)

positive_match = messages.MatchesMessage(
        scan_spec=common_scan_spec._replace(scan_tag=scan_tag0),