class Engine2ConversionTest(unittest.TestCase): def setUp(self): self._sm = SourceManager() self._ir = image_handle.follow(self._sm) self._hr = html_handle.follow(self._sm) def tearDown(self): self._sm.clear() def test_last_modified(self): self.assertIsNotNone( convert(self._ir, OutputType.LastModified).value) def test_image_dimensions(self): self.assertEqual( convert(self._ir, OutputType.ImageDimensions).value, (896, 896)) def test_fallback(self): self.assertEqual( convert(self._ir, OutputType.Fallback).value, True) def test_dummy(self): with self.assertRaises(KeyError): convert(self._ir, OutputType.Dummy) def test_html(self): self.assertIn( "This is only a test.", convert(self._hr, OutputType.Text).value)
def handle_message(body, channel): if channel == "os2ds_scan_specs": with SourceManager() as sm: yield from explorer.message_received_raw(body, channel, sm, "os2ds_conversions", "os2ds_problems", None) elif channel == "os2ds_conversions": with SourceManager() as sm: yield from processor.message_received_raw(body, channel, sm, "os2ds_representations", "os2ds_scan_specs", ["os2ds_problems"]) elif channel == "os2ds_representations": yield from matcher.message_received_raw(body, channel, ["os2ds_matches"], "os2ds_handles", "os2ds_conversions") elif channel == "os2ds_handles": with SourceManager() as sm: yield from tagger.message_received_raw(body, channel, sm, "os2ds_metadata", "os2ds_problems") elif channel in ( "os2ds_matches", "os2ds_metadata", "os2ds_problems", ): yield from exporter.message_received_raw(body, channel, False, "os2ds_results")
def test_exploration_index(self): count = 0 with SourceManager() as sm: for h in indexed_mapped_site.handles(sm): count += 1 self.assertEqual( count, 6, "embedded site with sitemap index should have 6 handles")
def test_exploration_data_sitemap(self): count = 0 with SourceManager() as sm: for h in embedded_mapped_site.handles(sm): count += 1 self.assertEqual( count, 4, "embedded site with data: sitemap should have 4 handles")
def test_generator_exception(self): source = BrokenSource() with SourceManager() as sm: with self.assertRaises(ValueError): sm.open(source) self.assertFalse(source in sm, "_generate_state failed, but Source still open")
def test_odt_extraction(self): with SourceManager() as sm: metadata = guess_responsible_party(self.handle_proxy, sm) self.assertEqual(metadata["od-modifier"], "Alexander John Faithfull", "metadata extraction failed") self.assertEqual(self.handle_proxy.get_attr_access_count("follow"), 0, "metadata extraction from synthetic file attempted")
def test_generator_exception2(self): source = BrokenSource() with SourceManager() as sm: with self.assertRaises(ValueError): sm.open(source) with self.assertRaises(ValueError): sm.open(source)
def test_missing_headers(self): with SourceManager() as sm: first_thing = None with contextlib.closing(magenta.handles(sm)) as handles: first_thing = next(handles) r = first_thing.follow(sm) now = datetime.now() # It is not documented anywhere that WebResource.get_header() # returns a live dictionary, so don't depend on this behaviour header = r.unpack_header() for name in ( "content-type", OutputType.LastModified, ): if name in header: del header[name] self.assertEqual( r.compute_type(), "application/octet-stream", "{0}: unexpected backup MIME type".format(first_thing)) self.assertGreaterEqual( r.get_last_modified().value, now, "{0}: Last-Modified not fresh".format(first_thing))
def test_exploration(): count = 0 with SourceManager() as sm: for h in site.handles(sm): count += 1 print(h.relative_path) print(f"Embedded site should have 3 handles. Have {count}")
def test_corrupted_ocr(self): fs = FilesystemSource(os.path.join(test_data_path, "corrupted")) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) self.assertEqual(convert(resource, OutputType.Text), None, "{0}: error handling failed".format(h))
def test_ocr_conversions(self): fs = FilesystemSource(os.path.join(test_data_path, "good")) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) self.assertEqual( convert(resource, OutputType.Text).value, expected_result, "{0}: content failed".format(h))
def test_corrupted_doc(self): corrupted_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "msoffice/corrupted/test.trunc.doc")) corrupted_doc = Source.from_handle(corrupted_doc_handle) with SourceManager() as sm: self.assertEqual( list(corrupted_doc.handles(sm)), [], "unrecognised CDFV2 document should be empty and wasn't")
def test_derived_source(self): with SourceManager() as sm: s = FilesystemSource(test_data_path) h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip") zs = Source.from_handle(h) self.assertIsNotNone( zs.handle, "{0}: derived source has no handle".format(zs))
def test_basic(self): tracker = Tracker() with SourceManager() as sm: sm.open(tracker) sm.open(tracker) self.assertEqual(tracker.count, 1, "SourceManager opened the same object twice") self.assertEqual(tracker.count, 0, "SourceManager didn't close the object")
def test_exploration(self): count = 0 with SourceManager() as sm: for h in magenta.handles(sm): if count == 5: break else: count += 1 self.assertEqual(count, 5, "magenta.dk should have more than 5 pages")
def test_alternative_trimming(self): alternative_source = MailSource( FilesystemHandle.make_handle( os.path.join(test_data_path, "alternative.eml"))) with SourceManager() as sm: self.assertEqual( len(list(alternative_source.handles(sm))), 1, "text/plain trimming failed")
def run_rule(self, source): with SourceManager() as sm: results = list(try_apply(sm, source, self.rule)) self.assertEqual(results, [{ "offset": 0, "match": "1310XXXXXX", "context": "XXXXXX-XXXX", "context_offset": 0 }])
def test_broken_page_handling(self): h = WebHandle(WebSource("http://localhost:64346/"), "broken.html") with SourceManager() as sm: with h.follow(sm).make_stream() as fp: content = fp.read().decode() self.assertEqual( list(make_outlinks(content, "http://localhost:64346/broken.html")), ["http://localhost:64346/kontakt.html"], "expected one link to be found in broken document")
def test_exploration_sitemap(): count = 0 with SourceManager() as sm: for h in mapped_site.handles(sm): count += 1 print(h.relative_path) if h.relative_path == "hemmeligheder2.html": lm = h.follow(sm).get_last_modified().value print('modification date', lm.year, lm.month, lm.day) print(f"embedded site with sitemap should have 5 handles. Have {count}")
def test_followable(self): with SourceManager() as sm: for handle in example_handles: with self.subTest(handle): try: handle.follow(sm) except TypeError: raise except Exception: pass
def test_smbc_url(self): with SourceManager() as sm: source = Source.from_url( "smbc://*****:*****@samba/general") try: with contextlib.closing(source.handles(sm)) as c: next(c) except Exception: self.skipTest("test Samba server not up (not running in CI?)") self.process(source, sm)
def test_doc_mime(self): self.assertEqual( doc_handle.guess_type(), "application/msword", ".doc MIME guess is incorrect") with SourceManager() as sm: self.assertEqual( doc_handle.follow(sm).compute_type(), "application/msword", ".doc MIME computation is incorrect")
def test_libreoffice_size(self): large_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "libreoffice/html-explosion.ods")) large_doc = Source.from_handle(large_doc_handle) with SourceManager() as sm: for h in large_doc.handles(sm): if h.name.endswith(".html"): r = h.follow(sm) self.assertLess(r.get_size().value, 1048576, "LibreOffice HTML output was too big")
def test_encrypted_zip(self): # Check that all the ZipHandles we get out of an encrypted Zip file # actually work. (It's fine if we don't get any, but the ones we *do* # need to work!) encrypted_file = ZipSource( FilesystemHandle( FilesystemSource(test_data_path), "encrypted-test-vector.zip")) with SourceManager() as sm: for h in encrypted_file.handles(sm): h.follow(sm).compute_type()
def test_docx_mime(self): self.assertEqual( docx_handle.guess_type(), "application/vnd.openxmlformats-officedocument" ".wordprocessingml.document", ".docx MIME guess is incorrect") with SourceManager() as sm: self.assertEqual( docx_handle.follow(sm).compute_type(), "application/vnd.openxmlformats-officedocument" ".wordprocessingml.document", ".docx MIME computation is incorrect")
def test_size_computation(self): fs = FilesystemSource(test_data_path) with SourceManager() as sm: for h in fs.handles(sm): resource = h.follow(sm) size = convert(resource, OutputType.ImageDimensions) if not size: if "rgba32" in h.relative_path: self.skipTest("Pillow RGBA bug detected -- skipping") else: size = size.value self.assertEqual(size, expected_size, "{0}: size failed")
def test_sitemap_lm(self): count = 0 with SourceManager() as sm: for h in indexed_mapped_site.handles(sm): if h.relative_path == "hemmeligheder2.html": lm = h.follow(sm).get_last_modified().value self.assertEqual( (lm.year, lm.month, lm.day), (2011, 12, 1), "secret file's modification date is too late") break else: self.fail("secret file missing")
def handle(self, **kwargs): urls = kwargs['urls'] guess, summarise = kwargs['guess'], kwargs['summarise'] with SourceManager() as sm: for i in urls: try: s = Source.from_url(i) url_explorer.print_source(sm, s, guess=guess, summarise=summarise) except UnknownSchemeError: pass
def verify(self) -> bool: for account in self.generate_sources(): with SourceManager() as sm: try: exchangelib_object = sm.open(account) if exchangelib_object.msg_folder_root: print( "OS2datascanner has access to mailbox {0}".format( account.address)) except ErrorNonExistentMailbox: print("Mailbox {0} does not exits".format(account.address)) return False return True
def test_eml_files(self): fs = FilesystemSource(test_data_path) with SourceManager() as sm: for h in fs.handles(sm): mail_source = Source.from_handle(h) self.assertIsInstance( mail_source, MailSource, "conversion of {0} to MailSource failed".format(h)) for h in mail_source.handles(sm): self.assertIsInstance( h, MailPartHandle)