def process(self, source, sm, depth=0): if depth == 0: self.assertIsNone(source.handle, "{0}: unexpected backing handle".format(source)) for handle in source.handles(sm): print("{0}{1}".format(" " * depth, handle)) guessed = Source.from_handle(handle) computed = Source.from_handle(handle, sm) if computed or guessed: self.process(computed or guessed, sm, depth + 1) elif handle.name == "url": with handle.follow(sm).make_stream() as fp: url = fp.read().decode("utf-8") self.process(Source.from_url(url), sm, depth + 1) elif handle.name == "test-vector" or isinstance( source, DataSource): r = handle.follow(sm) self.assertTrue(r.check(), "check() method failed") reported_size = r.get_size() last_modified = r.get_last_modified() with r.make_stream() as fp: stream_raw = fp.read() stream_size = len(stream_raw) stream_content = stream_raw.decode("utf-8") with r.make_path() as p: with open(p, "rb") as fp: file_raw = fp.read() file_size = len(file_raw) file_content = file_raw.decode("utf-8") self.assertIsInstance(last_modified, SingleResult, ("{0}: last modification date is not a" " SingleResult").format(handle)) self.assertIsInstance( last_modified.value, datetime, ("{0}: last modification date value is not a" "datetime.datetime").format(handle)) self.assertIsInstance(reported_size, SingleResult, ("{0}: resource length is not a" " SingleResult").format(handle)) self.assertEqual( stream_size, reported_size.value, "{0}: model stream length invalid".format(handle)) self.assertEqual( file_size, reported_size.value, "{0}: model stream length invalid".format(handle)) self.assertEqual( file_raw, stream_raw, "{0}: model file and stream not equal".format(handle)) self.assertEqual(stream_content, self.correct_content, "{0}: model stream invalid".format(handle)) self.assertEqual(file_content, self.correct_content, "{0}: model file invalid".format(handle))
def test_derived_source(self): with SourceManager() as sm: s = FilesystemSource(test_data_path) h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip") zs = Source.from_handle(h) self.assertIsNotNone( zs.handle, "{0}: derived source has no handle".format(zs))
def test_corrupted_doc(self): corrupted_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "msoffice/corrupted/test.trunc.doc")) corrupted_doc = Source.from_handle(corrupted_doc_handle) with SourceManager() as sm: self.assertEqual( list(corrupted_doc.handles(sm)), [], "unrecognised CDFV2 document should be empty and wasn't")
def try_apply(sm, source): for handle in source.handles(sm): derived = Source.from_handle(handle, sm) if derived: try_apply(sm, derived) else: resource = handle.follow(sm) representation = convert(resource, OutputType.Text) return representation.value
def try_apply(sm, source, rule): for handle in source.handles(sm): derived = Source.from_handle(handle, sm) if derived: yield from try_apply(sm, derived, rule) else: resource = handle.follow(sm) representation = convert(resource, rule.operates_on) if representation: yield from rule.match(representation.value)
def test_libreoffice_size(self): large_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "libreoffice/html-explosion.ods")) large_doc = Source.from_handle(large_doc_handle) with SourceManager() as sm: for h in large_doc.handles(sm): if h.name.endswith(".html"): r = h.follow(sm) self.assertLess(r.get_size().value, 1048576, "LibreOffice HTML output was too big")
def test_eml_files(self): fs = FilesystemSource(test_data_path) with SourceManager() as sm: for h in fs.handles(sm): mail_source = Source.from_handle(h) self.assertIsInstance( mail_source, MailSource, "conversion of {0} to MailSource failed".format(h)) for h in mail_source.handles(sm): self.assertIsInstance( h, MailPartHandle)
def get_content_from_handle(handle): with SourceManager() as sm: source = Source.from_handle(handle, sm) assert source is not None, f"{handle} cound not be made into a Source" return try_apply(sm, source)
#if h.guess_type() == "text/plain": print(f"handle\t{h}") print(f"resource\t{r}") print("raw content:") with r.make_stream() as fp: print("\t\t{0}".format(fp.read())) # should succed for text -> text conversion try: rep = convert(r, OutputType.Text) print(f"Conveted\t{rep.value}") break except KeyError as e: # lets try to reinterpret the handle as a new Source s = Source.from_handle(h) # sz = Source.from_handle(h) # hz = next(sz.handles(sm)) # rz = hz.follow(sm) # with rz.make_stream() as fp: # print("\t\t{0}".format(fp.read())) ## Lets try manual hd = DataHandle(DataSource(content=b64encode(gzip_content), mime="text/plain", name="sitemap.xml.gz"), relpath="sitemap.xml.gz") rd = hd.follow(sm) print("data resource b64encoded gzip - ") with rd.make_stream() as fp:
def run_rule_on_handle(self, handle): with SourceManager() as sm: source = Source.from_handle(handle, sm) self.assertIsNotNone( source, "{0} couldn't be made into a Source".format(handle)) self.run_rule(source, sm)