def test_warc_with_pages_flag(self, mock_now): """When passing the pages flag with a valid pages.jsonl file a pages/pages.jsonl file should be created""" mock_now.return_value = (2020, 10, 7, 22, 29, 10) with tempfile.TemporaryDirectory() as tmpdir: fp = tempfile.NamedTemporaryFile() fp.write( """{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}\n{"id": "1db0ef709a", "url": "http://www.example.com/", "ts": "2020-10-07T21:22:36Z", "title": "Example Domain"}""" .encode("utf-8")) fp.seek(0) self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection-valid-url.wacz"), "-p", os.path.join(tmpdir, fp.name), ]), 0, ) with zipfile.ZipFile( os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r") as zip_ref: zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_pages")) zip_ref.close() self.assertEqual( main([ "validate", "-f", os.path.join(tmpdir, "example-collection-valid-url.wacz"), ]), 0, ) wacz_pages = os.path.join( tmpdir, "unzipped_valid_pages/pages/pages.jsonl") wacz_cdx = os.path.join( tmpdir, "unzipped_valid_pages/indexes/index.cdx.gz") cdx_content = gzip.open(wacz_cdx, "rb").read() self.assertTrue("pages.jsonl" in os.listdir( os.path.join(tmpdir, "unzipped_valid_pages/pages/"))) with open(wacz_pages) as f: for _ in range(1): next(f) for line in f: obj = json.loads(line) self.assertTrue("id" in obj.keys()) self.assertTrue("ts" in obj.keys()) self.assertTrue("url" in obj.keys()) self.assertTrue(obj["url"].encode() in cdx_content)
def test_warc_with_invalid_url_flag(self): """When passing an invalid url flag we should raise a ValueError""" with tempfile.TemporaryDirectory() as tmpdir: with self.assertRaises(ValueError): main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection.wacz"), "--url", "http://www.examplefake.com/", ])
def setUpClass(self, mock_now): mock_now.return_value = (2020, 10, 7, 22, 29, 10) main(['-o', os.path.join(TEST_DIR, 'example.wacz'), os.path.join(TEST_DIR, 'example-collection.warc')]) with zipfile.ZipFile(os.path.join(TEST_DIR, 'example.wacz'), "r") as zip_ref: zip_ref.extractall("tests/fixtures/unzipped_wacz") zip_ref.close() self.wacz_file = os.path.join(TEST_DIR, 'example.wacz') self.warc_file = os.path.join(TEST_DIR, 'example-collection.warc') self.wacz_archive = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures/unzipped_wacz/archive/example-collection.warc") self.wacz_index_cdx = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures/unzipped_wacz/indexes/index.cdx.gz") self.wacz_index_idx = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures/unzipped_wacz/indexes/index.idx") self.wacz_yaml = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures/unzipped_wacz/webarchive.yaml")
def test_warc_with_valid_date_flag(self, mock_now): """When passing a valid date flag the datapackage should have that as the mainpageTS""" mock_now.return_value = (2020, 10, 7, 22, 29, 10) with tempfile.TemporaryDirectory() as tmpdir: self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection-valid-date.wacz"), "--date", "2020-11-01", ]), 0, ) with zipfile.ZipFile( os.path.join(tmpdir, "example-collection-valid-date.wacz"), "r") as zip_ref: zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_date")) zip_ref.close() self.wacz_json = os.path.join( tmpdir, "unzipped_valid_date/datapackage.json") self.wacz_pages = os.path.join( tmpdir, "unzipped_valid_date/pages/pages.jsonl") f = open(self.wacz_json, "rb") json_parse = json.loads(f.read()) self.assertEqual(json_parse["metadata"]["mainPageTS"], "2020-11-01")
def test_warc_with_hash_flag_sha256(self, mock_now): mock_now.return_value = (2020, 10, 7, 22, 29, 10) """When passing the --hash-type flag with a value of sha256 the datapackage should be hashed using sha256""" with tempfile.TemporaryDirectory() as tmpdir: self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection-sha256.wacz"), "--hash-type", "sha256", ]), 0, ) with zipfile.ZipFile( os.path.join(tmpdir, "example-collection-sha256.wacz"), "r") as zip_ref: zip_ref.extractall(os.path.join(tmpdir, "unzipped_sha256")) zip_ref.close() self.wacz_json = os.path.join(tmpdir, "unzipped_sha256/datapackage.json") f = open(self.wacz_json, "rb") json_parse = json.loads(f.read()) assert "sha256" in json_parse["resources"][0]["hash"]
def test_warc_with_extra_lists(self): with tempfile.TemporaryDirectory() as tmpdir: self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection-with-lists.warc"), "-o", os.path.join(tmpdir, "example-collection-with-lists.wacz"), ]), 0, ) self.assertEqual( main([ "validate", "-f", os.path.join(tmpdir, "example-collection-with-lists.wacz"), ]), 0, ) with zipfile.ZipFile( os.path.join(tmpdir, "example-collection-with-lists.wacz")) as zf: filelist = sorted(zf.namelist()) # verify pages file added for each list self.assertEqual( filelist, [ "archive/example-collection-with-lists.warc", "datapackage.json", "indexes/index.cdx.gz", "indexes/index.idx", "pages/example.jsonl", "pages/iana.jsonl", "pages/pages.jsonl", ], )
def setUpClass(self, mock_now): mock_now.return_value = (2020, 10, 7, 22, 29, 10) self.tmpdir = tempfile.TemporaryDirectory() main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(self.tmpdir.name, "valid_example_1.wacz"), ]) with zipfile.ZipFile( os.path.join(self.tmpdir.name, "valid_example_1.wacz"), "r") as zip_ref: zip_ref.extractall( os.path.join(self.tmpdir.name, "unzipped_wacz_1")) zip_ref.close() self.validation_class_valid_1 = Validation( os.path.join(self.tmpdir.name, "valid_example_1.wacz")) self.validation_class_invalid = Validation( os.path.join(TEST_DIR, "invalid_example_1.wacz"))
def test_warc_with_other_metadata(self): with tempfile.TemporaryDirectory() as tmpdir: self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-warcinfo-metadata.warc"), "-o", os.path.join(tmpdir, "example-warcinfo-metadata.wacz"), ]), 0, ) self.assertEqual( main([ "validate", "-f", os.path.join(tmpdir, "example-warcinfo-metadata.wacz"), ]), 0, )
def test_invalid_wacz_missing_datapackage(self): """Correctly fail on a wacz with no datapackage""" tmpdir = tempfile.TemporaryDirectory() main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir.name, "valid_example_1.wacz"), ]) with zipfile.ZipFile(os.path.join(tmpdir.name, "valid_example_1.wacz"), "r") as zip_ref: zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) zip_ref.close() os.remove(os.path.join(tmpdir.name, "unzipped_wacz_1/datapackage.json")) validation_class = Validation( os.path.join(self.tmpdir.name, "valid_example_1.wacz")) valid = validation_class.check_required_contents() self.assertEqual(valid, 0)
def test_ability_to_detect_hash_md5(self): """Correctly identify the hash type of a file as md5""" tmpdir = tempfile.TemporaryDirectory() main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir.name, "valid_example_1.wacz"), "--hash-type", "md5", ]) with zipfile.ZipFile(os.path.join(tmpdir.name, "valid_example_1.wacz"), "r") as zip_ref: zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) zip_ref.close() validation_class = Validation( os.path.join(self.tmpdir.name, "valid_example_1.wacz")) valid = validation_class.detect_hash_type() self.assertEqual(valid, 0) valid = validation_class.hash_type self.assertEqual(valid, "md5")
def setUpClass(self, mock_now): mock_now.return_value = (2020, 10, 7, 22, 29, 10) self.tmpdir = tempfile.TemporaryDirectory() main( [ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(self.tmpdir.name, "valid_example_1.wacz"), ] ) with zipfile.ZipFile( os.path.join(self.tmpdir.name, "valid_example_1.wacz"), "r" ) as zip_ref: zip_ref.extractall(os.path.join(self.tmpdir.name, "unzipped_wacz_1")) zip_ref.close() self.wacz_file = os.path.join(self.tmpdir.name, "valid_example_1.wacz") self.warc_file = os.path.join(TEST_DIR, "example-collection.warc") self.wacz_archive = os.path.join( self.tmpdir.name, "unzipped_wacz_1/archive/example-collection.warc", ) self.wacz_index_cdx = os.path.join( self.tmpdir.name, "unzipped_wacz_1/indexes/index.cdx.gz", ) self.wacz_index_idx = os.path.join( self.tmpdir.name, "unzipped_wacz_1/indexes/index.idx", ) self.wacz_json = os.path.join( self.tmpdir.name, "unzipped_wacz_1/datapackage.json", )
def test_warc_with_only_ts_flag(self): """If a user only passes the --ts flag we should return an error and a message about needing to also pass the --url flag""" with tempfile.TemporaryDirectory() as tmpdir: with self.assertRaises(SystemExit): self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection.wacz"), "--ts", "2020104212236", ]), 0, )
def test_warc_with_both_p_and_d_flag(self): """If a user passes both the --pages and --detect-pages flags we should return an error and a message about needing only one""" with tempfile.TemporaryDirectory() as tmpdir: with self.assertRaises(SystemExit): self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection.wacz"), "--detect_pages", "-p", "test.jsonl", ]), 0, )
def test_warc_with_valid_url_and_ts_flag(self, mock_now): mock_now.return_value = (2020, 10, 7, 22, 29, 10) """When passing an a valid url and ts flag we should see those values represented in the datapackage and pages.jsonl file""" with tempfile.TemporaryDirectory() as tmpdir: self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection-valid-url-valid-ts.wacz"), "--url", "http://www.example.com/", "--ts", "20201007212236", ]), 0, ) with zipfile.ZipFile( os.path.join(tmpdir, "example-collection-valid-url-valid-ts.wacz"), "r") as zip_ref: zip_ref.extractall( os.path.join(tmpdir, "unzipped_valid_url_valid_ts")) zip_ref.close() self.wacz_json = os.path.join( tmpdir, "unzipped_valid_url_valid_ts/datapackage.json") self.wacz_pages = os.path.join( tmpdir, "unzipped_valid_url_valid_ts/pages/pages.jsonl") f = open(self.wacz_json, "rb") json_parse = json.loads(f.read()) f = open(self.wacz_pages, "rb") json_pages = [json.loads(jline) for jline in f.read().splitlines()] self.assertEqual(json_pages[1]["url"], "http://www.example.com/") self.assertEqual(json_parse["metadata"]["mainPageURL"], "http://www.example.com/") self.assertEqual(json_parse["metadata"]["mainPageTS"], "20201007212236")
def test_warc_with_invalid_passed_pages(self): """If a user passes an invalid file using --page we should return an error""" with tempfile.TemporaryDirectory() as tmpdir: fp = tempfile.NamedTemporaryFile() fp.write( """{"format": "title": "All Pages"}\n{"http://www.example" "0-10-07T21:22:36Z", "title": "Example Domain"}""" .encode("utf-8")) fp.seek(0) self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection-valid-url.wacz"), "-p", os.path.join(tmpdir, fp.name), ]), 1, )
def test_warc_with_text_index_flag(self, mock_now): """When passing the text index flag pages/pages.jsonl should be generated with a full and accurate text index.""" mock_now.return_value = (2020, 10, 7, 22, 29, 10) with tempfile.TemporaryDirectory() as tmpdir: self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection-valid-url.wacz"), "-t", ]), 0, ) with zipfile.ZipFile( os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r") as zip_ref: zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_text")) zip_ref.close() wacz_pages = os.path.join(tmpdir, "unzipped_valid_text/pages/pages.jsonl") wacz_cdx = os.path.join( tmpdir, "unzipped_valid_text/indexes/index.cdx.gz") cdx_content = gzip.open(wacz_cdx, "rb").read() self.assertTrue("pages.jsonl" in os.listdir( os.path.join(tmpdir, "unzipped_valid_text/pages/"))) with open(wacz_pages) as f: for _ in range(1): next(f) for line in f: obj = json.loads(line) self.assertTrue("id" in obj.keys()) self.assertTrue("ts" in obj.keys()) self.assertTrue("title" in obj.keys()) self.assertTrue("url" in obj.keys()) self.assertTrue(obj["url"].encode() in cdx_content) self.assertTrue("text" in obj.keys())
def test_warc_with_valid_url_flag(self, mock_now): """When passing a valid url flag the url should be added to the pages.jsonl file and appear in the datapackage""" mock_now.return_value = (2020, 10, 7, 22, 29, 10) with tempfile.TemporaryDirectory() as tmpdir: self.assertEqual( main([ "create", "-f", os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(tmpdir, "example-collection-valid-url.wacz"), "--url", "http://www.example.com/", ]), 0, ) with zipfile.ZipFile( os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r") as zip_ref: zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_url")) zip_ref.close() self.wacz_json = os.path.join( tmpdir, "unzipped_valid_url/datapackage.json") self.wacz_pages = os.path.join( tmpdir, "unzipped_valid_url/pages/pages.jsonl") f = open(self.wacz_json, "rb") json_parse = json.loads(f.read()) f = open(self.wacz_pages, "rb") json_pages = [json.loads(jline) for jline in f.read().splitlines()] self.assertEqual(json_pages[1]["url"], "http://www.example.com/") self.assertEqual(json_parse["metadata"]["mainPageURL"], "http://www.example.com/") assert "mainPageTS" not in json_parse.keys()
from wacz.main import main main()