def objects_manifest_publish( ctx, file, thread_num, append_urls, manifest_file_delimiter, out_manifest_file, ): auth = ctx.obj["auth_factory"].get() loop = get_or_create_event_loop_for_thread() if not file: file = click.prompt("Enter Discovery metadata file path to publish") click.echo( f"Publishing/writing object data from {file}...\n to: {auth.endpoint}" ) index_object_manifest( commons_url=auth.endpoint, manifest_file=file, thread_num=thread_num, auth=auth, replace_urls=not append_urls, manifest_file_delimiter=manifest_file_delimiter, output_filename=out_manifest_file, submit_additional_metadata_columns=True, )
def test_index_manifest_additional_metadata(gen3_index, gen3_auth): """ When `submit_additional_metadata_columns` is set, the data for any provided column that is not in indexd should be submitted to the metadata service. """ with patch( "gen3.tools.indexing.index_manifest.Gen3Metadata.create", MagicMock() ) as mock_mds_create: index_object_manifest( manifest_file=CURRENT_DIR + "/test_data/manifest_additional_metadata.tsv", auth=gen3_auth, commons_url=gen3_index.client.url, thread_num=1, replace_urls=False, submit_additional_metadata_columns=True, ) mds_records = { kwargs["guid"]: kwargs["metadata"] for (_, kwargs) in mock_mds_create.call_args_list } assert len(mds_records) == 1 indexd_records = {r["did"]: r for r in gen3_index.get_all_records()} assert len(indexd_records) == 1 guid = list(indexd_records.keys())[0] assert indexd_records[guid]["file_name"] == "file.txt" assert indexd_records[guid]["size"] == 363455714 assert indexd_records[guid]["hashes"] == {"md5": "473d83400bc1bc9dc635e334faddf33c"} assert indexd_records[guid]["authz"] == ["/open"] assert indexd_records[guid]["urls"] == ["s3://my-data-bucket/dg.1234/path/file.txt"] assert guid in mds_records assert mds_records[guid] == {"fancy_column": "fancy_data"}
def test_index_manifest_packages_failure(data, gen3_index, gen3_auth, logfile): """ Test that the expected errors are thrown when the manifest contains invalid package rows. """ with patch( "gen3.tools.indexing.index_manifest.Gen3Metadata.create", MagicMock() ) as mock_mds_create: index_object_manifest( manifest_file=f"{CURRENT_DIR}/test_data/{data['manifest']}", auth=gen3_auth, commons_url=gen3_index.client.url, thread_num=1, replace_urls=False, submit_additional_metadata_columns=True, ) mds_records = { kwargs["guid"]: kwargs["metadata"] for (_, kwargs) in mock_mds_create.call_args_list } assert len(mds_records) == 0 indexd_records = {r["did"]: r for r in gen3_index.get_all_records()} assert len(indexd_records) == 0 for error in data["expected_error_msgs"]: assert error in logfile.read()
def test_index_manifest(gen3_index, indexd_server): rec1 = gen3_index.create_record( did="255e396f-f1f8-11e9-9a07-0a80fada099c", hashes={"md5": "473d83400bc1bc9dc635e334faddf33c"}, acl=["DEV", "test"], size=363_455_714, urls=[ "s3://testaws/aws/test.txt", "gs://test/test.txt", "gs://test/test,with,comma.txt", ], ) index_object_manifest( indexd_server.baseurl, CURRENT_DIR + "/test_data/test.tsv", 1, ("admin", "admin"), replace_urls=False, ) rec1 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada099c") rec2 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada010c") rec3 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada098c") rec4 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada097c") rec5 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada096c") rec6 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada012c") assert set(rec1["urls"]) == set( [ "s3://testaws/aws/test.txt", "gs://test/test.txt", "s3://pdcdatastore/test1.raw", # commas *are* allowed in values of arrays "gs://test/test,with,comma.txt", ] ) assert rec1["authz"] == [] assert rec2["hashes"]["md5"] == "473d83400bc1bc9dc635e334fadde33c" assert rec2["size"] == 363_455_714 assert rec2["authz"] == ["/program/DEV/project/test"] assert rec2["urls"] == ["s3://pdcdatastore/test5.raw"] assert rec3["urls"] == ["s3://pdcdatastore/test2.raw"] assert rec3["authz"] == ["/program/DEV/project/test"] assert rec4["urls"] == ["s3://pdcdatastore/test3.raw"] assert rec4["acl"] == ["phs0001", "phs0002"] assert rec5["urls"] == ["s3://pdcdatastore/test4.raw"] assert rec5["file_name"] == "test4_file.raw" # commas *are* allowed in values of arrays assert rec5["acl"] == ["phs0001,", "phs0002"] assert rec5["authz"] == ["/program/DEV/project/test"] assert rec6["urls"] == ["s3://pdcdatastore/test6 space.raw"] assert rec6["authz"] == ["/prog ram/DEV/project/test"] # ensure prev_guid worked to create a new version with same baseid assert rec6["baseid"] == rec2["baseid"]
def test_index_manifest_with_replace_urls(gen3_index, indexd_server): rec1 = gen3_index.create_record( did="255e396f-f1f8-11e9-9a07-0a80fada099c", hashes={"md5": "473d83400bc1bc9dc635e334faddf33c"}, acl=["DEV", "test"], size=363_455_714, urls=["s3://testaws/aws/test.txt", "gs://test/test.txt"], ) index_object_manifest( indexd_server.baseurl, "./test.tsv", 1, ("admin", "admin"), replace_urls=True ) rec1 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada099c") assert rec1["urls"] == ["s3://pdcdatastore/test1.raw"]
def test_index_non_guid_manifest(gen3_index, indexd_server): files, _ = index_object_manifest( indexd_server.baseurl, "./test2.tsv", 1, ("admin", "admin"), replace_urls=True ) assert "testprefix" in files[0]["guid"] rec1 = gen3_index.get(files[0]["guid"]) assert rec1["urls"] == ["s3://pdcdatastore/test1.raw"]
def test_index_manifest(gen3_index, indexd_server): rec1 = gen3_index.create_record( did="255e396f-f1f8-11e9-9a07-0a80fada099c", hashes={"md5": "473d83400bc1bc9dc635e334faddf33c"}, acl=["DEV", "test"], size=363_455_714, urls=["s3://testaws/aws/test.txt", "gs://test/test.txt"], ) index_object_manifest( indexd_server.baseurl, "./test.tsv", 1, ("admin", "admin"), replace_urls=False ) rec1 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada099c") rec2 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada010c") rec3 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada098c") rec4 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada097c") rec5 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada096c") rec6 = gen3_index.get("255e396f-f1f8-11e9-9a07-0a80fada012c") assert set(rec1["urls"]) == set( [ "s3://testaws/aws/test.txt", "gs://test/test.txt", "s3://pdcdatastore/test1.raw", ] ) assert rec1["authz"] == [] assert rec2["hashes"]["md5"] == "473d83400bc1bc9dc635e334fadde33c" assert rec2["size"] == 363_455_714 assert rec2["authz"] == ["/program/DEV/project/test"] assert rec2["urls"] == ["s3://pdcdatastore/test5.raw"] assert rec3["urls"] == ["s3://pdcdatastore/test2.raw"] assert rec3["authz"] == ["/program/DEV/project/test"] assert rec4["urls"] == ["s3://pdcdatastore/test3.raw"] assert rec4["acl"] == ["phs0001", "phs0002"] assert rec5["urls"] == ["s3://pdcdatastore/test4.raw"] assert rec5["file_name"] == "test4_file.raw" assert rec5["acl"] == ["phs0001", "phs0002"] assert rec5["authz"] == ["/program/DEV/project/test"] assert rec6["urls"] == ["s3://pdcdatastore/test6 space.raw"] assert rec6["authz"] == ["/prog ram/DEV/project/test"]
def test_index_manifest_packages(gen3_index, gen3_auth): """ When `record_type == package`, packages should be created in the metadata service and any `package_contents` values should be parsed and submitted. """ with patch( "gen3.tools.indexing.index_manifest.Gen3Metadata.create", MagicMock() ) as mock_mds_create: index_object_manifest( manifest_file=CURRENT_DIR + "/test_data/packages_manifest_ok.tsv", auth=gen3_auth, commons_url=gen3_index.client.url, thread_num=1, replace_urls=False, submit_additional_metadata_columns=True, ) print("MDS create calls:", mock_mds_create.call_args_list) mds_records = { kwargs["guid"]: kwargs["metadata"] for (_, kwargs) in mock_mds_create.call_args_list } assert len(mds_records) == 4 indexd_records = {r["did"]: r for r in gen3_index.get_all_records()} assert len(indexd_records) == 5 # object (not a package) with all fields provided guid = "255e396f-f1f8-11e9-9a07-0a80fada0900" assert guid in indexd_records assert guid not in mds_records # package with all fields provided # S3 URL guid = "255e396f-f1f8-11e9-9a07-0a80fada0901" assert guid in indexd_records assert indexd_records[guid]["file_name"] == "package.zip" assert indexd_records[guid]["size"] == 363455714 assert indexd_records[guid]["hashes"] == {"md5": "473d83400bc1bc9dc635e334faddf33c"} assert indexd_records[guid]["authz"] == ["/open/packages"] assert indexd_records[guid]["urls"] == [ "s3://my-data-bucket/dg.1234/path/package.zip" ] assert guid in mds_records assert mds_records[guid]["type"] == "package" assert mds_records[guid]["package"]["version"] == "0.1" assert mds_records[guid]["package"]["file_name"] == "package.zip" assert mds_records[guid]["package"]["size"] == 363455714 assert mds_records[guid]["package"]["hashes"] == { "md5": "473d83400bc1bc9dc635e334faddf33c" } assert mds_records[guid]["package"]["contents"] == [ { "hashes": {"md5sum": "2cd6ee2c70b0bde53fbe6cac3c8b8bb1"}, "file_name": "yes.txt", "size": 35, }, { "hashes": {"md5sum": "30cf3d7d133b08543cb6c8933c29dfd7"}, "file_name": "hi.txt", "size": 35, }, ] assert mds_records[guid]["_buckets"] == ["s3://my-data-bucket"] assert mds_records[guid]["_filename"] == "package.zip" assert mds_records[guid]["_file_extension"] == ".zip" assert mds_records[guid]["_upload_status"] == "uploaded" assert mds_records[guid]["_resource_paths"] == ["/open/packages"] # package with no "package_contents" provided # GS URL guid = "255e396f-f1f8-11e9-9a07-0a80fada0902" assert guid in indexd_records assert indexd_records[guid]["urls"] == [ "gs://my-google-data-bucket/dg.1234/path/package.zip" ] assert guid in mds_records assert mds_records[guid]["type"] == "package" assert mds_records[guid]["package"]["contents"] == None assert mds_records[guid]["_buckets"] == ["gs://my-google-data-bucket"] # package with no "file_name" provided # and 2 URLs with different file names. # the file name from the first URL is used as the package file name - # depending on the order of the URLs in the indexd record, it could be # either one guid = "255e396f-f1f8-11e9-9a07-0a80fada0903" assert guid in indexd_records assert indexd_records[guid]["file_name"] == "" assert sorted(indexd_records[guid]["urls"]) == sorted( [ "s3://my-data-bucket/dg.1234/path/package.zip", "gs://my-google-data-bucket/dg.1234/path/other_file_name.zip", ] ) assert guid in mds_records assert sorted(mds_records[guid]["_buckets"]) == sorted( ["s3://my-data-bucket", "gs://my-google-data-bucket"] ) assert mds_records[guid]["package"]["file_name"] in [ "package.zip", "other_file_name.zip", ] assert mds_records[guid]["_filename"] in ["package.zip", "other_file_name.zip"] # package with no "guid" provided new_guids = [ guid for guid in indexd_records if not guid.startswith("255e396f-f1f8-11e9-9a07-0a80fada09") ] assert len(new_guids) == 1 guid = new_guids[0] assert guid in mds_records