def test_size_mismatch(): """ Test that an error is raised when two manifests have rows with same md5 but different sizes. """ with pytest.raises(csv.Error): merge_bucket_manifests( directory="tests/merge_manifests/size_mismatch/input", output_manifest="merged-output-test-manifest.tsv", )
def test_writing_to_csv(): """ Test that output manifest can be written as csv. """ merge_bucket_manifests( directory="tests/merge_manifests/regular/input/", output_manifest="merged-output-test-manifest.csv", ) assert _get_tsv_data("merged-output-test-manifest.csv", ",") == _get_tsv_data( "tests/merge_manifests/regular/expected-merged-output-manifest.tsv" )
def test_multiple_urls(): """ Test input manifest having a row with multiple urls. """ merge_bucket_manifests( directory="tests/merge_manifests/multiple_urls/input", output_manifest="merged-output-test-manifest.tsv", ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/multiple_urls/expected-merged-output-manifest.tsv" )
def test_duplicate_values(): """ Test two input manifests having duplicate values ("sushi" in manifest2.tsv and manifest3.tsv) """ merge_bucket_manifests( directory="tests/merge_manifests/duplicate_values/input", output_manifest="merged-output-test-manifest.tsv", ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/duplicate_values/expected-merged-output-manifest.tsv" )
def test_regular_merge_bucket_manifests(): """ Test that the output manifest produced by merge_bucket_manifests for a given input directory matches the expected output manifest. """ merge_bucket_manifests( directory="tests/merge_manifests/regular/input/", output_manifest="merged-output-test-manifest.tsv", ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/regular/expected-merged-output-manifest.tsv" )
def test_same_guid_for_same_hash(): """ Test input manifests with rows having matching guids, md5, and size. """ merge_bucket_manifests( directory="tests/merge_manifests/same_guid_for_same_hash/input", output_manifest="merged-output-test-manifest.tsv", allow_mult_guids_per_hash=True, ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/same_guid_for_same_hash/expected-merged-output-manifest.tsv" )
def test_multiple_guids_per_hash(): """ Test multiple guids per hash. """ merge_bucket_manifests( directory="tests/merge_manifests/multiple_guids_per_hash/input", output_manifest="merged-output-test-manifest.tsv", allow_mult_guids_per_hash=True, ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv" )
def test_size_mismatch(): """ Test that an error is raised when two manifests have rows with same md5 but different sizes. """ with pytest.raises(csv.Error): merge_bucket_manifests( directory="tests/merge_manifests/size_mismatch/input", output_manifest="merged-output-test-manifest.tsv", columns_with_arrays=[ "extra_data", "more_data", "some_additional_data" ], )
def test_multiple_urls(): """ Test input manifest having a row with multiple urls. """ merge_bucket_manifests( directory="tests/merge_manifests/multiple_urls/input", output_manifest="merged-output-test-manifest.tsv", columns_with_arrays=[ "extra_data", "more_data", "some_additional_data" ], ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/multiple_urls/expected-merged-output-manifest.tsv" )
def test_regular_merge_bucket_manifests(): """ Test that the output manifest produced by merge_bucket_manifests for a given input directory matches the expected output manifest. """ merge_bucket_manifests( directory="tests/merge_manifests/regular/input/", output_manifest="merged-output-test-manifest.tsv", columns_with_arrays=[ "extra_data", "more_data", "some_additional_data" ], ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/regular/expected-merged-output-manifest.tsv")
def test_duplicate_values(): """ Test two input manifests having duplicate values ("sushi" in manifest2.tsv and manifest3.tsv) """ merge_bucket_manifests( directory="tests/merge_manifests/duplicate_values/input", output_manifest="merged-output-test-manifest.tsv", columns_with_arrays=[ "extra_data", "more_data", "some_additional_data", "food" ], ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/duplicate_values/expected-merged-output-manifest.tsv" )
def test_same_guid_for_same_hash(): """ Test input manifests with rows having matching guids, md5, and size. """ merge_bucket_manifests( directory="tests/merge_manifests/same_guid_for_same_hash/input", output_manifest="merged-output-test-manifest.tsv", columns_with_arrays=[ "extra_data", "more_data", "some_additional_data" ], allow_mult_guids_per_hash=True, ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/same_guid_for_same_hash/expected-merged-output-manifest.tsv" )
def test_multiple_guids_per_hash(): """ Test multiple guids per hash. """ merge_bucket_manifests( directory="tests/merge_manifests/multiple_guids_per_hash/input", output_manifest="merged-output-test-manifest.tsv", columns_with_arrays=[ "extra_data", "more_data", "some_additional_data" ], allow_mult_guids_per_hash=True, ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv" )
def test_writing_to_csv(): """ Test that output manifest can be written as csv. """ merge_bucket_manifests( directory="tests/merge_manifests/regular/input/", output_manifest="merged-output-test-manifest.csv", columns_with_arrays=[ "extra_data", "more_data", "some_additional_data" ], ) assert _get_tsv_data( "merged-output-test-manifest.csv", ",") == _get_tsv_data( "tests/merge_manifests/regular/expected-merged-output-manifest.tsv" )
is_allowed, message = check_user_permission(access_token, access_authz_requirement) if not is_allowed: logging.error(f"[out]: {message['message']}") sys.exit() os.mkdir(INPUT_MANIFESTS_DIRECTORY) s3 = boto3.client("s3") for i, url in enumerate(input_data_json["URLS"]): s3_bucket, s3_object = url.replace("s3://", "").split("/", 1) local_file_path = os.path.join(INPUT_MANIFESTS_DIRECTORY, f"manifest{i}.txt") logging.info(f"[out] downloading {url} to {local_file_path}") s3.download_file(s3_bucket, s3_object, local_file_path) merge_bucket_manifests( directory=INPUT_MANIFESTS_DIRECTORY, output_manifest_file_delimiter=detect_delimiter(local_file_path), output_manifest=OUTPUT_MANIFEST, ) output_s3_bucket = creds["bucket"] log_file_presigned_url = upload_file_to_s3_and_generate_presigned_url( output_s3_bucket, LOG_FILE) output_manifest_presigned_url = upload_file_to_s3_and_generate_presigned_url( output_s3_bucket, OUTPUT_MANIFEST) logging.info( f"[out] {log_file_presigned_url} {output_manifest_presigned_url}")