def test_size_mismatch():
    """
    Test that an error is raised when two manifests have rows with same md5 but
    different sizes.
    """
    with pytest.raises(csv.Error):
        merge_bucket_manifests(
            directory="tests/merge_manifests/size_mismatch/input",
            output_manifest="merged-output-test-manifest.tsv",
        )
def test_writing_to_csv():
    """
    Test that output manifest can be written as csv.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/regular/input/",
        output_manifest="merged-output-test-manifest.csv",
    )
    assert _get_tsv_data("merged-output-test-manifest.csv", ",") == _get_tsv_data(
        "tests/merge_manifests/regular/expected-merged-output-manifest.tsv"
    )
def test_multiple_urls():
    """
    Test input manifest having a row with multiple urls.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/multiple_urls/input",
        output_manifest="merged-output-test-manifest.tsv",
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/multiple_urls/expected-merged-output-manifest.tsv"
    )
def test_duplicate_values():
    """
    Test two input manifests having duplicate values ("sushi" in manifest2.tsv
    and manifest3.tsv)
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/duplicate_values/input",
        output_manifest="merged-output-test-manifest.tsv",
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/duplicate_values/expected-merged-output-manifest.tsv"
    )
def test_regular_merge_bucket_manifests():
    """
    Test that the output manifest produced by merge_bucket_manifests for a
    given input directory matches the expected output manifest.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/regular/input/",
        output_manifest="merged-output-test-manifest.tsv",
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/regular/expected-merged-output-manifest.tsv"
    )
def test_same_guid_for_same_hash():
    """
    Test input manifests with rows having matching guids, md5, and size.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/same_guid_for_same_hash/input",
        output_manifest="merged-output-test-manifest.tsv",
        allow_mult_guids_per_hash=True,
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/same_guid_for_same_hash/expected-merged-output-manifest.tsv"
    )
def test_multiple_guids_per_hash():
    """
    Test multiple guids per hash.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/multiple_guids_per_hash/input",
        output_manifest="merged-output-test-manifest.tsv",
        allow_mult_guids_per_hash=True,
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv"
    )
def test_size_mismatch():
    """
    Test that an error is raised when two manifests have rows with same md5 but
    different sizes.
    """
    with pytest.raises(csv.Error):
        merge_bucket_manifests(
            directory="tests/merge_manifests/size_mismatch/input",
            output_manifest="merged-output-test-manifest.tsv",
            columns_with_arrays=[
                "extra_data", "more_data", "some_additional_data"
            ],
        )
def test_multiple_urls():
    """
    Test input manifest having a row with multiple urls.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/multiple_urls/input",
        output_manifest="merged-output-test-manifest.tsv",
        columns_with_arrays=[
            "extra_data", "more_data", "some_additional_data"
        ],
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/multiple_urls/expected-merged-output-manifest.tsv"
    )
def test_regular_merge_bucket_manifests():
    """
    Test that the output manifest produced by merge_bucket_manifests for a
    given input directory matches the expected output manifest.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/regular/input/",
        output_manifest="merged-output-test-manifest.tsv",
        columns_with_arrays=[
            "extra_data", "more_data", "some_additional_data"
        ],
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/regular/expected-merged-output-manifest.tsv")
def test_duplicate_values():
    """
    Test two input manifests having duplicate values ("sushi" in manifest2.tsv
    and manifest3.tsv)
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/duplicate_values/input",
        output_manifest="merged-output-test-manifest.tsv",
        columns_with_arrays=[
            "extra_data", "more_data", "some_additional_data", "food"
        ],
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/duplicate_values/expected-merged-output-manifest.tsv"
    )
def test_same_guid_for_same_hash():
    """
    Test input manifests with rows having matching guids, md5, and size.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/same_guid_for_same_hash/input",
        output_manifest="merged-output-test-manifest.tsv",
        columns_with_arrays=[
            "extra_data", "more_data", "some_additional_data"
        ],
        allow_mult_guids_per_hash=True,
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/same_guid_for_same_hash/expected-merged-output-manifest.tsv"
    )
def test_multiple_guids_per_hash():
    """
    Test multiple guids per hash.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/multiple_guids_per_hash/input",
        output_manifest="merged-output-test-manifest.tsv",
        columns_with_arrays=[
            "extra_data", "more_data", "some_additional_data"
        ],
        allow_mult_guids_per_hash=True,
    )
    assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data(
        "tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv"
    )
def test_writing_to_csv():
    """
    Test that output manifest can be written as csv.
    """
    merge_bucket_manifests(
        directory="tests/merge_manifests/regular/input/",
        output_manifest="merged-output-test-manifest.csv",
        columns_with_arrays=[
            "extra_data", "more_data", "some_additional_data"
        ],
    )
    assert _get_tsv_data(
        "merged-output-test-manifest.csv", ",") == _get_tsv_data(
            "tests/merge_manifests/regular/expected-merged-output-manifest.tsv"
        )
示例#15
0
    is_allowed, message = check_user_permission(access_token,
                                                access_authz_requirement)
    if not is_allowed:
        logging.error(f"[out]: {message['message']}")
        sys.exit()

    os.mkdir(INPUT_MANIFESTS_DIRECTORY)
    s3 = boto3.client("s3")
    for i, url in enumerate(input_data_json["URLS"]):
        s3_bucket, s3_object = url.replace("s3://", "").split("/", 1)
        local_file_path = os.path.join(INPUT_MANIFESTS_DIRECTORY,
                                       f"manifest{i}.txt")

        logging.info(f"[out] downloading {url} to {local_file_path}")
        s3.download_file(s3_bucket, s3_object, local_file_path)

    merge_bucket_manifests(
        directory=INPUT_MANIFESTS_DIRECTORY,
        output_manifest_file_delimiter=detect_delimiter(local_file_path),
        output_manifest=OUTPUT_MANIFEST,
    )

    output_s3_bucket = creds["bucket"]
    log_file_presigned_url = upload_file_to_s3_and_generate_presigned_url(
        output_s3_bucket, LOG_FILE)
    output_manifest_presigned_url = upload_file_to_s3_and_generate_presigned_url(
        output_s3_bucket, OUTPUT_MANIFEST)

    logging.info(
        f"[out] {log_file_presigned_url} {output_manifest_presigned_url}")