Exemplo n.º 1
0
def test_gcsuri_localize(
    gcs_test_path,
    local_j1_json,
    local_v41_json,
    local_v421_tsv,
    local_v5_csv,
    local_v6_txt,
    s3_j1_json,
    s3_v41_json,
    s3_v421_tsv,
    s3_v5_csv,
    s3_v6_txt,
    gcs_j1_json,
    gcs_v41_json,
    gcs_v421_tsv,
    gcs_v5_csv,
    gcs_v6_txt,
    url_j1_json,
    url_v41_json,
    url_v421_tsv,
    url_v5_csv,
    url_v6_txt,
) -> Tuple[str, bool]:
    """Recursive localization is supported for the following file extensions:
        .json:
            Files defined only in values (not keys) can be recursively localized.
        .tsv/.csv:
            Files defined in all values can be recursively localized.

    This function will test localizing j1.json file on each remote storage.
    This JSON file has file paths including .tsv and .csv, which also include
    other files in its contents.
    Therefore, when the recursive flag is on, all files in these JSON, TSV, CSV
    files should be localized recursively with correct file names
    (controlled by cls.loc_prefix and cls.loc_suffix).

    Filenaming for (recursive) localization:
        cls.loc_prefix + remote_file_path_without_scheme + cls.loc_suffix (for recursvely only)

    For example,
    s3://test-bucket/j1.json has some file paths on s3://.

    With recursive localization, all these files must be localized on /tmp/user/loc_prefix/ with
    a correct directory structure (keeping original structure on source: i.e. bucket name, path)
    and the name of the JSON file should be j1.local.json since contents of this file should be
    modified to point to localized files in it. This is recursively done for all files in it too.

    Without recursive localization, autouri doesn't look inside that JSON file and just localize
    the file itself alone on /tmp/user/loc_prefix/ while keeping the same filename j1.local.json.

    Test localizing on a GCS storage from the following remote storages:
        local_test_path: local -> gcs
        s3_test_path: s3 -> gcs
        gcs_test_path: gcs -> gcs
        url_test_path: url -> gcs

    Parameters to be tested:
        make_md5_file:
            Make md5 file on destination only when it's REQUIRED.
            It's required only if we need to compare md5 hash of source and target.
            This is already tested cp and it's actually needed for local storage.
            Cloud URIs will provide md5 hash info in their metadata so md5 file
            is not required and hence will not be created even with this flag on.
        recursive:
            j1.json
    """
    loc_prefix = os.path.join(gcs_test_path, "test_gcsuri_localize")
    # GCSURI.init_gcsuri(use_gsutil_for_s3=True)

    for j1_json in (gcs_j1_json, ):
        # localization from same storage
        u_j1_json = AutoURI(j1_json)
        loc_prefix_ = loc_prefix + u_j1_json.__class__.get_loc_suffix()
        basename = u_j1_json.basename

        # for localization both with or without recursive
        # nothing should be localized actually
        # since they are already on same storage
        # so loc_prefix directory itself shouldn't be created
        loc_uri, localized = GCSURI.localize(u_j1_json,
                                             recursive=False,
                                             return_flag=True,
                                             loc_prefix=loc_prefix_)
        assert loc_uri == u_j1_json.uri and not localized
        assert not AutoURI(os.path.join(loc_prefix_, basename)).exists

        loc_uri, localized = GCSURI.localize(u_j1_json,
                                             recursive=True,
                                             return_flag=True,
                                             loc_prefix=loc_prefix_)
        assert loc_uri == u_j1_json.uri and not localized
        assert not AutoURI(os.path.join(loc_prefix_, basename)).exists
        # check if all URIs defeind in localized JSON file exist
        recurse_raise_if_uri_not_exist(loc_uri)

    # localization from remote storages
    for j1_json in (local_j1_json, s3_j1_json, url_j1_json):
        u_j1_json = AutoURI(j1_json)
        loc_prefix_ = loc_prefix + u_j1_json.__class__.get_loc_suffix()
        basename = u_j1_json.basename

        loc_uri, localized = GCSURI.localize(u_j1_json,
                                             recursive=False,
                                             return_flag=True,
                                             loc_prefix=loc_prefix_)
        expected = os.path.join(loc_prefix_, u_j1_json.loc_dirname,
                                u_j1_json.basename)
        assert loc_uri == expected
        assert localized and AutoURI(expected).exists

        loc_uri, localized = GCSURI.localize(u_j1_json,
                                             recursive=True,
                                             return_flag=True,
                                             loc_prefix=loc_prefix_)
        expected = os.path.join(
            loc_prefix_,
            u_j1_json.loc_dirname,
            u_j1_json.basename_wo_ext + GCSURI.get_loc_suffix() +
            u_j1_json.ext,
        )
        assert loc_uri == expected
        assert localized and AutoURI(expected).exists
        # check if all URIs defeind in localized JSON file exist
        recurse_raise_if_uri_not_exist(loc_uri)
Exemplo n.º 2
0
def test_gcsuri_get_loc_suffix() -> str:
    assert GCSURI.get_loc_suffix() == ".gcs"