예제 #1
0
def test_diff_sizes(tmp_uri_fixture):  # NOQA

    from dtoolcore import (
        DataSet,
        generate_admin_metadata,
        generate_proto_dataset,
    )
    from dtoolcore.utils import generate_identifier
    from dtoolcore.compare import diff_sizes

    fpaths = create_test_files(tmp_uri_fixture)

    proto_ds_a = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_1"),
        base_uri=tmp_uri_fixture)
    proto_ds_a.create()
    proto_ds_a.put_item(fpaths["he"], "file.txt")
    proto_ds_a.freeze()

    proto_ds_b = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_2"),
        base_uri=tmp_uri_fixture)
    proto_ds_b.create()
    proto_ds_b.put_item(fpaths["she"], "file.txt")
    proto_ds_b.freeze()

    ds_a = DataSet.from_uri(proto_ds_a.uri)
    ds_b = DataSet.from_uri(proto_ds_b.uri)

    assert diff_sizes(ds_a, ds_a) == []

    expected = [
        (generate_identifier("file.txt"), 2, 3),
    ]
    assert diff_sizes(ds_a, ds_b) == expected
예제 #2
0
def test_http_enable_with_presigned_url(tmp_uuid_and_uri):  # NOQA

    uuid, dest_uri = tmp_uuid_and_uri

    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(uri=dest_uri,
                                 admin_metadata=admin_metadata,
                                 config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, 'tiny.png')
    proto_dataset.put_readme("---\nproject: testing\n")
    proto_dataset.freeze()

    dataset = DataSet.from_uri(dest_uri)

    # Add an annotation.
    dataset.put_annotation("project", "dtool-testing")

    # Add tags.
    dataset.put_tag("amazing")
    dataset.put_tag("stuff")

    with tmp_env_var("DTOOL_S3_PUBLISH_EXPIRY", "120"):
        access_url = dataset._storage_broker.http_enable()
    assert access_url.find("?") != -1  # This is a presigned URL dataset.

    assert access_url.startswith("https://")

    dataset_from_http = DataSet.from_uri(access_url)

    # Assert that the annotation has been copied across.
    assert dataset_from_http.get_annotation("project") == "dtool-testing"

    # Asser that the tags are available.
    assert dataset_from_http.list_tags() == ["amazing", "stuff"]

    from dtoolcore.compare import (diff_identifiers, diff_sizes, diff_content)

    assert len(diff_identifiers(dataset, dataset_from_http)) == 0
    assert len(diff_sizes(dataset, dataset_from_http)) == 0
    assert len(diff_content(dataset_from_http, dataset)) == 0

    # Make sure that all the URLs in the manifest are presigned.
    http_manifest = dataset_from_http._storage_broker.http_manifest
    assert http_manifest["manifest_url"].find("?") != -1
    assert http_manifest["readme_url"].find("?") != -1
    for url in http_manifest["item_urls"].values():
        assert url.find("?") != -1
    for url in http_manifest["annotations"].values():
        assert url.find("?") != -1
예제 #3
0
def diff(full, dataset_uri, reference_dataset_uri):
    """Report the difference between two datasets.

    1. Checks that the identifiers are identicial
    2. Checks that the sizes are identical
    3. Checks that the hashes are identical, if the '--full' option is used

    If a differences is detected in step 1, steps 2 and 3 will not be carried
    out. Similarly if a difference is detected in step 2, step 3 will not be
    carried out.

    When checking that the hashes are identical the hashes for the first
    dataset are recalculated using the hashing algorithm of the reference
    dataset.
    """
    def echo_header(desc, ds_name, ref_ds_name, prop):
        click.secho("Different {}".format(desc), fg="red")
        click.secho("ID, {} in '{}', {} in '{}'".format(
            prop, ds_name, prop, ref_ds_name))

    def echo_diff(diff):
        for d in diff:
            line = "{}, {}, {}".format(d[0], d[1], d[2])
            click.secho(line)

    ds = dtoolcore.DataSet.from_uri(dataset_uri)
    ref_ds = dtoolcore.DataSet.from_uri(reference_dataset_uri)

    num_items = len(list(ref_ds.identifiers))

    ids_diff = diff_identifiers(ds, ref_ds)
    if len(ids_diff) > 0:
        echo_header("identifiers", ds.name, ref_ds.name, "present")
        echo_diff(ids_diff)
        sys.exit(1)

    with click.progressbar(length=num_items,
                           label="Comparing sizes") as progressbar:
        sizes_diff = diff_sizes(ds, ref_ds, progressbar)
    if len(sizes_diff) > 0:
        echo_header("sizes", ds.name, ref_ds.name, "size")
        echo_diff(sizes_diff)
        sys.exit(2)

    if full:
        with click.progressbar(length=num_items,
                               label="Comparing hashes") as progressbar:
            content_diff = diff_content(ds, ref_ds, progressbar)
        if len(content_diff) > 0:
            echo_header("content", ds.name, ref_ds.name, "hash")
            echo_diff(content_diff)
            sys.exit(3)
예제 #4
0
def test_http_enable(tmp_uuid_and_uri):  # NOQA

    uuid, dest_uri = tmp_uuid_and_uri

    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(
        uri=dest_uri,
        admin_metadata=admin_metadata,
        config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, 'tiny.png')
    proto_dataset.put_readme("---\nproject: testing\n")
    proto_dataset.freeze()

    dataset = DataSet.from_uri(dest_uri)

    access_url = dataset._storage_broker.http_enable()

    assert access_url.startswith("https://")

    dataset_from_http = DataSet.from_uri(access_url)

    from dtoolcore.compare import (
        diff_identifiers,
        diff_sizes,
        diff_content
    )

    assert len(diff_identifiers(dataset, dataset_from_http)) == 0
    assert len(diff_sizes(dataset, dataset_from_http)) == 0
    assert len(diff_content(dataset_from_http, dataset)) == 0
예제 #5
0
def test_copy_and_diff(tmp_uuid_and_uri):  # NOQA

    uuid, dest_uri = tmp_uuid_and_uri

    import dtoolcore
    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet
    from dtoolcore.compare import (
        diff_identifiers,
        diff_sizes,
        diff_content,
    )

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata)
    proto_dataset.create()
    proto_dataset.put_readme(content='---\ndescription: test')
    proto_dataset.put_item(local_file_path, 'tiny.png')
    proto_dataset.freeze()

    remote_dataset = DataSet.from_uri(dest_uri)

    with tmp_directory() as local_dir:
        local_uri = dtoolcore.copy(dest_uri, local_dir)
        assert local_uri.startswith("file:/")
        local_dataset = DataSet.from_uri(local_uri)
        assert len(diff_identifiers(local_dataset, remote_dataset)) == 0
        assert len(diff_sizes(local_dataset, remote_dataset)) == 0
        assert len(diff_content(local_dataset, remote_dataset)) == 0
예제 #6
0
def test_http_enable(tmp_uuid_and_uri):  # NOQA

    uuid, dest_uri = tmp_uuid_and_uri

    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(uri=dest_uri,
                                 admin_metadata=admin_metadata,
                                 config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, 'tiny.png')
    proto_dataset.put_readme("---\nproject: testing\n")
    proto_dataset.freeze()

    dataset = DataSet.from_uri(dest_uri)

    # Test HTTP manifest.
    http_manifest = dataset._storage_broker._generate_http_manifest()
    assert "admin_metadata" in http_manifest
    assert http_manifest["admin_metadata"] == dataset._admin_metadata
    assert "overlays" in http_manifest
    assert "readme_url" in http_manifest
    assert "manifest_url" in http_manifest
    assert "item_urls" in http_manifest
    assert "annotations" in http_manifest
    assert "tags" in http_manifest
    assert set(http_manifest["item_urls"].keys()) == set(dataset.identifiers)

    # Add an annotation.
    dataset.put_annotation("project", "dtool-testing")

    # Add tags.
    dataset.put_tag("amazing")
    dataset.put_tag("stuff")

    access_url = dataset._storage_broker.http_enable()

    assert access_url.startswith("https://")

    dataset_from_http = DataSet.from_uri(access_url)

    # Assert that the annotation has been copied across.
    assert dataset_from_http.get_annotation("project") == "dtool-testing"

    # Asser that the tags are available.
    assert dataset_from_http.list_tags() == ["amazing", "stuff"]

    from dtoolcore.compare import (diff_identifiers, diff_sizes, diff_content)

    assert len(diff_identifiers(dataset, dataset_from_http)) == 0
    assert len(diff_sizes(dataset, dataset_from_http)) == 0
    assert len(diff_content(dataset_from_http, dataset)) == 0