def main(input_uri, output_prefix, output_storage): StorageBroker = dtoolcore._get_storage_broker(input_uri, None) parsed_uri = urlparse(input_uri) for uri in StorageBroker.list_dataset_uris(parsed_uri.path, None): dataset = dtoolcore.DataSet.from_uri(uri) name = dataset.name start = time.time() dtoolcore.copy(uri, output_prefix, output_storage) elapsed = time.time() - start print("Copying {} took: {}s".format(name, elapsed))
def copy(quiet, dataset_uri, dest_location_uri): """Copy a dataset to a different location.""" src_dataset = dtoolcore.DataSet.from_uri(dataset_uri) # Generate the destination URI. parsed_location_uri = urlparse(dest_location_uri) prefix = parsed_location_uri.path storage = parsed_location_uri.scheme if storage == "": storage = "file" dest_uri = dtoolcore._generate_uri( admin_metadata=src_dataset._admin_metadata, prefix=prefix, storage=storage) # Check if the destination URI is already a dataset # and exit gracefully if true. if dtoolcore._is_dataset(dest_uri, config_path=CONFIG_PATH): raise click.UsageError("Dataset already exists: {}".format(dest_uri)) # If the destination URI is a "file" dataset one needs to check if # the path already exists and exit gracefully if true. parsed_dataset_uri = urlparse(dest_uri) if storage == "file": if os.path.exists(parsed_dataset_uri.path): raise click.UsageError("Path already exists: {}".format( parsed_dataset_uri.path)) # Finally do the copy if quiet: dest_uri = dtoolcore.copy(src_uri=dataset_uri, prefix=prefix, storage=storage, config_path=CONFIG_PATH) click.secho(dest_uri) else: num_items = len(list(src_dataset.identifiers)) with click.progressbar(length=num_items * 2, label="Copying dataset") as progressbar: dest_uri = dtoolcore.copy(src_uri=dataset_uri, prefix=prefix, storage=storage, config_path=CONFIG_PATH, progressbar=progressbar) click.secho("Dataset copied to:\n{}".format(dest_uri))
def test_copy(tmp_uri_fixture): # NOQA import dtoolcore src_dir = os.path.join(uri_to_path(tmp_uri_fixture), "src") dest_dir = os.path.join(uri_to_path(tmp_uri_fixture), "dest") for directory in [src_dir, dest_dir]: os.mkdir(directory) # Create the src dataset to be copied. admin_metadata = dtoolcore.generate_admin_metadata("test_copy") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=tmp_uri_fixture + "/src") proto_dataset.create() src_uri = proto_dataset.uri proto_dataset.put_readme("---\nproject: exciting\n") overlay = "file_extension" for fname in os.listdir(TEST_SAMPLE_DATA): _, ext = os.path.splitext(fname) item_fpath = os.path.join(TEST_SAMPLE_DATA, fname) proto_dataset.put_item(item_fpath, fname) proto_dataset.add_item_metadata(fname, overlay, ext) proto_dataset.freeze() # Copy the src dataset to dest. dest_uri = dtoolcore.copy(src_uri, tmp_uri_fixture + "/dest") # Compare the two datasets. src_ds = dtoolcore.DataSet.from_uri(src_uri) dest_ds = dtoolcore.DataSet.from_uri(dest_uri) for key, value in src_ds._admin_metadata.items(): if key == "frozen_at": tolerance = 2 # seconds (number chosen arbitrarily) assert dest_ds._admin_metadata[key] >= value assert dest_ds._admin_metadata[key] < value + tolerance else: assert dest_ds._admin_metadata[key] == value assert src_ds.identifiers == dest_ds.identifiers for i in src_ds.identifiers: src_item_props = src_ds.item_properties(i) dest_item_props = dest_ds.item_properties(i) for key, value in src_item_props.items(): if key == "utc_timestamp": tolerance = 2 # seconds (number chosen arbitrarily) assert dest_item_props[key] >= value assert dest_item_props[key] < value + tolerance else: assert dest_item_props[key] == value assert src_ds.get_readme_content() == dest_ds.get_readme_content() assert src_ds.list_overlay_names() == dest_ds.list_overlay_names() assert src_ds.get_overlay(overlay) == dest_ds.get_overlay(overlay)
def test_frozen_at_value_when_copying_dataset(tmp_dir_fixture): # NOQA with dc.DataSetCreator("delete-me", tmp_dir_fixture) as ds_creator: src_uri = ds_creator.uri dest_base_uri = os.path.join(tmp_dir_fixture, "dest") os.mkdir(dest_base_uri) src_dataset = dc.DataSet.from_uri(src_uri) src_frozen_at = src_dataset._admin_metadata["frozen_at"] time.sleep(0.1) dest_uri = dc.copy(src_uri, dest_base_uri) dest_dataset = dc.DataSet.from_uri(dest_uri) dest_frozen_at = dest_dataset._admin_metadata["frozen_at"] assert src_frozen_at == dest_frozen_at
def test_copy(tmp_dir_fixture): # NOQA import dtoolcore src_dir = os.path.join(tmp_dir_fixture, "src") dest_dir = os.path.join(tmp_dir_fixture, "dest") for directory in [src_dir, dest_dir]: os.mkdir(directory) # Create the src dataset to be copied. admin_metadata = dtoolcore.generate_admin_metadata("test_copy") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, prefix=src_dir, storage="file") proto_dataset.create() src_uri = proto_dataset.uri proto_dataset.put_readme("---\nproject: exciting\n") overlay = "file_extension" for fname in os.listdir(TEST_SAMPLE_DATA): _, ext = os.path.splitext(fname) item_fpath = os.path.join(TEST_SAMPLE_DATA, fname) proto_dataset.put_item(item_fpath, fname) proto_dataset.add_item_metadata(fname, overlay, ext) proto_dataset.freeze() # Copy the src dataset to dest. dest_uri = dtoolcore.copy(src_uri, dest_dir, "file") # Compare the two datasets. src_ds = dtoolcore.DataSet.from_uri(src_uri) dest_ds = dtoolcore.DataSet.from_uri(dest_uri) assert src_ds._admin_metadata == dest_ds._admin_metadata assert src_ds.identifiers == dest_ds.identifiers for i in src_ds.identifiers: assert src_ds.item_properties(i) == dest_ds.item_properties(i) assert src_ds.get_readme_content() == dest_ds.get_readme_content() assert src_ds.list_overlay_names() == dest_ds.list_overlay_names() assert src_ds.get_overlay(overlay) == dest_ds.get_overlay(overlay)
def test_dataset_verify_functional(tmp_dir_fixture): # NOQA from dtool_info.dataset import verify uri = dtoolcore.copy(lion_dataset_uri, tmp_dir_fixture, "file") dataset = dtoolcore.DataSet.from_uri(uri) runner = CliRunner() result = runner.invoke(verify, [uri]) assert result.exit_code == 0 assert result.output.startswith("All good") extra_fpath = os.path.join(dataset._storage_broker._data_abspath, "extra.txt") with open(extra_fpath, "w") as fh: fh.write("extra") result = runner.invoke(verify, [uri]) assert result.exit_code == 1 assert result.output.startswith("Unknown item: ") os.unlink(extra_fpath) item_fpath = os.path.join(dataset._storage_broker._data_abspath, "file.txt") os.unlink(item_fpath) result = runner.invoke(verify, [uri]) assert result.exit_code == 1 assert result.output.startswith("Missing item: ") with open(item_fpath, "w") as fh: fh.write("Different content") result = runner.invoke(verify, [uri]) assert result.exit_code == 1 assert result.output.startswith("Altered item size: ") result = runner.invoke(verify, ["--full", uri]) assert result.exit_code == 1 assert result.output.startswith("Altered item size: ") assert result.output.find("Altered item hash: ") != -1
def test_copy_and_diff(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri import dtoolcore from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.compare import ( diff_identifiers, diff_sizes, diff_content, ) name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata) proto_dataset.create() proto_dataset.put_readme(content='---\ndescription: test') proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() remote_dataset = DataSet.from_uri(dest_uri) with tmp_directory() as local_dir: local_uri = dtoolcore.copy(dest_uri, local_dir) assert local_uri.startswith("file:/") local_dataset = DataSet.from_uri(local_uri) assert len(diff_identifiers(local_dataset, remote_dataset)) == 0 assert len(diff_sizes(local_dataset, remote_dataset)) == 0 assert len(diff_content(local_dataset, remote_dataset)) == 0
def test_annotation_functional(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, ProtoDataSet, DtoolCoreKeyError, DtoolCoreInvalidNameError, generate_admin_metadata, copy, ) from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri(name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset. proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') # Test working on annotations with a ProtoDataset. with pytest.raises(DtoolCoreKeyError): proto_dataset.get_annotation(annotation_name="project") proto_dataset.put_annotation(annotation_name="project", annotation="world-peace") assert proto_dataset.get_annotation("project") == "world-peace" proto_dataset.put_annotation("project", "food-sustainability") assert proto_dataset.get_annotation("project") == "food-sustainability" assert proto_dataset.list_annotation_names() == ["project"] # Freeze the dataset proto_dataset.put_readme("") proto_dataset.freeze() # Test working on annotations with a frozen DataSet. dataset = DataSet.from_uri(dest_uri) with pytest.raises(DtoolCoreKeyError): dataset.get_annotation(annotation_name="stars") dataset.put_annotation(annotation_name="stars", annotation=0) assert dataset.get_annotation("stars") == 0 dataset.put_annotation("stars", 5) assert dataset.get_annotation("stars") == 5 assert dataset.list_annotation_names() == ["project", "stars"] # Test invalid keys, no spaces allowed. invalid_keys = ["with space", "with,comma", "with/slash", "X" * 81] for invalid_key in invalid_keys: with pytest.raises(DtoolCoreInvalidNameError): dataset.put_annotation(invalid_key, "bad") # Test invalid keys, name too long. with pytest.raises(DtoolCoreInvalidNameError): dataset.put_annotation("x" * 81, "bad") # Test copy. copy_dataset_directory = os.path.join(tmp_dir_fixture, "copy") os.mkdir(copy_dataset_directory) dest_uri = dataset.base_uri + "/copy" copy_uri = copy(dataset.uri, dest_uri) copy_dataset = DataSet.from_uri(copy_uri) assert copy_dataset.list_annotation_names() == ["project", "stars"] assert copy_dataset.get_annotation("stars") == 5 assert copy_dataset.get_annotation("project") == "food-sustainability"
def test_overlays_functional(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, ProtoDataSet, DtoolCoreKeyError, DtoolCoreTypeError, DtoolCoreValueError, DtoolCoreInvalidNameError, generate_admin_metadata, copy, ) from dtoolcore.utils import generate_identifier from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri(name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset. proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') # Freeze the dataset proto_dataset.put_readme("") proto_dataset.freeze() # Load the dataset. dataset = DataSet.from_uri(proto_dataset.uri) # The overlay has not been added yet. with pytest.raises(DtoolCoreKeyError): dataset.get_overlay("is_png") # Create overlay content. expected_identifier = generate_identifier('tiny.png') is_png_overlay = {expected_identifier: True} with pytest.raises(DtoolCoreTypeError): dataset.put_overlay("is_png", "not_a_dict") incorrect_identifier_overlay = {"incorrect": True} with pytest.raises(DtoolCoreValueError): dataset.put_overlay("is_png", incorrect_identifier_overlay) invalid_keys = ["with space", "with,comma", "with/slash", "X" * 81] for invalid_key in invalid_keys: with pytest.raises(DtoolCoreInvalidNameError): dataset.put_overlay(invalid_key, is_png_overlay) dataset.put_overlay("is_png", is_png_overlay) assert dataset.get_overlay("is_png") == is_png_overlay # Test copy. copy_dataset_directory = os.path.join(tmp_dir_fixture, "copy") os.mkdir(copy_dataset_directory) dest_uri = dataset.base_uri + "/copy" copy_uri = copy(dataset.uri, dest_uri) copy_dataset = DataSet.from_uri(copy_uri) assert copy_dataset.list_overlay_names() == ["is_png"] assert copy_dataset.get_overlay("is_png") == is_png_overlay