def stage_outputs(self, identifier): read1_handle = None read2_handle = None for filename in self.outputs: useful_name = self.input_dataset.get_overlay( 'useful_name')[identifier] fpath = os.path.join(self.working_directory, filename) relpath = os.path.join(useful_name, filename) out_id = self.output_proto_dataset.put_item(fpath, relpath) self.output_proto_dataset.add_item_metadata( out_id, 'from', "{}/{}".format(self.input_dataset.uri, identifier)) # Add is_read1 overlay. if filename.find("_1") != -1: self.output_proto_dataset.add_item_metadata( out_id, "is_read1", True) read1_handle = out_id else: self.output_proto_dataset.add_item_metadata( out_id, "is_read1", False) read2_handle = out_id # Add pair_id overlay. self.output_proto_dataset.add_item_metadata( read1_handle, "pair_id", generate_identifier(read2_handle)) self.output_proto_dataset.add_item_metadata( read2_handle, "pair_id", generate_identifier(read1_handle))
def test_diff_identifiers(tmp_uri_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_identifiers fpaths = create_test_files(tmp_uri_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), base_uri=tmp_uri_fixture) proto_ds_a.create() proto_ds_a.put_item(fpaths["cat"], "a.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), base_uri=tmp_uri_fixture) proto_ds_b.create() proto_ds_b.put_item(fpaths["cat"], "b.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_identifiers(ds_a, ds_a) == [] expected = [(generate_identifier("a.txt"), True, False), (generate_identifier("b.txt"), False, True)] assert diff_identifiers(ds_a, ds_b) == expected
def put_item(self, fpath, relpath): logger.debug("Put item {}".format(self)) # Here the MD5 checksum is calculated so that it can be uploaded with # the item as a piece of metadata. This is needed as the AWS etag is # not the md5 sum of the uploaded object for items that are uploaded # using multipart uploads (large files). # See: https://stackoverflow.com/a/43067788 checksum = S3StorageBroker.hasher(fpath) fname = generate_identifier(relpath) dest_path = self.data_key_prefix + fname extra_args = { 'Metadata': { 'handle': relpath, 'checksum': checksum, } } _put_item_with_retry(s3client=self.s3client, s3resource=self.s3resource, fpath=fpath, bucket=self.bucket, dest_path=dest_path, extra_args=extra_args) return relpath
def get_item_metadata(self, handle): """Return dictionary containing all metadata associated with handle. In other words all the metadata added using the ``add_item_metadata`` method. :param handle: handle for accessing an item before the dataset is frozen :returns: dictionary containing item metadata """ logger.debug("Get item metadata {}".format(self)) bucket = self.s3resource.Bucket(self.bucket) metadata = {} identifier = generate_identifier(handle) prefix = self.fragments_key_prefix + '{}'.format(identifier) for obj in bucket.objects.filter(Prefix=prefix).all(): metadata_key = obj.key.split('.')[-2] response = obj.get() value_as_string = response['Body'].read().decode('utf-8') value = json.loads(value_as_string) metadata[metadata_key] = value return metadata
def test_DataSetCreator(tmp_dir_fixture): # NOQA import dtoolcore from dtoolcore.utils import generate_identifier name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" local_file_path = os.path.join(TEST_SAMPLE_DATA, "tiny.png") with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, readme_content=readme_content, creator_username=creator_username) as dataset_creator: assert dataset_creator.name == name uri = dataset_creator.uri handle = dataset_creator.put_item(local_file_path, "subdir/tiny.png") dataset_creator.add_item_metadata(handle, "ext", ".png") # The below would raise if the dataset was not frozen. dataset = dtoolcore.DataSet.from_uri(uri) # Check the content. expected_identifier = generate_identifier("subdir/tiny.png") assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1 # Check item metadata expected_ext_overlay = {expected_identifier: ".png"} assert dataset.get_overlay("ext") == expected_ext_overlay
def test_basic_workflow(tmp_dir_fixture): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri( name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) expected_identifier = generate_identifier('tiny.png') assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1
def get_item_metadata(self, handle): """Return dictionary containing all metadata associated with handle. In other words all the metadata added using the ``add_item_metadata`` method. :param handle: handle for accessing an item before the dataset is frozen :returns: dictionary containing item metadata """ metadata = {} identifier = generate_identifier(handle) prefix = self.fragments_key_prefix + '{}'.format(identifier) blob_generator = self._blobservice.list_blobs(self.uuid, include='metadata', prefix=prefix) for blob in blob_generator: metadata_key = blob.name.split('.')[-2] value_as_string = self.get_text(blob.name) value = json.loads(value_as_string) metadata[metadata_key] = value return metadata
def test_item_local_abspath_with_clean_cache(tmp_uuid_and_uri): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier uuid, dest_uri = tmp_uuid_and_uri name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() identifier = generate_identifier('tiny.png') with tmp_directory() as cache_dir: with tmp_env_var("DTOOL_S3_CACHE_DIRECTORY", cache_dir): dataset = DataSet.from_uri(dest_uri) fpath = dataset.item_content_abspath(identifier) assert os.path.isfile(fpath)
def test_basic_workflow_on_first_namespace(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) expected_identifier = generate_identifier('tiny.png') assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1
def test_diff_content(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_content from dtoolcore.storagebroker import DiskStorageBroker fpaths = create_test_files(tmp_dir_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), prefix=tmp_dir_fixture, storage="file") proto_ds_a.create() proto_ds_a.put_item(fpaths["cat"], "file.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), prefix=tmp_dir_fixture, storage="file") proto_ds_b.create() proto_ds_b.put_item(fpaths["she"], "file.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_content(ds_a, ds_a) == [] identifier = generate_identifier("file.txt") expected = [ (generate_identifier("file.txt"), DiskStorageBroker.hasher(ds_a.item_content_abspath(identifier)), DiskStorageBroker.hasher(ds_b.item_content_abspath(identifier))) ] assert diff_content(ds_a, ds_b) == expected
def test_basic_workflow_with_nested_handle(tmp_dir_fixture): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri( name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') handle = "subdir/tiny.png" # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, handle) proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) expected_identifier = generate_identifier(handle) assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1 # Ensure that the file exists in the disk dataset. # Particularly on Windows. item_abspath = os.path.join( tmp_dir_fixture, name, "data", "subdir", "tiny.png" ) assert os.path.isfile(item_abspath) assert os.path.isfile(dataset.item_content_abspath(expected_identifier)) # Ensure that the correct abspath is returned. # Particularly on Windows. assert dataset.item_content_abspath(expected_identifier) == item_abspath # NOQA
def add_item_metadata(self, handle, key, value): """Store the given key:value pair for the item associated with handle. :param handle: handle for accessing an item before the dataset is frozen :param key: metadata key :param value: metadata value """ logger.debug("Add item metadata {}".format(self)) identifier = generate_identifier(handle) suffix = '{}.{}.json'.format(identifier, key) bucket_fpath = self.fragments_key_prefix + suffix self.s3resource.Object(self.bucket, bucket_fpath).put(Body=json.dumps(value))
def put_item(self, fpath, relpath): """Put item with content from fpath at relpath in dataset. Missing directories in relpath are created on the fly. :param fpath: path to the item on local disk :param relpath: relative path name given to the item in the dataset as a handle """ # Put the file into iRODS. fname = generate_identifier(relpath) dest_path = os.path.join(self._data_abspath, fname) _cp(fpath, dest_path) # Add the relpath handle as metadata. _put_metadata(dest_path, "handle", relpath) return relpath
def put_item(self, fpath, relpath): identifier = generate_identifier(relpath) self._blobservice.create_blob_from_path( self.uuid, identifier, fpath, content_settings=ContentSettings(content_md5=_get_md5sum(fpath))) self._blobservice.set_blob_metadata(container_name=self.uuid, blob_name=identifier, metadata={ "relpath": relpath, "type": "item" }) return relpath
def test_DataSetCreator_staging_api_stage_item(tmp_dir_fixture): # NOQA import dtoolcore from dtoolcore.utils import generate_identifier name = "my-test-ds" base_uri = _sanitise_base_uri(tmp_dir_fixture) readme_content = "---\ndescription: a test dataset" creator_username = "******" handle = "subdir/test.txt" with dtoolcore.DataSetCreator( name=name, base_uri=base_uri, readme_content=readme_content, creator_username=creator_username) as dataset_creator: # Ensure that the staging directory exists. assert os.path.isdir(dataset_creator.staging_directory) # Add an item more programatically. staging_abspath = dataset_creator.prepare_staging_abspath_promise( # NOQA handle) with open(staging_abspath, "w") as fh: fh.write("Hello world!") uri = dataset_creator.uri # Ensure that the staging directory has been removed. assert not os.path.isdir(dataset_creator.staging_directory) # The below would raise if the dataset was not frozen. dataset = dtoolcore.DataSet.from_uri(uri) # Check the content. expected_identifier = generate_identifier(handle) assert expected_identifier in dataset.identifiers manual_item_props = dataset.item_properties(expected_identifier) assert manual_item_props["size_in_bytes"] == 12 assert len(dataset.identifiers) == 1
def add_item_metadata(self, handle, key, value): """Store the given key:value pair for the item associated with handle. :param handle: handle for accessing an item before the dataset is frozen :param key: metadata key :param value: metadata value """ identifier = generate_identifier(handle) metadata_blob_suffix = "{}.{}.json".format(identifier, key) metadata_blob_name = self.fragments_key_prefix + metadata_blob_suffix self._blobservice.create_blob_from_text(self.uuid, metadata_blob_name, json.dumps(value)) self._blobservice.set_blob_metadata(container_name=self.uuid, blob_name=metadata_blob_name, metadata={"type": "item_metadata"})
def test_diff_sizes(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_sizes fpaths = create_test_files(tmp_dir_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), prefix=tmp_dir_fixture, storage="file") proto_ds_a.create() proto_ds_a.put_item(fpaths["he"], "file.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), prefix=tmp_dir_fixture, storage="file") proto_ds_b.create() proto_ds_b.put_item(fpaths["she"], "file.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_sizes(ds_a, ds_a) == [] expected = [ (generate_identifier("file.txt"), 2, 3), ] assert diff_sizes(ds_a, ds_b) == expected
def item_properties(self, handle): """Return properties of the item with the given handle.""" fname = generate_identifier(handle) irods_item_path = os.path.join(self._data_abspath, fname) # Get the hash. checksum = _get_checksum(irods_item_path) checksum_as_hex = base64_to_hex(checksum) # Get the UTC timestamp and the size in bytes. size, timestamp = self._get_size_and_timestamp_with_cache( irods_item_path ) # Get the relpath from the handle metadata. relpath = self._get_metadata_with_cache(irods_item_path, "handle") properties = { 'size_in_bytes': int(size), 'utc_timestamp': timestamp, 'hash': checksum_as_hex, 'relpath': relpath } return properties
def test_creation_and_reading(tmp_uuid_and_uri): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata uuid, dest_uri = tmp_uuid_and_uri name = "func_test_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) # Create a proto dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_readme("") assert proto_dataset.name == "func_test_dataset" # Test reading from URI. proto_dataset = ProtoDataSet.from_uri(dest_uri) assert proto_dataset.name == "func_test_dataset" # Test get/put readme. assert proto_dataset.get_readme_content() == "" proto_dataset.put_readme("Hello world!") assert proto_dataset.get_readme_content() == "Hello world!" # Test putting a local file handle = "tiny.png" local_file_path = os.path.join(sample_data_path, 'tiny.png') proto_dataset.put_item(local_file_path, handle) assert handle in list(proto_dataset._storage_broker.iter_item_handles()) # Test properties of that file expected_hash = md5sum_hexdigest(os.path.join(sample_data_path, 'tiny.png')) item_properties = proto_dataset._storage_broker.item_properties(handle) assert item_properties['relpath'] == 'tiny.png' assert item_properties['size_in_bytes'] == 276 assert item_properties['hash'] == expected_hash assert 'utc_timestamp' in item_properties time_from_item = datetime.datetime.fromtimestamp(float( item_properties['utc_timestamp']), tz=pytz.UTC) time_delta = datetime.datetime.now(tz=pytz.UTC) - time_from_item assert time_delta.days == 0 assert time_delta.seconds < 100 # Add metadata proto_dataset.add_item_metadata(handle, 'foo', 'bar') proto_dataset.add_item_metadata(handle, 'key', { 'subkey': 'subval', 'morekey': 'moreval' }) # Test metadata retrieval metadata = proto_dataset._storage_broker.get_item_metadata(handle) assert metadata == { 'foo': 'bar', 'key': { 'subkey': 'subval', 'morekey': 'moreval' } } # Add another item and test manifest from dtoolcore import __version__ from dtoolcore.utils import generate_identifier local_file_path = os.path.join(sample_data_path, 'real_text_file.txt') proto_dataset.put_item(local_file_path, 'real_text_file.txt') second_handle = 'real_text_file.txt' generated_manifest = proto_dataset.generate_manifest() assert generated_manifest['hash_function'] == 'md5sum_hexdigest' assert generated_manifest['dtoolcore_version'] == __version__ expected_identifier = generate_identifier(second_handle) assert expected_identifier in generated_manifest['items'] assert generated_manifest['items'][expected_identifier]['relpath'] \ == 'real_text_file.txt' expected_hash = md5sum_hexdigest(local_file_path) assert generated_manifest['items'][expected_identifier]['hash'] \ == expected_hash
def _handle_to_fragment_prefixpath(self, handle): stem = generate_identifier(handle) logger.debug("_handle_to_fragment_prefixpath, handle='{}', stem='{}'" .format(handle, stem)) return os.path.join(self._metadata_fragments_path, stem)
def test_generate_identifier(): from dtoolcore.utils import generate_identifier string = "Test me" assert generate_identifier(handle=string) == \ "9940674fb235beddae40df565cbfc688b824b362"
def _handle_to_fragment_absprefixpath(self, handle): stem = generate_identifier(handle) return os.path.join(self._metadata_fragments_abspath, stem)
def test_proto_dataset_freeze_functional(tmp_dir_fixture): # NOQA from dtoolcore import ( generate_admin_metadata, DataSet, ProtoDataSet, DtoolCoreTypeError ) from dtoolcore.utils import generate_identifier from dtoolcore.storagebroker import DiskStorageBroker name = "func_test_dataset_freeze" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri( name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None ) proto_dataset.create() filenames = ['tiny.png', 'actually_a_png.txt', 'another_file.txt'] for filename in filenames: local_file_path = os.path.join(sample_data_path, filename) proto_dataset.put_item(local_file_path, filename) proto_dataset.add_item_metadata( filename, 'namelen', len(filename) ) proto_dataset.add_item_metadata( filename, 'firstletter', filename[0] ) proto_dataset.put_readme(content='Hello world!') # We shouldn't be able to load this as a DataSet with pytest.raises(DtoolCoreTypeError): DataSet.from_uri(dest_uri) proto_dataset.freeze() # Freezing removes the temporary metadata fragments directory. assert not os.path.isdir( proto_dataset._storage_broker._metadata_fragments_abspath) # Now we shouln't be able to load as a ProtoDataSet with pytest.raises(DtoolCoreTypeError): ProtoDataSet.from_uri(dest_uri) # But we can as a DataSet dataset = DataSet.from_uri(dest_uri) assert dataset.name == 'func_test_dataset_freeze' # Test identifiers expected_identifiers = map(generate_identifier, filenames) assert set(dataset.identifiers) == set(expected_identifiers) # Test readme contents assert dataset.get_readme_content() == "Hello world!" # Test item expected_identifier = generate_identifier('tiny.png') item_properties = dataset.item_properties(expected_identifier) assert item_properties['relpath'] == 'tiny.png' assert item_properties['size_in_bytes'] == 276 assert item_properties['hash'] == 'dc73192d2f81d7009ce5a1ee7bad5755' # Test accessing item expected_identifier = generate_identifier('another_file.txt') fpath = dataset.item_content_abspath(expected_identifier) with open(fpath) as fh: contents = fh.read() assert contents == "Hello\n" # Test overlays have been created properly namelen_overlay = dataset.get_overlay('namelen') expected_identifier = generate_identifier('another_file.txt') assert namelen_overlay[expected_identifier] == len('another_file.txt')
def test_creation_and_reading(tmp_dir_fixture): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore.storagebroker import DiskStorageBroker name = "func_test_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri( name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) # Create a proto dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_readme("") assert proto_dataset.name == "func_test_dataset" # Test reading from URI. proto_dataset = ProtoDataSet.from_uri(dest_uri) assert proto_dataset.name == "func_test_dataset" # Test get/put readme. assert proto_dataset.get_readme_content() == "" proto_dataset.put_readme("Hello world!") assert proto_dataset.get_readme_content() == "Hello world!" # Test putting a local file handle = "tiny.png" local_file_path = os.path.join(sample_data_path, 'tiny.png') proto_dataset.put_item(local_file_path, handle) assert handle in list(proto_dataset._storage_broker.iter_item_handles()) # Test properties of that file item_properties = proto_dataset._storage_broker.item_properties(handle) assert item_properties['relpath'] == 'tiny.png' assert item_properties['size_in_bytes'] == 276 assert item_properties['hash'] == 'dc73192d2f81d7009ce5a1ee7bad5755' assert 'utc_timestamp' in item_properties time_from_item = datetime.datetime.fromtimestamp( float(item_properties['utc_timestamp']), tz=pytz.UTC ) time.sleep(0.1) # Make tests more robust on Windows. time_delta = datetime.datetime.now(tz=pytz.UTC) - time_from_item assert time_delta.days == 0 assert time_delta.seconds < 20 # Add metadata proto_dataset.add_item_metadata(handle, 'foo', 'bar') proto_dataset.add_item_metadata( handle, 'key', {'subkey': 'subval', 'morekey': 'moreval'} ) # Test metadata retrieval metadata = proto_dataset._storage_broker.get_item_metadata(handle) assert metadata == { 'foo': 'bar', 'key': { 'subkey': 'subval', 'morekey': 'moreval' } } # Add another item and test manifest from dtoolcore import __version__ from dtoolcore.utils import generate_identifier second_fname = "random_bytes" local_file_path = os.path.join(sample_data_path, second_fname) proto_dataset.put_item(local_file_path, second_fname) second_handle = second_fname generated_manifest = proto_dataset.generate_manifest() assert generated_manifest['hash_function'] == 'md5sum_hexdigest' assert generated_manifest['dtoolcore_version'] == __version__ expected_identifier = generate_identifier(second_handle) assert expected_identifier in generated_manifest['items'] assert generated_manifest['items'][expected_identifier]['relpath'] \ == second_handle assert generated_manifest['items'][expected_identifier]['hash'] \ == '5e5ccafa2018a36f8726398cc6589de8'
def test_notify_route(tmp_app_with_users, tmp_dir_fixture): # NOQA bucket_name = 'bucket' # Add local directory as base URI and assign URI to the bucket base_uri = sanitise_uri(tmp_dir_fixture) register_base_uri(base_uri) update_permissions({ 'base_uri': base_uri, 'users_with_search_permissions': ['snow-white'], 'users_with_register_permissions': ['snow-white'], }) Config.BUCKET_TO_BASE_URI[bucket_name] = base_uri # Create test dataset name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri(name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() readme = 'abc: def' proto_dataset.put_readme(readme) proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) expected_identifier = generate_identifier('tiny.png') assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1 # Tell plugin that dataset has been created r = tmp_app_with_users.post( "/elastic-search/notify/all/{}".format(name), json={ 'bucket': bucket_name, 'metadata': dataset._admin_metadata }, ) assert r.status_code == 200 # Check that dataset has actually been registered datasets = list_datasets_by_user('snow-white') assert len(datasets) == 1 assert datasets[0]['base_uri'] == base_uri assert datasets[0]['uri'] == dest_uri assert datasets[0]['uuid'] == admin_metadata['uuid'] assert datasets[0]['name'] == name # Check README check_readme = get_readme_from_uri_by_user('snow-white', dest_uri) assert check_readme == yaml.load(readme) # Update README new_readme = 'ghi: jkl' dataset.put_readme(new_readme) # Notify plugin about updated name r = tmp_app_with_users.post( "/elastic-search/notify/all/{}".format(name), json={ 'bucket': bucket_name, 'metadata': dataset._admin_metadata }, ) assert r.status_code == 200 # Check dataset datasets = list_datasets_by_user('snow-white') assert len(datasets) == 1 assert datasets[0]['base_uri'] == base_uri assert datasets[0]['uri'] == dest_uri assert datasets[0]['uuid'] == admin_metadata['uuid'] assert datasets[0]['name'] == name # Check that README has actually been changed check_readme = get_readme_from_uri_by_user('snow-white', dest_uri) assert check_readme == yaml.load(new_readme) # Tell plugin that dataset has been deleted r = tmp_app_with_users.delete( "/elastic-search/notify/all/{}_{}/dtool".format( bucket_name, admin_metadata['uuid'])) assert r.status_code == 200 # Check that dataset has been deleted datasets = list_datasets_by_user('snow-white') assert len(datasets) == 0
def _get_blob_properties(self, handle): identifier = generate_identifier(handle) return self._blobservice.get_blob_properties(self.uuid, identifier)
def test_proto_dataset_freeze_functional(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import (generate_admin_metadata, DataSet, ProtoDataSet, DtoolCoreTypeError) from dtoolcore.utils import generate_identifier name = "func_test_dataset_freeze" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() filenames = ['tiny.png', 'actually_a_png.txt', 'another_file.txt'] for filename in filenames: local_file_path = os.path.join(sample_data_path, filename) proto_dataset.put_item(local_file_path, filename) proto_dataset.add_item_metadata(filename, 'namelen', len(filename)) proto_dataset.add_item_metadata(filename, 'firstletter', filename[0]) # At this point the temporary fragments should exist. assert _prefix_contains_something( proto_dataset._storage_broker, proto_dataset._storage_broker.fragments_key_prefix) proto_dataset.put_readme(content='Hello world!') # We shouldn't be able to load this as a DataSet with pytest.raises(DtoolCoreTypeError): DataSet.from_uri(dest_uri) proto_dataset.freeze() # Freezing removes the temporary metadata fragments. assert not _prefix_contains_something( proto_dataset._storage_broker, proto_dataset._storage_broker.fragments_key_prefix) # Now we shouln't be able to load as a ProtoDataSet with pytest.raises(DtoolCoreTypeError): ProtoDataSet.from_uri(dest_uri) # But we can as a DataSet dataset = DataSet.from_uri(dest_uri) assert dataset.name == 'func_test_dataset_freeze' # Test identifiers expected_identifiers = map(generate_identifier, filenames) assert set(dataset.identifiers) == set(expected_identifiers) # Test readme contents assert dataset.get_readme_content() == "Hello world!" # Test item expected_identifier = generate_identifier('tiny.png') expected_hash = md5sum_hexdigest(os.path.join(sample_data_path, 'tiny.png')) item_properties = dataset.item_properties(expected_identifier) assert item_properties['relpath'] == 'tiny.png' assert item_properties['size_in_bytes'] == 276 assert item_properties['hash'] == expected_hash # Test accessing item expected_identifier = generate_identifier('another_file.txt') fpath = dataset.item_content_abspath(expected_identifier) with open(fpath) as fh: contents = fh.read() assert contents == "Hello\n" # Test overlays have been created properly namelen_overlay = dataset.get_overlay('namelen') expected_identifier = generate_identifier('another_file.txt') assert namelen_overlay[expected_identifier] == len('another_file.txt')
def test_overlays_functional(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, ProtoDataSet, DtoolCoreKeyError, DtoolCoreTypeError, DtoolCoreValueError, DtoolCoreInvalidNameError, generate_admin_metadata, copy, ) from dtoolcore.utils import generate_identifier from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri(name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset. proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') # Freeze the dataset proto_dataset.put_readme("") proto_dataset.freeze() # Load the dataset. dataset = DataSet.from_uri(proto_dataset.uri) # The overlay has not been added yet. with pytest.raises(DtoolCoreKeyError): dataset.get_overlay("is_png") # Create overlay content. expected_identifier = generate_identifier('tiny.png') is_png_overlay = {expected_identifier: True} with pytest.raises(DtoolCoreTypeError): dataset.put_overlay("is_png", "not_a_dict") incorrect_identifier_overlay = {"incorrect": True} with pytest.raises(DtoolCoreValueError): dataset.put_overlay("is_png", incorrect_identifier_overlay) invalid_keys = ["with space", "with,comma", "with/slash", "X" * 81] for invalid_key in invalid_keys: with pytest.raises(DtoolCoreInvalidNameError): dataset.put_overlay(invalid_key, is_png_overlay) dataset.put_overlay("is_png", is_png_overlay) assert dataset.get_overlay("is_png") == is_png_overlay # Test copy. copy_dataset_directory = os.path.join(tmp_dir_fixture, "copy") os.mkdir(copy_dataset_directory) dest_uri = dataset.base_uri + "/copy" copy_uri = copy(dataset.uri, dest_uri) copy_dataset = DataSet.from_uri(copy_uri) assert copy_dataset.list_overlay_names() == ["is_png"] assert copy_dataset.get_overlay("is_png") == is_png_overlay
def _get_item_object(self, handle): identifier = generate_identifier(handle) item_key = self.data_key_prefix + identifier obj = self.s3resource.Object(self.bucket, item_key) return obj