示例#1
0
    def stage_outputs(self, identifier):
        read1_handle = None
        read2_handle = None
        for filename in self.outputs:

            useful_name = self.input_dataset.get_overlay(
                'useful_name')[identifier]

            fpath = os.path.join(self.working_directory, filename)
            relpath = os.path.join(useful_name, filename)
            out_id = self.output_proto_dataset.put_item(fpath, relpath)
            self.output_proto_dataset.add_item_metadata(
                out_id, 'from', "{}/{}".format(self.input_dataset.uri,
                                               identifier))

            # Add is_read1 overlay.
            if filename.find("_1") != -1:
                self.output_proto_dataset.add_item_metadata(
                    out_id, "is_read1", True)
                read1_handle = out_id
            else:
                self.output_proto_dataset.add_item_metadata(
                    out_id, "is_read1", False)
                read2_handle = out_id

        # Add pair_id overlay.
        self.output_proto_dataset.add_item_metadata(
            read1_handle, "pair_id", generate_identifier(read2_handle))
        self.output_proto_dataset.add_item_metadata(
            read2_handle, "pair_id", generate_identifier(read1_handle))
示例#2
0
def test_diff_identifiers(tmp_uri_fixture):  # NOQA

    from dtoolcore import (
        DataSet,
        generate_admin_metadata,
        generate_proto_dataset,
    )
    from dtoolcore.utils import generate_identifier
    from dtoolcore.compare import diff_identifiers

    fpaths = create_test_files(tmp_uri_fixture)

    proto_ds_a = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_1"),
        base_uri=tmp_uri_fixture)
    proto_ds_a.create()
    proto_ds_a.put_item(fpaths["cat"], "a.txt")
    proto_ds_a.freeze()

    proto_ds_b = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_2"),
        base_uri=tmp_uri_fixture)
    proto_ds_b.create()
    proto_ds_b.put_item(fpaths["cat"], "b.txt")
    proto_ds_b.freeze()

    ds_a = DataSet.from_uri(proto_ds_a.uri)
    ds_b = DataSet.from_uri(proto_ds_b.uri)

    assert diff_identifiers(ds_a, ds_a) == []

    expected = [(generate_identifier("a.txt"), True, False),
                (generate_identifier("b.txt"), False, True)]
    assert diff_identifiers(ds_a, ds_b) == expected
示例#3
0
    def put_item(self, fpath, relpath):
        logger.debug("Put item {}".format(self))

        # Here the MD5 checksum is calculated so that it can be uploaded with
        # the item as a piece of metadata. This is needed as the AWS etag is
        # not the md5 sum of the uploaded object for items that are uploaded
        # using multipart uploads (large files).
        # See: https://stackoverflow.com/a/43067788
        checksum = S3StorageBroker.hasher(fpath)

        fname = generate_identifier(relpath)
        dest_path = self.data_key_prefix + fname
        extra_args = {
            'Metadata': {
                'handle': relpath,
                'checksum': checksum,
            }
        }
        _put_item_with_retry(s3client=self.s3client,
                             s3resource=self.s3resource,
                             fpath=fpath,
                             bucket=self.bucket,
                             dest_path=dest_path,
                             extra_args=extra_args)

        return relpath
示例#4
0
    def get_item_metadata(self, handle):
        """Return dictionary containing all metadata associated with handle.

        In other words all the metadata added using the ``add_item_metadata``
        method.

        :param handle: handle for accessing an item before the dataset is
                       frozen
        :returns: dictionary containing item metadata
        """
        logger.debug("Get item metadata {}".format(self))

        bucket = self.s3resource.Bucket(self.bucket)

        metadata = {}

        identifier = generate_identifier(handle)
        prefix = self.fragments_key_prefix + '{}'.format(identifier)
        for obj in bucket.objects.filter(Prefix=prefix).all():
            metadata_key = obj.key.split('.')[-2]
            response = obj.get()
            value_as_string = response['Body'].read().decode('utf-8')
            value = json.loads(value_as_string)

            metadata[metadata_key] = value

        return metadata
def test_DataSetCreator(tmp_dir_fixture):  # NOQA

    import dtoolcore
    from dtoolcore.utils import generate_identifier

    name = "my-test-ds"
    base_uri = _sanitise_base_uri(tmp_dir_fixture)
    readme_content = "---\ndescription: a test dataset"
    creator_username = "******"
    local_file_path = os.path.join(TEST_SAMPLE_DATA, "tiny.png")

    with dtoolcore.DataSetCreator(
            name=name,
            base_uri=base_uri,
            readme_content=readme_content,
            creator_username=creator_username) as dataset_creator:
        assert dataset_creator.name == name
        uri = dataset_creator.uri
        handle = dataset_creator.put_item(local_file_path, "subdir/tiny.png")
        dataset_creator.add_item_metadata(handle, "ext", ".png")

    # The below would raise if the dataset was not frozen.
    dataset = dtoolcore.DataSet.from_uri(uri)

    # Check the content.
    expected_identifier = generate_identifier("subdir/tiny.png")
    assert expected_identifier in dataset.identifiers
    assert len(dataset.identifiers) == 1

    # Check item metadata
    expected_ext_overlay = {expected_identifier: ".png"}
    assert dataset.get_overlay("ext") == expected_ext_overlay
示例#6
0
def test_basic_workflow(tmp_dir_fixture):  # NOQA

    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet
    from dtoolcore.utils import generate_identifier
    from dtoolcore.storagebroker import DiskStorageBroker

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    dest_uri = DiskStorageBroker.generate_uri(
        name=name,
        uuid=admin_metadata["uuid"],
        base_uri=tmp_dir_fixture)

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(
        uri=dest_uri,
        admin_metadata=admin_metadata,
        config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, 'tiny.png')

    proto_dataset.freeze()

    # Read in a dataset
    dataset = DataSet.from_uri(dest_uri)

    expected_identifier = generate_identifier('tiny.png')
    assert expected_identifier in dataset.identifiers
    assert len(dataset.identifiers) == 1
示例#7
0
    def get_item_metadata(self, handle):
        """Return dictionary containing all metadata associated with handle.

        In other words all the metadata added using the ``add_item_metadata``
        method.

        :param handle: handle for accessing an item before the dataset is
                       frozen
        :returns: dictionary containing item metadata
        """

        metadata = {}

        identifier = generate_identifier(handle)
        prefix = self.fragments_key_prefix + '{}'.format(identifier)

        blob_generator = self._blobservice.list_blobs(self.uuid,
                                                      include='metadata',
                                                      prefix=prefix)

        for blob in blob_generator:
            metadata_key = blob.name.split('.')[-2]
            value_as_string = self.get_text(blob.name)
            value = json.loads(value_as_string)

            metadata[metadata_key] = value

        return metadata
示例#8
0
def test_item_local_abspath_with_clean_cache(tmp_uuid_and_uri):  # NOQA

    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet
    from dtoolcore.utils import generate_identifier

    uuid, dest_uri = tmp_uuid_and_uri

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(uri=dest_uri,
                                 admin_metadata=admin_metadata,
                                 config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, 'tiny.png')
    proto_dataset.freeze()

    identifier = generate_identifier('tiny.png')

    with tmp_directory() as cache_dir:
        with tmp_env_var("DTOOL_S3_CACHE_DIRECTORY", cache_dir):

            dataset = DataSet.from_uri(dest_uri)
            fpath = dataset.item_content_abspath(identifier)

            assert os.path.isfile(fpath)
def test_basic_workflow_on_first_namespace(tmp_uuid_and_uri):  # NOQA

    uuid, dest_uri = tmp_uuid_and_uri

    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet
    from dtoolcore.utils import generate_identifier

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(
        uri=dest_uri,
        admin_metadata=admin_metadata,
        config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, 'tiny.png')
    proto_dataset.freeze()

    # Read in a dataset
    dataset = DataSet.from_uri(dest_uri)

    expected_identifier = generate_identifier('tiny.png')

    assert expected_identifier in dataset.identifiers
    assert len(dataset.identifiers) == 1
示例#10
0
def test_diff_content(tmp_dir_fixture):  # NOQA

    from dtoolcore import (
        DataSet,
        generate_admin_metadata,
        generate_proto_dataset,
    )
    from dtoolcore.utils import generate_identifier
    from dtoolcore.compare import diff_content
    from dtoolcore.storagebroker import DiskStorageBroker

    fpaths = create_test_files(tmp_dir_fixture)

    proto_ds_a = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_1"),
        prefix=tmp_dir_fixture,
        storage="file")
    proto_ds_a.create()
    proto_ds_a.put_item(fpaths["cat"], "file.txt")
    proto_ds_a.freeze()

    proto_ds_b = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_2"),
        prefix=tmp_dir_fixture,
        storage="file")
    proto_ds_b.create()
    proto_ds_b.put_item(fpaths["she"], "file.txt")
    proto_ds_b.freeze()

    ds_a = DataSet.from_uri(proto_ds_a.uri)
    ds_b = DataSet.from_uri(proto_ds_b.uri)

    assert diff_content(ds_a, ds_a) == []

    identifier = generate_identifier("file.txt")
    expected = [
        (generate_identifier("file.txt"),
         DiskStorageBroker.hasher(ds_a.item_content_abspath(identifier)),
         DiskStorageBroker.hasher(ds_b.item_content_abspath(identifier)))
    ]
    assert diff_content(ds_a, ds_b) == expected
示例#11
0
def test_basic_workflow_with_nested_handle(tmp_dir_fixture):  # NOQA

    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore import DataSet
    from dtoolcore.utils import generate_identifier
    from dtoolcore.storagebroker import DiskStorageBroker

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    dest_uri = DiskStorageBroker.generate_uri(
        name=name,
        uuid=admin_metadata["uuid"],
        base_uri=tmp_dir_fixture)

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')
    handle = "subdir/tiny.png"

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(
        uri=dest_uri,
        admin_metadata=admin_metadata,
        config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, handle)

    proto_dataset.freeze()

    # Read in a dataset
    dataset = DataSet.from_uri(dest_uri)

    expected_identifier = generate_identifier(handle)
    assert expected_identifier in dataset.identifiers
    assert len(dataset.identifiers) == 1

    # Ensure that the file exists in the disk dataset.
    # Particularly on Windows.
    item_abspath = os.path.join(
        tmp_dir_fixture,
        name,
        "data",
        "subdir",
        "tiny.png"
    )
    assert os.path.isfile(item_abspath)
    assert os.path.isfile(dataset.item_content_abspath(expected_identifier))

    # Ensure that the correct abspath is returned.
    # Particularly on Windows.
    assert dataset.item_content_abspath(expected_identifier) == item_abspath  # NOQA
示例#12
0
    def add_item_metadata(self, handle, key, value):
        """Store the given key:value pair for the item associated with handle.

        :param handle: handle for accessing an item before the dataset is
                       frozen
        :param key: metadata key
        :param value: metadata value
        """
        logger.debug("Add item metadata {}".format(self))

        identifier = generate_identifier(handle)
        suffix = '{}.{}.json'.format(identifier, key)
        bucket_fpath = self.fragments_key_prefix + suffix

        self.s3resource.Object(self.bucket,
                               bucket_fpath).put(Body=json.dumps(value))
示例#13
0
    def put_item(self, fpath, relpath):
        """Put item with content from fpath at relpath in dataset.

        Missing directories in relpath are created on the fly.

        :param fpath: path to the item on local disk
        :param relpath: relative path name given to the item in the dataset as
                        a handle
        """
        # Put the file into iRODS.
        fname = generate_identifier(relpath)
        dest_path = os.path.join(self._data_abspath, fname)
        _cp(fpath, dest_path)

        # Add the relpath handle as metadata.
        _put_metadata(dest_path, "handle", relpath)

        return relpath
示例#14
0
    def put_item(self, fpath, relpath):

        identifier = generate_identifier(relpath)

        self._blobservice.create_blob_from_path(
            self.uuid,
            identifier,
            fpath,
            content_settings=ContentSettings(content_md5=_get_md5sum(fpath)))

        self._blobservice.set_blob_metadata(container_name=self.uuid,
                                            blob_name=identifier,
                                            metadata={
                                                "relpath": relpath,
                                                "type": "item"
                                            })

        return relpath
def test_DataSetCreator_staging_api_stage_item(tmp_dir_fixture):  # NOQA

    import dtoolcore
    from dtoolcore.utils import generate_identifier

    name = "my-test-ds"
    base_uri = _sanitise_base_uri(tmp_dir_fixture)
    readme_content = "---\ndescription: a test dataset"
    creator_username = "******"

    handle = "subdir/test.txt"

    with dtoolcore.DataSetCreator(
            name=name,
            base_uri=base_uri,
            readme_content=readme_content,
            creator_username=creator_username) as dataset_creator:

        # Ensure that the staging directory exists.
        assert os.path.isdir(dataset_creator.staging_directory)

        # Add an item more programatically.
        staging_abspath = dataset_creator.prepare_staging_abspath_promise(  # NOQA
            handle)
        with open(staging_abspath, "w") as fh:
            fh.write("Hello world!")

        uri = dataset_creator.uri

    # Ensure that the staging directory has been removed.
    assert not os.path.isdir(dataset_creator.staging_directory)

    # The below would raise if the dataset was not frozen.
    dataset = dtoolcore.DataSet.from_uri(uri)

    # Check the content.
    expected_identifier = generate_identifier(handle)
    assert expected_identifier in dataset.identifiers
    manual_item_props = dataset.item_properties(expected_identifier)
    assert manual_item_props["size_in_bytes"] == 12

    assert len(dataset.identifiers) == 1
示例#16
0
    def add_item_metadata(self, handle, key, value):
        """Store the given key:value pair for the item associated with handle.

        :param handle: handle for accessing an item before the dataset is
                       frozen
        :param key: metadata key
        :param value: metadata value
        """

        identifier = generate_identifier(handle)

        metadata_blob_suffix = "{}.{}.json".format(identifier, key)
        metadata_blob_name = self.fragments_key_prefix + metadata_blob_suffix

        self._blobservice.create_blob_from_text(self.uuid, metadata_blob_name,
                                                json.dumps(value))

        self._blobservice.set_blob_metadata(container_name=self.uuid,
                                            blob_name=metadata_blob_name,
                                            metadata={"type": "item_metadata"})
示例#17
0
def test_diff_sizes(tmp_dir_fixture):  # NOQA

    from dtoolcore import (
        DataSet,
        generate_admin_metadata,
        generate_proto_dataset,
    )
    from dtoolcore.utils import generate_identifier
    from dtoolcore.compare import diff_sizes

    fpaths = create_test_files(tmp_dir_fixture)

    proto_ds_a = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_1"),
        prefix=tmp_dir_fixture,
        storage="file")
    proto_ds_a.create()
    proto_ds_a.put_item(fpaths["he"], "file.txt")
    proto_ds_a.freeze()

    proto_ds_b = generate_proto_dataset(
        admin_metadata=generate_admin_metadata("test_compare_2"),
        prefix=tmp_dir_fixture,
        storage="file")
    proto_ds_b.create()
    proto_ds_b.put_item(fpaths["she"], "file.txt")
    proto_ds_b.freeze()

    ds_a = DataSet.from_uri(proto_ds_a.uri)
    ds_b = DataSet.from_uri(proto_ds_b.uri)

    assert diff_sizes(ds_a, ds_a) == []

    expected = [
        (generate_identifier("file.txt"), 2, 3),
    ]
    assert diff_sizes(ds_a, ds_b) == expected
示例#18
0
    def item_properties(self, handle):
        """Return properties of the item with the given handle."""
        fname = generate_identifier(handle)
        irods_item_path = os.path.join(self._data_abspath, fname)

        # Get the hash.
        checksum = _get_checksum(irods_item_path)
        checksum_as_hex = base64_to_hex(checksum)

        # Get the UTC timestamp and the size in bytes.
        size, timestamp = self._get_size_and_timestamp_with_cache(
            irods_item_path
        )

        # Get the relpath from the handle metadata.
        relpath = self._get_metadata_with_cache(irods_item_path, "handle")

        properties = {
            'size_in_bytes': int(size),
            'utc_timestamp': timestamp,
            'hash': checksum_as_hex,
            'relpath': relpath
        }
        return properties
示例#19
0
def test_creation_and_reading(tmp_uuid_and_uri):  # NOQA
    from dtoolcore import ProtoDataSet, generate_admin_metadata

    uuid, dest_uri = tmp_uuid_and_uri

    name = "func_test_dataset"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)

    # Create a proto dataset
    proto_dataset = ProtoDataSet(uri=dest_uri,
                                 admin_metadata=admin_metadata,
                                 config_path=None)
    proto_dataset.create()
    proto_dataset.put_readme("")

    assert proto_dataset.name == "func_test_dataset"

    # Test reading from URI.
    proto_dataset = ProtoDataSet.from_uri(dest_uri)
    assert proto_dataset.name == "func_test_dataset"

    # Test get/put readme.
    assert proto_dataset.get_readme_content() == ""
    proto_dataset.put_readme("Hello world!")
    assert proto_dataset.get_readme_content() == "Hello world!"

    # Test putting a local file
    handle = "tiny.png"
    local_file_path = os.path.join(sample_data_path, 'tiny.png')
    proto_dataset.put_item(local_file_path, handle)
    assert handle in list(proto_dataset._storage_broker.iter_item_handles())

    # Test properties of that file
    expected_hash = md5sum_hexdigest(os.path.join(sample_data_path,
                                                  'tiny.png'))
    item_properties = proto_dataset._storage_broker.item_properties(handle)
    assert item_properties['relpath'] == 'tiny.png'
    assert item_properties['size_in_bytes'] == 276
    assert item_properties['hash'] == expected_hash
    assert 'utc_timestamp' in item_properties
    time_from_item = datetime.datetime.fromtimestamp(float(
        item_properties['utc_timestamp']),
                                                     tz=pytz.UTC)
    time_delta = datetime.datetime.now(tz=pytz.UTC) - time_from_item
    assert time_delta.days == 0
    assert time_delta.seconds < 100

    # Add metadata
    proto_dataset.add_item_metadata(handle, 'foo', 'bar')
    proto_dataset.add_item_metadata(handle, 'key', {
        'subkey': 'subval',
        'morekey': 'moreval'
    })

    # Test metadata retrieval
    metadata = proto_dataset._storage_broker.get_item_metadata(handle)
    assert metadata == {
        'foo': 'bar',
        'key': {
            'subkey': 'subval',
            'morekey': 'moreval'
        }
    }

    # Add another item and test manifest
    from dtoolcore import __version__
    from dtoolcore.utils import generate_identifier
    local_file_path = os.path.join(sample_data_path, 'real_text_file.txt')
    proto_dataset.put_item(local_file_path, 'real_text_file.txt')
    second_handle = 'real_text_file.txt'
    generated_manifest = proto_dataset.generate_manifest()
    assert generated_manifest['hash_function'] == 'md5sum_hexdigest'
    assert generated_manifest['dtoolcore_version'] == __version__
    expected_identifier = generate_identifier(second_handle)
    assert expected_identifier in generated_manifest['items']
    assert generated_manifest['items'][expected_identifier]['relpath'] \
        == 'real_text_file.txt'
    expected_hash = md5sum_hexdigest(local_file_path)
    assert generated_manifest['items'][expected_identifier]['hash'] \
        == expected_hash
示例#20
0
 def _handle_to_fragment_prefixpath(self, handle):
     stem = generate_identifier(handle)
     logger.debug("_handle_to_fragment_prefixpath, handle='{}', stem='{}'"
         .format(handle, stem))
     return os.path.join(self._metadata_fragments_path, stem)
示例#21
0
def test_generate_identifier():
    from dtoolcore.utils import generate_identifier
    string = "Test me"

    assert generate_identifier(handle=string) ==  \
        "9940674fb235beddae40df565cbfc688b824b362"
示例#22
0
 def _handle_to_fragment_absprefixpath(self, handle):
     stem = generate_identifier(handle)
     return os.path.join(self._metadata_fragments_abspath, stem)
示例#23
0
def test_proto_dataset_freeze_functional(tmp_dir_fixture):  # NOQA

    from dtoolcore import (
        generate_admin_metadata,
        DataSet,
        ProtoDataSet,
        DtoolCoreTypeError
    )
    from dtoolcore.utils import generate_identifier
    from dtoolcore.storagebroker import DiskStorageBroker

    name = "func_test_dataset_freeze"
    admin_metadata = generate_admin_metadata(name)
    dest_uri = DiskStorageBroker.generate_uri(
        name=name,
        uuid=admin_metadata["uuid"],
        base_uri=tmp_dir_fixture)

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)

    proto_dataset = ProtoDataSet(
        uri=dest_uri,
        admin_metadata=admin_metadata,
        config_path=None
    )
    proto_dataset.create()

    filenames = ['tiny.png', 'actually_a_png.txt', 'another_file.txt']
    for filename in filenames:
        local_file_path = os.path.join(sample_data_path, filename)
        proto_dataset.put_item(local_file_path, filename)
        proto_dataset.add_item_metadata(
            filename,
            'namelen',
            len(filename)
        )
        proto_dataset.add_item_metadata(
            filename,
            'firstletter',
            filename[0]
        )

    proto_dataset.put_readme(content='Hello world!')

    # We shouldn't be able to load this as a DataSet
    with pytest.raises(DtoolCoreTypeError):
        DataSet.from_uri(dest_uri)

    proto_dataset.freeze()

    # Freezing removes the temporary metadata fragments directory.
    assert not os.path.isdir(
        proto_dataset._storage_broker._metadata_fragments_abspath)

    # Now we shouln't be able to load as a ProtoDataSet
    with pytest.raises(DtoolCoreTypeError):
        ProtoDataSet.from_uri(dest_uri)

    # But we can as a DataSet
    dataset = DataSet.from_uri(dest_uri)
    assert dataset.name == 'func_test_dataset_freeze'

    # Test identifiers
    expected_identifiers = map(generate_identifier, filenames)
    assert set(dataset.identifiers) == set(expected_identifiers)

    # Test readme contents
    assert dataset.get_readme_content() == "Hello world!"

    # Test item
    expected_identifier = generate_identifier('tiny.png')
    item_properties = dataset.item_properties(expected_identifier)
    assert item_properties['relpath'] == 'tiny.png'
    assert item_properties['size_in_bytes'] == 276
    assert item_properties['hash'] == 'dc73192d2f81d7009ce5a1ee7bad5755'

    # Test accessing item
    expected_identifier = generate_identifier('another_file.txt')
    fpath = dataset.item_content_abspath(expected_identifier)

    with open(fpath) as fh:
        contents = fh.read()

    assert contents == "Hello\n"

    # Test overlays have been created properly
    namelen_overlay = dataset.get_overlay('namelen')
    expected_identifier = generate_identifier('another_file.txt')
    assert namelen_overlay[expected_identifier] == len('another_file.txt')
示例#24
0
def test_creation_and_reading(tmp_dir_fixture):  # NOQA
    from dtoolcore import ProtoDataSet, generate_admin_metadata
    from dtoolcore.storagebroker import DiskStorageBroker

    name = "func_test_dataset"
    admin_metadata = generate_admin_metadata(name)
    dest_uri = DiskStorageBroker.generate_uri(
        name=name,
        uuid=admin_metadata["uuid"],
        base_uri=tmp_dir_fixture)

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)

    # Create a proto dataset
    proto_dataset = ProtoDataSet(
        uri=dest_uri,
        admin_metadata=admin_metadata,
        config_path=None)
    proto_dataset.create()
    proto_dataset.put_readme("")

    assert proto_dataset.name == "func_test_dataset"

    # Test reading from URI.
    proto_dataset = ProtoDataSet.from_uri(dest_uri)
    assert proto_dataset.name == "func_test_dataset"

    # Test get/put readme.
    assert proto_dataset.get_readme_content() == ""
    proto_dataset.put_readme("Hello world!")
    assert proto_dataset.get_readme_content() == "Hello world!"

    # Test putting a local file
    handle = "tiny.png"
    local_file_path = os.path.join(sample_data_path, 'tiny.png')
    proto_dataset.put_item(local_file_path, handle)
    assert handle in list(proto_dataset._storage_broker.iter_item_handles())

    # Test properties of that file
    item_properties = proto_dataset._storage_broker.item_properties(handle)
    assert item_properties['relpath'] == 'tiny.png'
    assert item_properties['size_in_bytes'] == 276
    assert item_properties['hash'] == 'dc73192d2f81d7009ce5a1ee7bad5755'
    assert 'utc_timestamp' in item_properties
    time_from_item = datetime.datetime.fromtimestamp(
        float(item_properties['utc_timestamp']),
        tz=pytz.UTC
    )

    time.sleep(0.1)  # Make tests more robust on Windows.
    time_delta = datetime.datetime.now(tz=pytz.UTC) - time_from_item

    assert time_delta.days == 0
    assert time_delta.seconds < 20

    # Add metadata
    proto_dataset.add_item_metadata(handle, 'foo', 'bar')
    proto_dataset.add_item_metadata(
        handle,
        'key',
        {'subkey': 'subval',
         'morekey': 'moreval'}
    )

    # Test metadata retrieval
    metadata = proto_dataset._storage_broker.get_item_metadata(handle)
    assert metadata == {
        'foo': 'bar',
        'key': {
            'subkey': 'subval',
            'morekey': 'moreval'
        }
    }

    # Add another item and test manifest
    from dtoolcore import __version__
    from dtoolcore.utils import generate_identifier
    second_fname = "random_bytes"
    local_file_path = os.path.join(sample_data_path, second_fname)
    proto_dataset.put_item(local_file_path, second_fname)
    second_handle = second_fname
    generated_manifest = proto_dataset.generate_manifest()
    assert generated_manifest['hash_function'] == 'md5sum_hexdigest'
    assert generated_manifest['dtoolcore_version'] == __version__
    expected_identifier = generate_identifier(second_handle)
    assert expected_identifier in generated_manifest['items']
    assert generated_manifest['items'][expected_identifier]['relpath'] \
        == second_handle
    assert generated_manifest['items'][expected_identifier]['hash'] \
        == '5e5ccafa2018a36f8726398cc6589de8'
def test_notify_route(tmp_app_with_users, tmp_dir_fixture):  # NOQA
    bucket_name = 'bucket'

    # Add local directory as base URI and assign URI to the bucket
    base_uri = sanitise_uri(tmp_dir_fixture)
    register_base_uri(base_uri)
    update_permissions({
        'base_uri': base_uri,
        'users_with_search_permissions': ['snow-white'],
        'users_with_register_permissions': ['snow-white'],
    })
    Config.BUCKET_TO_BASE_URI[bucket_name] = base_uri

    # Create test dataset
    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    dest_uri = DiskStorageBroker.generate_uri(name=name,
                                              uuid=admin_metadata["uuid"],
                                              base_uri=tmp_dir_fixture)

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset
    proto_dataset = ProtoDataSet(uri=dest_uri,
                                 admin_metadata=admin_metadata,
                                 config_path=None)
    proto_dataset.create()
    readme = 'abc: def'
    proto_dataset.put_readme(readme)
    proto_dataset.put_item(local_file_path, 'tiny.png')

    proto_dataset.freeze()

    # Read in a dataset
    dataset = DataSet.from_uri(dest_uri)

    expected_identifier = generate_identifier('tiny.png')
    assert expected_identifier in dataset.identifiers
    assert len(dataset.identifiers) == 1

    # Tell plugin that dataset has been created
    r = tmp_app_with_users.post(
        "/elastic-search/notify/all/{}".format(name),
        json={
            'bucket': bucket_name,
            'metadata': dataset._admin_metadata
        },
    )
    assert r.status_code == 200

    # Check that dataset has actually been registered
    datasets = list_datasets_by_user('snow-white')
    assert len(datasets) == 1
    assert datasets[0]['base_uri'] == base_uri
    assert datasets[0]['uri'] == dest_uri
    assert datasets[0]['uuid'] == admin_metadata['uuid']
    assert datasets[0]['name'] == name

    # Check README
    check_readme = get_readme_from_uri_by_user('snow-white', dest_uri)
    assert check_readme == yaml.load(readme)

    # Update README
    new_readme = 'ghi: jkl'
    dataset.put_readme(new_readme)

    # Notify plugin about updated name
    r = tmp_app_with_users.post(
        "/elastic-search/notify/all/{}".format(name),
        json={
            'bucket': bucket_name,
            'metadata': dataset._admin_metadata
        },
    )
    assert r.status_code == 200

    # Check dataset
    datasets = list_datasets_by_user('snow-white')
    assert len(datasets) == 1
    assert datasets[0]['base_uri'] == base_uri
    assert datasets[0]['uri'] == dest_uri
    assert datasets[0]['uuid'] == admin_metadata['uuid']
    assert datasets[0]['name'] == name

    # Check that README has actually been changed
    check_readme = get_readme_from_uri_by_user('snow-white', dest_uri)
    assert check_readme == yaml.load(new_readme)

    # Tell plugin that dataset has been deleted
    r = tmp_app_with_users.delete(
        "/elastic-search/notify/all/{}_{}/dtool".format(
            bucket_name, admin_metadata['uuid']))
    assert r.status_code == 200

    # Check that dataset has been deleted
    datasets = list_datasets_by_user('snow-white')
    assert len(datasets) == 0
示例#26
0
 def _get_blob_properties(self, handle):
     identifier = generate_identifier(handle)
     return self._blobservice.get_blob_properties(self.uuid, identifier)
示例#27
0
def test_proto_dataset_freeze_functional(tmp_uuid_and_uri):  # NOQA

    uuid, dest_uri = tmp_uuid_and_uri

    from dtoolcore import (generate_admin_metadata, DataSet, ProtoDataSet,
                           DtoolCoreTypeError)
    from dtoolcore.utils import generate_identifier

    name = "func_test_dataset_freeze"
    admin_metadata = generate_admin_metadata(name)
    admin_metadata["uuid"] = uuid

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)

    proto_dataset = ProtoDataSet(uri=dest_uri,
                                 admin_metadata=admin_metadata,
                                 config_path=None)
    proto_dataset.create()

    filenames = ['tiny.png', 'actually_a_png.txt', 'another_file.txt']
    for filename in filenames:
        local_file_path = os.path.join(sample_data_path, filename)
        proto_dataset.put_item(local_file_path, filename)
        proto_dataset.add_item_metadata(filename, 'namelen', len(filename))
        proto_dataset.add_item_metadata(filename, 'firstletter', filename[0])

    # At this point the temporary fragments should exist.
    assert _prefix_contains_something(
        proto_dataset._storage_broker,
        proto_dataset._storage_broker.fragments_key_prefix)

    proto_dataset.put_readme(content='Hello world!')

    # We shouldn't be able to load this as a DataSet
    with pytest.raises(DtoolCoreTypeError):
        DataSet.from_uri(dest_uri)

    proto_dataset.freeze()

    # Freezing removes the temporary metadata fragments.
    assert not _prefix_contains_something(
        proto_dataset._storage_broker,
        proto_dataset._storage_broker.fragments_key_prefix)

    # Now we shouln't be able to load as a ProtoDataSet
    with pytest.raises(DtoolCoreTypeError):
        ProtoDataSet.from_uri(dest_uri)

    # But we can as a DataSet
    dataset = DataSet.from_uri(dest_uri)
    assert dataset.name == 'func_test_dataset_freeze'

    # Test identifiers
    expected_identifiers = map(generate_identifier, filenames)
    assert set(dataset.identifiers) == set(expected_identifiers)

    # Test readme contents
    assert dataset.get_readme_content() == "Hello world!"

    # Test item
    expected_identifier = generate_identifier('tiny.png')
    expected_hash = md5sum_hexdigest(os.path.join(sample_data_path,
                                                  'tiny.png'))
    item_properties = dataset.item_properties(expected_identifier)
    assert item_properties['relpath'] == 'tiny.png'
    assert item_properties['size_in_bytes'] == 276
    assert item_properties['hash'] == expected_hash

    # Test accessing item
    expected_identifier = generate_identifier('another_file.txt')
    fpath = dataset.item_content_abspath(expected_identifier)

    with open(fpath) as fh:
        contents = fh.read()

    assert contents == "Hello\n"

    # Test overlays have been created properly
    namelen_overlay = dataset.get_overlay('namelen')
    expected_identifier = generate_identifier('another_file.txt')
    assert namelen_overlay[expected_identifier] == len('another_file.txt')
def test_overlays_functional(tmp_dir_fixture):  # NOQA

    from dtoolcore import (
        DataSet,
        ProtoDataSet,
        DtoolCoreKeyError,
        DtoolCoreTypeError,
        DtoolCoreValueError,
        DtoolCoreInvalidNameError,
        generate_admin_metadata,
        copy,
    )

    from dtoolcore.utils import generate_identifier
    from dtoolcore.storagebroker import DiskStorageBroker

    name = "my_dataset"
    admin_metadata = generate_admin_metadata(name)
    dest_uri = DiskStorageBroker.generate_uri(name=name,
                                              uuid=admin_metadata["uuid"],
                                              base_uri=tmp_dir_fixture)

    sample_data_path = os.path.join(TEST_SAMPLE_DATA)
    local_file_path = os.path.join(sample_data_path, 'tiny.png')

    # Create a minimal dataset.
    proto_dataset = ProtoDataSet(uri=dest_uri,
                                 admin_metadata=admin_metadata,
                                 config_path=None)
    proto_dataset.create()
    proto_dataset.put_item(local_file_path, 'tiny.png')

    # Freeze the dataset
    proto_dataset.put_readme("")
    proto_dataset.freeze()

    # Load the dataset.
    dataset = DataSet.from_uri(proto_dataset.uri)

    # The overlay has not been added yet.
    with pytest.raises(DtoolCoreKeyError):
        dataset.get_overlay("is_png")

    # Create overlay content.
    expected_identifier = generate_identifier('tiny.png')
    is_png_overlay = {expected_identifier: True}

    with pytest.raises(DtoolCoreTypeError):
        dataset.put_overlay("is_png", "not_a_dict")

    incorrect_identifier_overlay = {"incorrect": True}
    with pytest.raises(DtoolCoreValueError):
        dataset.put_overlay("is_png", incorrect_identifier_overlay)

    invalid_keys = ["with space", "with,comma", "with/slash", "X" * 81]
    for invalid_key in invalid_keys:
        with pytest.raises(DtoolCoreInvalidNameError):
            dataset.put_overlay(invalid_key, is_png_overlay)

    dataset.put_overlay("is_png", is_png_overlay)
    assert dataset.get_overlay("is_png") == is_png_overlay

    # Test copy.
    copy_dataset_directory = os.path.join(tmp_dir_fixture, "copy")
    os.mkdir(copy_dataset_directory)
    dest_uri = dataset.base_uri + "/copy"
    copy_uri = copy(dataset.uri, dest_uri)

    copy_dataset = DataSet.from_uri(copy_uri)
    assert copy_dataset.list_overlay_names() == ["is_png"]
    assert copy_dataset.get_overlay("is_png") == is_png_overlay
示例#29
0
 def _get_item_object(self, handle):
     identifier = generate_identifier(handle)
     item_key = self.data_key_prefix + identifier
     obj = self.s3resource.Object(self.bucket, item_key)
     return obj