示例#1
0
 def setUp(self):
     self.pkg = Package()
     self.entry_with_hash = PackageEntry(
         PhysicalKey('test-bucket', 'with-hash', 'with-hash'),
         42,
         {
             'type': 'SHA256',
             'value': '0' * 64
         },
         {},
     )
     self.entry_without_hash = PackageEntry(
         PhysicalKey('test-bucket', 'without-hash', 'without-hash'),
         42,
         None,
         {},
     )
     self.pkg.set('with-hash', self.entry_with_hash)
     self.pkg.set('without-hash', self.entry_without_hash)
示例#2
0
 def prepare_pkg(self, *, copy_data):
     expected_pkg = Package()
     pkg_entries = self.entries.items()
     if copy_data:
         pkg_entries = [(
             lk,
             e.with_physical_key(
                 PhysicalKey(self.dst_bucket, f'{self.dst_pkg_name}/{lk}',
                             'dst_' + e.physical_key.version_id)),
         ) for lk, e in pkg_entries]
     for lk, entry in pkg_entries:
         expected_pkg.set(lk, entry)
     expected_pkg._set_commit_message(None)
     return expected_pkg
示例#3
0
class HashCalculationTest(unittest.TestCase):
    def setUp(self):
        self.pkg = Package()
        self.entry_with_hash = PackageEntry(
            PhysicalKey('test-bucket', 'with-hash', 'with-hash'),
            42,
            {
                'type': 'SHA256',
                'value': '0' * 64
            },
            {},
        )
        self.entry_without_hash = PackageEntry(
            PhysicalKey('test-bucket', 'without-hash', 'without-hash'),
            42,
            None,
            {},
        )
        self.pkg.set('with-hash', self.entry_with_hash)
        self.pkg.set('without-hash', self.entry_without_hash)

    def test_calculate_pkg_hashes(self):
        boto_session = mock.MagicMock()
        with mock.patch.object(
                t4_lambda_pkgpush,
                'calculate_pkg_entry_hash') as calculate_pkg_entry_hash_mock:
            t4_lambda_pkgpush.calculate_pkg_hashes(boto_session, self.pkg)

        calculate_pkg_entry_hash_mock.assert_called_once_with(
            mock.ANY, self.entry_without_hash)

    @mock.patch.object(t4_lambda_pkgpush, 'S3_HASH_LAMBDA_MAX_FILE_SIZE_BYTES',
                       1)
    def test_calculate_pkg_hashes_too_large_file_error(self):
        s3_client = mock.MagicMock()
        with pytest.raises(t4_lambda_pkgpush.FileTooLargeForHashing):
            t4_lambda_pkgpush.calculate_pkg_hashes(s3_client, self.pkg)

    def test_calculate_pkg_entry_hash(self):
        get_s3_client_mock = mock.MagicMock()
        s3_client_mock = get_s3_client_mock.return_value
        s3_client_mock.generate_presigned_url.return_value = 'https://example.com'
        with mock.patch("t4_lambda_pkgpush.invoke_hash_lambda",
                        return_value='0' * 64) as invoke_hash_lambda_mock:
            t4_lambda_pkgpush.calculate_pkg_entry_hash(get_s3_client_mock,
                                                       self.entry_without_hash)

        get_s3_client_mock.assert_called_once_with(
            self.entry_without_hash.physical_key.bucket)
        invoke_hash_lambda_mock.assert_called_once_with(
            s3_client_mock.generate_presigned_url.return_value)
        s3_client_mock.generate_presigned_url.assert_called_once_with(
            ClientMethod='get_object',
            ExpiresIn=t4_lambda_pkgpush.
            S3_HASH_LAMBDA_SIGNED_URL_EXPIRES_IN_SECONDS,
            Params={
                'Bucket': self.entry_without_hash.physical_key.bucket,
                'Key': self.entry_without_hash.physical_key.path,
                'VersionId': self.entry_without_hash.physical_key.version_id,
            },
        )

        assert self.entry_without_hash.hash == {
            'type': 'SHA256',
            'value': invoke_hash_lambda_mock.return_value,
        }

    def test_invoke_hash_lambda(self):
        lambda_client_stubber = Stubber(t4_lambda_pkgpush.lambda_)
        lambda_client_stubber.activate()
        self.addCleanup(lambda_client_stubber.deactivate)
        test_hash = '0' * 64
        test_url = 'https://example.com'

        lambda_client_stubber.add_response(
            'invoke',
            service_response={
                'Payload': io.BytesIO(b'"%s"' % test_hash.encode()),
            },
            expected_params={
                'FunctionName': t4_lambda_pkgpush.S3_HASH_LAMBDA,
                'Payload': '"%s"' % test_url,
            },
        )

        assert t4_lambda_pkgpush.invoke_hash_lambda(test_url) == test_hash
        lambda_client_stubber.assert_no_pending_responses()

    def test_invoke_hash_lambda_error(self):
        lambda_client_stubber = Stubber(t4_lambda_pkgpush.lambda_)
        lambda_client_stubber.activate()
        self.addCleanup(lambda_client_stubber.deactivate)
        test_url = 'https://example.com'

        lambda_client_stubber.add_response(
            'invoke',
            service_response={
                'FunctionError': 'Unhandled',
                'Payload': io.BytesIO(b'some error info'),
            },
            expected_params={
                'FunctionName': t4_lambda_pkgpush.S3_HASH_LAMBDA,
                'Payload': '"%s"' % test_url,
            },
        )

        with pytest.raises(t4_lambda_pkgpush.S3HashLambdaUnhandledError):
            t4_lambda_pkgpush.invoke_hash_lambda(test_url)
        lambda_client_stubber.assert_no_pending_responses()
示例#4
0
    def setUpClass(cls):
        super().setUpClass()

        pkg = Package()
        pkg._set_commit_message(cls.parent_commit_message)
        pkg._workflow = {
            'config':
            f's3://{cls.parent_bucket}/.quilt/workflows/config.yml?versionId=configVersion',
            'id': 'gamma',
            'schemas': {
                'top-secret':
                f's3://{cls.parent_bucket}/top-secret.schema.json?versionId=schemaVersion'
            },
        }
        pkg.set_meta({'meta': 'old meta'})
        cls.entries = cls.get_pkg_entries()
        for lk, entry in cls.entries.items():
            pkg.set(lk, entry)
        manifest_buf = io.BytesIO()
        pkg._dump(manifest_buf)
        cls.parent_manifest = manifest_buf.getvalue()
        cls.parent_top_hash = pkg.top_hash
        cls.src_params = {
            'parent': {
                'registry': cls.src_registry,
                'name': cls.parent_pkg_name,
                'top_hash': cls.parent_top_hash,
            },
        }
示例#5
0
    def _mock_package_build(self,
                            entries,
                            *,
                            message=...,
                            expected_workflow=...):
        if message is ...:
            message = self.dst_commit_message

        # Use a test package to verify manifest entries
        test_pkg = Package()
        test_pkg.set_meta(self.meta)

        # Mock hashing package objects
        for entry in entries:
            pkey = PhysicalKey.from_url(entry['physical_key'])
            hash_obj = {'type': 'SHA256', 'value': entry['hash']}
            test_entry = PackageEntry(pkey, entry['size'], hash_obj,
                                      entry.get('meta'))
            test_pkg.set(entry['logical_key'], entry=test_entry)

        mocked_workflow_data = 'some-workflow-data'
        test_pkg._workflow = mocked_workflow_data

        # build the manifest from the test_package
        test_pkg._set_commit_message(message)
        manifest = io.BytesIO()
        test_pkg.dump(manifest)
        manifest.seek(0)

        self.s3_stubber.add_response(
            'put_object',
            service_response={},
            expected_params={
                'Body': manifest.read(),
                'Bucket': self.dst_bucket,
                'Key': f'.quilt/packages/{test_pkg.top_hash}',
            },
        )
        self.s3_stubber.add_response(
            'put_object',
            service_response={},
            expected_params={
                'Body':
                str.encode(test_pkg.top_hash),
                'Bucket':
                self.dst_bucket,
                'Key':
                f'.quilt/named_packages/{self.dst_pkg_name}/{str(int(self.mock_timestamp))}',
            },
        )
        self.s3_stubber.add_response(
            'put_object',
            service_response={},
            expected_params={
                'Body': str.encode(test_pkg.top_hash),
                'Bucket': self.dst_bucket,
                'Key': f'.quilt/named_packages/{self.dst_pkg_name}/latest',
            },
        )
        with mock.patch(
                'quilt3.workflows.validate',
                return_value=mocked_workflow_data) as workflow_validate_mock:
            yield
        workflow_validate_mock.assert_called_once_with(
            registry=get_package_registry(self.dst_registry),
            workflow=expected_workflow,
            name=self.dst_pkg_name,
            pkg=mock.ANY,  # TODO: probably this should be more specific.
            message=message,
        )
示例#6
0
def create_package(
    manifest: pd.DataFrame,
    step_pkg_root: Path,
    filepath_columns: List[str] = ["filepath"],
    metadata_columns: List[str] = [],
) -> Tuple[Package, pd.DataFrame]:
    # Make a copy
    relative_manifest = manifest.copy(deep=True)

    # Create empty package
    pkg = Package()

    # Create associate mappings: List[Dict[str, str]]
    # This list is in index order. Meaning that as the column values are descended we
    # can simply add a new associate to the already existing associate map at that list
    # index.
    associates = []

    # Create metadata reduction map
    # This will be used to clean up and standardize the metadata access after object
    # construction. Metadata column name to boolean value for should or should not
    # reduce metadata values. This will be used during the "clean up the package
    # metadata step". If we have multiple files each with the same keys for the
    # metadata, but for one reason or another, one packaged file's value for a certain
    # key is a list while another's is a single string, this leads to a confusing mixed
    # return value API for the same _type_ of object. Example:
    # fov/
    #   obj1/
    #      {example_key: "hello"}
    #   obj2/
    #      {example_key: ["hello", "world"]}
    # Commonly this happens when a manifest has rows of unique instances of a child
    # object but retains a reference to a parent object, example: rows of information
    # about unique cells that were all generated using the same algorithm, whose
    # information is stored in a column, for each cell information row. This could
    # result in some files (which only have one cell) being a single string while other
    # files (which have more than one cell) being a list of the same string over and
    # over again. "Why spend all this time to reduce/ collapse the metadata anyway?",
    # besides making it so that users won't have to call `obj2.meta["example_key"][0]`
    # every time they want the value, and besides the fact that it standardizes the
    # metadata api, the biggest reason is that S3 objects can only have 2KB of metadata,
    # without this reduction/ collapse step, manifests are more likely to hit that limit
    # and cause a package distribution error.
    metadata_reduction_map = {
        index_col: True
        for index_col in metadata_columns
    }

    # Set all files
    with tqdm(
            total=len(filepath_columns) * len(relative_manifest),
            desc="Constructing package",
    ) as pbar:
        for col in filepath_columns:
            # Update values to the logical key as they are set
            for i, val in enumerate(relative_manifest[col].values):
                # Fully resolve the path
                physical_key = Path(val).expanduser().resolve()

                # Try creating a logical key from the relative of step
                # local staging to the filepath
                #
                # Ex:
                # step_pkg_root = "local_staging/raw"
                # physical_key = "local_staging/raw/images/some_file.tiff"
                # produced logical_key = "images/some_file.tiff"
                try:
                    logical_key = str(
                        file_utils._filepath_rel2abs(physical_key).relative_to(
                            file_utils._filepath_rel2abs(step_pkg_root)))

                except ValueError:
                    # Create logical key from merging column and filename
                    # Also remove any obvious "path" type words from column name
                    #
                    # Ex:
                    # physical_key = "/some/abs/path/some_file.tiff"
                    # column = "SourceReadPath"
                    # produced logical_key = "source/some_file.tiff"
                    stripped_col = col.lower().replace("read",
                                                       "").replace("path", "")
                    logical_key = f"{stripped_col}/{physical_key.name}"

                if physical_key.is_file():
                    relative_manifest[col].values[i] = logical_key

                    # Create metadata dictionary to attach to object
                    meta = {}
                    for meta_col in metadata_columns:
                        # Short reference to current metadata value
                        v = relative_manifest[meta_col].values[i]

                        # Enforce simple JSON serializable type
                        # First check if value is a numpy value
                        # It likely is because pandas relies on numpy
                        # All numpy types have the "dtype" attribute and can be cast to
                        # python type by using the `item` function, details here:
                        # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.item.html
                        if hasattr(v, "dtype"):
                            v = v.item()

                        # Cast to JSON serializable type
                        v = file_utils.make_json_serializable(
                            v, f"Value from column: {meta_col}, index: {i}")

                        # Update metadata with value
                        meta[meta_col] = [v]

                    # Check if object already exists
                    if logical_key in pkg:
                        # Join the two meta dictionaries
                        joined_meta = {}
                        for meta_col, curr_v in pkg[logical_key].meta.items():
                            # Join the values for the current iteration of the metadata
                            joined_values = [*curr_v, *meta[meta_col]]

                            # Only check if the metadata at this index can be reduced
                            # if currently is still being decided. We know if the
                            # metadata value at this index is still be decided if:
                            # the boolean value in the metadata reduction map is True,
                            # as in, this index can be reduced or collapsed.
                            # The other reason to make this check is so that we don't
                            # override an earlier False reduction value. In the case
                            # where early on we encounter an instance of the metadata
                            # that should not be reduced but then later on we say it
                            # can be, this check prevents that. As we want all metadata
                            # access across the dataset to be uniform.
                            if metadata_reduction_map[meta_col]:
                                # Update the metadata reduction map
                                # For the current column being checked, as long as it
                                # is still being determined that the column can be
                                # reduced (aka we have entered this if block) check if
                                # we can still reduce the metadata after the recent
                                # addition. "We can reduce the metadata if the count of
                                # the first value (or any value) is the same as the
                                # length of the entire list of values".
                                # This runs quickly for small lists as seen here:
                                # https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical
                                metadata_reduction_map[
                                    meta_col] = joined_values.count(
                                        joined_values[0]) == len(
                                            joined_values)  # noqa F501

                            # Attached the joined values to the joined metadata
                            joined_meta[meta_col] = joined_values

                        # Update meta
                        pkg[logical_key].set_meta(joined_meta)

                    # Object didn't already exist, simply set it
                    else:
                        pkg.set(logical_key, physical_key, meta)

                    # Update associates
                    try:
                        associates[i][col] = logical_key
                    except IndexError:
                        associates.append({col: logical_key})
                else:
                    relative_manifest[col].values[i] = logical_key
                    pkg.set_dir(logical_key, physical_key)

                # Update progress bar
                pbar.update()

        # Clean up package metadata
        pkg = _recursive_clean(pkg, metadata_reduction_map)

        # Attach associates
        for i, associate_mapping in tqdm(
                enumerate(associates),
                desc="Creating associate metadata blocks"):
            for col, lk in associate_mapping.items():
                # Having dictionary expansion in this order means that associates will
                # override a prior existing `associates` key, this is assumed safe
                # because attach_associates was set to True.
                pkg[lk].set_meta({
                    **pkg[lk].meta,
                    **{
                        "associates": associate_mapping
                    }
                })

        return pkg, relative_manifest