Пример #1
0
 def process_keys(self):
     """Remove keys from the checkout bucket."""
     for _key in self.keys:
         if DSS_BUNDLE_KEY_REGEX.match(_key):
             for key in self.handle.list(self.checkout_bucket,
                                         _key):  # handles checkout/bundle/*
                 self._verify_delete(self.handle, self.checkout_bucket, key)
             uuid, version = self._parse_key(_key)
             manifest = get_bundle_manifest(replica=self.replica,
                                            uuid=uuid,
                                            version=version)
             if manifest is None:
                 sys.stderr.write(
                     f"Unable to locate manifest for: {self.checkout_bucket}/{_key}"
                 )
                 continue
             for _files in manifest['files']:
                 key = compose_blob_key(_files)
                 self._verify_delete(self.handle, self.checkout_bucket, key)
         elif _key.startswith(FILE_PREFIX):
             # should handle other keys, files/blobs
             file_metadata = self._get_metadata(self.handle,
                                                self.replica.bucket, _key)
             self._verify_delete(self.handle,
                                 self.checkout_bucket,
                                 key=compose_blob_key(file_metadata))
         else:
             sys.stderr.write(f'Invalid key regex: {_key}')
Пример #2
0
def resolve_content_item(replica: Replica, blobstore_handle: BlobStore,
                         item: dict):
    try:
        if item["type"] in {"file", "bundle", "collection"}:
            item_metadata = get_json_metadata(item["type"], item["uuid"],
                                              item["version"], replica,
                                              blobstore_handle)
        else:
            item_metadata = get_json_metadata("file", item["uuid"],
                                              item["version"], replica,
                                              blobstore_handle)
            if "fragment" not in item:
                raise Exception(
                    'The "fragment" field is required in collection elements '
                    'other than files, bundles, and collections')
            blob_path = compose_blob_key(item_metadata)
            # check that item is marked as metadata, is json, and is less than max size
            item_doc = json.loads(
                blobstore_handle.get(replica.bucket, blob_path))
            item_content = jsonpointer.resolve_pointer(item_doc,
                                                       item["fragment"])
            return item_content
    except DSSException:
        raise
    except Exception as e:
        raise DSSException(
            requests.codes.unprocessable_entity, "invalid_link",
            'Error while parsing the link "{}": {}: {}'.format(
                item,
                type(e).__name__, e))
Пример #3
0
def verify_file_replication(src_handle, dst_handle, src_bucket, dst_bucket,
                            key):
    """
    Return list of ReplicationAnomaly for files+blobs
    """
    anomalies = list()
    try:
        file_metadata = json.loads(src_handle.get(src_bucket, key))
    except BlobNotFoundError:
        anomalies.append(
            ReplicationAnomaly(key=key, anomaly="missing on source replica"))
    else:
        try:
            target_file_metadata = json.loads(dst_handle.get(dst_bucket, key))
        except BlobNotFoundError:
            anomalies.append(
                ReplicationAnomaly(key=key,
                                   anomaly="missing on target replica"))
        else:
            if file_metadata != target_file_metadata:
                anomalies.append(
                    ReplicationAnomaly(key=key,
                                       anomaly="file metadata mismatch"))
        blob_key = compose_blob_key(file_metadata)
        anomalies.extend(
            verify_blob_replication(src_handle, dst_handle, src_bucket,
                                    dst_bucket, blob_key))
    return anomalies
Пример #4
0
 def process_keys(self):
     for _key in self.keys:
         if DSS_BUNDLE_KEY_REGEX.match(_key):
             uuid, version = self._parse_key(_key)
             bundle_manifest = get_bundle_manifest(uuid=uuid,
                                                   replica=self.replica,
                                                   version=version)
             self._sleepy_checkout(bundle_checkout,
                                   bundle_uuid=uuid,
                                   bundle_version=version)
             for _files in bundle_manifest['files']:
                 blob_path = compose_blob_key(_files)
                 self._sleepy_checkout(file_checkout,
                                       file_metadata=_files,
                                       blob_path=blob_path)
         elif _key.startswith(FILE_PREFIX):
             file_metadata = self.handle.get(self.replica.bucket, _key)
             blob_path = compose_blob_key(file_metadata)
             self._sleepy_checkout(file_checkout,
                                   file_metadata=file_metadata,
                                   blob_path=blob_path)
         else:
             sys.stderr.write(f'Invalid key regex: {_key}')
Пример #5
0
def get_manifest_files(replica: Replica, src_bucket: str, bundle_uuid: str,
                       bundle_version: str):
    bundle_manifest = get_bundle_manifest(bundle_uuid,
                                          replica,
                                          bundle_version,
                                          bucket=src_bucket)
    files = bundle_manifest[BundleMetadata.FILES]
    dst_bundle_prefix = get_dst_bundle_prefix(
        bundle_uuid, bundle_manifest[BundleMetadata.VERSION])

    for file_metadata in files:
        dst_key = "{}/{}".format(dst_bundle_prefix,
                                 file_metadata.get(BundleFileMetadata.NAME))
        src_key = compose_blob_key(file_metadata)
        yield src_key, dst_key
Пример #6
0
    def process_key(self, _key):
        if self._is_file_tombstoned(_key):
            return  # skip if tombstoned

        file_metadata = self._get_metadata(self.handle, self.replica.bucket,
                                           _key)
        if not file_metadata:
            return  # skip if missing metadata (edge case where the file was deleted before we got here)

        # check if file meets cache criteria
        if cache_flow.should_cache_file(file_metadata['content-type'],
                                        file_metadata['size']):
            blob_key = compose_blob_key(file_metadata)
            checked_out = self._verify_blob_existance(
                self.handle, self.replica.checkout_bucket, blob_key)
            if not checked_out:
                print(f'Checking out: {_key}')
                start_file_checkout(replica=self.replica, blob_key=blob_key)
                assert self._verify_blob_existance(
                    self.handle, self.replica.checkout_bucket, blob_key)
Пример #7
0
 def _read_file_infos(cls, replica: Replica, fqid: BundleFQID,
                      manifest: JSON) -> List[Tuple[str, JSON]]:
     handle = Config.get_blobstore_handle(replica)
     index_files: List[Tuple[str, JSON]] = list()
     file_infos = manifest[BundleMetadata.FILES]
     assert isinstance(file_infos, list)
     for file_info in file_infos:
         if file_info[BundleFileMetadata.INDEXED]:
             file_name = file_info[BundleFileMetadata.NAME]
             content_type = file_info[BundleFileMetadata.CONTENT_TYPE]
             if content_type.startswith('application/json'):
                 file_blob_key = compose_blob_key(file_info)
                 try:
                     file_string = handle.get(replica.bucket,
                                              file_blob_key).decode("utf-8")
                 except BlobStoreError as ex:
                     raise RuntimeError(
                         f"{ex} This bundle will not be indexed. Bundle: {fqid}, File Blob Key: "
                         f"{file_blob_key}, File Name: '{file_name}'"
                     ) from ex
                 try:
                     file_json = json.loads(file_string)
                     # TODO (mbaumann) Are there other JSON-related exceptions that should be checked below?
                 except json.decoder.JSONDecodeError as ex:
                     logger.warning(
                         f"In bundle {fqid} the file '{file_name}' is marked for indexing yet could "
                         f"not be parsed. This file will not be indexed. Exception: {ex}"
                     )
                 else:
                     logger.debug(f"Loaded file: {file_name}")
                     index_files.append((file_name, file_json))
             else:
                 logger.warning(
                     f"In bundle {fqid} the file '{file_name}' is marked for indexing yet has "
                     f"content type '{content_type}' instead of the required content type "
                     f"'application/json'. This file will not be indexed.")
     return index_files
Пример #8
0
    def _test_file_get_checkout(self, replica: Replica, scheme: str,
                                test_bucket: str, uploader: Uploader):
        handle = Config.get_blobstore_handle(replica)
        src_key = generate_test_key()
        src_data = os.urandom(1024)
        source_url = f"{scheme}://{test_bucket}/{src_key}"
        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime_to_version_format(datetime.datetime.utcnow())

        # write dummy file and upload to upload area
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        # upload file to DSS
        self.upload_file(source_url,
                         file_uuid,
                         bundle_uuid=bundle_uuid,
                         version=version)
        url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query(
            "replica", replica.name).add_query("version", version))

        # get uploaded blob key
        file_metadata = json.loads(
            handle.get(test_bucket,
                       f"files/{file_uuid}.{version}").decode("utf-8"))
        file_key = compose_blob_key(file_metadata)

        @eventually(20, 1)
        def test_checkout():
            # assert 302 and verify checksum on checkout completion
            api_get = self.assertGetResponse(url,
                                             requests.codes.found,
                                             headers=get_auth_header(),
                                             redirect_follow_retries=0)
            file_get = requests.get(api_get.response.headers['Location'])
            self.assertTrue(file_get.ok)
            self.assertEquals(file_get.content, src_data)

        with self.subTest(
                f"{replica}: Initiates checkout and returns 301 for GET on 'uncheckedout' file."
        ):
            # assert 301 redirect on first GET
            self.assertGetResponse(url,
                                   requests.codes.moved,
                                   headers=get_auth_header(),
                                   redirect_follow_retries=0)
            test_checkout()

        with self.subTest(
                f"{replica}: Initiates checkout and returns 301 for GET on nearly expired checkout file."
        ):
            now = datetime.datetime.now(datetime.timezone.utc)
            creation_date_fn = (
                "cloud_blobstore.s3.S3BlobStore.get_creation_date"
                if replica.name == "aws" else
                "cloud_blobstore.gs.GSBlobStore.get_creation_date")
            with mock.patch(creation_date_fn) as mock_creation_date:
                blob_ttl_days = int(os.environ['DSS_BLOB_TTL_DAYS'])
                mock_creation_date.return_value = now - datetime.timedelta(
                    days=blob_ttl_days, hours=1, minutes=5)
                self.assertGetResponse(url,
                                       requests.codes.moved,
                                       headers=get_auth_header(),
                                       redirect_follow_retries=0)
            test_checkout()

        with self.subTest(
                f"{replica}: Initiates checkout and returns 302 immediately for GET on stale checkout file."
        ):
            now = datetime.datetime.now(datetime.timezone.utc)
            creation_date = handle.get_creation_date(replica.checkout_bucket,
                                                     file_key)
            creation_date_fn = (
                "cloud_blobstore.s3.S3BlobStore.get_creation_date"
                if replica.name == "aws" else
                "cloud_blobstore.gs.GSBlobStore.get_creation_date")
            with mock.patch(creation_date_fn) as mock_creation_date:
                # assert 302 found on stale file and that last modified refreshes
                blob_ttl_days = int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS'])
                mock_creation_date.return_value = now - datetime.timedelta(
                    days=blob_ttl_days + 1)
                self.assertGetResponse(url,
                                       requests.codes.found,
                                       headers=get_auth_header(),
                                       redirect_follow_retries=0)
                self.assertTrue(
                    creation_date > handle.get_creation_date(
                        replica.checkout_bucket, file_key),
                    f'\ncurr_creation_date: {creation_date}'
                    f'\nprev_creation_date: {handle.get_creation_date(replica.checkout_bucket)}'
                )

        handle.delete(test_bucket, f"files/{file_uuid}.{version}")
        handle.delete(replica.checkout_bucket, file_key)
Пример #9
0
def dependencies_exist(source_replica: Replica, dest_replica: Replica,
                       key: str):
    """
    Given a source replica and manifest key, checks if all dependencies of the corresponding DSS object are present in
    dest_replica:
     - Given a file manifest key, checks if blobs exist in dest_replica.
     - Given a bundle manifest key, checks if file manifests exist in dest_replica.
     - Given a collection key, checks if all collection contents exist in dest_replica.
    Returns true if all dependencies exist in dest_replica, false otherwise.
    """
    source_handle = Config.get_blobstore_handle(source_replica)
    dest_handle = Config.get_blobstore_handle(dest_replica)
    if key.endswith(TOMBSTONE_SUFFIX):
        return True
    elif key.startswith(FILE_PREFIX):
        file_id = FileFQID.from_key(key)
        file_manifest = get_json_metadata(
            entity_type="file",
            uuid=file_id.uuid,
            version=file_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        blob_path = compose_blob_key(file_manifest)
        if exists(dest_replica, blob_path):
            return True
    elif key.startswith(BUNDLE_PREFIX):
        # head all file manifests
        bundle_id = BundleFQID.from_key(key)
        bundle_manifest = get_json_metadata(
            entity_type="bundle",
            uuid=bundle_id.uuid,
            version=bundle_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            with ThreadPoolExecutor(max_workers=20) as e:
                futures = list()
                for file in bundle_manifest[BundleMetadata.FILES]:
                    file_uuid = file[BundleFileMetadata.UUID]
                    file_version = file[BundleFileMetadata.VERSION]
                    futures.append(
                        e.submit(get_json_metadata,
                                 entity_type="file",
                                 uuid=file_uuid,
                                 version=file_version,
                                 replica=dest_replica,
                                 blobstore_handle=source_handle,
                                 max_metadata_size=max_syncable_metadata_size))
                for future in as_completed(futures):
                    future.result()
            return True
        except Exception:
            pass
    elif key.startswith(COLLECTION_PREFIX):
        collection_id = CollectionFQID.from_key(key)
        collection_manifest = get_json_metadata(
            entity_type="collection",
            uuid=collection_id.uuid,
            version=collection_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            verify_collection(contents=collection_manifest["contents"],
                              replica=dest_replica,
                              blobstore_handle=dest_handle)
            return True
        except Exception:
            pass
    else:
        raise NotImplementedError("Unknown prefix for key {}".format(key))
    return False
Пример #10
0
    def test_dependencies_exist(self):
        file_uuid, file_version = str(uuid.uuid4()), get_version()
        bundle_uuid, bundle_version = str(uuid.uuid4()), get_version()
        collection_data = {
            "contents": [{
                "type": "bundle",
                "uuid": bundle_uuid,
                "version": bundle_version
            }, {
                "type": "file",
                "uuid": file_uuid,
                "version": file_version
            }]
        }
        bundle_data = {
            BundleMetadata.FILES: [{
                BundleFileMetadata.UUID: file_uuid,
                BundleFileMetadata.VERSION: file_version
            }]
        }
        file_data = {
            FileMetadata.SHA256: "sync_test",
            FileMetadata.SHA1: "sync_test",
            FileMetadata.S3_ETAG: "sync_test",
            FileMetadata.CRC32C: str(uuid.uuid4())
        }

        with self.subTest("collection without deps"):
            collection_key = "{}/{}".format(COLLECTION_PREFIX,
                                            get_collection_fqid())
            collection_blob = self.s3_bucket.Object(collection_key)
            collection_blob.put(Body=json.dumps(collection_data).encode())
            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws,
                                        collection_key))

        with self.subTest("bundle without deps"):
            bundle_key = "{}/{}".format(
                BUNDLE_PREFIX,
                BundleFQID(uuid=bundle_uuid, version=bundle_version))
            bundle_blob = self.s3_bucket.Object(bundle_key)
            bundle_blob.put(Body=json.dumps(bundle_data).encode())

            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws,
                                        collection_key))
            self.assertFalse(
                sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key))

        with self.subTest("file without deps"):
            file_key = "{}/{}".format(
                FILE_PREFIX, FileFQID(uuid=file_uuid, version=file_version))
            file_blob = self.s3_bucket.Object(file_key)
            file_blob.put(Body=json.dumps(file_data).encode())

            @eventually(timeout=8, interval=1, errors={Exception})
            def check_file_revdeps():
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            collection_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            bundle_key))
                self.assertFalse(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            file_key))

            check_file_revdeps()

        with self.subTest(
                "blob presence causes all dependencies to be resolved"):
            blob_key = compose_blob_key(file_data)
            blob_blob = self.s3_bucket.Object(blob_key)
            blob_blob.put(Body=b"sync_test")

            @eventually(timeout=8, interval=1, errors={Exception})
            def check_blob_revdeps():
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            collection_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            bundle_key))
                self.assertTrue(
                    sync.dependencies_exist(Replica.aws, Replica.aws,
                                            file_key))

            check_blob_revdeps()
Пример #11
0
    def process_keys(self):
        """Verify that keys are in the checkout bucket."""
        checkout_status = dict(replica=self.replica.name)
        for _key in self.keys:
            if DSS_BUNDLE_KEY_REGEX.match(
                    _key):  # handles bundles/fqid keys or fqid
                uuid, version = self._parse_key(_key)
                bundle_manifest = get_bundle_manifest(replica=self.replica,
                                                      uuid=uuid,
                                                      version=version)
                checkout_bundle_contents = [
                    x[0] for x in self.handle.list_v2(
                        bucket=self.checkout_bucket,
                        prefix=f'bundles/{uuid}.{version}')
                ]
                bundle_internal_status = list()

                for _file in bundle_manifest['files']:
                    temp = collections.defaultdict(blob_checkout=False,
                                                   bundle_checkout=False,
                                                   should_be_cached=False)
                    bundle_key = f'bundles/{uuid}.{version}/{_file["name"]}'
                    blob_key = compose_blob_key(_file)

                    blob_status = self._verify_blob_existance(
                        self.handle, self.checkout_bucket, blob_key)
                    if blob_status:
                        temp['blob_checkout'] = True
                    if bundle_key in checkout_bundle_contents:
                        temp['bundle_checkout'] = True
                    if cache_flow.should_cache_file(_file['content-type'],
                                                    _file['size']):
                        temp['should_be_cached'] = True

                    for x in ['name', 'uuid', 'version']:
                        temp.update({x: _file[x]})
                    bundle_internal_status.append(temp)
                checkout_status[_key] = bundle_internal_status
            elif _key.startswith(FILE_PREFIX):
                temp = collections.defaultdict(blob_checkout=False,
                                               should_be_cached=False)
                file_metadata = self._get_metadata(self.handle,
                                                   self.replica.bucket, _key)
                if not file_metadata:
                    sys.stderr.write(
                        f'Key not in either main bucket or checkout bucket: {_key}'
                    )
                    continue
                blob_key = compose_blob_key(file_metadata)
                blob_status = self._verify_blob_existance(
                    self.handle, self.checkout_bucket, blob_key)
                if blob_status:
                    temp['blob_checkout'] = True
                if cache_flow.should_cache_file(file_metadata['content-type'],
                                                file_metadata['size']):
                    temp['should_be_cached'] = True

                for x in ['name', 'uuid', 'version']:
                    temp.update({x: file_metadata[x]})
                checkout_status[_key] = collections.defaultdict(uuid=temp)
            else:
                sys.stderr.write(f'Invalid key regex: {_key}')
        print(json.dumps(checkout_status, sort_keys=True, indent=2))
        return checkout_status  # action_handler does not really use this, its just testing
Пример #12
0
def get_helper(uuid: str, replica: Replica, version: str = None, token: str = None, directurl: bool = False,
               content_disposition: str = None):

    with tracing.Subsegment('parameterization'):
        handle = Config.get_blobstore_handle(replica)
        bucket = replica.bucket

    if version is None:
        with tracing.Subsegment('find_latest_version'):
            # list the files and find the one that is the most recent.
            prefix = "files/{}.".format(uuid)
            for matching_file in handle.list(bucket, prefix):
                matching_file = matching_file[len(prefix):]
                if version is None or matching_file > version:
                    version = matching_file
    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    # retrieve the file metadata.
    try:
        with tracing.Subsegment('load_file'):
            file_metadata = json.loads(
                handle.get(
                    bucket,
                    f"files/{uuid}.{version}"
                ).decode("utf-8"))
    except BlobNotFoundError:
        key = f"files/{uuid}.{version}"
        item = AsyncStateItem.get(key)
        if isinstance(item, S3CopyEtagError):
            raise DSSException(
                requests.codes.unprocessable,
                "missing_checksum",
                "Incorrect s3-etag"
            )
        elif isinstance(item, AsyncStateError):
            raise item
        else:
            raise DSSException(404, "not_found", "Cannot find file!")

    with tracing.Subsegment('make_path'):
        blob_path = compose_blob_key(file_metadata)

    if request.method == "GET":
        token, ready = _verify_checkout(replica, token, file_metadata, blob_path)
        if ready:
            if directurl:
                response = redirect(str(UrlBuilder().set(
                    scheme=replica.storage_schema,
                    netloc=replica.checkout_bucket,
                    path=get_dst_key(blob_path)
                )))
            else:
                if content_disposition:
                    # can tell a browser to treat the response link as a download rather than open a new tab
                    response = redirect(handle.generate_presigned_GET_url(
                                        replica.checkout_bucket,
                                        get_dst_key(blob_path),
                                        response_content_disposition=content_disposition))
                else:
                    response = redirect(handle.generate_presigned_GET_url(
                                        replica.checkout_bucket,
                                        get_dst_key(blob_path)))
        else:
            with tracing.Subsegment('make_retry'):
                builder = UrlBuilder(request.url)
                builder.replace_query("token", token)
                response = redirect(str(builder), code=301)
                headers = response.headers
                headers['Retry-After'] = RETRY_AFTER_INTERVAL
                return response

    else:
        response = make_response('', 200)

    with tracing.Subsegment('set_headers'):
        headers = response.headers
        headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID]
        headers['X-DSS-VERSION'] = version
        headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE]
        headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE]
        headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C]
        headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG]
        headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1]
        headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256]

    return response