Пример #1
0
    def _start_execution(self, stateMachineArn, name, input):
        def context():
            return MockLambdaContext(StorageVisitation.shutdown_time + self.timeout)

        Config.set_config(BucketConfig.NORMAL)
        input = json.loads(input)
        state = implementation.job_initialize(input, context())
        walker_state = copy.deepcopy(state)
        state['work_result'] = []
        while walker_state['_status'] == 'init':
            walker_state = implementation.walker_initialize(walker_state, context(), 0)
            while walker_state['_status'] == 'walk':
                walker_state = implementation.walker_walk(walker_state, context(), 0)
            walker_state = implementation.walker_finalize(walker_state, context(), 0)
        self.assertEquals(walker_state['_status'], 'end')
        state['work_result'].append(walker_state['work_result'])
        state = implementation.job_finalize(state, context())
        work_result = state['work_result']
        for replica, missing, present in zip(state['replicas'], work_result['missing'], work_result['present']):
            keys = self.keys[Replica[replica]]
            self.assertEquals(self.num_keys, missing + present)
            self.assertEquals(len(keys), present)
        # Return the total number of missing keys (which is different between each test run because the overall number
        # of keys is random). The test can assert this value, proving that this was code actually run.
        return {'executionArn': str(sum(work_result['missing']))}
Пример #2
0
def touch_test_file(replica: Replica, dst_bucket: str) -> bool:
    """
    Write a test file into the specified bucket.
    :param dst_bucket: the bucket to be checked.
    :param replica: the replica to execute the checkout in.
    :return: True if able to write, if not raise DestinationBucketNotWritableError.
    """
    randomizer = ''.join(choices(hexdigits, k=2))
    # Spreading the touch test file across a larger range prevents hitting a modification rate limits.
    test_object = f"touch/{randomizer}.txt"
    handle = Config.get_blobstore_handle(replica)

    try:
        handle.upload_file_handle(
            dst_bucket,
            test_object,
            io.BytesIO(b""))
        return True
    except Exception as ex:
        raise DestinationBucketNotWritableError(ex)
    finally:
        try:
            Config.get_blobstore_handle(replica).delete(dst_bucket, test_object)
        except Exception:
            pass
Пример #3
0
def validate_file_dst(dst_bucket: str, dst_key: str, replica: Replica):
    try:
        Config.get_blobstore_handle(replica).get_user_metadata(
            dst_bucket, dst_key)
        return True
    except (BlobNotFoundError, BlobStoreUnknownError):
        return False
Пример #4
0
 def test_s3_checkout_bucket(self):
     Config.set_config(BucketConfig.NORMAL)
     self.assertEquals(Config.get_s3_checkout_bucket(), os.environ["DSS_S3_CHECKOUT_BUCKET"])
     Config.set_config(BucketConfig.TEST)
     self.assertEquals(Config.get_s3_checkout_bucket(), os.environ["DSS_S3_CHECKOUT_BUCKET_TEST"])
     Config.set_config(BucketConfig.TEST_FIXTURE)
     self.assertEquals(Config.get_s3_checkout_bucket(), os.environ["DSS_S3_CHECKOUT_BUCKET_TEST"])
Пример #5
0
 def test_s3_events_bucket(self):
     Config.set_config(BucketConfig.NORMAL)
     self.assertEqual(Config.get_flashflood_bucket(),
                      os.environ["DSS_FLASHFLOOD_BUCKET"])
     Config.set_config(BucketConfig.TEST)
     self.assertEqual(Config.get_flashflood_bucket(),
                      os.environ["DSS_S3_BUCKET_TEST"])
     Config.set_config(BucketConfig.TEST_FIXTURE)
     self.assertEqual(Config.get_flashflood_bucket(),
                      os.environ["DSS_S3_BUCKET_TEST"])
Пример #6
0
def touch_test_file(dst_bucket: str, replica: Replica) -> bool:
    """
    Write a test file into the specified bucket.
    :param bucket: the bucket to be checked.
    :return: True if able to write, if not also returns error message as a cause
    """
    test_object = "touch.txt"
    handle = Config.get_blobstore_handle(replica)

    try:
        handle.upload_file_handle(dst_bucket, test_object, io.BytesIO(b""))
        Config.get_blobstore_handle(replica).delete(dst_bucket, test_object)
        return True
    except Exception as e:
        return False
Пример #7
0
def delete_event_for_bundle(replica: Replica,
                            key: str,
                            flashflood_prefixes: typing.Tuple[str,
                                                              ...] = None):
    """
    Delete a bundle event from flashflood. This operation is eventually consistent, and
    will not take effect until flashflood.update() is called (typically by daemons/dss-event-scribe)
    """
    fqid = key.split("/", 1)[1]
    if flashflood_prefixes is None:
        flashflood_prefixes = replica.flashflood_prefix_write
    for pfx in flashflood_prefixes:
        ff = Config.get_flashflood_handle(pfx)
        try:
            ff.delete_event(fqid)
            logger.info(
                json.dumps(dict(message="Deleted event",
                                replica=replica.name,
                                prefix=pfx,
                                key=key),
                           indent=4))
        except FlashFloodEventNotFound:
            logger.warning(
                json.dumps(dict(message="Cannot delete nonexistent event",
                                replica=replica.name,
                                prefix=pfx,
                                key=key),
                           indent=4))
Пример #8
0
def record(argv: typing.List[str], args: argparse.Namespace):
    """
    Record events for `keys` into flashflood prefix `prefix`
    If `keys` is omitted, record an event for each bundle in `replica` via lambda forwarding.
    """
    replica = Replica[args.replica]
    job_id = args.job_id or f"{uuid4()}"
    cmd_template = (f"events record --job-id {job_id} "
                    f"--prefix {args.prefix} "
                    f"--replica {replica.name} "
                    f"--keys {{keys}}")

    if args.keys is None:
        start_time = datetime.now()

        def forward_keys(bundle_fqids):
            with SQSMessenger(command_queue_url) as sqsm:
                for fqid in bundle_fqids:
                    sqsm.send(cmd_template.format(keys=f"bundles/{fqid}"))

        handle = Config.get_blobstore_handle(replica)
        with ThreadPoolExecutor(max_workers=4) as e:
            for c in set(hexdigits.lower()):
                bundle_fqids = Living(handle.list_v2(replica.bucket, f"bundles/{c}"))
                e.submit(forward_keys, bundle_fqids)
        monitor_logs(logs, job_id, start_time)
    else:
        for key in args.keys:
            msg = json.dumps(dict(action="record event", job_id=job_id, replica=replica.name, key=key))
            record_event_for_bundle(Replica[args.replica], key, (args.prefix,), use_version_for_timestamp=True)
            print(msg)
Пример #9
0
def mark_bundle_checkout_started(execution_id: str, replica: Replica,
                                 sts_bucket: str):
    handle = Config.get_blobstore_handle(replica)
    data = {_STATUS_KEY: "RUNNING"}
    handle.upload_file_handle(sts_bucket,
                              _bundle_checkout_status_key(execution_id),
                              io.BytesIO(json.dumps(data).encode("utf-8")))
    def _test_bundle_delete(self, replica: Replica, fixtures_bucket: str, authorized: bool):
        schema = replica.storage_schema

        # prep existing bundle
        bundle_uuid = str(uuid.uuid4())
        file_uuid = str(uuid.uuid4())
        resp_obj = self.upload_file_wait(
            f"{schema}://{fixtures_bucket}/test_good_source_data/0",
            replica,
            file_uuid,
            bundle_uuid=bundle_uuid,
        )
        file_version = resp_obj.json['version']

        bundle_version = datetime_to_version_format(datetime.datetime.utcnow())
        self.put_bundle(
            replica,
            bundle_uuid,
            [(file_uuid, file_version, "LICENSE")],
            bundle_version,
        )

        handle = Config.get_blobstore_handle(replica)
        bucket = replica.bucket

        self.delete_bundle(replica, bundle_uuid, authorized=authorized)
        tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.dead")
        self.assertEquals(tombstone_exists, authorized)

        self.delete_bundle(replica, bundle_uuid, bundle_version, authorized=authorized)
        tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.{bundle_version}.dead")
        self.assertEquals(tombstone_exists, authorized)
Пример #11
0
def get_helper(uuid: str, replica: Replica, version: str = None):
    handle = Config.get_blobstore_handle(replica)
    bucket = replica.bucket

    if version is None:
        # list the files and find the one that is the most recent.
        prefix = "files/{}.".format(uuid)
        for matching_file in handle.list(bucket, prefix):
            matching_file = matching_file[len(prefix):]
            if version is None or matching_file > version:
                version = matching_file

    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    # retrieve the file metadata.
    try:
        file_metadata = json.loads(
            handle.get(bucket, "files/{}.{}".format(uuid,
                                                    version)).decode("utf-8"))
    except BlobNotFoundError as ex:
        raise DSSException(404, "not_found", "Cannot find file!")

    blob_path = "blobs/" + ".".join((
        file_metadata[FileMetadata.SHA256],
        file_metadata[FileMetadata.SHA1],
        file_metadata[FileMetadata.S3_ETAG],
        file_metadata[FileMetadata.CRC32C],
    ))

    if request.method == "GET":
        """
        Probabilistically return "Retry-After" header
        The retry-after interval can be relatively short now, but it sets up downstream
        libraries / users for success when we start integrating this with the checkout service.
        """
        if random.randint(0, 100) < REDIRECT_PROBABILITY_PERCENTS:
            response = redirect(request.url, code=301)
            headers = response.headers
            headers['Retry-After'] = RETRY_AFTER_INTERVAL
            return response

        response = redirect(
            handle.generate_presigned_GET_url(bucket, blob_path))
    else:
        response = make_response('', 200)

    headers = response.headers
    headers['X-DSS-BUNDLE-UUID'] = file_metadata[FileMetadata.BUNDLE_UUID]
    headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID]
    headers['X-DSS-VERSION'] = version
    headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE]
    headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE]
    headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C]
    headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG]
    headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1]
    headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256]

    return response
    def _walk(self) -> None:
        """
        Subclasses should not typically implement this method, which includes logic specific to calling
        self.process_item(*args) on each blob visited.
        """

        start_time = time()

        handle = Config.get_blobstore_handle(Replica[self.replica])

        blobs = handle.list_v2(
            self.bucket,
            prefix=self.work_id,
            start_after_key=self.
            marker,  # type: ignore  # Cannot determine type of 'marker'
            token=self.
            token  # type: ignore  # Cannot determine type of 'token'
        )

        for key in blobs:
            if 250 < time() - start_time:
                break
            self.process_item(key)
            self.marker = blobs.start_after_key
            self.token = blobs.token
        else:
            self._status = WalkerStatus.finished.name
    def _test_bundle_get_directaccess(self, replica: Replica):
        schema = replica.storage_schema

        bundle_uuid = "011c7340-9b3c-4d62-bf49-090d79daf198"
        version = "2017-06-20T214506.766634Z"

        url = str(UrlBuilder()
                  .set(path="/v1/bundles/" + bundle_uuid)
                  .add_query("replica", replica.name)
                  .add_query("version", version)
                  .add_query("directurls", "true"))

        with override_bucket_config(BucketConfig.TEST_FIXTURE):
            resp_obj = self.assertGetResponse(
                url,
                requests.codes.ok)

        url = resp_obj.json['bundle']['files'][0]['url']
        splitted = urllib.parse.urlparse(url)
        self.assertEqual(splitted.scheme, schema)
        bucket = splitted.netloc
        key = splitted.path[1:]  # ignore the / part of the path.

        handle = Config.get_blobstore_handle(replica)
        contents = handle.get(bucket, key)

        hasher = hashlib.sha1()
        hasher.update(contents)
        sha1 = hasher.hexdigest()
        self.assertEqual(sha1, "2b8b815229aa8a61e483fb4ba0588b8b6c491890")
Пример #14
0
def _configure_logging(test=False, **kwargs):
    root_logger = logging.getLogger()
    global _logging_configured
    if _logging_configured:
        root_logger.info(
            "Logging was already configured in this interpreter process. The currently "
            "registered handlers, formatters, filters and log levels will be left as is."
        )
    else:
        root_logger.setLevel(logging.WARNING)
        if 'AWS_LAMBDA_LOG_GROUP_NAME' in os.environ:
            pass  # On AWS Lambda, we assume that its runtime already configured logging appropriately
        elif len(root_logger.handlers) == 0:
            logging.basicConfig(**kwargs)
        else:
            # If this happens, the process can likely proceed but the underlying issue needs to be investigated. Some
            # module isn't playing nicely and configured logging before we had a chance to do so. The backtrace
            # included in the log message may look scary but it should aid in finding the culprit.
            root_logger.warning(
                "It appears that logging was already configured in this interpreter process. "
                "Currently registered handlers, formatters and filters will be left as is.",
                stack_info=True)
        debug = Config.debug_level()
        log_levels = main_log_levels
        if test:
            log_levels = {**log_levels, **test_log_levels}
        for logger, levels in log_levels.items():
            if isinstance(logger, (str, type(None))):
                logger = logging.getLogger(logger)
            level = levels[min(debug, len(levels) - 1)]
            logger.setLevel(level)
        _logging_configured = True
Пример #15
0
def create_app():
    app = DSSApp(
        __name__,
        validator_map={
            'body': DSSRequestBodyValidator,
            'parameter': DSSParameterValidator,
        },
    )
    # The Flask/Connection app's logger has its own multi-line formatter and configuration. Rather than suppressing
    # it we let it do its thing, give it a special name and only enable it if DSS_DEBUG > 1. Most of the DSS web
    # app's logging is done through the DSSChaliceApp.app logger not the Flask app's logger.
    #
    app.app.logger_name = 'dss.api'
    debug = Config.debug_level() > 0
    app.app.debug = debug
    app.app.logger.info('Flask debug is %s.',
                        'enabled' if debug else 'disabled')

    resolver = RestyResolver("dss.api", collection_endpoint_name="list")
    app.add_api('../dss-api.yml',
                resolver=resolver,
                validate_responses=True,
                arguments=os.environ)
    app.add_error_handler(DSSException, dss_exception_handler)
    return app
Пример #16
0
def _verify_checkout(
        replica: Replica, token: typing.Optional[str], file_metadata: dict, blob_path: str,
) -> typing.Tuple[str, bool]:
    cloud_handle = Config.get_blobstore_handle(replica)
    hca_handle = Config.get_hcablobstore_handle(replica)

    try:
        now = datetime.datetime.now(datetime.timezone.utc)
        creation_date = cloud_handle.get_creation_date(replica.checkout_bucket, blob_path)
        stale_after_date = creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS']))
        expiration_date = (creation_date
                           + datetime.timedelta(days=int(os.environ['DSS_BLOB_TTL_DAYS']))
                           - datetime.timedelta(hours=1))

        if now < expiration_date:
            if now > stale_after_date:
                start_file_checkout(replica, blob_path)
            if hca_handle.verify_blob_checksum_from_dss_metadata(replica.checkout_bucket,
                                                                 blob_path,
                                                                 file_metadata):
                return "", True
            else:
                logger.error(
                    f"Checksum verification failed for file {replica.checkout_bucket}/{blob_path}")
    except BlobNotFoundError:
        pass

    decoded_token: dict
    if token is None:
        execution_id = start_file_checkout(replica, blob_path)
        start_time = time.time()
        attempts = 0

        decoded_token = {
            CheckoutTokenKeys.EXECUTION_ID: execution_id,
            CheckoutTokenKeys.START_TIME: start_time,
            CheckoutTokenKeys.ATTEMPTS: attempts
        }
    else:
        try:
            decoded_token = json.loads(token)
            decoded_token[CheckoutTokenKeys.ATTEMPTS] += 1
        except (KeyError, ValueError) as ex:
            raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex)

    encoded_token = json.dumps(decoded_token)
    return encoded_token, False
Пример #17
0
    def test_gcloud_reties(self):
        Config.get_native_handle.cache_clear()
        Config.BLOBSTORE_RETRIES = 1

        handle = Config.get_native_handle(Replica.gcp)
        for adapter in handle._http.adapters.values():
            self.assertEqual(Config.BLOBSTORE_RETRIES,
                             adapter.max_retries.total)
Пример #18
0
def update_flashflood(prefix: str, number_of_updates_to_apply=1000) -> int:
    """
    Apply event updates to existing journals.
    This is typically called after journaling is complete.
    """
    ff = Config.get_flashflood_handle(prefix, confirm_writes=True)
    number_of_updates_applied = ff.update(number_of_updates_to_apply)
    return number_of_updates_applied
Пример #19
0
    def __init__(self, timeout: float = 60, delay: float = 10) -> None:
        elasticsearch_binary = os.getenv("DSS_TEST_ES_PATH", "elasticsearch")
        tempdir = tempfile.TemporaryDirectory()

        # Set Elasticsearch's initial and max heap to 1.6 GiB, 40% of what's available on Travis, according to
        # guidance from https://www.elastic.co/guide/en/elasticsearch/reference/current/heap-size.html
        env = dict(os.environ, ES_JAVA_OPTIONS="-Xms1638m -Xmx1638m")

        # Work around https://github.com/travis-ci/travis-ci/issues/8408
        if '_JAVA_OPTIONS' in env:  # no coverage
            logger.warning(
                "_JAVA_OPTIONS is set. This may override the options just set via ES_JAVA_OPTIONS."
            )

        port = networking.unused_tcp_port()
        transport_port = networking.unused_tcp_port()

        args = [
            elasticsearch_binary, "-E", f"http.port={port}", "-E",
            f"transport.tcp.port={transport_port}", "-E",
            f"path.data={tempdir.name}", "-E", "logger.org.elasticsearch=" +
            ("info" if Config.debug_level() > 0 else "warn")
        ]
        logger.info("Running %r with environment %r", args, env)
        proc = subprocess.Popen(args, env=env)

        def check():
            status = proc.poll()
            if status is not None:
                tempdir.cleanup()
                raise ChildProcessError(
                    f"ES process died with status {status}")

        deadline = time.time() + timeout
        while True:
            check()
            time.sleep(delay)
            check()
            logger.info('Attempting to connect to ES instance at 127.0.0.1:%i',
                        port)
            try:
                sock = socket.create_connection(("127.0.0.1", port), 1)
            except (ConnectionRefusedError, socket.timeout):
                logger.debug(
                    'Failed connecting to ES instance at 127.0.0.1:%i',
                    port,
                    exc_info=True)
                if time.time() + delay > deadline:
                    proc.kill()
                    tempdir.cleanup()
                    raise
            else:
                sock.close()
                check()
                self.port = port
                self.proc = proc
                self.tempdir = tempdir
                break
Пример #20
0
 def setUp(self):
     dss.Config.set_config(dss.BucketConfig.NORMAL)
     self.gs_bucket_name, self.s3_bucket_name = dss.Config.get_gs_bucket(
     ), dss.Config.get_s3_bucket()
     self.logger = logging.getLogger(__name__)
     self.gs = Config.get_native_handle(Replica.gcp)
     self.gs_bucket = self.gs.bucket(self.gs_bucket_name)
     self.s3 = boto3.resource("s3")
     self.s3_bucket = self.s3.Bucket(self.s3_bucket_name)
Пример #21
0
def validate_dst_bucket(dst_bucket: str,
                        replica: Replica) -> typing.Tuple[ValidationEnum, str]:
    if not Config.get_blobstore_handle(replica).check_bucket_exists(
            dst_bucket):
        return ValidationEnum.WRONG_DST_BUCKET, f"Bucket {dst_bucket} doesn't exist"
    if not touch_test_file(dst_bucket, replica):
        return ValidationEnum.WRONG_PERMISSIONS_DST_BUCKET, f"Insufficient permissions on bucket {dst_bucket}"

    return ValidationEnum.PASSED, None
 def job_finalize(self):
     super().job_finalize()
     handle = Config.get_blobstore_handle(Replica[self.replica])
     listed_keys = handle.list(self.bucket, prefix=self.prefix)
     k_listed = sum(1 for _ in listed_keys)
     assert self.work_result == k_listed, f'Integration test failed: {self.work_result} != {k_listed}'
     logger.info(
         f"Integration test passed for {self.replica} with {k_listed} key(s) listed"
     )
Пример #23
0
 def _test_delete_event_for_bundle(self, replica, prefixes, key):
     ff = mock.MagicMock()
     with mock.patch("dss.events.Config.get_flashflood_handle", return_value=ff):
         events.delete_event_for_bundle(replica, key, prefixes)
         used_prefixes = prefixes or replica.flashflood_prefix_write
         self.assertEqual(len(used_prefixes), ff.delete_event.call_count)
         for args, pfx in zip(ff.call_args_list, used_prefixes):
             expected = ((resources.s3, Config.get_flashflood_bucket(), pfx),)
             self.assertEqual(args, expected)
Пример #24
0
    def test_boto_timeout(self):
        Config.get_native_handle.cache_clear()
        Config.BLOBSTORE_CONNECT_TIMEOUT = 1
        Config.BLOBSTORE_READ_TIMEOUT = 2
        Config.BLOBSTORE_BOTO_RETRIES = 3

        client_config = Config.get_native_handle(Replica.aws)._client_config
        self.assertEqual(Config.BLOBSTORE_CONNECT_TIMEOUT, client_config.connect_timeout)
        self.assertEqual(Config.BLOBSTORE_READ_TIMEOUT, client_config.read_timeout)
        self.assertEqual(Config.BLOBSTORE_BOTO_RETRIES, client_config.retries['max_attempts'])
Пример #25
0
def put_status_succeeded(execution_id: str, dst_replica: Replica,
                         dst_bucket: str, dst_location: str):
    handle = Config.get_blobstore_handle(Replica.aws)
    data = {
        "status": 'SUCCEEDED',
        "location":
        f"{dst_replica.storage_schema}://{dst_bucket}/{dst_location}"
    }
    handle.upload_file_handle(Replica.aws.checkout_bucket,
                              status_file_name(execution_id),
                              io.BytesIO(json.dumps(data).encode("utf-8")))
Пример #26
0
 def _test_record_event_for_bundle(self, replica, prefixes, metadata_document, key):
     with mock.patch("dss.events.build_bundle_metadata_document", return_value=metadata_document):
         ff = mock.MagicMock()
         ff.event_exists = mock.MagicMock(return_value=False)
         with mock.patch("dss.events.Config.get_flashflood_handle", return_value=ff):
             ret = events.record_event_for_bundle(replica, key, prefixes)
             used_prefixes = prefixes or replica.flashflood_prefix_write
             self.assertEqual(len(used_prefixes), ff.put.call_count)
             self.assertEqual(metadata_document, ret)
             for args, pfx in zip(ff.call_args_list, used_prefixes):
                 expected = ((resources.s3, Config.get_flashflood_bucket(), pfx),)
                 self.assertEqual(args, expected)
Пример #27
0
    def _walk(self) -> None:
        executor = ThreadPoolExecutor(len(DEFAULT_BACKENDS))
        # We can't use executor as context manager because we don't want shutting it down to block
        try:
            remaining_backend_time = AdjustedRemainingTime(
                actual=self._remaining_time, offset=-self.shutdown_time)
            backend = CompositeIndexBackend(
                executor=executor,
                backends=DEFAULT_BACKENDS,
                remaining_time=remaining_backend_time,
                dryrun=self.dryrun,
                notify=self.notify)
            replica = Replica[self.replica]
            indexer_cls = Indexer.for_replica(replica)
            indexer = indexer_cls(backend, remaining_backend_time)

            handle = Config.get_blobstore_handle(replica)
            if self.bucket != replica.bucket:
                logger.warning(
                    f'Indexing bucket {self.bucket} instead of default {self.bucket}.'
                )

            blobs: PagedIter = handle.list_v2(self.bucket,
                                              prefix=f'bundles/{self.work_id}',
                                              start_after_key=self.marker,
                                              token=self.token)

            for key in blobs:
                # Timing out while recording paging info could cause an inconsistent paging state, leading to repeats
                # of large amounts of work. This can be avoided by checking for timeouts only during actual
                # re-indexing. The indexer performs this check for every item.
                self.work_result['processed'] += 1
                try:
                    indexer.index_object(key)
                except IndexerTimeout as e:
                    self.work_result['failed'] += 1
                    logger.warning(
                        f'{self.work_id} timed out during index visitation: {e}'
                    )
                    break
                except Exception:
                    self.work_result['failed'] += 1
                    logger.warning(f'Index visitation failed for {key}',
                                   exc_info=True)
                else:
                    self.work_result['indexed'] += 1
                    self.marker = blobs.start_after_key
                    self.token = blobs.token
            else:
                self._status = WalkerStatus.finished.name
        finally:
            executor.shutdown(False)
Пример #28
0
    def _test_file_put_cached(self, replica: Replica, scheme: str,
                              test_bucket: str, test_checkout_bucket: str,
                              uploader: Uploader):
        stored_cache_criteria = os.environ.get('CHECKOUT_CACHE_CRITERIA')
        try:
            os.environ[
                'CHECKOUT_CACHE_CRITERIA'] = '[{"type":"application/json","max_size":12314}]'
            handle = Config.get_blobstore_handle(replica)
            src_key = generate_test_key()
            src_data = b'{"status":"valid"}'
            source_url = f"{scheme}://{test_bucket}/{src_key}"
            file_uuid = str(uuid.uuid4())
            bundle_uuid = str(uuid.uuid4())
            version = datetime_to_version_format(datetime.datetime.utcnow())

            # write dummy file and upload to upload area
            with tempfile.NamedTemporaryFile(delete=True) as fh:
                fh.write(src_data)
                fh.flush()

                uploader.checksum_and_upload_file(fh.name, src_key,
                                                  "application/json")

            # upload file to DSS
            self.upload_file(source_url,
                             file_uuid,
                             bundle_uuid=bundle_uuid,
                             version=version)

            metadata = handle.get_user_metadata(test_bucket, src_key)
            dst_key = ("blobs/" + ".".join([
                metadata['hca-dss-sha256'], metadata['hca-dss-sha1'],
                metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c']
            ])).lower()

            for wait_to_upload_into_checkout_bucket in range(30):
                try:
                    # get uploaded blob key from the checkout bucket
                    file_metadata = json.loads(
                        handle.get(test_checkout_bucket,
                                   dst_key).decode("utf-8"))
                    break
                except BlobNotFoundError:
                    time.sleep(1)
            else:
                file_metadata = json.loads(
                    handle.get(test_checkout_bucket, dst_key).decode("utf-8"))
            assert file_metadata[
                "status"] == "valid"  # the file exists in the checkout bucket
        finally:
            os.environ['CHECKOUT_CACHE_CRITERIA'] = stored_cache_criteria
Пример #29
0
def _list_checkout_bundle(
    replica: Replica,
    bundle_uuid: str,
    bundle_version: typing.Optional[str],
) -> typing.List[typing.Tuple[str, dict]]:
    """
    Lists the contents of a bundle in checkout.
    :param replica: Cloud replica
    :param bundle_uuid: Bundle UUID
    :param bundle_version: Bundle version
    :return: List of checkout bundle contents
    """
    handle = Config.get_blobstore_handle(replica)
    prefix = get_dst_bundle_prefix(bundle_uuid, bundle_version)
    return list(handle.list_v2(replica.checkout_bucket, prefix))
Пример #30
0
    def setUp(self):
        self.remaining_time = SpecificRemainingTime(10)
        Config.set_config(BucketConfig.TEST)
        self.s3_test_fixtures_bucket = get_env("DSS_S3_BUCKET_TEST_FIXTURES")
        self.gs_test_fixtures_bucket = get_env("DSS_GS_BUCKET_TEST_FIXTURES")
        self.s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
        self.gs_test_bucket = get_env("DSS_GS_BUCKET_TEST")

        class VT(Visitation):
            def walker_walk(self):
                pass

        registered_visitations.registered_visitations['VT'] = VT

        self.job_state = {
            '_visitation_class_name': 'VT',
            'work_ids': ['1', '2', '3', '4'],
            '_number_of_workers': 3,
        }

        self.walker_state = {
            '_visitation_class_name': 'VT',
            'work_ids': [['1', '2'], ['3', '4']],
        }