def setUpClass(cls): replica = Replica.aws Config.set_config(BucketConfig.TEST_FIXTURE) cls.blobstore = Config.get_blobstore_handle(replica) cls.test_fixture_bucket = replica.bucket Config.set_config(BucketConfig.TEST) cls.test_bucket = replica.bucket
def delete(uuid: str, replica: str): authenticated_user_email = request.token_info['email'] es_client = ElasticsearchClient.get() try: response = es_client.get(index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) except NotFoundError as ex: raise DSSException(requests.codes.not_found, "not_found", "Cannot find subscription!") stored_metadata = response['_source'] if stored_metadata['owner'] != authenticated_user_email: # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent. raise DSSException(requests.codes.forbidden, "Forbidden", "Your credentials can't access this subscription!") # get all indexes that use current alias alias_name = Config.get_es_alias_name(ESIndexType.docs, Replica[replica]) doc_indexes = _get_indexes_by_alias(es_client, alias_name) _unregister_percolate(es_client, doc_indexes, uuid) es_client.delete(index=Config.get_es_index_name(ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) timestamp = datetime.datetime.utcnow() time_deleted = timestamp.strftime("%Y-%m-%dT%H%M%S.%fZ") return jsonify({'timeDeleted': time_deleted}), requests.codes.okay
def test_custom_email_claims(self): self.addCleanup(self.restore_email_claims, os.environ.pop('OIDC_EMAIL_CLAIM', 'EMPTY')) email = '*****@*****.**' email_claim = '*****@*****.**' tests = [({ 'email': email, Config.get_OIDC_email_claim(): email_claim }, email_claim), ({ Config.get_OIDC_email_claim(): email_claim }, email_claim), ({ 'email': email }, email)] for param, result in tests: with self.subTest(f"no custom claim {param}"): self.assertEqual(security.get_token_email(param), result) os.environ['OIDC_EMAIL_CLAIM'] = 'TEST_CLAIM' for param, result in tests: with self.subTest(f"custom claim {param}"): self.assertEqual(security.get_token_email(param), result) with self.subTest("missing claim"): with self.assertRaises(DSSException) as ex: security.get_token_email({}) self.assertEqual(ex.exception.status, 401) self.assertEqual(ex.exception.message, 'Authorization token is missing email claims.')
def verify_sync(argv: typing.List[str], args: argparse.Namespace): """ Verify replication for a DSS entity, following references. for example, if a bundle key is provided, replication will be verified for the bundle and all referenced files and blobs. """ assert args.source_replica != args.destination_replica src_replica = Replica[args.source_replica] dst_replica = Replica[args.destination_replica] src_handle = Config.get_blobstore_handle(src_replica) dst_handle = Config.get_blobstore_handle(dst_replica) for key in args.keys: if key.startswith(BUNDLE_PREFIX): verify = verify_bundle_replication elif key.startswith(FILE_PREFIX): verify = verify_file_replication elif key.startswith(BLOB_PREFIX): verify = verify_blob_replication else: raise ValueError(f"cannot handle key {key}") for anomaly in verify(src_handle, dst_handle, src_replica.bucket, dst_replica.bucket, key): _log_warning(ReplicationAnomaly=dict(key=anomaly.key, anomaly=anomaly.anomaly))
def from_config(cls): """ Create a Notifier instance with global configuration, typically environment variables. """ kwargs = dict(deployment_stage=Config.deployment_stage(), delays=Config.notification_delays(), num_workers=Config.notification_workers(), timeout=Config.notification_timeout()) return cls(**{k: v for k, v in kwargs.items() if v is not None})
def start_serving(cls): Config.set_config(BucketConfig.TEST) cls._addr, cls._port = cls.get_addr_port() cls.stash_oidc_group_claim() cls.stash_openid_provider() Config._set_authz_url(f"http://{cls._addr}:{cls._port}") logger.info( f"Mock Fusillade server listening at {cls._addr}:{cls._port}") cls._server = HTTPServer((cls._addr, cls._port), cls) cls._thread = threading.Thread(target=cls._server.serve_forever) cls._thread.start()
def sync_blob(source_platform, source_key, dest_platform, context): gs = Config.get_native_handle(Replica.gcp) logger.info( f"Begin transfer of {source_key} from {source_platform} to {dest_platform}" ) gs_bucket, s3_bucket = gs.bucket( Config.get_gs_bucket()), resources.s3.Bucket(Config.get_s3_bucket()) if source_platform == "s3" and dest_platform == "gs": source = BlobLocation(platform=source_platform, bucket=s3_bucket, blob=s3_bucket.Object(source_key)) dest = BlobLocation(platform=dest_platform, bucket=gs_bucket, blob=gs_bucket.blob(source_key)) elif source_platform == "gs" and dest_platform == "s3": source = BlobLocation(platform=source_platform, bucket=gs_bucket, blob=gs_bucket.blob(source_key)) dest = BlobLocation(platform=dest_platform, bucket=s3_bucket, blob=s3_bucket.Object(source_key)) else: raise NotImplementedError() if source_platform == "s3" and dest_platform == "gs" and use_gsts: sync_s3_to_gcsts(gs.project, source.bucket.name, dest.bucket.name, source_key) elif source_platform == "s3" and dest_platform == "gs": if dest.blob.exists(): logger.info(f"Key {source_key} already exists in GS") return elif source.blob.content_length < part_size["s3"]: sync_s3_to_gs_oneshot(source, dest) else: dispatch_multipart_sync(source, dest, context) elif source_platform == "gs" and dest_platform == "s3": try: dest.blob.load() logger.info(f"Key {source_key} already exists in S3") return except clients.s3.exceptions.ClientError as e: if e.response["Error"].get("Message") != "Not Found": raise source.blob.reload() if source.blob.size < part_size["s3"]: sync_gs_to_s3_oneshot(source, dest) else: dispatch_multipart_sync(source, dest, context) logger.info( f"Completed transfer of {source_key} from {source.bucket} to {dest.bucket}" )
def assert_authorized_issuer(token: typing.Mapping[str, typing.Any]) -> None: """ Must be either `Config.get_openid_provider()` or in `Config.get_trusted_google_projects()` :param token: dict """ issuer = token['iss'] if issuer == Config.get_openid_provider(): return service_name, _, service_domain = issuer.partition("@") if service_domain in Config.get_trusted_google_projects( ) and issuer == token['sub']: return logger.info(f"Token issuer not authorized: {issuer}") raise DSSForbiddenException()
def exists(replica: Replica, key: str): if replica == Replica.aws: try: resources.s3.Bucket( replica.bucket).Object(key).load() # type: ignore return True except clients.s3.exceptions.ClientError: # type: ignore return False elif replica == Replica.gcp: gs = Config.get_native_handle(Replica.gcp) gs_bucket = gs.bucket(Config.get_gs_bucket()) return gs_bucket.blob(key).exists() else: raise NotImplementedError()
def get_impl(uuid: str, replica: str, version: str = None): uuid = uuid.lower() bucket = Replica[replica].bucket handle = Config.get_blobstore_handle(Replica[replica]) tombstone_key = CollectionTombstoneID(uuid, version=None).to_key() if test_object_exists(handle, bucket, tombstone_key): raise DSSException( 404, "not_found", "Could not find collection for UUID {}".format(uuid)) if version is None: # list the collections and find the one that is the most recent. prefix = CollectionFQID(uuid, version=None).to_key_prefix() for matching_key in handle.list(bucket, prefix): matching_key = matching_key[len(prefix):] if version is None or matching_key > version: version = matching_key try: collection_blob = handle.get(bucket, CollectionFQID(uuid, version).to_key()) except BlobNotFoundError: raise DSSException( 404, "not_found", "Could not find collection for UUID {}".format(uuid)) return json.loads(collection_blob)
def delete(uuid: str, replica: str): authenticated_user_email = security.get_token_email(request.token_info) uuid = uuid.lower() tombstone_key = CollectionTombstoneID(uuid, version=None).to_key() tombstone_object_data = dict(email=authenticated_user_email) owner = get_impl(uuid=uuid, replica=replica)["owner"] if owner != authenticated_user_email: raise DSSException(requests.codes.forbidden, "forbidden", f"Collection access denied") created, idempotent = idempotent_save( Config.get_blobstore_handle(Replica[replica]), Replica[replica].bucket, tombstone_key, json.dumps(tombstone_object_data).encode("utf-8")) if not idempotent: raise DSSException( requests.codes.conflict, f"collection_tombstone_already_exists", f"collection tombstone with UUID {uuid} already exists") status_code = requests.codes.ok response_body = dict() # type: dict # update dynamoDB owner_lookup.delete_collection_uuid(owner=authenticated_user_email, uuid=uuid) return jsonify(response_body), status_code
def enumerate_available_bundles(replica: str = None, prefix: typing.Optional[str] = None, per_page: int = PerPageBounds.per_page_max, search_after: typing.Optional[str] = None, token: typing.Optional[str] = None): """ :returns: dictionary with bundles that are available, provides context of cloud providers internal pagination mechanism. :rtype: dictionary """ kwargs = dict(bucket=Replica[replica].bucket, prefix=prefix, k_page_max=per_page) if search_after: kwargs['start_after_key'] = search_after if token: kwargs['token'] = token storage_handler = Config.get_blobstore_handle(Replica[replica]) prefix_iterator = Living(storage_handler.list_v2( **kwargs)) # note dont wrap this in enumerate; it looses the token uuid_list = list() for fqid in prefix_iterator: uuid_list.append(dict(uuid=fqid.uuid, version=fqid.version)) if len(uuid_list) >= per_page: break return dict(search_after=prefix_iterator.start_after_key, bundles=uuid_list, token=prefix_iterator.token, page_count=len(uuid_list))
def _test_gs_cache(self, src_data, content_type, checkout_bucket): replica = Replica.gcp checkout_bucket = checkout_bucket if checkout_bucket else replica.checkout_bucket test_src_key = infra.generate_test_key() gs_blobstore = Config.get_blobstore_handle(Replica.gcp) client = storage.Client() # upload with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() fh.seek(0) gs_blobstore.upload_file_handle(replica.bucket, test_src_key, fh, content_type) # checkout test_dst_key = infra.generate_test_key() event = gscopyclient.copy_sfn_event(replica.bucket, test_src_key, checkout_bucket, test_dst_key) event = gscopyclient.implementation.setup_copy_task(event, None) spoof_context = self.SpoofContext() # parameters of copy_worker are arbitrary, only passed because required. event = gscopyclient.implementation.copy_worker(event, spoof_context) # verify for retry in [1, 1, 1]: try: bucket = client.get_bucket(checkout_bucket) blob_class = bucket.get_blob(test_dst_key).storage_class except AttributeError: time.sleep(retry) else: break # cleanup gs_blobstore.delete(replica.bucket, test_src_key) gs_blobstore.delete(checkout_bucket, test_dst_key) return blob_class
def test_multipart_parallel_upload(self): data = os.urandom(7 * 1024 * 1024) metadata = {'something': "foolish"} part_size = 5 * 1024 * 1024 s3_client = Config.get_native_handle(Replica.aws) bucket = os.environ['DSS_S3_BUCKET_TEST'] with self.subTest("copy multiple parts"): with io.BytesIO(data) as fh: multipart_parallel_upload( s3_client, bucket, "fake_key", fh, part_size=part_size, metadata=metadata, content_type="application/octet-stream", ) part_size = 14 * 1024 * 1024 with self.subTest("should work with single part"): with io.BytesIO(data) as fh: multipart_parallel_upload( s3_client, bucket, "fake_key", fh, part_size=part_size, )
def get(uuid: str, replica: str): owner = security.get_token_email(request.token_info) es_client = ElasticsearchClient.get() try: response = es_client.get(index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) except NotFoundError: raise DSSException(requests.codes.not_found, "not_found", "Cannot find subscription!") source = response['_source'] source['uuid'] = uuid source['replica'] = replica if 'hmac_key_id' in response: source['hmac_key_id'] = response['hmac_key_id'] if 'hmac_secret_key' in source: source.pop('hmac_secret_key') if source['owner'] != owner: # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent. raise DSSException(requests.codes.forbidden, "Forbidden", "Your credentials can't access this subscription!") return jsonify(source), requests.codes.okay
def _prepare_index(self, dryrun): shape_descriptor = self.get_shape_descriptor() index_name = Config.get_es_index_name(ESIndexType.docs, self.replica, shape_descriptor) es_client = ElasticsearchClient.get() if not dryrun: IndexManager.create_index(es_client, self.replica, index_name) return index_name
def refresh_percolate_queries(replica: Replica, index_name: str): # When dynamic templates are used and queries for percolation have been added # to an index before the index contains mappings of fields referenced by those queries, # the queries must be reloaded when the mappings are present for the queries to match. # See: https://github.com/elastic/elasticsearch/issues/5750 subscription_index_name = Config.get_es_index_name( ESIndexType.subscriptions, replica) es_client = ElasticsearchClient.get() if not es_client.indices.exists(subscription_index_name): return subscription_queries = [{ '_index': index_name, '_type': ESDocType.query.name, '_id': hit['_id'], '_source': hit['_source']['es_query'] } for hit in scan(es_client, index=subscription_index_name, doc_type=ESDocType.subscription.name, query={'query': { 'match_all': {} }})] if subscription_queries: try: bulk(es_client, iter(subscription_queries), refresh=True) except BulkIndexError as ex: logger.error(f"Error occurred when adding subscription queries " f"to index {index_name} Errors: {ex.errors}")
def launch_from_forwarded_event(event, context): executions = {} for event_record in event["Records"]: message = json.loads(json.loads(event_record["body"])["Message"]) if message['resourceState'] == "not_exists": logger.info("Ignoring object deletion event") elif message["selfLink"].startswith( "https://www.googleapis.com/storage"): source_replica = Replica.gcp source_key = message["name"] bucket = source_replica.bucket if source_key.startswith( BLOB_PREFIX) and not BLOB_KEY_REGEX.match(source_key): logger.info( "Key %s does not match blob key format, skipping sync", source_key) continue for dest_replica in Config.get_replication_destinations( source_replica): if exists(dest_replica, source_key): logger.info("Key %s already exists in %s, skipping sync", source_key, dest_replica) continue exec_name = bucket + "/" + message[ "name"] + ":" + source_replica.name + ":" + dest_replica.name exec_input = dict(source_replica=source_replica.name, dest_replica=dest_replica.name, source_key=message["name"], source_obj_metadata=message) executions[exec_name] = app.state_machine.start_execution( **exec_input)["executionArn"] else: raise NotImplementedError() return executions
def launch_from_operator_queue(event, context): executions = {} for event_record in event['Records']: message = json.loads(event_record['body']) try: source_replica = Replica[message['source_replica']] dest_replica = Replica[message['dest_replica']] key = message['key'] assert source_replica != dest_replica except (KeyError, AssertionError): logger.error("Inoperable operation sync message %s", message) continue bucket = source_replica.bucket if exists(dest_replica, key): logger.info("Key %s already exists in %s, skipping sync", key, dest_replica) continue try: size = Config.get_blobstore_handle(source_replica).get_size( bucket, key) except BlobNotFoundError: logger.error("Key %s does not exist on source replica %s", key, source_replica) continue exec_name = bucket + "/" + key + ":" + source_replica.name + ":" + dest_replica.name exec_input = dict(source_replica=source_replica.name, dest_replica=dest_replica.name, source_key=key, source_obj_metadata=dict(size=size)) executions[exec_name] = app.state_machine.start_execution( **exec_input)["executionArn"] return executions
def get_token_email(token_info: typing.Mapping[str, typing.Any]) -> str: try: email_claim = Config.get_OIDC_email_claim() return token_info.get(email_claim) or token_info['email'] except KeyError: raise DSSException(401, 'Unauthorized', 'Authorization token is missing email claims.')
def verify_jwt(token: str) -> typing.Optional[typing.Mapping]: try: unverified_token = jwt.decode(token, verify=False) except jwt.DecodeError: logger.info(f"Failed to decode JWT: {token}", exc_info=True) raise DSSException(401, 'Unauthorized', 'Failed to decode token.') assert_authorized_issuer(unverified_token) issuer = unverified_token['iss'] public_keys = get_public_keys(issuer) try: token_header = jwt.get_unverified_header(token) verified_tok = jwt.decode( token, key=public_keys[token_header["kid"]], issuer=issuer, audience=Config.get_audience(), algorithms=allowed_algorithms, ) logger.info("""{"valid": true, "token": %s}""", json.dumps(verified_tok)) except jwt.PyJWTError as ex: # type: ignore logger.info("""{"valid": false, "token": %s}""", json.dumps(unverified_token), exc_info=True) raise DSSException(401, 'Unauthorized', 'Authorization token is invalid') from ex return verified_tok
def do_oneshot_copy(source_replica: Replica, dest_replica: Replica, source_key: str): gs = Config.get_native_handle(Replica.gcp) if source_replica == Replica.aws and dest_replica == Replica.gcp: s3_bucket = resources.s3.Bucket(source_replica.bucket) # type: ignore gs_bucket = gs.bucket(dest_replica.bucket) source = BlobLocation(platform="s3", bucket=s3_bucket, blob=s3_bucket.Object(source_key)) dest = BlobLocation(platform="gs", bucket=gs_bucket, blob=gs_bucket.blob(source_key)) sync_s3_to_gs_oneshot(source, dest) elif source_replica == Replica.gcp and dest_replica == Replica.aws: gs_bucket = gs.bucket(source_replica.bucket) s3_bucket = resources.s3.Bucket(dest_replica.bucket) # type: ignore source = BlobLocation(platform="gs", bucket=gs_bucket, blob=gs_bucket.blob(source_key)) source.blob.reload() dest = BlobLocation(platform="s3", bucket=s3_bucket, blob=s3_bucket.Object(source_key)) sync_gs_to_s3_oneshot(source, dest) else: raise NotImplementedError()
def delete(uuid: str, replica: str): owner = security.get_token_email(request.token_info) es_client = ElasticsearchClient.get() try: response = es_client.get(index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) except NotFoundError: raise DSSException(requests.codes.not_found, "not_found", "Cannot find subscription!") stored_metadata = response['_source'] if stored_metadata['owner'] != owner: # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent. raise DSSException(requests.codes.forbidden, "Forbidden", "Your credentials can't access this subscription!") _delete_subscription(es_client, uuid) timestamp = datetime.datetime.utcnow() time_deleted = timestamp.strftime("%Y-%m-%dT%H%M%S.%fZ") return jsonify({'timeDeleted': time_deleted}), requests.codes.okay
def index(argv: typing.List[str], args: argparse.Namespace): """ Queue an SQS message to the indexer lambda for each key in object storage beginning with `bundles/{prefix}`. If `prefix` is omitted, send a message for each key in object storage beginning with `bundles/` """ replica = Replica[args.replica] handle = Config.get_blobstore_handle(replica) index_queue_url = get_queue_url("dss-index-operation-" + os.environ['DSS_DEPLOYMENT_STAGE']) if "on-change" == args.send_notifications: send_notifications = None else: send_notifications = ("true" == args.send_notifications) def _forward_keys(pfx): with SQSMessenger(index_queue_url) as sqsm: for key in handle.list(replica.bucket, pfx): msg = dict(replica=replica.name, key=key) if send_notifications is not None: msg['send_notifications'] = send_notifications sqsm.send(json.dumps(msg)) with ThreadPoolExecutor(max_workers=10) as e: futures = [ e.submit(_forward_keys, f"bundles/{args.prefix}{c}") for c in set(hexdigits.lower()) ] for f in as_completed(futures): f.result()
def _test_aws_cache(self, src_data, content_type, checkout_bucket): replica = Replica.aws checkout_bucket = checkout_bucket if checkout_bucket else replica.checkout_bucket test_src_key = infra.generate_test_key() s3_blobstore = Config.get_blobstore_handle(Replica.aws) # upload with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() fh.seek(0) s3_blobstore.upload_file_handle(replica.bucket, test_src_key, fh, content_type) # checkout test_dst_key = infra.generate_test_key() event = s3copyclient.copy_sfn_event(replica.bucket, test_src_key, checkout_bucket, test_dst_key) event = s3copyclient.implementation.setup_copy_task(event, None) spoof_context = self.SpoofContext() # parameters of copy_worker are arbitrary, only passed because required. event = s3copyclient.implementation.copy_worker( event, spoof_context, 10) # verify tagging = s3_blobstore.get_user_metadata(checkout_bucket, test_dst_key) # cleanup s3_blobstore.delete(replica.bucket, test_src_key) s3_blobstore.delete(checkout_bucket, test_dst_key) return tagging
def __init__(self, argv: typing.List[str], args: argparse.Namespace): self.keys = [] self.replica = Replica[args.replica] self.handle = Config.get_blobstore_handle(self.replica) self.checkout_bucket = self.replica.checkout_bucket self.tombstone_cache: Dict[str, bytes] = {} self.tombstone_cache_max_len = 100000
def search(search_after: str = None): if search_after is None: page = es_client.search( index=Config.get_es_alias_name(ESIndexType.docs, replica), doc_type=ESDocType.doc.name, size=per_page, body=es_query, ) else: es_query['search_after'] = search_after.split(',') page = es_client.search( index=Config.get_es_alias_name(ESIndexType.docs, replica), doc_type=ESDocType.doc.name, size=per_page, body=es_query, ) return page
def _register_subscription(es_client: Elasticsearch, uuid: str, json_request_body: dict, replica: str): index_name = Config.get_es_index_name(ESIndexType.subscriptions, Replica[replica]) return es_client.index(index=index_name, doc_type=ESDocType.subscription.name, id=uuid, body=json_request_body, refresh=True)
def __init__(self, notify_async: bool = None, *args, **kwargs) -> None: """ :param notify_async: If True, enable ansynchronous (and reliable) notifications. If False, disable them. If None, use external configuration to determine whether to enable them. """ super().__init__(*args, **kwargs) if notify_async is None: notify_async = Config.notification_is_async() self.notifier = Notifier.from_config() if notify_async else None
def __init__(self, timeout_seconds: float, state: dict) -> None: super().__init__(timeout_seconds, state) self.gcp_client = Config.get_native_handle(Replica.gcp) self.source_bucket = state[Key.SOURCE_BUCKET] self.source_key = state[Key.SOURCE_KEY] self.source_crc32c = state[_Key.SOURCE_CRC32C] self.destination_bucket = state[Key.DESTINATION_BUCKET] self.destination_key = state[Key.DESTINATION_KEY] self.size = state[_Key.SIZE]