def is_good_content_entry(entry): """True if ContentEntry is not broken. ContentEntry is broken if it is in old format (before content namespace were sharded) or corresponding Google Storage file doesn't exist. """ # New entries use GS file path as ids. File path is always <namespace>/<hash>. entry_id = entry.key.id() if '/' not in entry_id: return False # Content is inline, entity doesn't have GS file attached -> it is fine. if entry.content is not None: return True # Ensure GS file exists. return bool(gcs.get_file_info(config.settings().gs_bucket, entry_id))
def storage_helper(self, request, uploaded_to_gs): """Implement shared logic between store_inline and finalize_gs.""" # validate token or error out if not request.upload_ticket: raise endpoints.BadRequestException( 'Upload ticket was empty or not provided.') try: embedded = TokenSigner.validate( request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs]) except (auth.InvalidTokenError, ValueError) as error: raise endpoints.BadRequestException( 'Ticket validation failed: %s' % error.message) # read data and convert types digest = embedded['d'].encode('utf-8') is_isolated = bool(int(embedded['i'])) namespace = embedded['n'] size = int(embedded['s']) # create a key key = entry_key_or_error(namespace, digest) # get content and compressed size if uploaded_to_gs: # ensure that file info is uploaded to GS first # TODO(cmassaro): address analogous TODO from handlers_api file_info = gcs.get_file_info(config.settings().gs_bucket, key.id()) if not file_info: raise endpoints.BadRequestException( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % ( key.id(), size)) content = None compressed_size = file_info.size else: content = request.content compressed_size = len(content) # all is well; create an entry entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=size, is_verified=not uploaded_to_gs, content=content, ) # DB: assert that embedded content is the data sent by the request if not uploaded_to_gs: if (digest, size) != hash_content(content, namespace): raise endpoints.BadRequestException( 'Embedded digest does not match provided data: ' '(digest, size): (%r, %r); expected: %r' % ( digest, size, hash_content(content, namespace))) entry.put() # GCS: enqueue verification task else: try: store_and_enqueue_verify_task(entry, utils.get_task_queue_host()) except ( datastore_errors.Error, runtime.apiproxy_errors.CancelledError, runtime.apiproxy_errors.DeadlineExceededError, runtime.apiproxy_errors.OverQuotaError, runtime.DeadlineExceededError, taskqueue.Error) as e: raise endpoints.InternalServerErrorException( 'Unable to store the entity: %s.' % e.__class__.__name__) stats.add_entry( stats.STORE, entry.compressed_size, 'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline') return PushPing(ok=True)
def post(self, namespace, hash_key): entry = model.entry_key(namespace, hash_key).get() if not entry: logging.error('Failed to find entity') return if entry.is_verified: logging.warning('Was already verified') return if entry.content is not None: logging.error('Should not be called with inline content') return # Get GS file size. gs_bucket = config.settings().gs_bucket gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id()) # It's None if file is missing. if not gs_file_info: # According to the docs, GS is read-after-write consistent, so a file is # missing only if it wasn't stored at all or it was deleted, in any case # it's not a valid ContentEntry. self.purge_entry(entry, 'No such GS file') return # Expected stored length and actual length should match. if gs_file_info.size != entry.compressed_size: self.purge_entry(entry, 'Bad GS file: expected size is %d, actual size is %d', entry.compressed_size, gs_file_info.size) return save_to_memcache = ( entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and entry.is_isolated) expanded_size = 0 digest = model.get_hash_algo(namespace) data = None try: # Start a loop where it reads the data in block. stream = gcs.read_file(gs_bucket, entry.key.id()) if save_to_memcache: # Wraps stream with a generator that accumulates the data. stream = Accumulator(stream) for data in model.expand_content(namespace, stream): expanded_size += len(data) digest.update(data) # Make sure the data is GC'ed. del data # Hashes should match. if digest.hexdigest() != hash_key: self.purge_entry(entry, 'SHA-1 do not match data (%d bytes, %d bytes expanded)', entry.compressed_size, expanded_size) return except gcs.NotFoundError as e: # Somebody deleted a file between get_file_info and read_file calls. self.purge_entry(entry, 'File was unexpectedly deleted') return except (gcs.ForbiddenError, gcs.AuthorizationError) as e: # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may # be fine. Maybe ACL problems would be fixed before the next retry. logging.warning( 'CloudStorage auth issues (%s): %s', e.__class__.__name__, e) # Abort so the job is retried automatically. return self.abort(500) except (gcs.FatalError, zlib.error, IOError) as e: # ForbiddenError and AuthorizationError inherit FatalError, so this except # block should be last. # It's broken or unreadable. self.purge_entry(entry, 'Failed to read the file (%s): %s', e.__class__.__name__, e) return # Verified. Data matches the hash. entry.expanded_size = expanded_size entry.is_verified = True future = entry.put_async() logging.info( '%d bytes (%d bytes expanded) verified', entry.compressed_size, expanded_size) if save_to_memcache: model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated)) future.wait()
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature( config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError( 'Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error( 'Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified = not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache( namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add( url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id() stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()
def post(self, namespace, hash_key): original_request = self.request.get('req') entry = model.get_entry_key(namespace, hash_key).get() if not entry: logging.error('Failed to find entity\n%s', original_request) return if entry.is_verified: logging.warning('Was already verified\n%s', original_request) return if entry.content is not None: logging.error('Should not be called with inline content\n%s', original_request) return # Get GS file size. gs_bucket = config.settings().gs_bucket gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id()) # It's None if file is missing. if not gs_file_info: # According to the docs, GS is read-after-write consistent, so a file is # missing only if it wasn't stored at all or it was deleted, in any case # it's not a valid ContentEntry. self.purge_entry(entry, 'No such GS file\n%s', original_request) return # Expected stored length and actual length should match. if gs_file_info.size != entry.compressed_size: self.purge_entry( entry, 'Bad GS file: expected size is %d, actual size is %d\n%s', entry.compressed_size, gs_file_info.size, original_request) return save_to_memcache = ( entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and entry.is_isolated) expanded_size = 0 digest = hashlib.sha1() data = None try: # Start a loop where it reads the data in block. stream = gcs.read_file(gs_bucket, entry.key.id()) if save_to_memcache: # Wraps stream with a generator that accumulates the data. stream = Accumulator(stream) for data in model.expand_content(namespace, stream): expanded_size += len(data) digest.update(data) # Make sure the data is GC'ed. del data # Hashes should match. if digest.hexdigest() != hash_key: self.purge_entry( entry, 'SHA-1 do not match data\n' '%d bytes, %d bytes expanded, expected %d bytes\n%s', entry.compressed_size, expanded_size, entry.expanded_size, original_request) return except gcs.NotFoundError as e: # Somebody deleted a file between get_file_info and read_file calls. self.purge_entry(entry, 'File was unexpectedly deleted\n%s', original_request) return except (gcs.ForbiddenError, gcs.AuthorizationError) as e: # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may # be fine. Maybe ACL problems would be fixed before the next retry. logging.warning('CloudStorage auth issues (%s): %s', e.__class__.__name__, e) # Abort so the job is retried automatically. return self.abort(500) except (gcs.FatalError, zlib.error, IOError) as e: # ForbiddenError and AuthorizationError inherit FatalError, so this except # block should be last. # It's broken or unreadable. self.purge_entry(entry, 'Failed to read the file (%s): %s\n%s', e.__class__.__name__, e, original_request) return # Verified. Data matches the hash. entry.expanded_size = expanded_size entry.is_verified = True future = entry.put_async() logging.info('%d bytes (%d bytes expanded) verified\n%s', entry.compressed_size, expanded_size, original_request) if save_to_memcache: model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated)) future.wait()
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature(config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError('Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error('Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified=not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache(namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add(url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id( ) stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()
def storage_helper(request, uploaded_to_gs): """Implement shared logic between store_inline and finalize_gs. Arguments: request: either StorageRequest or FinalizeRequest. uploaded_to_gs: bool. """ if not request.upload_ticket: raise endpoints.BadRequestException( 'Upload ticket was empty or not provided.') try: embedded = TokenSigner.validate( request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs]) except (auth.InvalidTokenError, ValueError) as error: raise endpoints.BadRequestException( 'Ticket validation failed: %s' % error.message) digest = embedded['d'].encode('utf-8') is_isolated = bool(int(embedded['i'])) namespace = embedded['n'] size = int(embedded['s']) key = entry_key_or_error(namespace, digest) if uploaded_to_gs: # Ensure that file info is uploaded to GS first. file_info = gcs.get_file_info(config.settings().gs_bucket, key.id()) if not file_info: logging.debug('%s', digest) raise endpoints.BadRequestException( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % ( key.id(), size)) content = None compressed_size = file_info.size else: content = request.content compressed_size = len(content) # Look if the entity was already stored. Alert in that case but ignore it. if key.get(): # TODO(maruel): Handle these more gracefully. logging.warning('Overwritting ContentEntry\n%s', digest) entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=size, is_verified=not uploaded_to_gs, content=content, ) if not uploaded_to_gs: # Assert that embedded content is the data sent by the request. logging.debug('%s', digest) if (digest, size) != hash_content(content, namespace): raise endpoints.BadRequestException( 'Embedded digest does not match provided data: ' '(digest, size): (%r, %r); expected: %r' % ( digest, size, hash_content(content, namespace))) entry.put() else: # Enqueue verification task transactionally as the entity is stored. try: store_and_enqueue_verify_task(entry, utils.get_task_queue_host()) except ( datastore_errors.Error, runtime.apiproxy_errors.CancelledError, runtime.apiproxy_errors.DeadlineExceededError, runtime.apiproxy_errors.OverQuotaError, runtime.DeadlineExceededError, taskqueue.Error) as e: raise endpoints.InternalServerErrorException( 'Unable to store the entity: %s.' % e.__class__.__name__) stats.add_entry( stats.STORE, entry.compressed_size, 'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline') return PushPing(ok=True)