def generate_push_urls(self, entry_info, namespace): """Generates a pair of URLs to be used by clients to upload an item. The GS filename is exactly ContentEntry.key.id(). URL's being generated are 'upload URL' and 'finalize URL'. Client uploads an item to upload URL (via PUT request) and then POST status of the upload to a finalize URL. Finalize URL may be optional (it's None in that case). """ if self.should_push_to_gs(entry_info): # Store larger stuff in Google Storage. key = model.entry_key(namespace, entry_info.digest) upload_url = self.gs_url_signer.get_upload_url( filename=key.id(), content_type='application/octet-stream', expiration=self.DEFAULT_LINK_EXPIRATION) finalize_url = self.generate_store_url( entry_info, namespace, http_verb='POST', uploaded_to_gs=True, expiration=self.DEFAULT_LINK_EXPIRATION) else: # Store smallish entries and *.isolated in Datastore directly. upload_url = self.generate_store_url( entry_info, namespace, http_verb='PUT', uploaded_to_gs=False, expiration=self.DEFAULT_LINK_EXPIRATION) finalize_url = None return upload_url, finalize_url
def post(self, namespace, timestamp): digests = [] now = utils.timestamp_to_datetime(long(timestamp)) expiration = config.settings().default_expiration try: digests = payload_to_hashes(self, namespace) # Requests all the entities at once. futures = ndb.get_multi_async( model.entry_key(namespace, binascii.hexlify(d)) for d in digests) to_save = [] while futures: # Return opportunistically the first entity that can be retrieved. future = ndb.Future.wait_any(futures) futures.remove(future) item = future.get_result() if item and item.next_tag_ts < now: # Update the timestamp. Add a bit of pseudo randomness. item.expiration_ts, item.next_tag_ts = model.expiration_jitter( now, expiration) to_save.append(item) if to_save: ndb.put_multi(to_save) logging.info( 'Timestamped %d entries out of %s', len(to_save), len(digests)) except Exception as e: logging.error('Failed to stamp entries: %s\n%d entries', e, len(digests)) raise
def post(self, namespace, timestamp): digests = [] now = utils.timestamp_to_datetime(long(timestamp)) expiration = config.settings().default_expiration try: digests = payload_to_hashes(self, namespace) # Requests all the entities at once. futures = ndb.get_multi_async( model.entry_key(namespace, binascii.hexlify(d)) for d in digests) to_save = [] while futures: # Return opportunistically the first entity that can be retrieved. future = ndb.Future.wait_any(futures) futures.remove(future) item = future.get_result() if item and item.next_tag_ts < now: # Update the timestamp. Add a bit of pseudo randomness. item.expiration_ts, item.next_tag_ts = model.expiration_jitter( now, expiration) to_save.append(item) if to_save: ndb.put_multi(to_save) logging.info('Timestamped %d entries out of %s', len(to_save), len(digests)) except Exception as e: logging.error('Failed to stamp entries: %s\n%d entries', e, len(digests)) raise
def test_finalize_gs_creates_content_entry(self): """Assert that finalize_gs_upload creates a content entry.""" content = pad_string('empathy') request = self.store_request(content) embedded = validate( request.upload_ticket, handlers_endpoints.UPLOAD_MESSAGES[1]) key = model.entry_key(embedded['n'], embedded['d']) # finalize_gs_upload should put a new ContentEntry into the database self.mock(gcs, 'get_file_info', get_file_info_factory(content)) self.call_api( 'finalize_gs_upload', self.message_to_dict(request), 200) stored = key.get() self.assertEqual(key, stored.key) # assert that expected attributes are present self.assertEqual(None, stored.content) self.assertEqual(int(embedded['s']), stored.expanded_size) # ensure that verification occurs self.mock(gcs, 'read_file', lambda _bucket, _key: content) # add a side effect in execute_tasks() # TODO(cmassaro): there must be a better way than this def set_verified(): stored_entry = stored.key.get() if not stored_entry.is_verified: stored_entry.is_verified = True self.mock_side_effect(self._taskqueue_stub, 'DeleteTask', set_verified) # assert that verification occurs in the taskqueue self.assertFalse(stored.key.get().is_verified) self.assertEqual(1, self.execute_tasks()) self.assertTrue(stored.key.get().is_verified)
def gen_content(namespace='default', content='Foo'): h = model.get_hash_algo(namespace) h.update(content) hashhex = h.hexdigest() key = model.entry_key(namespace, hashhex) model.new_content_entry( key, is_isolated=False, content=content, compressed_size=len(content), expanded_size=len(content), is_verified=True).put() return hashhex
def test_check_existing_enqueues_tasks(self): """Assert that existent entities are enqueued.""" collection = handlers_endpoints.DigestCollection( namespace=handlers_endpoints.Namespace()) collection.items.append( generate_digest('some content', collection.namespace)) key = model.entry_key( collection.namespace.namespace, collection.items[0].digest) # guarantee that one digest already exists in the datastore model.new_content_entry(key).put() self.call_api( 'preupload', self.message_to_dict(collection), 200) # find enqueued tasks enqueued_tasks = self.execute_tasks() self.assertEqual(1, enqueued_tasks)
def test_store_inline_empty_content(self): """Assert that inline content storage works when content is empty.""" request = self.store_request('') embedded = validate( request.upload_ticket, handlers_endpoints.UPLOAD_MESSAGES[0]) key = model.entry_key(embedded['n'], embedded['d']) # assert that store_inline puts the correct entity into the datastore self.call_api( 'store_inline', self.message_to_dict(request), 200) stored = key.get() self.assertEqual(key, stored.key) # assert that expected (digest, size) pair is generated by stored content self.assertEqual( (embedded['d'].encode('utf-8'), int(embedded['s'])), handlers_endpoints.hash_content(stored.content, embedded['n']))
def check_entry_infos(entries, namespace): """Generator that checks for EntryInfo entries existence. Yields pairs (EntryInfo object, True if such entry exists in Datastore). """ # Kick off all queries in parallel. Build mapping Future -> digest. futures = {} for entry_info in entries: key = model.entry_key(namespace, entry_info.digest) futures[key.get_async(use_cache=False)] = entry_info # Pick first one that finishes and yield it, rinse, repeat. while futures: future = ndb.Future.wait_any(futures) # TODO(maruel): For items that were present, make sure # future.get_result().compressed_size == entry_info.size. yield futures.pop(future), bool(future.get_result())
def get(self, namespace, hash_key): #pylint: disable=W0221 # Parse 'Range' header if it's present to extract initial offset. # Only support single continuous range from some |offset| to the end. offset = 0 range_header = self.request.headers.get('range') if range_header: match = re.match(r'bytes=(\d+)-', range_header) if not match: return self.send_error('Unsupported byte range.\n\'%s\'.' % range_header, http_code=416) offset = int(match.group(1)) memcache_entry = memcache.get(hash_key, namespace='table_%s' % namespace) if memcache_entry is not None: self.send_data(memcache_entry, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(memcache_entry) - offset, 'memcache') return entry = model.entry_key(namespace, hash_key).get() if not entry: return self.send_error('Unable to retrieve the entry.', http_code=404) if entry.content is not None: self.send_data(entry.content, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(entry.content) - offset, 'inline') return # Generate signed download URL. settings = config.settings() # TODO(maruel): The GS object may not exist anymore. Handle this. signer = gcs.URLSigner(settings.gs_bucket, settings.gs_client_id_email, settings.gs_private_key) # The entry key is the GS filepath. signed_url = signer.get_download_url(entry.key.id()) # Redirect client to this URL. If 'Range' header is used, client will # correctly pass it to Google Storage to fetch only subrange of file, # so update stats accordingly. self.redirect(signed_url) stats.add_entry(stats.RETURN, entry.compressed_size - offset, 'GS; %s' % entry.key.id())
def get(self, namespace, hash_key): #pylint: disable=W0221 # Parse 'Range' header if it's present to extract initial offset. # Only support single continuous range from some |offset| to the end. offset = 0 range_header = self.request.headers.get('range') if range_header: match = re.match(r'bytes=(\d+)-', range_header) if not match: return self.send_error( 'Unsupported byte range.\n\'%s\'.' % range_header, http_code=416) offset = int(match.group(1)) memcache_entry = memcache.get(hash_key, namespace='table_%s' % namespace) if memcache_entry is not None: self.send_data(memcache_entry, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(memcache_entry) - offset, 'memcache') return entry = model.entry_key(namespace, hash_key).get() if not entry: return self.send_error('Unable to retrieve the entry.', http_code=404) if entry.content is not None: self.send_data(entry.content, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(entry.content) - offset, 'inline') return # Generate signed download URL. settings = config.settings() # TODO(maruel): The GS object may not exist anymore. Handle this. signer = gcs.URLSigner(settings.gs_bucket, settings.gs_client_id_email, settings.gs_private_key) # The entry key is the GS filepath. signed_url = signer.get_download_url(entry.key.id()) # Redirect client to this URL. If 'Range' header is used, client will # correctly pass it to Google Storage to fetch only subrange of file, # so update stats accordingly. self.redirect(signed_url) stats.add_entry( stats.RETURN, entry.compressed_size - offset, 'GS; %s' % entry.key.id())
def test_trim_missing(self): deleted = self.mock_delete_files() def gen_file(i, t=0): return (i, gcs.cloudstorage.GCSFileStat(i, 100, 'etag', t)) mock_files = [ # Was touched. gen_file('d/' + '0' * 40), # Is deleted. gen_file('d/' + '1' * 40), # Too recent. gen_file('d/' + '2' * 40, time.time() - 60), ] self.mock(gcs, 'list_files', lambda _: mock_files) model.ContentEntry(key=model.entry_key('d', '0' * 40)).put() headers = {'X-AppEngine-Cron': 'true'} resp = self.app_backend.get( '/internal/cron/cleanup/trigger/trim_lost', headers=headers) self.assertEqual(200, resp.status_code) self.assertEqual(1, self.execute_tasks()) self.assertEqual(['d/' + '1' * 40], deleted)
def test_check_existing_finds_existing_entities(self): """Assert that existence check is working.""" collection = generate_collection( ['small content', 'larger content', 'biggest content']) key = model.entry_key( collection.namespace.namespace, collection.items[0].digest) # guarantee that one digest already exists in the datastore model.new_content_entry(key).put() response = self.call_api( 'preupload', self.message_to_dict(collection), 200) # we should see one enqueued task and two new URLs in the response items = response.json['items'] self.assertEqual(2, len(items)) self.assertEqual([1, 2], [int(item['index']) for item in items]) for item in items: self.assertIsNotNone(item.get('upload_ticket')) # remove tasks so tearDown doesn't complain _ = self.execute_tasks()
def test_trim_missing(self): deleted = self.mock_delete_files() def gen_file(i, t=0): return (i, gcs.cloudstorage.GCSFileStat(i, 100, 'etag', t)) mock_files = [ # Was touched. gen_file('d/' + '0' * 40), # Is deleted. gen_file('d/' + '1' * 40), # Too recent. gen_file('d/' + '2' * 40, time.time() - 60), ] self.mock(gcs, 'list_files', lambda _: mock_files) model.ContentEntry(key=model.entry_key('d', '0' * 40)).put() headers = {'X-AppEngine-Cron': 'true'} resp = self.app_backend.get('/internal/cron/cleanup/trigger/trim_lost', headers=headers) self.assertEqual(200, resp.status_code) self.assertEqual(1, self.execute_tasks()) self.assertEqual(['d/' + '1' * 40], deleted)
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature( config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError( 'Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error( 'Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified = not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache( namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add( url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id() stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()
def post(self, namespace, hash_key): entry = model.entry_key(namespace, hash_key).get() if not entry: logging.error('Failed to find entity') return if entry.is_verified: logging.warning('Was already verified') return if entry.content is not None: logging.error('Should not be called with inline content') return # Get GS file size. gs_bucket = config.settings().gs_bucket gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id()) # It's None if file is missing. if not gs_file_info: # According to the docs, GS is read-after-write consistent, so a file is # missing only if it wasn't stored at all or it was deleted, in any case # it's not a valid ContentEntry. self.purge_entry(entry, 'No such GS file') return # Expected stored length and actual length should match. if gs_file_info.size != entry.compressed_size: self.purge_entry( entry, 'Bad GS file: expected size is %d, actual size is %d', entry.compressed_size, gs_file_info.size) return save_to_memcache = ( entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and entry.is_isolated) expanded_size = 0 digest = model.get_hash_algo(namespace) data = None try: # Start a loop where it reads the data in block. stream = gcs.read_file(gs_bucket, entry.key.id()) if save_to_memcache: # Wraps stream with a generator that accumulates the data. stream = Accumulator(stream) for data in model.expand_content(namespace, stream): expanded_size += len(data) digest.update(data) # Make sure the data is GC'ed. del data # Hashes should match. if digest.hexdigest() != hash_key: self.purge_entry( entry, 'SHA-1 do not match data (%d bytes, %d bytes expanded)', entry.compressed_size, expanded_size) return except gcs.NotFoundError as e: # Somebody deleted a file between get_file_info and read_file calls. self.purge_entry(entry, 'File was unexpectedly deleted') return except (gcs.ForbiddenError, gcs.AuthorizationError) as e: # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may # be fine. Maybe ACL problems would be fixed before the next retry. logging.warning('CloudStorage auth issues (%s): %s', e.__class__.__name__, e) # Abort so the job is retried automatically. return self.abort(500) except (gcs.FatalError, zlib.error, IOError) as e: # ForbiddenError and AuthorizationError inherit FatalError, so this except # block should be last. # It's broken or unreadable. self.purge_entry(entry, 'Failed to read the file (%s): %s', e.__class__.__name__, e) return # Verified. Data matches the hash. entry.expanded_size = expanded_size entry.is_verified = True future = entry.put_async() logging.info('%d bytes (%d bytes expanded) verified', entry.compressed_size, expanded_size) if save_to_memcache: model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated)) future.wait()
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature(config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError('Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error('Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified=not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache(namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add(url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id( ) stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()
def entry_key_or_error(namespace, digest): try: return model.entry_key(namespace, digest) except ValueError as error: raise endpoints.BadRequestException(error.message)
def post(self, namespace, hash_key): entry = model.entry_key(namespace, hash_key).get() if not entry: logging.error('Failed to find entity') return if entry.is_verified: logging.warning('Was already verified') return if entry.content is not None: logging.error('Should not be called with inline content') return # Get GS file size. gs_bucket = config.settings().gs_bucket gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id()) # It's None if file is missing. if not gs_file_info: # According to the docs, GS is read-after-write consistent, so a file is # missing only if it wasn't stored at all or it was deleted, in any case # it's not a valid ContentEntry. self.purge_entry(entry, 'No such GS file') return # Expected stored length and actual length should match. if gs_file_info.size != entry.compressed_size: self.purge_entry(entry, 'Bad GS file: expected size is %d, actual size is %d', entry.compressed_size, gs_file_info.size) return save_to_memcache = ( entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and entry.is_isolated) expanded_size = 0 digest = model.get_hash_algo(namespace) data = None try: # Start a loop where it reads the data in block. stream = gcs.read_file(gs_bucket, entry.key.id()) if save_to_memcache: # Wraps stream with a generator that accumulates the data. stream = Accumulator(stream) for data in model.expand_content(namespace, stream): expanded_size += len(data) digest.update(data) # Make sure the data is GC'ed. del data # Hashes should match. if digest.hexdigest() != hash_key: self.purge_entry(entry, 'SHA-1 do not match data (%d bytes, %d bytes expanded)', entry.compressed_size, expanded_size) return except gcs.NotFoundError as e: # Somebody deleted a file between get_file_info and read_file calls. self.purge_entry(entry, 'File was unexpectedly deleted') return except (gcs.ForbiddenError, gcs.AuthorizationError) as e: # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may # be fine. Maybe ACL problems would be fixed before the next retry. logging.warning( 'CloudStorage auth issues (%s): %s', e.__class__.__name__, e) # Abort so the job is retried automatically. return self.abort(500) except (gcs.FatalError, zlib.error, IOError) as e: # ForbiddenError and AuthorizationError inherit FatalError, so this except # block should be last. # It's broken or unreadable. self.purge_entry(entry, 'Failed to read the file (%s): %s', e.__class__.__name__, e) return # Verified. Data matches the hash. entry.expanded_size = expanded_size entry.is_verified = True future = entry.put_async() logging.info( '%d bytes (%d bytes expanded) verified', entry.compressed_size, expanded_size) if save_to_memcache: model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated)) future.wait()