def filter_missing(): futures = {} cutoff = time.time() - 60*60 for filepath, filestats in gcs.list_files(gs_bucket): # If the file was uploaded in the last hour, ignore it. if filestats.st_ctime >= cutoff: continue # This must match the logic in model.entry_key(). Since this request # will in practice touch every item, do not use memcache since it'll # mess it up by loading every items in it. # TODO(maruel): Batch requests to use get_multi_async() similar to # datastore_utils.page_queries(). future = model.entry_key_from_id(filepath).get_async( use_cache=False, use_memcache=False) futures[future] = filepath if len(futures) > 20: future = ndb.Future.wait_any(futures) filepath = futures.pop(future) if future.get_result(): continue yield filepath while futures: future = ndb.Future.wait_any(futures) filepath = futures.pop(future) if future.get_result(): continue yield filepath
def filter_missing(): futures = {} cutoff = time.time() - 60 * 60 for filepath, filestats in gcs.list_files(gs_bucket): # If the file was uploaded in the last hour, ignore it. if filestats.st_ctime >= cutoff: continue # This must match the logic in model.get_entry_key(). Since this request # will in practice touch every item, do not use memcache since it'll # mess it up by loading every items in it. # TODO(maruel): Batch requests to use get_multi_async() similar to # datastore_utils.page_queries(). future = model.entry_key_from_id(filepath).get_async( use_cache=False, use_memcache=False) futures[future] = filepath if len(futures) > 20: future = ndb.Future.wait_any(futures) filepath = futures.pop(future) if future.get_result(): continue yield filepath while futures: future = ndb.Future.wait_any(futures) filepath = futures.pop(future) if future.get_result(): continue yield filepath
def post(self): logging.info('Deleting ContentEntry') incremental_delete(model.ContentEntry.query().iter(keys_only=True), ndb.delete_multi_async) gs_bucket = config.settings().gs_bucket logging.info('Deleting GS bucket %s', gs_bucket) incremental_delete( (i[0] for i in gcs.list_files(gs_bucket)), lambda filenames: gcs.delete_files(gs_bucket, filenames)) logging.info('Flushing memcache') # High priority (.isolated files) are cached explicitly. Make sure ghosts # are zapped too. memcache.flush_all() logging.info('Finally done!')
def post(self): logging.info('Deleting ContentEntry') incremental_delete( model.ContentEntry.query().iter(keys_only=True), ndb.delete_multi_async) gs_bucket = config.settings().gs_bucket logging.info('Deleting GS bucket %s', gs_bucket) incremental_delete( (i[0] for i in gcs.list_files(gs_bucket)), lambda filenames: gcs.delete_files(gs_bucket, filenames)) logging.info('Flushing memcache') # High priority (.isolated files) are cached explicitly. Make sure ghosts # are zapped too. memcache.flush_all() logging.info('Finally done!')
def _yield_orphan_gcs_files(gs_bucket): """Iterates over the whole GCS bucket for unreferenced files. Finds files in GCS that are not referred to by a ContentEntry. Only return files at least 1 day old to reduce the risk of failure. Yields: path of unreferenced files in the bucket """ good = 0 orphaned = 0 size_good = 0 size_orphaned = 0 # pylint: disable=too-many-nested-blocks try: futures = {} cutoff = time.time() - 24 * 60 * 60 # https://cloud.google.com/appengine/docs/standard/python/googlecloudstorageclient/gcsfilestat_class for filepath, filestats in gcs.list_files(gs_bucket): # If the file was uploaded in the last hour, ignore it. if filestats.st_ctime >= cutoff: continue # This must match the logic in model.get_entry_key(). Since this request # will touch every ContentEntry, do not use memcache since it'll # overflow it up by loading every items in it. try: # TODO(maruel): Handle non-ascii files, in practice they cannot be # digests so they must be deleted anyway. key = model.entry_key_from_id(str(filepath)) except AssertionError: # It's not even a valid entry. orphaned += 1 size_orphaned += filestats.st_size yield filepath continue futures[key.get_async(use_memcache=False)] = (filepath, filestats) if len(futures) > 100: ndb.Future.wait_any(futures) tmp = {} # pylint: disable=redefined-outer-name for f, (filepath, filestats) in futures.items(): if f.done(): if not f.get_result(): # Orphaned, delete. orphaned += 1 size_orphaned += filestats.st_size yield filepath else: good += 1 size_good += filestats.st_size else: tmp[f] = (filepath, filestats) futures = tmp while futures: ndb.Future.wait_any(futures) tmp = {} for f, (filepath, filestats) in futures.items(): if f.done(): if not f.get_result(): # Orphaned, delete. orphaned += 1 size_orphaned += filestats.st_size yield filepath else: good += 1 size_good += filestats.st_size else: # Not done yet. tmp[f] = (filepath, filestats) futures = tmp finally: size_good_tb = size_good / 1024. / 1024. / 1024. / 1024. size_orphaned_gb = size_orphaned / 1024. / 1024. / 1024. logging.info( 'Found:\n' '- %d good GCS files; %d bytes (%.1fTiB)\n' '- %d orphaned files; %d bytes (%.1fGiB)', good, size_good, size_good_tb, orphaned, size_orphaned, size_orphaned_gb)