Пример #1
0
    def filter_missing():
      futures = {}
      cutoff = time.time() - 60*60
      for filepath, filestats in gcs.list_files(gs_bucket):
        # If the file was uploaded in the last hour, ignore it.
        if filestats.st_ctime >= cutoff:
          continue

        # This must match the logic in model.entry_key(). Since this request
        # will in practice touch every item, do not use memcache since it'll
        # mess it up by loading every items in it.
        # TODO(maruel): Batch requests to use get_multi_async() similar to
        # datastore_utils.page_queries().
        future = model.entry_key_from_id(filepath).get_async(
            use_cache=False, use_memcache=False)
        futures[future] = filepath

        if len(futures) > 20:
          future = ndb.Future.wait_any(futures)
          filepath = futures.pop(future)
          if future.get_result():
            continue
          yield filepath
      while futures:
        future = ndb.Future.wait_any(futures)
        filepath = futures.pop(future)
        if future.get_result():
          continue
        yield filepath
Пример #2
0
        def filter_missing():
            futures = {}
            cutoff = time.time() - 60 * 60
            for filepath, filestats in gcs.list_files(gs_bucket):
                # If the file was uploaded in the last hour, ignore it.
                if filestats.st_ctime >= cutoff:
                    continue

                # This must match the logic in model.get_entry_key(). Since this request
                # will in practice touch every item, do not use memcache since it'll
                # mess it up by loading every items in it.
                # TODO(maruel): Batch requests to use get_multi_async() similar to
                # datastore_utils.page_queries().
                future = model.entry_key_from_id(filepath).get_async(
                    use_cache=False, use_memcache=False)
                futures[future] = filepath

                if len(futures) > 20:
                    future = ndb.Future.wait_any(futures)
                    filepath = futures.pop(future)
                    if future.get_result():
                        continue
                    yield filepath
            while futures:
                future = ndb.Future.wait_any(futures)
                filepath = futures.pop(future)
                if future.get_result():
                    continue
                yield filepath
Пример #3
0
    def post(self):
        logging.info('Deleting ContentEntry')
        incremental_delete(model.ContentEntry.query().iter(keys_only=True),
                           ndb.delete_multi_async)

        gs_bucket = config.settings().gs_bucket
        logging.info('Deleting GS bucket %s', gs_bucket)
        incremental_delete(
            (i[0] for i in gcs.list_files(gs_bucket)),
            lambda filenames: gcs.delete_files(gs_bucket, filenames))

        logging.info('Flushing memcache')
        # High priority (.isolated files) are cached explicitly. Make sure ghosts
        # are zapped too.
        memcache.flush_all()
        logging.info('Finally done!')
Пример #4
0
  def post(self):
    logging.info('Deleting ContentEntry')
    incremental_delete(
        model.ContentEntry.query().iter(keys_only=True),
        ndb.delete_multi_async)

    gs_bucket = config.settings().gs_bucket
    logging.info('Deleting GS bucket %s', gs_bucket)
    incremental_delete(
        (i[0] for i in gcs.list_files(gs_bucket)),
        lambda filenames: gcs.delete_files(gs_bucket, filenames))

    logging.info('Flushing memcache')
    # High priority (.isolated files) are cached explicitly. Make sure ghosts
    # are zapped too.
    memcache.flush_all()
    logging.info('Finally done!')
Пример #5
0
def _yield_orphan_gcs_files(gs_bucket):
    """Iterates over the whole GCS bucket for unreferenced files.

  Finds files in GCS that are not referred to by a ContentEntry.

  Only return files at least 1 day old to reduce the risk of failure.

  Yields:
    path of unreferenced files in the bucket
  """
    good = 0
    orphaned = 0
    size_good = 0
    size_orphaned = 0
    # pylint: disable=too-many-nested-blocks
    try:
        futures = {}
        cutoff = time.time() - 24 * 60 * 60
        # https://cloud.google.com/appengine/docs/standard/python/googlecloudstorageclient/gcsfilestat_class
        for filepath, filestats in gcs.list_files(gs_bucket):
            # If the file was uploaded in the last hour, ignore it.
            if filestats.st_ctime >= cutoff:
                continue

            # This must match the logic in model.get_entry_key(). Since this request
            # will touch every ContentEntry, do not use memcache since it'll
            # overflow it up by loading every items in it.
            try:
                # TODO(maruel): Handle non-ascii files, in practice they cannot be
                # digests so they must be deleted anyway.
                key = model.entry_key_from_id(str(filepath))
            except AssertionError:
                # It's not even a valid entry.
                orphaned += 1
                size_orphaned += filestats.st_size
                yield filepath
                continue

            futures[key.get_async(use_memcache=False)] = (filepath, filestats)

            if len(futures) > 100:
                ndb.Future.wait_any(futures)
                tmp = {}
                # pylint: disable=redefined-outer-name
                for f, (filepath, filestats) in futures.items():
                    if f.done():
                        if not f.get_result():
                            # Orphaned, delete.
                            orphaned += 1
                            size_orphaned += filestats.st_size
                            yield filepath
                        else:
                            good += 1
                            size_good += filestats.st_size
                    else:
                        tmp[f] = (filepath, filestats)
                futures = tmp

        while futures:
            ndb.Future.wait_any(futures)
            tmp = {}
            for f, (filepath, filestats) in futures.items():
                if f.done():
                    if not f.get_result():
                        # Orphaned, delete.
                        orphaned += 1
                        size_orphaned += filestats.st_size
                        yield filepath
                    else:
                        good += 1
                        size_good += filestats.st_size
                else:
                    # Not done yet.
                    tmp[f] = (filepath, filestats)
            futures = tmp
    finally:
        size_good_tb = size_good / 1024. / 1024. / 1024. / 1024.
        size_orphaned_gb = size_orphaned / 1024. / 1024. / 1024.
        logging.info(
            'Found:\n'
            '- %d good GCS files; %d bytes (%.1fTiB)\n'
            '- %d orphaned files; %d bytes (%.1fGiB)', good, size_good,
            size_good_tb, orphaned, size_orphaned, size_orphaned_gb)