Exemplo n.º 1
0
  def handle(self, namespace, hash_key):
    """Handles this request."""
    # Extract relevant request parameters.
    expiration_ts = self.request.get('x')
    item_size = self.request.get('s')
    is_isolated = self.request.get('i')
    uploaded_to_gs = self.request.get('g')
    signature = self.request.get('sig')

    # Build correct signature.
    expected_sig = self.generate_signature(
        config.settings().global_secret, self.request.method, expiration_ts,
        namespace, hash_key, item_size, is_isolated, uploaded_to_gs)

    # Verify signature is correct.
    if not utils.constant_time_equals(signature, expected_sig):
      return self.send_error('Incorrect signature.')

    # Convert parameters from strings back to something useful.
    # It can't fail since matching signature means it was us who generated
    # this strings in a first place.
    expiration_ts = int(expiration_ts)
    item_size = int(item_size)
    is_isolated = bool(int(is_isolated))
    uploaded_to_gs = bool(int(uploaded_to_gs))

    # Verify signature is not yet expired.
    if time.time() > expiration_ts:
      return self.send_error('Expired signature.')

    if uploaded_to_gs:
      # GS upload finalization uses empty POST body.
      assert self.request.method == 'POST'
      if self.request.headers.get('content-length'):
        return self.send_error('Expecting empty POST.')
      content = None
    else:
      # Datastore upload uses PUT.
      assert self.request.method == 'PUT'
      if self.request.headers.get('content-length'):
        content = self.request.body
      else:
        content = ''

    # Info about corresponding GS entry (if it exists).
    gs_bucket = config.settings().gs_bucket
    key = model.entry_key(namespace, hash_key)

    # Verify the data while at it since it's already in memory but before
    # storing it in memcache and datastore.
    if content is not None:
      # Verify advertised hash matches the data.
      try:
        hex_digest, expanded_size = hash_content(content, namespace)
        if hex_digest != hash_key:
          raise ValueError(
              'Hash and data do not match, '
              '%d bytes (%d bytes expanded)' % (len(content), expanded_size))
        if expanded_size != item_size:
          raise ValueError(
              'Advertised data length (%d) and actual data length (%d) '
              'do not match' % (item_size, expanded_size))
      except ValueError as err:
        return self.send_error('Inline verification failed.\n%s' % err)
      # Successfully verified!
      compressed_size = len(content)
      needs_verification = False
    else:
      # Fetch size of the stored file.
      file_info = gcs.get_file_info(gs_bucket, key.id())
      if not file_info:
        # TODO(maruel): Do not fail yet. If the request got up to here, the file
        # is likely there but the service may have trouble fetching the metadata
        # from GS.
        return self.send_error(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' %
            (key.id(), item_size))
      compressed_size = file_info.size
      needs_verification = True

    # Data is here and it's too large for DS, so put it in GS. It is likely
    # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS
    if content is not None and len(content) >= MIN_SIZE_FOR_GS:
      if not gcs.write_file(gs_bucket, key.id(), [content]):
        # Returns 503 so the client automatically retries.
        return self.send_error(
            'Unable to save the content to GS.', http_code=503)
      # It's now in GS.
      uploaded_to_gs = True

    # Can create entity now, everything appears to be legit.
    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=-1 if needs_verification else item_size,
        is_verified = not needs_verification)

    # If it's not in GS then put it inline.
    if not uploaded_to_gs:
      assert content is not None and len(content) < MIN_SIZE_FOR_GS
      entry.content = content

    # Start saving *.isolated into memcache iff its content is available and
    # it's not in Datastore: there's no point in saving inline blobs in memcache
    # because ndb already memcaches them.
    memcache_store_future = None
    if (content is not None and
        entry.content is None and
        entry.is_isolated and
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED):
      memcache_store_future = model.save_in_memcache(
          namespace, hash_key, content, async=True)

    try:
      # If entry was already verified above (i.e. it is a small inline entry),
      # store it right away, possibly overriding existing entity. Most of
      # the time it is a new entry anyway (since clients try to upload only
      # new entries).
      if not needs_verification:
        entry.put()
      else:
        # For large entries (that require expensive verification) be more
        # careful and check that it is indeed a new entity. No need to do it in
        # transaction: a race condition would lead to redundant verification
        # task enqueued, no big deal.
        existing = entry.key.get()
        if existing:
          if existing.is_verified:
            logging.info('Entity exists and already verified')
          else:
            logging.info('Entity exists, but not yet verified')
        else:
          # New entity. Store it and enqueue verification task, transactionally.
          task_queue_host = utils.get_task_queue_host()
          def run():
            entry.put()
            taskqueue.add(
                url='/internal/taskqueue/verify/%s' % entry.key.id(),
                queue_name='verify',
                headers={'Host': task_queue_host},
                transactional=True)
          datastore_utils.transaction(run)

      # TODO(vadimsh): Fill in details about the entry, such as expiration time.
      self.send_json({'entry': {}})

      # Log stats.
      where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id()
      stats.add_entry(stats.STORE, entry.compressed_size, where)

    finally:
      # Do not keep dangling futures. Note that error here is ignored,
      # memcache is just an optimization.
      if memcache_store_future:
        memcache_store_future.wait()
Exemplo n.º 2
0
  def post(self, namespace, hash_key):
    entry = model.entry_key(namespace, hash_key).get()
    if not entry:
      logging.error('Failed to find entity')
      return
    if entry.is_verified:
      logging.warning('Was already verified')
      return
    if entry.content is not None:
      logging.error('Should not be called with inline content')
      return

    # Get GS file size.
    gs_bucket = config.settings().gs_bucket
    gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

    # It's None if file is missing.
    if not gs_file_info:
      # According to the docs, GS is read-after-write consistent, so a file is
      # missing only if it wasn't stored at all or it was deleted, in any case
      # it's not a valid ContentEntry.
      self.purge_entry(entry, 'No such GS file')
      return

    # Expected stored length and actual length should match.
    if gs_file_info.size != entry.compressed_size:
      self.purge_entry(entry,
          'Bad GS file: expected size is %d, actual size is %d',
          entry.compressed_size, gs_file_info.size)
      return

    save_to_memcache = (
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and
        entry.is_isolated)
    expanded_size = 0
    digest = model.get_hash_algo(namespace)
    data = None

    try:
      # Start a loop where it reads the data in block.
      stream = gcs.read_file(gs_bucket, entry.key.id())
      if save_to_memcache:
        # Wraps stream with a generator that accumulates the data.
        stream = Accumulator(stream)

      for data in model.expand_content(namespace, stream):
        expanded_size += len(data)
        digest.update(data)
        # Make sure the data is GC'ed.
        del data

      # Hashes should match.
      if digest.hexdigest() != hash_key:
        self.purge_entry(entry,
            'SHA-1 do not match data (%d bytes, %d bytes expanded)',
            entry.compressed_size, expanded_size)
        return

    except gcs.NotFoundError as e:
      # Somebody deleted a file between get_file_info and read_file calls.
      self.purge_entry(entry, 'File was unexpectedly deleted')
      return
    except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
      # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
      # be fine. Maybe ACL problems would be fixed before the next retry.
      logging.warning(
          'CloudStorage auth issues (%s): %s', e.__class__.__name__, e)
      # Abort so the job is retried automatically.
      return self.abort(500)
    except (gcs.FatalError, zlib.error, IOError) as e:
      # ForbiddenError and AuthorizationError inherit FatalError, so this except
      # block should be last.
      # It's broken or unreadable.
      self.purge_entry(entry,
          'Failed to read the file (%s): %s', e.__class__.__name__, e)
      return

    # Verified. Data matches the hash.
    entry.expanded_size = expanded_size
    entry.is_verified = True
    future = entry.put_async()
    logging.info(
        '%d bytes (%d bytes expanded) verified',
        entry.compressed_size, expanded_size)
    if save_to_memcache:
      model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated))
    future.wait()
Exemplo n.º 3
0
    def post(self, namespace, hash_key):
        original_request = self.request.get('req')
        entry = model.get_entry_key(namespace, hash_key).get()
        if not entry:
            logging.error('Failed to find entity\n%s', original_request)
            return
        if entry.is_verified:
            logging.warning('Was already verified\n%s', original_request)
            return
        if entry.content is not None:
            logging.error('Should not be called with inline content\n%s',
                          original_request)
            return

        # Get GS file size.
        gs_bucket = config.settings().gs_bucket
        gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

        # It's None if file is missing.
        if not gs_file_info:
            # According to the docs, GS is read-after-write consistent, so a file is
            # missing only if it wasn't stored at all or it was deleted, in any case
            # it's not a valid ContentEntry.
            self.purge_entry(entry, 'No such GS file\n%s', original_request)
            return

        # Expected stored length and actual length should match.
        if gs_file_info.size != entry.compressed_size:
            self.purge_entry(
                entry,
                'Bad GS file: expected size is %d, actual size is %d\n%s',
                entry.compressed_size, gs_file_info.size, original_request)
            return

        save_to_memcache = (
            entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED
            and entry.is_isolated)
        expanded_size = 0
        digest = hashlib.sha1()
        data = None

        try:
            # Start a loop where it reads the data in block.
            stream = gcs.read_file(gs_bucket, entry.key.id())
            if save_to_memcache:
                # Wraps stream with a generator that accumulates the data.
                stream = Accumulator(stream)

            for data in model.expand_content(namespace, stream):
                expanded_size += len(data)
                digest.update(data)
                # Make sure the data is GC'ed.
                del data

            # Hashes should match.
            if digest.hexdigest() != hash_key:
                self.purge_entry(
                    entry, 'SHA-1 do not match data\n'
                    '%d bytes, %d bytes expanded, expected %d bytes\n%s',
                    entry.compressed_size, expanded_size, entry.expanded_size,
                    original_request)
                return

        except gcs.NotFoundError as e:
            # Somebody deleted a file between get_file_info and read_file calls.
            self.purge_entry(entry, 'File was unexpectedly deleted\n%s',
                             original_request)
            return
        except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
            # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
            # be fine. Maybe ACL problems would be fixed before the next retry.
            logging.warning('CloudStorage auth issues (%s): %s',
                            e.__class__.__name__, e)
            # Abort so the job is retried automatically.
            return self.abort(500)
        except (gcs.FatalError, zlib.error, IOError) as e:
            # ForbiddenError and AuthorizationError inherit FatalError, so this except
            # block should be last.
            # It's broken or unreadable.
            self.purge_entry(entry, 'Failed to read the file (%s): %s\n%s',
                             e.__class__.__name__, e, original_request)
            return

        # Verified. Data matches the hash.
        entry.expanded_size = expanded_size
        entry.is_verified = True
        future = entry.put_async()
        logging.info('%d bytes (%d bytes expanded) verified\n%s',
                     entry.compressed_size, expanded_size, original_request)
        if save_to_memcache:
            model.save_in_memcache(namespace, hash_key,
                                   ''.join(stream.accumulated))
        future.wait()