示例#1
0
def is_good_content_entry(entry):
  """True if ContentEntry is not broken.

  ContentEntry is broken if it is in old format (before content namespace
  were sharded) or corresponding Google Storage file doesn't exist.
  """
  # New entries use GS file path as ids. File path is always <namespace>/<hash>.
  entry_id = entry.key.id()
  if '/' not in entry_id:
    return False
  # Content is inline, entity doesn't have GS file attached -> it is fine.
  if entry.content is not None:
    return True
  # Ensure GS file exists.
  return bool(gcs.get_file_info(config.settings().gs_bucket, entry_id))
  def storage_helper(self, request, uploaded_to_gs):
    """Implement shared logic between store_inline and finalize_gs."""
    # validate token or error out
    if not request.upload_ticket:
      raise endpoints.BadRequestException(
          'Upload ticket was empty or not provided.')
    try:
      embedded = TokenSigner.validate(
          request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs])
    except (auth.InvalidTokenError, ValueError) as error:
      raise endpoints.BadRequestException(
          'Ticket validation failed: %s' % error.message)

    # read data and convert types
    digest = embedded['d'].encode('utf-8')
    is_isolated = bool(int(embedded['i']))
    namespace = embedded['n']
    size = int(embedded['s'])

    # create a key
    key = entry_key_or_error(namespace, digest)

    # get content and compressed size
    if uploaded_to_gs:
      # ensure that file info is uploaded to GS first
      # TODO(cmassaro): address analogous TODO from handlers_api
      file_info = gcs.get_file_info(config.settings().gs_bucket, key.id())
      if not file_info:
        raise endpoints.BadRequestException(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (
                key.id(), size))
      content = None
      compressed_size = file_info.size
    else:
      content = request.content
      compressed_size = len(content)

    # all is well; create an entry
    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=size,
        is_verified=not uploaded_to_gs,
        content=content,
    )

    # DB: assert that embedded content is the data sent by the request
    if not uploaded_to_gs:
      if (digest, size) != hash_content(content, namespace):
        raise endpoints.BadRequestException(
            'Embedded digest does not match provided data: '
            '(digest, size): (%r, %r); expected: %r' % (
                digest, size, hash_content(content, namespace)))
      entry.put()

    # GCS: enqueue verification task
    else:
      try:
        store_and_enqueue_verify_task(entry, utils.get_task_queue_host())
      except (
          datastore_errors.Error,
          runtime.apiproxy_errors.CancelledError,
          runtime.apiproxy_errors.DeadlineExceededError,
          runtime.apiproxy_errors.OverQuotaError,
          runtime.DeadlineExceededError,
          taskqueue.Error) as e:
        raise endpoints.InternalServerErrorException(
            'Unable to store the entity: %s.' % e.__class__.__name__)

    stats.add_entry(
        stats.STORE, entry.compressed_size,
        'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline')
    return PushPing(ok=True)
示例#3
0
  def post(self, namespace, hash_key):
    entry = model.entry_key(namespace, hash_key).get()
    if not entry:
      logging.error('Failed to find entity')
      return
    if entry.is_verified:
      logging.warning('Was already verified')
      return
    if entry.content is not None:
      logging.error('Should not be called with inline content')
      return

    # Get GS file size.
    gs_bucket = config.settings().gs_bucket
    gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

    # It's None if file is missing.
    if not gs_file_info:
      # According to the docs, GS is read-after-write consistent, so a file is
      # missing only if it wasn't stored at all or it was deleted, in any case
      # it's not a valid ContentEntry.
      self.purge_entry(entry, 'No such GS file')
      return

    # Expected stored length and actual length should match.
    if gs_file_info.size != entry.compressed_size:
      self.purge_entry(entry,
          'Bad GS file: expected size is %d, actual size is %d',
          entry.compressed_size, gs_file_info.size)
      return

    save_to_memcache = (
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and
        entry.is_isolated)
    expanded_size = 0
    digest = model.get_hash_algo(namespace)
    data = None

    try:
      # Start a loop where it reads the data in block.
      stream = gcs.read_file(gs_bucket, entry.key.id())
      if save_to_memcache:
        # Wraps stream with a generator that accumulates the data.
        stream = Accumulator(stream)

      for data in model.expand_content(namespace, stream):
        expanded_size += len(data)
        digest.update(data)
        # Make sure the data is GC'ed.
        del data

      # Hashes should match.
      if digest.hexdigest() != hash_key:
        self.purge_entry(entry,
            'SHA-1 do not match data (%d bytes, %d bytes expanded)',
            entry.compressed_size, expanded_size)
        return

    except gcs.NotFoundError as e:
      # Somebody deleted a file between get_file_info and read_file calls.
      self.purge_entry(entry, 'File was unexpectedly deleted')
      return
    except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
      # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
      # be fine. Maybe ACL problems would be fixed before the next retry.
      logging.warning(
          'CloudStorage auth issues (%s): %s', e.__class__.__name__, e)
      # Abort so the job is retried automatically.
      return self.abort(500)
    except (gcs.FatalError, zlib.error, IOError) as e:
      # ForbiddenError and AuthorizationError inherit FatalError, so this except
      # block should be last.
      # It's broken or unreadable.
      self.purge_entry(entry,
          'Failed to read the file (%s): %s', e.__class__.__name__, e)
      return

    # Verified. Data matches the hash.
    entry.expanded_size = expanded_size
    entry.is_verified = True
    future = entry.put_async()
    logging.info(
        '%d bytes (%d bytes expanded) verified',
        entry.compressed_size, expanded_size)
    if save_to_memcache:
      model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated))
    future.wait()
示例#4
0
  def handle(self, namespace, hash_key):
    """Handles this request."""
    # Extract relevant request parameters.
    expiration_ts = self.request.get('x')
    item_size = self.request.get('s')
    is_isolated = self.request.get('i')
    uploaded_to_gs = self.request.get('g')
    signature = self.request.get('sig')

    # Build correct signature.
    expected_sig = self.generate_signature(
        config.settings().global_secret, self.request.method, expiration_ts,
        namespace, hash_key, item_size, is_isolated, uploaded_to_gs)

    # Verify signature is correct.
    if not utils.constant_time_equals(signature, expected_sig):
      return self.send_error('Incorrect signature.')

    # Convert parameters from strings back to something useful.
    # It can't fail since matching signature means it was us who generated
    # this strings in a first place.
    expiration_ts = int(expiration_ts)
    item_size = int(item_size)
    is_isolated = bool(int(is_isolated))
    uploaded_to_gs = bool(int(uploaded_to_gs))

    # Verify signature is not yet expired.
    if time.time() > expiration_ts:
      return self.send_error('Expired signature.')

    if uploaded_to_gs:
      # GS upload finalization uses empty POST body.
      assert self.request.method == 'POST'
      if self.request.headers.get('content-length'):
        return self.send_error('Expecting empty POST.')
      content = None
    else:
      # Datastore upload uses PUT.
      assert self.request.method == 'PUT'
      if self.request.headers.get('content-length'):
        content = self.request.body
      else:
        content = ''

    # Info about corresponding GS entry (if it exists).
    gs_bucket = config.settings().gs_bucket
    key = model.entry_key(namespace, hash_key)

    # Verify the data while at it since it's already in memory but before
    # storing it in memcache and datastore.
    if content is not None:
      # Verify advertised hash matches the data.
      try:
        hex_digest, expanded_size = hash_content(content, namespace)
        if hex_digest != hash_key:
          raise ValueError(
              'Hash and data do not match, '
              '%d bytes (%d bytes expanded)' % (len(content), expanded_size))
        if expanded_size != item_size:
          raise ValueError(
              'Advertised data length (%d) and actual data length (%d) '
              'do not match' % (item_size, expanded_size))
      except ValueError as err:
        return self.send_error('Inline verification failed.\n%s' % err)
      # Successfully verified!
      compressed_size = len(content)
      needs_verification = False
    else:
      # Fetch size of the stored file.
      file_info = gcs.get_file_info(gs_bucket, key.id())
      if not file_info:
        # TODO(maruel): Do not fail yet. If the request got up to here, the file
        # is likely there but the service may have trouble fetching the metadata
        # from GS.
        return self.send_error(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' %
            (key.id(), item_size))
      compressed_size = file_info.size
      needs_verification = True

    # Data is here and it's too large for DS, so put it in GS. It is likely
    # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS
    if content is not None and len(content) >= MIN_SIZE_FOR_GS:
      if not gcs.write_file(gs_bucket, key.id(), [content]):
        # Returns 503 so the client automatically retries.
        return self.send_error(
            'Unable to save the content to GS.', http_code=503)
      # It's now in GS.
      uploaded_to_gs = True

    # Can create entity now, everything appears to be legit.
    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=-1 if needs_verification else item_size,
        is_verified = not needs_verification)

    # If it's not in GS then put it inline.
    if not uploaded_to_gs:
      assert content is not None and len(content) < MIN_SIZE_FOR_GS
      entry.content = content

    # Start saving *.isolated into memcache iff its content is available and
    # it's not in Datastore: there's no point in saving inline blobs in memcache
    # because ndb already memcaches them.
    memcache_store_future = None
    if (content is not None and
        entry.content is None and
        entry.is_isolated and
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED):
      memcache_store_future = model.save_in_memcache(
          namespace, hash_key, content, async=True)

    try:
      # If entry was already verified above (i.e. it is a small inline entry),
      # store it right away, possibly overriding existing entity. Most of
      # the time it is a new entry anyway (since clients try to upload only
      # new entries).
      if not needs_verification:
        entry.put()
      else:
        # For large entries (that require expensive verification) be more
        # careful and check that it is indeed a new entity. No need to do it in
        # transaction: a race condition would lead to redundant verification
        # task enqueued, no big deal.
        existing = entry.key.get()
        if existing:
          if existing.is_verified:
            logging.info('Entity exists and already verified')
          else:
            logging.info('Entity exists, but not yet verified')
        else:
          # New entity. Store it and enqueue verification task, transactionally.
          task_queue_host = utils.get_task_queue_host()
          def run():
            entry.put()
            taskqueue.add(
                url='/internal/taskqueue/verify/%s' % entry.key.id(),
                queue_name='verify',
                headers={'Host': task_queue_host},
                transactional=True)
          datastore_utils.transaction(run)

      # TODO(vadimsh): Fill in details about the entry, such as expiration time.
      self.send_json({'entry': {}})

      # Log stats.
      where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id()
      stats.add_entry(stats.STORE, entry.compressed_size, where)

    finally:
      # Do not keep dangling futures. Note that error here is ignored,
      # memcache is just an optimization.
      if memcache_store_future:
        memcache_store_future.wait()
示例#5
0
    def post(self, namespace, hash_key):
        original_request = self.request.get('req')
        entry = model.get_entry_key(namespace, hash_key).get()
        if not entry:
            logging.error('Failed to find entity\n%s', original_request)
            return
        if entry.is_verified:
            logging.warning('Was already verified\n%s', original_request)
            return
        if entry.content is not None:
            logging.error('Should not be called with inline content\n%s',
                          original_request)
            return

        # Get GS file size.
        gs_bucket = config.settings().gs_bucket
        gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

        # It's None if file is missing.
        if not gs_file_info:
            # According to the docs, GS is read-after-write consistent, so a file is
            # missing only if it wasn't stored at all or it was deleted, in any case
            # it's not a valid ContentEntry.
            self.purge_entry(entry, 'No such GS file\n%s', original_request)
            return

        # Expected stored length and actual length should match.
        if gs_file_info.size != entry.compressed_size:
            self.purge_entry(
                entry,
                'Bad GS file: expected size is %d, actual size is %d\n%s',
                entry.compressed_size, gs_file_info.size, original_request)
            return

        save_to_memcache = (
            entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED
            and entry.is_isolated)
        expanded_size = 0
        digest = hashlib.sha1()
        data = None

        try:
            # Start a loop where it reads the data in block.
            stream = gcs.read_file(gs_bucket, entry.key.id())
            if save_to_memcache:
                # Wraps stream with a generator that accumulates the data.
                stream = Accumulator(stream)

            for data in model.expand_content(namespace, stream):
                expanded_size += len(data)
                digest.update(data)
                # Make sure the data is GC'ed.
                del data

            # Hashes should match.
            if digest.hexdigest() != hash_key:
                self.purge_entry(
                    entry, 'SHA-1 do not match data\n'
                    '%d bytes, %d bytes expanded, expected %d bytes\n%s',
                    entry.compressed_size, expanded_size, entry.expanded_size,
                    original_request)
                return

        except gcs.NotFoundError as e:
            # Somebody deleted a file between get_file_info and read_file calls.
            self.purge_entry(entry, 'File was unexpectedly deleted\n%s',
                             original_request)
            return
        except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
            # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
            # be fine. Maybe ACL problems would be fixed before the next retry.
            logging.warning('CloudStorage auth issues (%s): %s',
                            e.__class__.__name__, e)
            # Abort so the job is retried automatically.
            return self.abort(500)
        except (gcs.FatalError, zlib.error, IOError) as e:
            # ForbiddenError and AuthorizationError inherit FatalError, so this except
            # block should be last.
            # It's broken or unreadable.
            self.purge_entry(entry, 'Failed to read the file (%s): %s\n%s',
                             e.__class__.__name__, e, original_request)
            return

        # Verified. Data matches the hash.
        entry.expanded_size = expanded_size
        entry.is_verified = True
        future = entry.put_async()
        logging.info('%d bytes (%d bytes expanded) verified\n%s',
                     entry.compressed_size, expanded_size, original_request)
        if save_to_memcache:
            model.save_in_memcache(namespace, hash_key,
                                   ''.join(stream.accumulated))
        future.wait()
示例#6
0
    def handle(self, namespace, hash_key):
        """Handles this request."""
        # Extract relevant request parameters.
        expiration_ts = self.request.get('x')
        item_size = self.request.get('s')
        is_isolated = self.request.get('i')
        uploaded_to_gs = self.request.get('g')
        signature = self.request.get('sig')

        # Build correct signature.
        expected_sig = self.generate_signature(config.settings().global_secret,
                                               self.request.method,
                                               expiration_ts, namespace,
                                               hash_key, item_size,
                                               is_isolated, uploaded_to_gs)

        # Verify signature is correct.
        if not utils.constant_time_equals(signature, expected_sig):
            return self.send_error('Incorrect signature.')

        # Convert parameters from strings back to something useful.
        # It can't fail since matching signature means it was us who generated
        # this strings in a first place.
        expiration_ts = int(expiration_ts)
        item_size = int(item_size)
        is_isolated = bool(int(is_isolated))
        uploaded_to_gs = bool(int(uploaded_to_gs))

        # Verify signature is not yet expired.
        if time.time() > expiration_ts:
            return self.send_error('Expired signature.')

        if uploaded_to_gs:
            # GS upload finalization uses empty POST body.
            assert self.request.method == 'POST'
            if self.request.headers.get('content-length'):
                return self.send_error('Expecting empty POST.')
            content = None
        else:
            # Datastore upload uses PUT.
            assert self.request.method == 'PUT'
            if self.request.headers.get('content-length'):
                content = self.request.body
            else:
                content = ''

        # Info about corresponding GS entry (if it exists).
        gs_bucket = config.settings().gs_bucket
        key = model.entry_key(namespace, hash_key)

        # Verify the data while at it since it's already in memory but before
        # storing it in memcache and datastore.
        if content is not None:
            # Verify advertised hash matches the data.
            try:
                hex_digest, expanded_size = hash_content(content, namespace)
                if hex_digest != hash_key:
                    raise ValueError('Hash and data do not match, '
                                     '%d bytes (%d bytes expanded)' %
                                     (len(content), expanded_size))
                if expanded_size != item_size:
                    raise ValueError(
                        'Advertised data length (%d) and actual data length (%d) '
                        'do not match' % (item_size, expanded_size))
            except ValueError as err:
                return self.send_error('Inline verification failed.\n%s' % err)
            # Successfully verified!
            compressed_size = len(content)
            needs_verification = False
        else:
            # Fetch size of the stored file.
            file_info = gcs.get_file_info(gs_bucket, key.id())
            if not file_info:
                # TODO(maruel): Do not fail yet. If the request got up to here, the file
                # is likely there but the service may have trouble fetching the metadata
                # from GS.
                return self.send_error(
                    'File should be in Google Storage.\nFile: \'%s\' Size: %d.'
                    % (key.id(), item_size))
            compressed_size = file_info.size
            needs_verification = True

        # Data is here and it's too large for DS, so put it in GS. It is likely
        # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS
        if content is not None and len(content) >= MIN_SIZE_FOR_GS:
            if not gcs.write_file(gs_bucket, key.id(), [content]):
                # Returns 503 so the client automatically retries.
                return self.send_error('Unable to save the content to GS.',
                                       http_code=503)
            # It's now in GS.
            uploaded_to_gs = True

        # Can create entity now, everything appears to be legit.
        entry = model.new_content_entry(
            key=key,
            is_isolated=is_isolated,
            compressed_size=compressed_size,
            expanded_size=-1 if needs_verification else item_size,
            is_verified=not needs_verification)

        # If it's not in GS then put it inline.
        if not uploaded_to_gs:
            assert content is not None and len(content) < MIN_SIZE_FOR_GS
            entry.content = content

        # Start saving *.isolated into memcache iff its content is available and
        # it's not in Datastore: there's no point in saving inline blobs in memcache
        # because ndb already memcaches them.
        memcache_store_future = None
        if (content is not None and entry.content is None and entry.is_isolated
                and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED):
            memcache_store_future = model.save_in_memcache(namespace,
                                                           hash_key,
                                                           content,
                                                           async=True)

        try:
            # If entry was already verified above (i.e. it is a small inline entry),
            # store it right away, possibly overriding existing entity. Most of
            # the time it is a new entry anyway (since clients try to upload only
            # new entries).
            if not needs_verification:
                entry.put()
            else:
                # For large entries (that require expensive verification) be more
                # careful and check that it is indeed a new entity. No need to do it in
                # transaction: a race condition would lead to redundant verification
                # task enqueued, no big deal.
                existing = entry.key.get()
                if existing:
                    if existing.is_verified:
                        logging.info('Entity exists and already verified')
                    else:
                        logging.info('Entity exists, but not yet verified')
                else:
                    # New entity. Store it and enqueue verification task, transactionally.
                    task_queue_host = utils.get_task_queue_host()

                    def run():
                        entry.put()
                        taskqueue.add(url='/internal/taskqueue/verify/%s' %
                                      entry.key.id(),
                                      queue_name='verify',
                                      headers={'Host': task_queue_host},
                                      transactional=True)

                    datastore_utils.transaction(run)

            # TODO(vadimsh): Fill in details about the entry, such as expiration time.
            self.send_json({'entry': {}})

            # Log stats.
            where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id(
            )
            stats.add_entry(stats.STORE, entry.compressed_size, where)

        finally:
            # Do not keep dangling futures. Note that error here is ignored,
            # memcache is just an optimization.
            if memcache_store_future:
                memcache_store_future.wait()
  def storage_helper(request, uploaded_to_gs):
    """Implement shared logic between store_inline and finalize_gs.

    Arguments:
      request: either StorageRequest or FinalizeRequest.
      uploaded_to_gs: bool.
    """
    if not request.upload_ticket:
      raise endpoints.BadRequestException(
          'Upload ticket was empty or not provided.')
    try:
      embedded = TokenSigner.validate(
          request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs])
    except (auth.InvalidTokenError, ValueError) as error:
      raise endpoints.BadRequestException(
          'Ticket validation failed: %s' % error.message)

    digest = embedded['d'].encode('utf-8')
    is_isolated = bool(int(embedded['i']))
    namespace = embedded['n']
    size = int(embedded['s'])
    key = entry_key_or_error(namespace, digest)

    if uploaded_to_gs:
      # Ensure that file info is uploaded to GS first.
      file_info = gcs.get_file_info(config.settings().gs_bucket, key.id())
      if not file_info:
        logging.debug('%s', digest)
        raise endpoints.BadRequestException(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (
                key.id(), size))
      content = None
      compressed_size = file_info.size
    else:
      content = request.content
      compressed_size = len(content)

    # Look if the entity was already stored. Alert in that case but ignore it.
    if key.get():
      # TODO(maruel): Handle these more gracefully.
      logging.warning('Overwritting ContentEntry\n%s', digest)

    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=size,
        is_verified=not uploaded_to_gs,
        content=content,
    )

    if not uploaded_to_gs:
      # Assert that embedded content is the data sent by the request.
      logging.debug('%s', digest)
      if (digest, size) != hash_content(content, namespace):
        raise endpoints.BadRequestException(
            'Embedded digest does not match provided data: '
            '(digest, size): (%r, %r); expected: %r' % (
                digest, size, hash_content(content, namespace)))
      entry.put()
    else:
      # Enqueue verification task transactionally as the entity is stored.
      try:
        store_and_enqueue_verify_task(entry, utils.get_task_queue_host())
      except (
          datastore_errors.Error,
          runtime.apiproxy_errors.CancelledError,
          runtime.apiproxy_errors.DeadlineExceededError,
          runtime.apiproxy_errors.OverQuotaError,
          runtime.DeadlineExceededError,
          taskqueue.Error) as e:
        raise endpoints.InternalServerErrorException(
            'Unable to store the entity: %s.' % e.__class__.__name__)

    stats.add_entry(
        stats.STORE, entry.compressed_size,
        'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline')
    return PushPing(ok=True)