Пример #1
0
    def _split_input_from_namespace(cls, app, namespace, entity_kind_name,
                                    shard_count):
        """Return KeyRange objects. Helper for _split_input_from_params."""

        raw_entity_kind = util.get_short_name(entity_kind_name)

        if shard_count == 1:
            # With one shard we don't need to calculate any splitpoints at all.
            return [key_range.KeyRange(namespace=namespace, _app=app)]

        # we use datastore.Query instead of ext.db.Query here, because we can't
        # erase ordering on db.Query once we set it.
        ds_query = datastore.Query(kind=raw_entity_kind,
                                   namespace=namespace,
                                   _app=app,
                                   keys_only=True)
        ds_query.Order("__scatter__")
        random_keys = ds_query.Get(shard_count * cls._OVERSAMPLING_FACTOR)
        if not random_keys:
            # This might mean that there are no entities with scatter property
            # or there are no entities at all.
            return [key_range.KeyRange(namespace=namespace, _app=app)]
        random_keys.sort()
        # pick shard_count - 1 points to generate shard_count splits
        split_points_count = shard_count - 1
        if len(random_keys) > split_points_count:
            # downsample
            random_keys = [
                random_keys[len(random_keys) * i / split_points_count]
                for i in range(split_points_count)
            ]

        key_ranges = []

        key_ranges.append(
            key_range.KeyRange(key_start=None,
                               key_end=random_keys[0],
                               direction=key_range.KeyRange.ASC,
                               include_start=False,
                               include_end=False,
                               namespace=namespace))

        for i in range(0, len(random_keys) - 1):
            key_ranges.append(
                key_range.KeyRange(key_start=random_keys[i],
                                   key_end=random_keys[i + 1],
                                   direction=key_range.KeyRange.ASC,
                                   include_start=True,
                                   include_end=False,
                                   namespace=namespace))

        key_ranges.append(
            key_range.KeyRange(key_start=random_keys[-1],
                               key_end=None,
                               direction=key_range.KeyRange.ASC,
                               include_start=True,
                               include_end=False,
                               namespace=namespace))

        return key_ranges
Пример #2
0
    def testCursors(self):
        qs = model.QuerySpec(TestModel, model_class_path=ENTITY_KIND)
        kr = key_range.KeyRange(key_start=key(1),
                                key_end=key(10000),
                                direction="ASC")

        json = {
            'key_range': kr.to_json(),
            'query_spec': qs.to_json(),
            'cursor': None
        }

        entities = []
        while True:
            model_iter = DjangoModelIterator.from_json(json)

            c = False
            count = 0
            for entity in model_iter:
                count += 1
                entities.append(entity)
                if count == 10:
                    c = True
                    break

            if c:
                json = model_iter.to_json()
            else:
                break

        self.assertEquals(100, len(entities))
        self.assertEquals(self.expected_entities, entities)
Пример #3
0
  def _iter_ns_range(self):
    """Iterates over self._ns_range, delegating to self._iter_key_range()."""
    while True:
      if self._current_key_range is None:
        query = self._ns_range.make_datastore_query()
        namespace_result = query.Get(1)
        if not namespace_result:
          break

        namespace = namespace_result[0].name() or ""
        self._current_key_range = key_range.KeyRange(
            namespace=namespace, _app=self._ns_range.app)

      for key, o in self._iter_key_range(
          copy.deepcopy(self._current_key_range)):
        # The caller must consume yielded values so advancing the KeyRange
        # before yielding is safe.
        self._current_key_range.advance(key)
        yield o

      if (self._ns_range.is_single_namespace or
          self._current_key_range.namespace == self._ns_range.namespace_end):
        break
      self._ns_range = self._ns_range.with_start_after(
          self._current_key_range.namespace)
      self._current_key_range = None
Пример #4
0
    def __iter__(self):
        """Create a generator for entities or keys in the range.

    Iterating through entries moves query range past the consumed entries.

    Yields:
      next entry.
    """
        while True:
            entries_query = self._key_range.make_ascending_query(
                util.for_name(self._entity_kind), self._keys_only)
            entries_list = entries_query.fetch(limit=self.batch_size)

            if not entries_list:
                return

            for entry in entries_list:
                if hasattr(entry, 'key'):
                    key = entry.key()
                else:
                    key = entry

                self._key_range = key_range.KeyRange(
                    key, self._key_range.key_end, self._key_range.direction,
                    False, self._key_range.include_end)
                yield entry
Пример #5
0
    def _split_input_from_namespace(cls, app, namespace, entity_kind_name,
                                    shard_count):
        entity_kind = util.for_name(entity_kind_name)
        entity_kind_name = entity_kind.kind()

        hex_key_start = db.Key.from_path(entity_kind_name, 0)
        hex_key_end = db.Key.from_path(entity_kind_name, int('f' * 40,
                                                             base=16))
        hex_range = key_range.KeyRange(hex_key_start,
                                       hex_key_end,
                                       None,
                                       True,
                                       True,
                                       namespace=namespace,
                                       _app=app)

        key_range_list = [hex_range]
        number_of_half_splits = int(math.floor(math.log(shard_count, 2)))
        for index in xrange(0, number_of_half_splits):
            new_ranges = []
            for current_range in key_range_list:
                new_ranges.extend(current_range.split_range(1))
            key_range_list = new_ranges

        adjusted_range_list = []
        for current_range in key_range_list:
            adjusted_range = key_range.KeyRange(
                key_start=db.Key.from_path(current_range.key_start.kind(),
                                           'hash_%040x' %
                                           (current_range.key_start.id() or 0),
                                           _app=current_range._app),
                key_end=db.Key.from_path(current_range.key_end.kind(),
                                         'hash_%040x' %
                                         (current_range.key_end.id() or 0),
                                         _app=current_range._app),
                direction=current_range.direction,
                include_start=current_range.include_start,
                include_end=current_range.include_end,
                namespace=current_range.namespace,
                _app=current_range._app)

            adjusted_range_list.append(adjusted_range)

        return adjusted_range_list
Пример #6
0
  def testTwoShards(self):
    """Tests two shares: one for number prefixes, one for letter prefixes."""
    result = (
        offline_jobs.HashKeyDatastoreInputReader._split_input_from_namespace(
          self.app, self.namespace, self.entity_kind, 2))

    expected = [
      key_range.KeyRange(
          key_start=db.Key.from_path(
              'Subscription',
              u'hash_0000000000000000000000000000000000000000',
              _app=u'my-app-id'),
          key_end=db.Key.from_path(
              'Subscription',
              u'hash_7fffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          direction='DESC',
          include_start=True,
          include_end=True,
          namespace='my-namespace',
          _app='my-app-id'),
      key_range.KeyRange(
          key_start=db.Key.from_path(
              'Subscription',
              u'hash_7fffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          key_end=db.Key.from_path(
              'Subscription',
              u'hash_ffffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          direction='ASC',
          include_start=False,
          include_end=True,
          namespace='my-namespace',
          _app='my-app-id'),
    ]
    self.assertEquals(expected, result)
Пример #7
0
    def _split_input_from_params(cls, app, namespaces, entity_kind_name,
                                 params, shard_count):
        readers = super(ConsistentKeyReader,
                        cls)._split_input_from_params(app, namespaces,
                                                      entity_kind_name, params,
                                                      shard_count)

        # We always produce at least one key range because:
        # a) there might be unapplied entities
        # b) it simplifies mapper code
        if not readers:
            key_ranges = [
                key_range.KeyRange(namespace=namespace, _app=app)
                for namespace in namespaces
            ]
            readers = [cls(entity_kind_name, key_ranges)]

        return readers
Пример #8
0
  def testOneShard(self):
    """Tests just one shard."""
    result = (
        offline_jobs.HashKeyDatastoreInputReader._split_input_from_namespace(
          self.app, self.namespace, self.entity_kind, 1))

    expected = [
      key_range.KeyRange(
          key_start=db.Key.from_path(
              'Subscription',
              u'hash_0000000000000000000000000000000000000000',
              _app=u'my-app-id'),
          key_end=db.Key.from_path(
              'Subscription',
              u'hash_ffffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          direction='ASC',
          include_start=True,
          include_end=True,
          namespace='my-namespace',
          _app='my-app-id')
    ]
    self.assertEquals(expected, result)
Пример #9
0
    def _split_input_from_namespace(cls, app, namespace, entity_kind_name,
                                    shard_count):
        """Return KeyRange objects. Helper for _split_input_from_params."""

        raw_entity_kind = util.get_short_name(entity_kind_name)

        if shard_count == 1:
            # With one shard we don't need to calculate any splitpoints at all.
            return [key_range.KeyRange(namespace=namespace, _app=app)]

        # we use datastore.Query instead of ext.db.Query here, because we can't
        # erase ordering on db.Query once we set it.
        ds_query = datastore.Query(kind=raw_entity_kind,
                                   namespace=namespace,
                                   _app=app,
                                   keys_only=True)
        ds_query.Order("__key__")
        first_entity_key_list = ds_query.Get(1)
        if not first_entity_key_list:
            logging.warning("Could not retrieve an entity of type %s.",
                            raw_entity_kind)
            return []
        first_entity_key = first_entity_key_list[0]
        ds_query.Order(("__key__", datastore.Query.DESCENDING))
        try:
            last_entity_key, = ds_query.Get(1)
        except db.NeedIndexError, e:
            # TODO(user): Show this error in the worker log, not the app logs.
            logging.warning(
                "Cannot create accurate approximation of keyspace, "
                "guessing instead. Please address this problem: %s", e)
            # TODO(user): Use a key-end hint from the user input parameters
            # in this case, in the event the user has a good way of figuring out
            # the range of the keyspace.
            last_entity_key = key_range.KeyRange.guess_end_key(
                raw_entity_kind, first_entity_key)
Пример #10
0
class DatastoreInputReader(InputReader):
    """Represents a range in query results.

  DatastoreInputReader yields model instances from the entities in a given key
  range. Iterating over DatastoreInputReader changes its range past consumed
  entries.

  The class shouldn't be instantiated directly. Use the split_input class method
  instead.
  """

    # Number of entities to fetch at once while doing scanning.
    _BATCH_SIZE = 50

    # Maximum number of shards we'll create.
    _MAX_SHARD_COUNT = 256

    # Mapreduce parameters.
    ENTITY_KIND_PARAM = "entity_kind"
    KEYS_ONLY_PARAM = "keys_only"
    BATCH_SIZE_PARAM = "batch_size"
    KEY_RANGE_PARAM = "key_range"

    # TODO(user): Add support for arbitrary queries. It's not possible to
    # support them without cursors since right now you can't even serialize query
    # definition.
    def __init__(self, entity_kind, key_ranges, batch_size=_BATCH_SIZE):
        """Create new DatastoreInputReader object.

    This is internal constructor. Use split_query instead.

    Args:
      entity_kind: entity kind as string.
      key_ranges: a sequence of key_range.KeyRange instances to process.
      batch_size: size of read batch as int.
    """
        self._entity_kind = entity_kind
        # Reverse the KeyRanges so they can be processed in order as a stack of
        # work items.
        self._key_ranges = list(reversed(key_ranges))
        self._batch_size = int(batch_size)

    def __iter__(self):
        """Create a generator for model instances for entities.

    Iterating through entities moves query range past the consumed entities.

    Yields:
      next model instance.
    """
        while True:
            if self._current_key_range is None:
                break

            while True:
                query = self._current_key_range.make_ascending_query(
                    util.for_name(self._entity_kind))
                results = query.fetch(limit=self._batch_size)

                if not results:
                    self._advance_key_range()
                    break

                for model_instance in results:
                    key = model_instance.key()

                    self._current_key_range.advance(key)
                    yield model_instance

    @property
    def _current_key_range(self):
        if self._key_ranges:
            return self._key_ranges[-1]
        else:
            return None

    def _advance_key_range(self):
        if self._key_ranges:
            self._key_ranges.pop()

    # TODO(user): use query splitting functionality when it becomes available
    # instead.
    @classmethod
    def _split_input_from_namespace(cls, app, namespace, entity_kind_name,
                                    shard_count):
        """Return KeyRange objects. Helper for _split_input_from_params."""

        raw_entity_kind = util.get_short_name(entity_kind_name)

        if shard_count == 1:
            # With one shard we don't need to calculate any splitpoints at all.
            return [key_range.KeyRange(namespace=namespace, _app=app)]

        # we use datastore.Query instead of ext.db.Query here, because we can't
        # erase ordering on db.Query once we set it.
        ds_query = datastore.Query(kind=raw_entity_kind,
                                   namespace=namespace,
                                   _app=app,
                                   keys_only=True)
        ds_query.Order("__key__")
        first_entity_key_list = ds_query.Get(1)
        if not first_entity_key_list:
            logging.warning("Could not retrieve an entity of type %s.",
                            raw_entity_kind)
            return []
        first_entity_key = first_entity_key_list[0]
        ds_query.Order(("__key__", datastore.Query.DESCENDING))
        try:
            last_entity_key, = ds_query.Get(1)
        except db.NeedIndexError, e:
            # TODO(user): Show this error in the worker log, not the app logs.
            logging.warning(
                "Cannot create accurate approximation of keyspace, "
                "guessing instead. Please address this problem: %s", e)
            # TODO(user): Use a key-end hint from the user input parameters
            # in this case, in the event the user has a good way of figuring out
            # the range of the keyspace.
            last_entity_key = key_range.KeyRange.guess_end_key(
                raw_entity_kind, first_entity_key)
        full_keyrange = key_range.KeyRange(first_entity_key,
                                           last_entity_key,
                                           None,
                                           True,
                                           True,
                                           namespace=namespace,
                                           _app=app)
        key_ranges = [full_keyrange]
        number_of_half_splits = int(math.floor(math.log(shard_count, 2)))
        for _ in range(0, number_of_half_splits):
            new_ranges = []
            for r in key_ranges:
                new_ranges += r.split_range(1)
            key_ranges = new_ranges
        return key_ranges
Пример #11
0
  def testManyShards(self):
    """Tests having many shards with multiple levels of splits."""
    result = (
        offline_jobs.HashKeyDatastoreInputReader._split_input_from_namespace(
          self.app, self.namespace, self.entity_kind, 4))

    expected = [
      key_range.KeyRange(
          key_start=db.Key.from_path(
              'Subscription',
              u'hash_0000000000000000000000000000000000000000',
              _app=u'my-app-id'),
          key_end=db.Key.from_path(
              'Subscription',
              u'hash_3fffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          direction='DESC',
          include_start=True,
          include_end=True,
          namespace='my-namespace',
          _app='my-app-id'),
      key_range.KeyRange(
          key_start=db.Key.from_path(
              'Subscription',
              u'hash_3fffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          key_end=db.Key.from_path(
              'Subscription',
              u'hash_7fffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          direction='ASC',
          include_start=False,
          include_end=True,
          namespace='my-namespace',
          _app='my-app-id'),
      key_range.KeyRange(
          key_start=db.Key.from_path(
              'Subscription',
              u'hash_7fffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          key_end=db.Key.from_path(
              'Subscription',
              u'hash_bfffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          direction='DESC',
          include_start=False,
          include_end=True,
          namespace='my-namespace',
          _app='my-app-id'),
      key_range.KeyRange(
          key_start=db.Key.from_path(
              'Subscription',
              u'hash_bfffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          key_end=db.Key.from_path(
              'Subscription',
              u'hash_ffffffffffffffffffffffffffffffffffffffff',
              _app=u'my-app-id'),
          direction='ASC',
          include_start=False,
          include_end=True,
          namespace='my-namespace',
          _app='my-app-id'),
    ]
    self.assertEquals(expected, result)
Пример #12
0
class DatastoreInputReader(InputReader):
    """Represents a range in query results.

  DatastoreInputReader is a generator for either entities or keys in the key
  range, depending on the value of the keys_only parameter. Iterating over
  DatastoreInputReader changes its range past consumed entries.

  The class shouldn't be instantiated directly. Use split_input class method
  instead.
  """

    _BATCH_SIZE = 50

    _MAX_SHARD_COUNT = 256

    def __init__(self, entity_kind, key_range_param, batch_size, keys_only):
        """Create new DatastoreInputReader object.

    This is internal constructor. Use split_query instead.

    Args:
      entity_kind: entity kind as string.
      key_range_param: key range to process as key_range.KeyRange.
      batch_size: batch size of entity fetching.
      keys_only: if True, then send only keys to the mapper.
    """
        self._entity_kind = entity_kind
        self._key_range = key_range_param
        self.batch_size = batch_size
        self._keys_only = keys_only

    def __iter__(self):
        """Create a generator for entities or keys in the range.

    Iterating through entries moves query range past the consumed entries.

    Yields:
      next entry.
    """
        while True:
            entries_query = self._key_range.make_ascending_query(
                util.for_name(self._entity_kind), self._keys_only)
            entries_list = entries_query.fetch(limit=self.batch_size)

            if not entries_list:
                return

            for entry in entries_list:
                if hasattr(entry, 'key'):
                    key = entry.key()
                else:
                    key = entry

                self._key_range = key_range.KeyRange(
                    key, self._key_range.key_end, self._key_range.direction,
                    False, self._key_range.include_end)
                yield entry

    @classmethod
    def split_input(cls, mapper_spec):
        """Splits query into shards without fetching query results.

    Tries as best as it can to split the whole query result set into equal
    shards. Due to difficulty of making the perfect split, resulting shards'
    sizes might differ significantly from each other. The actual number of
    shards might also be less then requested (even 1), though it is never
    greater.

    Current implementation does key-lexicographic order splitting. It requires
    query not to specify any __key__-based ordering. If an index for
    query.order('-__key__') query is not present, an inaccurate guess at
    sharding will be made by splitting the full key range.

    Args:
      mapper_spec: MapperSpec with params containing 'entity_kind'.
        May also have 'batch_size' in the params to specify the number
        of entities to process in each batch.

    Returns:
      A list of DatastoreInputReader objects of length <= number_of_shards.

    Raises:
      BadReaderParamsError if required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Input reader class mismatch")
        params = mapper_spec.params
        if "entity_kind" not in params:
            raise BadReaderParamsError(
                "Missing mapper parameter 'entity_kind'")

        entity_kind_name = params["entity_kind"]
        entity_kind = util.for_name(entity_kind_name)
        shard_count = mapper_spec.shard_count
        batch_size = int(params.get("batch_size", cls._BATCH_SIZE))
        keys_only = int(params.get("keys_only", False))

        ds_query = entity_kind.all()._get_query()
        ds_query.Order("__key__")
        first_entity = ds_query.Get(1)
        if not first_entity:
            return []
        else:
            first_entity_key = first_entity[0].key()

        ds_query.Order(("__key__", datastore.Query.DESCENDING))
        try:
            last_entity = ds_query.Get(1)
            last_entity_key = last_entity[0].key()
        except db.NeedIndexError, e:
            logging.warning(
                "Cannot create accurate approximation of keyspace, "
                "guessing instead. Please address this problem: %s", e)
            last_entity_key = key_range.KeyRange.guess_end_key(
                entity_kind.kind(), first_entity_key)

        full_keyrange = key_range.KeyRange(first_entity_key, last_entity_key,
                                           None, True, True)
        key_ranges = [full_keyrange]

        number_of_half_splits = int(math.floor(math.log(shard_count, 2)))
        for _ in range(0, number_of_half_splits):
            new_ranges = []
            for r in key_ranges:
                new_ranges += r.split_range(1)
            key_ranges = new_ranges

        return [
            DatastoreInputReader(entity_kind_name, r, batch_size, keys_only)
            for r in key_ranges
        ]