Пример #1
0
 def validate(cls, mapper_spec):
   """Validate reader parameters in mapper_spec."""
   if mapper_spec.input_reader_class() != cls:
     raise errors.BadReaderParamsError("Input reader class mismatch")
   params = mapper_spec.params
   if not cls.FILES_PARAM in params:
     raise errors.BadReaderParamsError("Missing files parameter.")
    def _validate_filters(cls, filters, model_class):
        """Validate user supplied filters.

    Validate filters are on existing properties and filter values
    have valid semantics.

    Args:
      filters: user supplied filters. Each filter should be a list or tuple of
        format (<property_name_as_str>, <query_operator_as_str>,
        <value_of_certain_type>). Value type is up to the property's type.
      model_class: the db.Model class for the entity type to apply filters on.

    Raises:
      BadReaderParamsError: if any filter is invalid in any way.
    """
        if not filters:
            return

        properties = model_class.properties()

        for f in filters:
            prop, _, val = f
            if prop not in properties:
                raise errors.BadReaderParamsError(
                    "Property %s is not defined for entity type %s", prop,
                    model_class.kind())

            # Validate the value of each filter. We need to know filters have
            # valid value to carry out splits.
            try:
                properties[prop].validate(val)
            except db.BadValueError, e:
                raise errors.BadReaderParamsError(e)
Пример #3
0
    def validate(cls, job_config):
        """Inherit docs."""
        super(AbstractDatastoreInputReader, cls).validate(job_config)
        params = job_config.input_reader_params

        # Check for the required entity kind parameter.
        if cls.ENTITY_KIND_PARAM not in params:
            raise errors.BadReaderParamsError("Missing input reader parameter "
                                              "'entity_kind'")
        # Validate the batch size parameter.
        if cls.BATCH_SIZE_PARAM in params:
            try:
                batch_size = int(params[cls.BATCH_SIZE_PARAM])
                if batch_size < 1:
                    raise errors.BadReaderParamsError("Bad batch size: %s" %
                                                      batch_size)
            except ValueError, e:
                raise errors.BadReaderParamsError("Bad batch size: %s" % e)
 def validate(cls, job_config):
     """Inherit docs."""
     super(ModelDatastoreInputReader, cls).validate(job_config)
     params = job_config.input_reader_params
     entity_kind = params[cls.ENTITY_KIND_PARAM]
     # Fail fast if Model cannot be located.
     try:
         model_class = util.for_name(entity_kind)
     except ImportError, e:
         raise errors.BadReaderParamsError("Bad entity kind: %s" % e)
    def _validate_filters_ndb(cls, filters, model_class):
        """Validate ndb.Model filters."""
        if not filters:
            return

        properties = model_class._properties

        for f in filters:
            prop, _, val = f
            if prop not in properties:
                raise errors.BadReaderParamsError(
                    "Property %s is not defined for entity type %s", prop,
                    model_class._get_kind())

            # Validate the value of each filter. We need to know filters have
            # valid value to carry out splits.
            try:
                properties[prop]._do_validate(val)
            except db.BadValueError, e:
                raise errors.BadReaderParamsError(e)
Пример #6
0
def _get_params(mapper_spec, allowed_keys=None):
    """Obtain input reader parameters.

  Utility function for input readers implementation. Fetches parameters
  from mapreduce specification giving appropriate usage warnings.

  Args:
    mapper_spec: The MapperSpec for the job
    allowed_keys: set of all allowed keys in parameters as strings. If it is not
      None, then parameters are expected to be in a separate "input_reader"
      subdictionary of mapper_spec parameters.

  Returns:
    mapper parameters as dict

  Raises:
    BadReaderParamsError: if parameters are invalid/missing or not allowed.
  """
    if "input_reader" not in mapper_spec.params:
        message = ("Input reader's parameters should be specified in "
                   "input_reader subdictionary.")
        if allowed_keys:
            raise errors.BadReaderParamsError(message)
        else:
            logging.warning(message)
        params = mapper_spec.params
        params = dict((str(n), v) for n, v in params.iteritems())
    else:
        if not isinstance(mapper_spec.params.get("input_reader"), dict):
            raise BadReaderParamsError(
                "Input reader parameters should be a dictionary")
        params = mapper_spec.params.get("input_reader")
        params = dict((str(n), v) for n, v in params.iteritems())
        if allowed_keys:
            params_diff = set(params.keys()) - allowed_keys
            if params_diff:
                raise errors.BadReaderParamsError(
                    "Invalid input_reader parameters: %s" %
                    ",".join(params_diff))

    return params
Пример #7
0
    def validate(cls, job_config):
        """Inherit docs."""
        super(SampleInputReader, cls).validate(job_config)

        params = job_config.input_reader_params
        # Validate count.
        if cls.COUNT not in params:
            raise errors.BadReaderParamsError("Must specify %s" % cls.COUNT)
        if not isinstance(params[cls.COUNT], int):
            raise errors.BadReaderParamsError(
                "%s should be an int but is %s" %
                (cls.COUNT, type(params[cls.COUNT])))
        if params[cls.COUNT] <= 0:
            raise errors.BadReaderParamsError("%s should be a positive int")
        # Validate string length.
        if cls.STRING_LENGTH in params and not (
                isinstance(params[cls.STRING_LENGTH], int)
                and params[cls.STRING_LENGTH] > 0):
            raise errors.BadReaderParamsError(
                "%s should be a positive int "
                "but is %s" % (cls.STRING_LENGTH, params[cls.STRING_LENGTH]))
Пример #8
0
  def validate(cls, job_config):
    """Validate mapper specification.

    Args:
      job_config: map_job.JobConfig.

    Raises:
      BadReaderParamsError: if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
    reader_params = job_config.input_reader_params

    # Bucket Name is required
    if cls.BUCKET_NAME_PARAM not in reader_params:
      raise errors.BadReaderParamsError(
          "%s is required for Google Cloud Storage" %
          cls.BUCKET_NAME_PARAM)
    try:
      cloudstorage.validate_bucket_name(
          reader_params[cls.BUCKET_NAME_PARAM])
    except ValueError, error:
      raise errors.BadReaderParamsError("Bad bucket name, %s" % (error))
Пример #9
0
    def validate(cls, job_config):
        """Validates relevant parameters.

    This method can validate fields which it deems relevant.

    Args:
      job_config: an instance of map_job.JobConfig.

    Raises:
      errors.BadReaderParamsError: required parameters are missing or invalid.
    """
        if job_config.input_reader_cls != cls:
            raise errors.BadReaderParamsError(
                "Expect input reader class %r, got %r." %
                (cls, job_config.input_reader_cls))
Пример #10
0
    def validate(cls, mapper_spec):
        """Validate mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec

    Raises:
      BadReaderParamsError if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
        reader_spec = input_readers._get_params(mapper_spec, allow_old=False)

        # Readset id is required.
        if cls.READSET_ID_PARAM not in reader_spec:
            raise errors.BadReaderParamsError(
                "%s is required for the Genomics API" % cls.READSET_ID_PARAM)
Пример #11
0
 def validate(cls, job_config):
     """Inherit docs."""
     super(DatastoreInputReader, cls).validate(job_config)
     params = job_config.input_reader_params
     entity_kind = params[cls.ENTITY_KIND_PARAM]
     # Check for a "." in the entity kind.
     if "." in entity_kind:
         logging.warning(
             ". detected in entity kind %s specified for reader %s."
             "Assuming entity kind contains the dot.", entity_kind,
             cls.__name__)
     # Validate the filters parameters.
     if cls.FILTERS_PARAM in params:
         filters = params[cls.FILTERS_PARAM]
         for f in filters:
             if f[1] != "=":
                 raise errors.BadReaderParamsError(
                     "Only equality filters are supported: %s", f)
Пример #12
0
class GCSInputReader(map_job.InputReader):
  """Input reader from Google Cloud Storage using the cloudstorage library.

  Required configuration in the mapper_spec.input_reader dictionary.
    BUCKET_NAME_PARAM: name of the bucket to use. No "/" prefix or suffix.
    OBJECT_NAMES_PARAM: a list of object names or prefixes. All objects must be
      in the BUCKET_NAME_PARAM bucket. If the name ends with a * it will be
      treated as prefix and all objects with matching names will be read.
      Entries should not start with a slash unless that is part of the object's
      name. An example list could be:
      ["my-1st-input-file", "directory/my-2nd-file", "some/other/dir/input-*"]
      To retrieve all files "*" will match every object in the bucket. If a file
      is listed twice or is covered by multiple prefixes it will be read twice,
      there is no de-duplication.

  Optional configuration in the mapper_sec.input_reader dictionary.
    BUFFER_SIZE_PARAM: the size of the read buffer for each file handle.
    PATH_FILTER_PARAM: an instance of PathFilter. PathFilter is a predicate
      on which filenames to read.
    DELIMITER_PARAM: str. The delimiter that signifies directory.
      If you have too many files to shard on the granularity of individual
      files, you can specify this to enable shallow splitting. In this mode,
      the reader only goes one level deep during "*" expansion and stops when
      the delimiter is encountered.
  """

  # Counters.
  COUNTER_FILE_READ = "file-read"
  COUNTER_FILE_MISSING = "file-missing"

  # Supported parameters
  BUCKET_NAME_PARAM = "bucket_name"
  OBJECT_NAMES_PARAM = "objects"
  BUFFER_SIZE_PARAM = "buffer_size"
  DELIMITER_PARAM = "delimiter"
  PATH_FILTER_PARAM = "path_filter"

  # Internal parameters
  _ACCOUNT_ID_PARAM = "account_id"

  # Other internal configuration constants
  _JSON_PICKLE = "pickle"
  _STRING_MAX_FILES_LISTED = 10  # Max files shown in the str representation

  # Input reader can also take in start and end filenames and do
  # listbucket. This saves space but has two cons.
  # 1. Files to read are less well defined: files can be added or removed over
  #    the lifetime of the MR job.
  # 2. A shard has to process files from a contiguous namespace.
  #    May introduce staggering shard.
  def __init__(self, filenames, index=0, buffer_size=None, _account_id=None,
               delimiter=None, path_filter=None):
    """Initialize a GoogleCloudStorageInputReader instance.

    Args:
      filenames: A list of Google Cloud Storage filenames of the form
        '/bucket/objectname'.
      index: Index of the next filename to read.
      buffer_size: The size of the read buffer, None to use default.
      _account_id: Internal use only. See cloudstorage documentation.
      delimiter: Delimiter used as path separator. See class doc.
      path_filter: An instance of PathFilter.
    """
    super(GCSInputReader, self).__init__()
    self._filenames = filenames
    self._index = index
    self._buffer_size = buffer_size
    self._account_id = _account_id
    self._delimiter = delimiter
    self._bucket = None
    self._bucket_iter = None
    self._path_filter = path_filter
    self._slice_ctx = None

  def _next_file(self):
    """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
    while True:
      if self._bucket_iter:
        try:
          return self._bucket_iter.next().filename
        except StopIteration:
          self._bucket_iter = None
          self._bucket = None
      if self._index >= len(self._filenames):
        return
      filename = self._filenames[self._index]
      self._index += 1
      if self._delimiter is None or not filename.endswith(self._delimiter):
        return filename
      self._bucket = cloudstorage.listbucket(filename,
                                             delimiter=self._delimiter)
      self._bucket_iter = iter(self._bucket)

  @classmethod
  def validate(cls, job_config):
    """Validate mapper specification.

    Args:
      job_config: map_job.JobConfig.

    Raises:
      BadReaderParamsError: if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
    reader_params = job_config.input_reader_params

    # Bucket Name is required
    if cls.BUCKET_NAME_PARAM not in reader_params:
      raise errors.BadReaderParamsError(
          "%s is required for Google Cloud Storage" %
          cls.BUCKET_NAME_PARAM)
    try:
      cloudstorage.validate_bucket_name(
          reader_params[cls.BUCKET_NAME_PARAM])
    except ValueError, error:
      raise errors.BadReaderParamsError("Bad bucket name, %s" % (error))

    # Object Name(s) are required
    if cls.OBJECT_NAMES_PARAM not in reader_params:
      raise errors.BadReaderParamsError(
          "%s is required for Google Cloud Storage" %
          cls.OBJECT_NAMES_PARAM)
    filenames = reader_params[cls.OBJECT_NAMES_PARAM]
    if not isinstance(filenames, list):
      raise errors.BadReaderParamsError(
          "Object name list is not a list but a %s" %
          filenames.__class__.__name__)
    for filename in filenames:
      if not isinstance(filename, basestring):
        raise errors.BadReaderParamsError(
            "Object name is not a string but a %s" %
            filename.__class__.__name__)

    # Delimiter.
    if cls.DELIMITER_PARAM in reader_params:
      delimiter = reader_params[cls.DELIMITER_PARAM]
      if not isinstance(delimiter, basestring):
        raise errors.BadReaderParamsError(
            "%s is not a string but a %s" %
            (cls.DELIMITER_PARAM, type(delimiter)))

    # Buffer size.
    if cls.BUFFER_SIZE_PARAM in reader_params:
      buffer_size = reader_params[cls.BUFFER_SIZE_PARAM]
      if not isinstance(buffer_size, int):
        raise errors.BadReaderParamsError(
            "%s is not an int but a %s" %
            (cls.BUFFER_SIZE_PARAM, type(buffer_size)))

    # Path filter.
    if cls.PATH_FILTER_PARAM in reader_params:
      path_filter = reader_params[cls.PATH_FILTER_PARAM]
      if not isinstance(path_filter, PathFilter):
        raise errors.BadReaderParamsError(
            "%s is not an instance of PathFilter but %s." %
            (cls.PATH_FILTER_PARAM, type(path_filter)))
Пример #13
0
    def _get_range_from_filters(cls, filters, model_class):
        """Get property range from filters user provided.

    This method also validates there is one and only one closed range on a
    single property.

    Args:
      filters: user supplied filters. Each filter should be a list or tuple of
        format (<property_name_as_str>, <query_operator_as_str>,
        <value_of_certain_type>). Value type should satisfy the property's type.
      model_class: the model class for the entity type to apply filters on.

    Returns:
      a tuple of (property, start_filter, end_filter). property is the model's
    field that the range is about. start_filter and end_filter define the
    start and the end of the range. (None, None, None) if no range is found.

    Raises:
      BadReaderParamsError: if any filter is invalid in any way.
    """
        if not filters:
            return None, None, None

        range_property = None
        start_val = None
        end_val = None
        start_filter = None
        end_filter = None
        for f in filters:
            prop, op, val = f

            if op in [">", ">=", "<", "<="]:
                if range_property and range_property != prop:
                    raise errors.BadReaderParamsError(
                        "Range on only one property is supported.")
                range_property = prop

                if val is None:
                    raise errors.BadReaderParamsError(
                        "Range can't be None in filter %s", f)

                if op in [">", ">="]:
                    if start_val is not None:
                        raise errors.BadReaderParamsError(
                            "Operation %s is specified more than once.", op)
                    start_val = val
                    start_filter = f
                else:
                    if end_val is not None:
                        raise errors.BadReaderParamsError(
                            "Operation %s is specified more than once.", op)
                    end_val = val
                    end_filter = f
            elif op != "=":
                raise errors.BadReaderParamsError(
                    "Only < <= > >= = are supported as operation. Got %s", op)

        if not range_property:
            return None, None, None

        if start_val is None or end_val is None:
            raise errors.BadReaderParamsError(
                "Filter should contains a complete range on property %s",
                range_property)
        if issubclass(model_class, db.Model):
            property_obj = model_class.properties()[range_property]
        else:
            property_obj = (
                model_class._properties[  # pylint: disable=protected-access
                    range_property])
        supported_properties = (_DISCRETE_PROPERTY_SPLIT_FUNCTIONS.keys() +
                                _CONTINUOUS_PROPERTY_SPLIT_FUNCTIONS.keys())
        if not isinstance(property_obj, tuple(supported_properties)):
            raise errors.BadReaderParamsError(
                "Filtered property %s is not supported by sharding.",
                range_property)
        if not start_val < end_val:
            raise errors.BadReaderParamsError(
                "Start value %s should be smaller than end value %s",
                start_val, end_val)

        return property_obj, start_filter, end_filter
Пример #14
0
class AbstractDatastoreInputReader(input_reader.InputReader):
    """Implementation of an abstract base class for a Datastore input reader."""

    # Number of entities to fetch at once while doing scanning.
    _BATCH_SIZE = 50

    # Maximum number of shards we'll create.
    _MAX_SHARD_COUNT = 256

    # The maximum number of namespaces that will be sharded by datastore key
    # before switching to a strategy where sharding is done lexographically by
    # namespace.
    MAX_NAMESPACES_FOR_KEY_SHARD = 10

    _APP_PARAM = "_app"

    # reader parameters.
    NAMESPACE_PARAM = "namespace"
    ENTITY_KIND_PARAM = "entity_kind"
    KEYS_ONLY_PARAM = "keys_only"
    BATCH_SIZE_PARAM = "batch_size"
    KEY_RANGE_PARAM = "key_range"
    FILTERS_PARAM = "filters"

    _KEY_RANGE_ITER_CLS = db_iters.AbstractKeyRangeIterator

    def __init__(self, iterator):
        """Create new AbstractDatastoreInputReader object.

    This is internal constructor. Use split_input to create readers instead.

    Args:
      iterator: an iterator that generates objects for this input reader.
    """
        self._iter = iterator

    def __iter__(self):
        """Yields whatever the internal iterator yields."""
        for o in self._iter:
            yield o

    def __str__(self):
        """Returns the string representation of this InputReader."""
        return repr(self._iter)

    def to_json(self):
        """Inherit doc."""
        return self._iter.to_json()

    @classmethod
    def from_json(cls, state):
        """Inherit doc."""
        return cls(db_iters.RangeIteratorFactory.from_json(state))

    @classmethod
    def _get_query_spec(cls, params):
        """Construct a model.QuerySpec from model.MapperSpec."""
        entity_kind = params[cls.ENTITY_KIND_PARAM]
        filters = params.get(cls.FILTERS_PARAM)
        app = params.get(cls._APP_PARAM)
        ns = params.get(cls.NAMESPACE_PARAM)

        return model.QuerySpec(
            entity_kind=cls._get_raw_entity_kind(entity_kind),
            keys_only=bool(params.get(cls.KEYS_ONLY_PARAM, False)),
            filters=filters,
            batch_size=int(params.get(cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE)),
            model_class_path=entity_kind,
            app=app,
            ns=ns)

    @classmethod
    def split_input(cls, job_config):
        """Inherit doc."""
        shard_count = job_config.shard_count
        params = job_config.input_reader_params
        query_spec = cls._get_query_spec(params)

        namespaces = None
        if query_spec.ns is not None:
            k_ranges = cls._to_key_ranges_by_shard(query_spec.app,
                                                   [query_spec.ns],
                                                   shard_count, query_spec)
        else:
            ns_keys = namespace_range.get_namespace_keys(
                query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD + 1)
            # No namespace means the app may have some data but those data are not
            # visible yet. Just return.
            if not ns_keys:
                return
            # If the number of ns is small, we shard each ns by key and assign each
            # shard a piece of a ns.
            elif len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD:
                namespaces = [ns_key.name() or "" for ns_key in ns_keys]
                k_ranges = cls._to_key_ranges_by_shard(query_spec.app,
                                                       namespaces, shard_count,
                                                       query_spec)
            # When number of ns is large, we can only split lexicographically by ns.
            else:
                ns_ranges = namespace_range.NamespaceRange.split(
                    n=shard_count,
                    contiguous=False,
                    can_query=lambda: True,
                    _app=query_spec.app)
                k_ranges = [
                    key_ranges.KeyRangesFactory.create_from_ns_range(ns_range)
                    for ns_range in ns_ranges
                ]

        iters = [
            db_iters.RangeIteratorFactory.create_key_ranges_iterator(
                r, query_spec, cls._KEY_RANGE_ITER_CLS) for r in k_ranges
        ]

        return [cls(i) for i in iters]

    @classmethod
    def _to_key_ranges_by_shard(cls, app, namespaces, shard_count, query_spec):
        """Get a list of key_ranges.KeyRanges objects, one for each shard.

    This method uses scatter index to split each namespace into pieces
    and assign those pieces to shards.

    Args:
      app: app_id in str.
      namespaces: a list of namespaces in str.
      shard_count: number of shards to split.
      query_spec: model.QuerySpec.

    Returns:
      a list of key_ranges.KeyRanges objects.
    """
        key_ranges_by_ns = []
        # Split each ns into n splits. If a ns doesn't have enough scatter to
        # split into n, the last few splits are None.
        for namespace in namespaces:
            ranges = cls._split_ns_by_scatter(shard_count, namespace,
                                              query_spec.entity_kind, app)
            # The nth split of each ns will be assigned to the nth shard.
            # Shuffle so that None are not all by the end.
            random.shuffle(ranges)
            key_ranges_by_ns.append(ranges)

        # KeyRanges from different namespaces might be very different in size.
        # Use round robin to make sure each shard can have at most one split
        # or a None from a ns.
        ranges_by_shard = [[] for _ in range(shard_count)]
        for ranges in key_ranges_by_ns:
            for i, k_range in enumerate(ranges):
                if k_range:
                    ranges_by_shard[i].append(k_range)

        key_ranges_by_shard = []
        for ranges in ranges_by_shard:
            if ranges:
                key_ranges_by_shard.append(
                    key_ranges.KeyRangesFactory.create_from_list(ranges))
        return key_ranges_by_shard

    @classmethod
    def _split_ns_by_scatter(cls, shard_count, namespace, raw_entity_kind,
                             app):
        """Split a namespace by scatter index into key_range.KeyRange.

    TODO(user): Power this with key_range.KeyRange.compute_split_points.

    Args:
      shard_count: number of shards.
      namespace: namespace name to split. str.
      raw_entity_kind: low level datastore API entity kind.
      app: app id in str.

    Returns:
      A list of key_range.KeyRange objects. If there are not enough entities to
    splits into requested shards, the returned list will contain KeyRanges
    ordered lexicographically with any Nones appearing at the end.
    """
        if shard_count == 1:
            # With one shard we don't need to calculate any split points at all.
            return [key_range.KeyRange(namespace=namespace, _app=app)]

        ds_query = datastore.Query(kind=raw_entity_kind,
                                   namespace=namespace,
                                   _app=app,
                                   keys_only=True)
        ds_query.Order("__scatter__")
        oversampling_factor = 32
        random_keys = ds_query.Get(shard_count * oversampling_factor)

        if not random_keys:
            # There are no entities with scatter property. We have no idea
            # how to split.
            return ([key_range.KeyRange(namespace=namespace, _app=app)] +
                    [None] * (shard_count - 1))

        random_keys.sort()

        if len(random_keys) >= shard_count:
            # We've got a lot of scatter values. Sample them down.
            random_keys = cls._choose_split_points(random_keys, shard_count)

        k_ranges = []

        k_ranges.append(
            key_range.KeyRange(key_start=None,
                               key_end=random_keys[0],
                               direction=key_range.KeyRange.ASC,
                               include_start=False,
                               include_end=False,
                               namespace=namespace,
                               _app=app))

        for i in range(0, len(random_keys) - 1):
            k_ranges.append(
                key_range.KeyRange(key_start=random_keys[i],
                                   key_end=random_keys[i + 1],
                                   direction=key_range.KeyRange.ASC,
                                   include_start=True,
                                   include_end=False,
                                   namespace=namespace,
                                   _app=app))

        k_ranges.append(
            key_range.KeyRange(key_start=random_keys[-1],
                               key_end=None,
                               direction=key_range.KeyRange.ASC,
                               include_start=True,
                               include_end=False,
                               namespace=namespace,
                               _app=app))

        if len(k_ranges) < shard_count:
            # We need to have as many shards as it was requested. Add some Nones.
            k_ranges += [None] * (shard_count - len(k_ranges))
        return k_ranges

    @classmethod
    def _choose_split_points(cls, sorted_keys, shard_count):
        """Returns the best split points given a random set of datastore.Keys."""
        assert len(sorted_keys) >= shard_count
        index_stride = len(sorted_keys) / float(shard_count)
        return [
            sorted_keys[int(round(index_stride * i))]
            for i in range(1, shard_count)
        ]

    @classmethod
    def validate(cls, job_config):
        """Inherit docs."""
        super(AbstractDatastoreInputReader, cls).validate(job_config)
        params = job_config.input_reader_params

        # Check for the required entity kind parameter.
        if cls.ENTITY_KIND_PARAM not in params:
            raise errors.BadReaderParamsError("Missing input reader parameter "
                                              "'entity_kind'")
        # Validate the batch size parameter.
        if cls.BATCH_SIZE_PARAM in params:
            try:
                batch_size = int(params[cls.BATCH_SIZE_PARAM])
                if batch_size < 1:
                    raise errors.BadReaderParamsError("Bad batch size: %s" %
                                                      batch_size)
            except ValueError, e:
                raise errors.BadReaderParamsError("Bad batch size: %s" % e)
        # Validate the keys only parameter.
        try:
            bool(params.get(cls.KEYS_ONLY_PARAM, False))
        except:
            raise errors.BadReaderParamsError(
                "keys_only expects a boolean value but "
                "got %s", params[cls.KEYS_ONLY_PARAM])
        # Validate the namespace parameter.
        if cls.NAMESPACE_PARAM in params:
            if not isinstance(params[cls.NAMESPACE_PARAM],
                              (str, unicode, type(None))):
                raise errors.BadReaderParamsError(
                    "Expected a single namespace string")

        # Validate the filters parameter.
        if cls.FILTERS_PARAM in params:
            filters = params[cls.FILTERS_PARAM]
            if not isinstance(filters, list):
                raise errors.BadReaderParamsError(
                    "Expected list for filters parameter")
            for f in filters:
                if not isinstance(f, (tuple, list)):
                    raise errors.BadReaderParamsError(
                        "Filter should be a tuple or list: "
                        "%s", f)
                if len(f) != 3:
                    raise errors.BadReaderParamsError(
                        "Filter should be a 3-tuple: %s", f)
                prop, op, _ = f
                if not isinstance(prop, basestring):
                    raise errors.BadReaderParamsError(
                        "Property should be string: %s", prop)
                if not isinstance(op, basestring):
                    raise errors.BadReaderParamsError(
                        "Operator should be string: %s", op)