示例#1
0
 def validate(cls, mapper_spec):
     """Validate reader parameters in mapper_spec."""
     if mapper_spec.input_reader_class() != cls:
         raise errors.BadReaderParamsError("Input reader class mismatch")
     params = mapper_spec.params
     if not cls.FILES_PARAM in params:
         raise errors.BadReaderParamsError("Missing files parameter.")
    def _validate_filters(cls, filters, model_class):
        """Validate user supplied filters.

    Validate filters are on existing properties and filter values
    have valid semantics.

    Args:
      filters: user supplied filters. Each filter should be a list or tuple of
        format (<property_name_as_str>, <query_operator_as_str>,
        <value_of_certain_type>). Value type is up to the property's type.
      model_class: the db.Model class for the entity type to apply filters on.

    Raises:
      BadReaderParamsError: if any filter is invalid in any way.
    """
        if not filters:
            return

        properties = model_class.properties()

        for f in filters:
            prop, _, val = f
            if prop not in properties:
                raise errors.BadReaderParamsError(
                    "Property %s is not defined for entity type %s", prop,
                    model_class.kind())

            try:
                properties[prop].validate(val)
            except db.BadValueError as e:
                raise errors.BadReaderParamsError(e)
    def validate(cls, job_config):
        """Inherit docs."""
        super(AbstractDatastoreInputReader, cls).validate(job_config)
        params = job_config.input_reader_params

        if cls.ENTITY_KIND_PARAM not in params:
            raise errors.BadReaderParamsError("Missing input reader parameter "
                                              "'entity_kind'")

        if cls.BATCH_SIZE_PARAM in params:
            try:
                batch_size = int(params[cls.BATCH_SIZE_PARAM])
                if batch_size < 1:
                    raise errors.BadReaderParamsError("Bad batch size: %s" %
                                                      batch_size)
            except ValueError as e:
                raise errors.BadReaderParamsError("Bad batch size: %s" % e)

        try:
            bool(params.get(cls.KEYS_ONLY_PARAM, False))
        except:
            raise errors.BadReaderParamsError(
                "keys_only expects a boolean value but "
                "got %s", params[cls.KEYS_ONLY_PARAM])

        if cls.NAMESPACE_PARAM in params:
            if not isinstance(params[cls.NAMESPACE_PARAM], (str, type(None))):
                raise errors.BadReaderParamsError(
                    "Expected a single namespace string")

        if cls.FILTERS_PARAM in params:
            filters = params[cls.FILTERS_PARAM]
            if not isinstance(filters, list):
                raise errors.BadReaderParamsError(
                    "Expected list for filters parameter")
            for f in filters:
                if not isinstance(f, (tuple, list)):
                    raise errors.BadReaderParamsError(
                        "Filter should be a tuple or list: "
                        "%s", f)
                if len(f) != 3:
                    raise errors.BadReaderParamsError(
                        "Filter should be a 3-tuple: %s", f)
                prop, op, _ = f
                if not isinstance(prop, str):
                    raise errors.BadReaderParamsError(
                        "Property should be string: %s", prop)
                if not isinstance(op, str):
                    raise errors.BadReaderParamsError(
                        "Operator should be string: %s", op)
示例#4
0
    def validate(cls, job_config):
        """Validate mapper specification.

    Args:
      job_config: map_job.JobConfig.

    Raises:
      BadReaderParamsError: if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
        reader_params = job_config.input_reader_params

        if cls.BUCKET_NAME_PARAM not in reader_params:
            raise errors.BadReaderParamsError(
                "%s is required for Google Cloud Storage" %
                cls.BUCKET_NAME_PARAM)
        try:
            cloudstorage.validate_bucket_name(
                reader_params[cls.BUCKET_NAME_PARAM])
        except ValueError as error:
            raise errors.BadReaderParamsError("Bad bucket name, %s" % (error))

        if cls.OBJECT_NAMES_PARAM not in reader_params:
            raise errors.BadReaderParamsError(
                "%s is required for Google Cloud Storage" %
                cls.OBJECT_NAMES_PARAM)
        filenames = reader_params[cls.OBJECT_NAMES_PARAM]
        if not isinstance(filenames, list):
            raise errors.BadReaderParamsError(
                "Object name list is not a list but a %s" %
                filenames.__class__.__name__)
        for filename in filenames:
            if not isinstance(filename, six.string_types):
                raise errors.BadReaderParamsError(
                    "Object name is not a string but a %s" %
                    filename.__class__.__name__)

        if cls.DELIMITER_PARAM in reader_params:
            delimiter = reader_params[cls.DELIMITER_PARAM]
            if not isinstance(delimiter, six.string_types):
                raise errors.BadReaderParamsError(
                    "%s is not a string but a %s" %
                    (cls.DELIMITER_PARAM, type(delimiter)))

        if cls.BUFFER_SIZE_PARAM in reader_params:
            buffer_size = reader_params[cls.BUFFER_SIZE_PARAM]
            if not isinstance(buffer_size, int):
                raise errors.BadReaderParamsError(
                    "%s is not an int but a %s" %
                    (cls.BUFFER_SIZE_PARAM, type(buffer_size)))

        if cls.PATH_FILTER_PARAM in reader_params:
            path_filter = reader_params[cls.PATH_FILTER_PARAM]
            if not isinstance(path_filter, PathFilter):
                raise errors.BadReaderParamsError(
                    "%s is not an instance of PathFilter but %s." %
                    (cls.PATH_FILTER_PARAM, type(path_filter)))
    def validate(cls, job_config):
        """Inherit docs."""
        super(AbstractDatastoreInputReader, cls).validate(job_config)
        params = job_config.input_reader_params

        if cls.ENTITY_KIND_PARAM not in params:
            raise errors.BadReaderParamsError("Missing input reader parameter "
                                              "'entity_kind'")

        if cls.BATCH_SIZE_PARAM in params:
            try:
                batch_size = int(params[cls.BATCH_SIZE_PARAM])
                if batch_size < 1:
                    raise errors.BadReaderParamsError("Bad batch size: %s" %
                                                      batch_size)
            except ValueError, e:
                raise errors.BadReaderParamsError("Bad batch size: %s" % e)
    def _validate_filters_ndb(cls, filters, model_class):
        """Validate ndb.Model filters."""
        if not filters:
            return

        properties = model_class._properties

        for f in filters:
            prop, _, val = f
            if prop not in properties:
                raise errors.BadReaderParamsError(
                    "Property %s is not defined for entity type %s", prop,
                    model_class._get_kind())

            try:
                properties[prop]._do_validate(val)
            except db.BadValueError as e:
                raise errors.BadReaderParamsError(e)
示例#7
0
  def validate(cls, job_config):
    """Inherit docs."""
    super(ModelDatastoreInputReader, cls).validate(job_config)
    params = job_config.input_reader_params
    entity_kind = params[cls.ENTITY_KIND_PARAM]

    try:
      model_class = util.for_name(entity_kind)
    except ImportError, e:
      raise errors.BadReaderParamsError("Bad entity kind: %s" % e)
    def validate(cls, job_config):
        """Inherit docs."""
        super(SampleInputReader, cls).validate(job_config)

        params = job_config.input_reader_params

        if cls.COUNT not in params:
            raise errors.BadReaderParamsError("Must specify %s" % cls.COUNT)
        if not isinstance(params[cls.COUNT], int):
            raise errors.BadReaderParamsError(
                "%s should be an int but is %s" %
                (cls.COUNT, type(params[cls.COUNT])))
        if params[cls.COUNT] <= 0:
            raise errors.BadReaderParamsError("%s should be a positive int")

        if cls.STRING_LENGTH in params and not (
                isinstance(params[cls.STRING_LENGTH], int)
                and params[cls.STRING_LENGTH] > 0):
            raise errors.BadReaderParamsError(
                "%s should be a positive int "
                "but is %s" % (cls.STRING_LENGTH, params[cls.STRING_LENGTH]))
示例#9
0
    def validate(cls, job_config):
        """Validate mapper specification.

    Args:
      job_config: map_job.JobConfig.

    Raises:
      BadReaderParamsError: if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
        reader_params = job_config.input_reader_params

        if cls.BUCKET_NAME_PARAM not in reader_params:
            raise errors.BadReaderParamsError(
                "%s is required for Google Cloud Storage" %
                cls.BUCKET_NAME_PARAM)
        try:
            cloudstorage.validate_bucket_name(
                reader_params[cls.BUCKET_NAME_PARAM])
        except ValueError, error:
            raise errors.BadReaderParamsError("Bad bucket name, %s" % (error))
示例#10
0
    def validate(cls, job_config):
        """Validates relevant parameters.

    This method can validate fields which it deems relevant.

    Args:
      job_config: an instance of map_job.JobConfig.

    Raises:
      errors.BadReaderParamsError: required parameters are missing or invalid.
    """
        if job_config.input_reader_cls != cls:
            raise errors.BadReaderParamsError(
                "Expect input reader class %r, got %r." %
                (cls, job_config.input_reader_cls))
    def validate(cls, job_config):
        """Inherit docs."""
        super(ModelDatastoreInputReader, cls).validate(job_config)
        params = job_config.input_reader_params
        entity_kind = params[cls.ENTITY_KIND_PARAM]

        try:
            model_class = util.for_name(entity_kind)
        except ImportError as e:
            raise errors.BadReaderParamsError("Bad entity kind: %s" % e)
        if cls.FILTERS_PARAM in params:
            filters = params[cls.FILTERS_PARAM]
            if issubclass(model_class, db.Model):
                cls._validate_filters(filters, model_class)
            else:
                cls._validate_filters_ndb(filters, model_class)
            property_range.PropertyRange(filters, entity_kind)
示例#12
0
    def validate(cls, job_config):
        """Inherit docs."""
        super(DatastoreInputReader, cls).validate(job_config)
        params = job_config.input_reader_params
        entity_kind = params[cls.ENTITY_KIND_PARAM]

        if "." in entity_kind:
            logging.warning(
                ". detected in entity kind %s specified for reader %s."
                "Assuming entity kind contains the dot.", entity_kind,
                cls.__name__)

        if cls.FILTERS_PARAM in params:
            filters = params[cls.FILTERS_PARAM]
            for f in filters:
                if f[1] != "=":
                    raise errors.BadReaderParamsError(
                        "Only equality filters are supported: %s", f)
示例#13
0
    def _get_range_from_filters(cls, filters, model_class):
        """Get property range from filters user provided.

    This method also validates there is one and only one closed range on a
    single property.

    Args:
      filters: user supplied filters. Each filter should be a list or tuple of
        format (<property_name_as_str>, <query_operator_as_str>,
        <value_of_certain_type>). Value type should satisfy the property's type.
      model_class: the model class for the entity type to apply filters on.

    Returns:
      a tuple of (property, start_filter, end_filter). property is the model's
    field that the range is about. start_filter and end_filter define the
    start and the end of the range. (None, None, None) if no range is found.

    Raises:
      BadReaderParamsError: if any filter is invalid in any way.
    """
        if not filters:
            return None, None, None

        range_property = None
        start_val = None
        end_val = None
        start_filter = None
        end_filter = None
        for f in filters:
            prop, op, val = f

            if op in [">", ">=", "<", "<="]:
                if range_property and range_property != prop:
                    raise errors.BadReaderParamsError(
                        "Range on only one property is supported.")
                range_property = prop

                if val is None:
                    raise errors.BadReaderParamsError(
                        "Range can't be None in filter %s", f)

                if op in [">", ">="]:
                    if start_val is not None:
                        raise errors.BadReaderParamsError(
                            "Operation %s is specified more than once.", op)
                    start_val = val
                    start_filter = f
                else:
                    if end_val is not None:
                        raise errors.BadReaderParamsError(
                            "Operation %s is specified more than once.", op)
                    end_val = val
                    end_filter = f
            elif op != "=":
                raise errors.BadReaderParamsError(
                    "Only < <= > >= = are supported as operation. Got %s", op)

        if not range_property:
            return None, None, None

        if start_val is None or end_val is None:
            raise errors.BadReaderParamsError(
                "Filter should contains a complete range on property %s",
                range_property)
        if issubclass(model_class, db.Model):
            property_obj = model_class.properties()[range_property]
        else:
            property_obj = (model_class._properties[range_property])
        supported_properties = (_DISCRETE_PROPERTY_SPLIT_FUNCTIONS.keys() +
                                _CONTINUOUS_PROPERTY_SPLIT_FUNCTIONS.keys())
        if not isinstance(property_obj, tuple(supported_properties)):
            raise errors.BadReaderParamsError(
                "Filtered property %s is not supported by sharding.",
                range_property)
        if not start_val < end_val:
            raise errors.BadReaderParamsError(
                "Start value %s should be smaller than end value %s",
                start_val, end_val)

        return property_obj, start_filter, end_filter
class AbstractDatastoreInputReader(input_reader.InputReader):
    """Implementation of an abstract base class for a Datastore input reader."""

    _BATCH_SIZE = 50

    _MAX_SHARD_COUNT = 256

    MAX_NAMESPACES_FOR_KEY_SHARD = 10

    _APP_PARAM = "_app"

    NAMESPACE_PARAM = "namespace"
    ENTITY_KIND_PARAM = "entity_kind"
    KEYS_ONLY_PARAM = "keys_only"
    BATCH_SIZE_PARAM = "batch_size"
    KEY_RANGE_PARAM = "key_range"
    FILTERS_PARAM = "filters"

    _KEY_RANGE_ITER_CLS = db_iters.AbstractKeyRangeIterator

    def __init__(self, iterator):
        """Create new AbstractDatastoreInputReader object.

    This is internal constructor. Use split_input to create readers instead.

    Args:
      iterator: an iterator that generates objects for this input reader.
    """
        self._iter = iterator

    def __iter__(self):
        """Yields whatever the internal iterator yields."""
        for o in self._iter:
            yield o

    def __str__(self):
        """Returns the string representation of this InputReader."""
        return repr(self._iter)

    def to_json(self):
        """Inherit doc."""
        return self._iter.to_json()

    @classmethod
    def from_json(cls, state):
        """Inherit doc."""
        return cls(db_iters.RangeIteratorFactory.from_json(state))

    @classmethod
    def _get_query_spec(cls, params):
        """Construct a model.QuerySpec from model.MapperSpec."""
        entity_kind = params[cls.ENTITY_KIND_PARAM]
        filters = params.get(cls.FILTERS_PARAM)
        app = params.get(cls._APP_PARAM)
        ns = params.get(cls.NAMESPACE_PARAM)

        return model.QuerySpec(
            entity_kind=cls._get_raw_entity_kind(entity_kind),
            keys_only=bool(params.get(cls.KEYS_ONLY_PARAM, False)),
            filters=filters,
            batch_size=int(params.get(cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE)),
            model_class_path=entity_kind,
            app=app,
            ns=ns)

    @classmethod
    def split_input(cls, job_config):
        """Inherit doc."""
        shard_count = job_config.shard_count
        params = job_config.input_reader_params
        query_spec = cls._get_query_spec(params)

        namespaces = None
        if query_spec.ns is not None:
            k_ranges = cls._to_key_ranges_by_shard(query_spec.app,
                                                   [query_spec.ns],
                                                   shard_count, query_spec)
        else:
            ns_keys = namespace_range.get_namespace_keys(
                query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD + 1)

            if not ns_keys:
                return

            elif len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD:
                namespaces = [ns_key.name() or "" for ns_key in ns_keys]
                k_ranges = cls._to_key_ranges_by_shard(query_spec.app,
                                                       namespaces, shard_count,
                                                       query_spec)

            else:
                ns_ranges = namespace_range.NamespaceRange.split(
                    n=shard_count,
                    contiguous=False,
                    can_query=lambda: True,
                    _app=query_spec.app)
                k_ranges = [
                    key_ranges.KeyRangesFactory.create_from_ns_range(ns_range)
                    for ns_range in ns_ranges
                ]

        iters = [
            db_iters.RangeIteratorFactory.create_key_ranges_iterator(
                r, query_spec, cls._KEY_RANGE_ITER_CLS) for r in k_ranges
        ]

        return [cls(i) for i in iters]

    @classmethod
    def _to_key_ranges_by_shard(cls, app, namespaces, shard_count, query_spec):
        """Get a list of key_ranges.KeyRanges objects, one for each shard.

    This method uses scatter index to split each namespace into pieces
    and assign those pieces to shards.

    Args:
      app: app_id in str.
      namespaces: a list of namespaces in str.
      shard_count: number of shards to split.
      query_spec: model.QuerySpec.

    Returns:
      a list of key_ranges.KeyRanges objects.
    """
        key_ranges_by_ns = []

        for namespace in namespaces:
            ranges = cls._split_ns_by_scatter(shard_count, namespace,
                                              query_spec.entity_kind, app)

            random.shuffle(ranges)
            key_ranges_by_ns.append(ranges)

        ranges_by_shard = [[] for _ in range(shard_count)]
        for ranges in key_ranges_by_ns:
            for i, k_range in enumerate(ranges):
                if k_range:
                    ranges_by_shard[i].append(k_range)

        key_ranges_by_shard = []
        for ranges in ranges_by_shard:
            if ranges:
                key_ranges_by_shard.append(
                    key_ranges.KeyRangesFactory.create_from_list(ranges))
        return key_ranges_by_shard

    @classmethod
    def _split_ns_by_scatter(cls, shard_count, namespace, raw_entity_kind,
                             app):
        """Split a namespace by scatter index into key_range.KeyRange.

    TODO: Power this with key_range.KeyRange.compute_split_points.

    Args:
      shard_count: number of shards.
      namespace: namespace name to split. str.
      raw_entity_kind: low level datastore API entity kind.
      app: app id in str.

    Returns:
      A list of key_range.KeyRange objects. If there are not enough entities to
    splits into requested shards, the returned list will contain KeyRanges
    ordered lexicographically with any Nones appearing at the end.
    """
        if shard_count == 1:

            return [key_range.KeyRange(namespace=namespace, _app=app)]

        ds_query = datastore.Query(kind=raw_entity_kind,
                                   namespace=namespace,
                                   _app=app,
                                   keys_only=True)
        ds_query.Order("__scatter__")
        oversampling_factor = 32
        random_keys = ds_query.Get(shard_count * oversampling_factor)

        if not random_keys:

            return ([key_range.KeyRange(namespace=namespace, _app=app)] +
                    [None] * (shard_count - 1))

        random_keys.sort()

        if len(random_keys) >= shard_count:

            random_keys = cls._choose_split_points(random_keys, shard_count)

        k_ranges = []

        k_ranges.append(
            key_range.KeyRange(key_start=None,
                               key_end=random_keys[0],
                               direction=key_range.KeyRange.ASC,
                               include_start=False,
                               include_end=False,
                               namespace=namespace,
                               _app=app))

        for i in range(0, len(random_keys) - 1):
            k_ranges.append(
                key_range.KeyRange(key_start=random_keys[i],
                                   key_end=random_keys[i + 1],
                                   direction=key_range.KeyRange.ASC,
                                   include_start=True,
                                   include_end=False,
                                   namespace=namespace,
                                   _app=app))

        k_ranges.append(
            key_range.KeyRange(key_start=random_keys[-1],
                               key_end=None,
                               direction=key_range.KeyRange.ASC,
                               include_start=True,
                               include_end=False,
                               namespace=namespace,
                               _app=app))

        if len(k_ranges) < shard_count:

            k_ranges += [None] * (shard_count - len(k_ranges))
        return k_ranges

    @classmethod
    def _choose_split_points(cls, sorted_keys, shard_count):
        """Returns the best split points given a random set of datastore.Keys."""
        assert len(sorted_keys) >= shard_count
        index_stride = len(sorted_keys) / float(shard_count)
        return [
            sorted_keys[int(round(index_stride * i))]
            for i in range(1, shard_count)
        ]

    @classmethod
    def validate(cls, job_config):
        """Inherit docs."""
        super(AbstractDatastoreInputReader, cls).validate(job_config)
        params = job_config.input_reader_params

        if cls.ENTITY_KIND_PARAM not in params:
            raise errors.BadReaderParamsError("Missing input reader parameter "
                                              "'entity_kind'")

        if cls.BATCH_SIZE_PARAM in params:
            try:
                batch_size = int(params[cls.BATCH_SIZE_PARAM])
                if batch_size < 1:
                    raise errors.BadReaderParamsError("Bad batch size: %s" %
                                                      batch_size)
            except ValueError, e:
                raise errors.BadReaderParamsError("Bad batch size: %s" % e)

        try:
            bool(params.get(cls.KEYS_ONLY_PARAM, False))
        except:
            raise errors.BadReaderParamsError(
                "keys_only expects a boolean value but "
                "got %s", params[cls.KEYS_ONLY_PARAM])

        if cls.NAMESPACE_PARAM in params:
            if not isinstance(params[cls.NAMESPACE_PARAM],
                              (str, unicode, type(None))):
                raise errors.BadReaderParamsError(
                    "Expected a single namespace string")

        if cls.FILTERS_PARAM in params:
            filters = params[cls.FILTERS_PARAM]
            if not isinstance(filters, list):
                raise errors.BadReaderParamsError(
                    "Expected list for filters parameter")
            for f in filters:
                if not isinstance(f, (tuple, list)):
                    raise errors.BadReaderParamsError(
                        "Filter should be a tuple or list: "
                        "%s", f)
                if len(f) != 3:
                    raise errors.BadReaderParamsError(
                        "Filter should be a 3-tuple: %s", f)
                prop, op, _ = f
                if not isinstance(prop, basestring):
                    raise errors.BadReaderParamsError(
                        "Property should be string: %s", prop)
                if not isinstance(op, basestring):
                    raise errors.BadReaderParamsError(
                        "Operator should be string: %s", op)
示例#15
0
class GCSInputReader(map_job.InputReader):
    """Input reader from Google Cloud Storage using the cloudstorage library.

  Required configuration in the mapper_spec.input_reader dictionary.
    BUCKET_NAME_PARAM: name of the bucket to use. No "/" prefix or suffix.
    OBJECT_NAMES_PARAM: a list of object names or prefixes. All objects must be
      in the BUCKET_NAME_PARAM bucket. If the name ends with a * it will be
      treated as prefix and all objects with matching names will be read.
      Entries should not start with a slash unless that is part of the object's
      name. An example list could be:
      ["my-1st-input-file", "directory/my-2nd-file", "some/other/dir/input-*"]
      To retrieve all files "*" will match every object in the bucket. If a file
      is listed twice or is covered by multiple prefixes it will be read twice,
      there is no de-duplication.

  Optional configuration in the mapper_sec.input_reader dictionary.
    BUFFER_SIZE_PARAM: the size of the read buffer for each file handle.
    PATH_FILTER_PARAM: an instance of PathFilter. PathFilter is a predicate
      on which filenames to read.
    DELIMITER_PARAM: str. The delimiter that signifies directory.
      If you have too many files to shard on the granularity of individual
      files, you can specify this to enable shallow splitting. In this mode,
      the reader only goes one level deep during "*" expansion and stops when
      the delimiter is encountered.
  """

    COUNTER_FILE_READ = "file-read"
    COUNTER_FILE_MISSING = "file-missing"

    BUCKET_NAME_PARAM = "bucket_name"
    OBJECT_NAMES_PARAM = "objects"
    BUFFER_SIZE_PARAM = "buffer_size"
    DELIMITER_PARAM = "delimiter"
    PATH_FILTER_PARAM = "path_filter"

    _ACCOUNT_ID_PARAM = "account_id"

    _JSON_PICKLE = "pickle"
    _STRING_MAX_FILES_LISTED = 10

    def __init__(self,
                 filenames,
                 index=0,
                 buffer_size=None,
                 _account_id=None,
                 delimiter=None,
                 path_filter=None):
        """Initialize a GoogleCloudStorageInputReader instance.

    Args:
      filenames: A list of Google Cloud Storage filenames of the form
        '/bucket/objectname'.
      index: Index of the next filename to read.
      buffer_size: The size of the read buffer, None to use default.
      _account_id: Internal use only. See cloudstorage documentation.
      delimiter: Delimiter used as path separator. See class doc.
      path_filter: An instance of PathFilter.
    """
        super(GCSInputReader, self).__init__()
        self._filenames = filenames
        self._index = index
        self._buffer_size = buffer_size
        self._account_id = _account_id
        self._delimiter = delimiter
        self._bucket = None
        self._bucket_iter = None
        self._path_filter = path_filter
        self._slice_ctx = None

    def _next_file(self):
        """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
        while True:
            if self._bucket_iter:
                try:
                    return self._bucket_iter.next().filename
                except StopIteration:
                    self._bucket_iter = None
                    self._bucket = None
            if self._index >= len(self._filenames):
                return
            filename = self._filenames[self._index]
            self._index += 1
            if self._delimiter is None or not filename.endswith(
                    self._delimiter):
                return filename
            self._bucket = cloudstorage.listbucket(filename,
                                                   delimiter=self._delimiter)
            self._bucket_iter = iter(self._bucket)

    @classmethod
    def validate(cls, job_config):
        """Validate mapper specification.

    Args:
      job_config: map_job.JobConfig.

    Raises:
      BadReaderParamsError: if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
        reader_params = job_config.input_reader_params

        if cls.BUCKET_NAME_PARAM not in reader_params:
            raise errors.BadReaderParamsError(
                "%s is required for Google Cloud Storage" %
                cls.BUCKET_NAME_PARAM)
        try:
            cloudstorage.validate_bucket_name(
                reader_params[cls.BUCKET_NAME_PARAM])
        except ValueError, error:
            raise errors.BadReaderParamsError("Bad bucket name, %s" % (error))

        if cls.OBJECT_NAMES_PARAM not in reader_params:
            raise errors.BadReaderParamsError(
                "%s is required for Google Cloud Storage" %
                cls.OBJECT_NAMES_PARAM)
        filenames = reader_params[cls.OBJECT_NAMES_PARAM]
        if not isinstance(filenames, list):
            raise errors.BadReaderParamsError(
                "Object name list is not a list but a %s" %
                filenames.__class__.__name__)
        for filename in filenames:
            if not isinstance(filename, basestring):
                raise errors.BadReaderParamsError(
                    "Object name is not a string but a %s" %
                    filename.__class__.__name__)

        if cls.DELIMITER_PARAM in reader_params:
            delimiter = reader_params[cls.DELIMITER_PARAM]
            if not isinstance(delimiter, basestring):
                raise errors.BadReaderParamsError(
                    "%s is not a string but a %s" %
                    (cls.DELIMITER_PARAM, type(delimiter)))

        if cls.BUFFER_SIZE_PARAM in reader_params:
            buffer_size = reader_params[cls.BUFFER_SIZE_PARAM]
            if not isinstance(buffer_size, int):
                raise errors.BadReaderParamsError(
                    "%s is not an int but a %s" %
                    (cls.BUFFER_SIZE_PARAM, type(buffer_size)))

        if cls.PATH_FILTER_PARAM in reader_params:
            path_filter = reader_params[cls.PATH_FILTER_PARAM]
            if not isinstance(path_filter, PathFilter):
                raise errors.BadReaderParamsError(
                    "%s is not an instance of PathFilter but %s." %
                    (cls.PATH_FILTER_PARAM, type(path_filter)))