def validate(cls, mapper_spec): """Validate reader parameters in mapper_spec.""" if mapper_spec.input_reader_class() != cls: raise errors.BadReaderParamsError("Input reader class mismatch") params = mapper_spec.params if not cls.FILES_PARAM in params: raise errors.BadReaderParamsError("Missing files parameter.")
def _validate_filters(cls, filters, model_class): """Validate user supplied filters. Validate filters are on existing properties and filter values have valid semantics. Args: filters: user supplied filters. Each filter should be a list or tuple of format (<property_name_as_str>, <query_operator_as_str>, <value_of_certain_type>). Value type is up to the property's type. model_class: the db.Model class for the entity type to apply filters on. Raises: BadReaderParamsError: if any filter is invalid in any way. """ if not filters: return properties = model_class.properties() for f in filters: prop, _, val = f if prop not in properties: raise errors.BadReaderParamsError( "Property %s is not defined for entity type %s", prop, model_class.kind()) try: properties[prop].validate(val) except db.BadValueError as e: raise errors.BadReaderParamsError(e)
def validate(cls, job_config): """Inherit docs.""" super(AbstractDatastoreInputReader, cls).validate(job_config) params = job_config.input_reader_params if cls.ENTITY_KIND_PARAM not in params: raise errors.BadReaderParamsError("Missing input reader parameter " "'entity_kind'") if cls.BATCH_SIZE_PARAM in params: try: batch_size = int(params[cls.BATCH_SIZE_PARAM]) if batch_size < 1: raise errors.BadReaderParamsError("Bad batch size: %s" % batch_size) except ValueError as e: raise errors.BadReaderParamsError("Bad batch size: %s" % e) try: bool(params.get(cls.KEYS_ONLY_PARAM, False)) except: raise errors.BadReaderParamsError( "keys_only expects a boolean value but " "got %s", params[cls.KEYS_ONLY_PARAM]) if cls.NAMESPACE_PARAM in params: if not isinstance(params[cls.NAMESPACE_PARAM], (str, type(None))): raise errors.BadReaderParamsError( "Expected a single namespace string") if cls.FILTERS_PARAM in params: filters = params[cls.FILTERS_PARAM] if not isinstance(filters, list): raise errors.BadReaderParamsError( "Expected list for filters parameter") for f in filters: if not isinstance(f, (tuple, list)): raise errors.BadReaderParamsError( "Filter should be a tuple or list: " "%s", f) if len(f) != 3: raise errors.BadReaderParamsError( "Filter should be a 3-tuple: %s", f) prop, op, _ = f if not isinstance(prop, str): raise errors.BadReaderParamsError( "Property should be string: %s", prop) if not isinstance(op, str): raise errors.BadReaderParamsError( "Operator should be string: %s", op)
def validate(cls, job_config): """Validate mapper specification. Args: job_config: map_job.JobConfig. Raises: BadReaderParamsError: if the specification is invalid for any reason such as missing the bucket name or providing an invalid bucket name. """ reader_params = job_config.input_reader_params if cls.BUCKET_NAME_PARAM not in reader_params: raise errors.BadReaderParamsError( "%s is required for Google Cloud Storage" % cls.BUCKET_NAME_PARAM) try: cloudstorage.validate_bucket_name( reader_params[cls.BUCKET_NAME_PARAM]) except ValueError as error: raise errors.BadReaderParamsError("Bad bucket name, %s" % (error)) if cls.OBJECT_NAMES_PARAM not in reader_params: raise errors.BadReaderParamsError( "%s is required for Google Cloud Storage" % cls.OBJECT_NAMES_PARAM) filenames = reader_params[cls.OBJECT_NAMES_PARAM] if not isinstance(filenames, list): raise errors.BadReaderParamsError( "Object name list is not a list but a %s" % filenames.__class__.__name__) for filename in filenames: if not isinstance(filename, six.string_types): raise errors.BadReaderParamsError( "Object name is not a string but a %s" % filename.__class__.__name__) if cls.DELIMITER_PARAM in reader_params: delimiter = reader_params[cls.DELIMITER_PARAM] if not isinstance(delimiter, six.string_types): raise errors.BadReaderParamsError( "%s is not a string but a %s" % (cls.DELIMITER_PARAM, type(delimiter))) if cls.BUFFER_SIZE_PARAM in reader_params: buffer_size = reader_params[cls.BUFFER_SIZE_PARAM] if not isinstance(buffer_size, int): raise errors.BadReaderParamsError( "%s is not an int but a %s" % (cls.BUFFER_SIZE_PARAM, type(buffer_size))) if cls.PATH_FILTER_PARAM in reader_params: path_filter = reader_params[cls.PATH_FILTER_PARAM] if not isinstance(path_filter, PathFilter): raise errors.BadReaderParamsError( "%s is not an instance of PathFilter but %s." % (cls.PATH_FILTER_PARAM, type(path_filter)))
def validate(cls, job_config): """Inherit docs.""" super(AbstractDatastoreInputReader, cls).validate(job_config) params = job_config.input_reader_params if cls.ENTITY_KIND_PARAM not in params: raise errors.BadReaderParamsError("Missing input reader parameter " "'entity_kind'") if cls.BATCH_SIZE_PARAM in params: try: batch_size = int(params[cls.BATCH_SIZE_PARAM]) if batch_size < 1: raise errors.BadReaderParamsError("Bad batch size: %s" % batch_size) except ValueError, e: raise errors.BadReaderParamsError("Bad batch size: %s" % e)
def _validate_filters_ndb(cls, filters, model_class): """Validate ndb.Model filters.""" if not filters: return properties = model_class._properties for f in filters: prop, _, val = f if prop not in properties: raise errors.BadReaderParamsError( "Property %s is not defined for entity type %s", prop, model_class._get_kind()) try: properties[prop]._do_validate(val) except db.BadValueError as e: raise errors.BadReaderParamsError(e)
def validate(cls, job_config): """Inherit docs.""" super(ModelDatastoreInputReader, cls).validate(job_config) params = job_config.input_reader_params entity_kind = params[cls.ENTITY_KIND_PARAM] try: model_class = util.for_name(entity_kind) except ImportError, e: raise errors.BadReaderParamsError("Bad entity kind: %s" % e)
def validate(cls, job_config): """Inherit docs.""" super(SampleInputReader, cls).validate(job_config) params = job_config.input_reader_params if cls.COUNT not in params: raise errors.BadReaderParamsError("Must specify %s" % cls.COUNT) if not isinstance(params[cls.COUNT], int): raise errors.BadReaderParamsError( "%s should be an int but is %s" % (cls.COUNT, type(params[cls.COUNT]))) if params[cls.COUNT] <= 0: raise errors.BadReaderParamsError("%s should be a positive int") if cls.STRING_LENGTH in params and not ( isinstance(params[cls.STRING_LENGTH], int) and params[cls.STRING_LENGTH] > 0): raise errors.BadReaderParamsError( "%s should be a positive int " "but is %s" % (cls.STRING_LENGTH, params[cls.STRING_LENGTH]))
def validate(cls, job_config): """Validate mapper specification. Args: job_config: map_job.JobConfig. Raises: BadReaderParamsError: if the specification is invalid for any reason such as missing the bucket name or providing an invalid bucket name. """ reader_params = job_config.input_reader_params if cls.BUCKET_NAME_PARAM not in reader_params: raise errors.BadReaderParamsError( "%s is required for Google Cloud Storage" % cls.BUCKET_NAME_PARAM) try: cloudstorage.validate_bucket_name( reader_params[cls.BUCKET_NAME_PARAM]) except ValueError, error: raise errors.BadReaderParamsError("Bad bucket name, %s" % (error))
def validate(cls, job_config): """Validates relevant parameters. This method can validate fields which it deems relevant. Args: job_config: an instance of map_job.JobConfig. Raises: errors.BadReaderParamsError: required parameters are missing or invalid. """ if job_config.input_reader_cls != cls: raise errors.BadReaderParamsError( "Expect input reader class %r, got %r." % (cls, job_config.input_reader_cls))
def validate(cls, job_config): """Inherit docs.""" super(ModelDatastoreInputReader, cls).validate(job_config) params = job_config.input_reader_params entity_kind = params[cls.ENTITY_KIND_PARAM] try: model_class = util.for_name(entity_kind) except ImportError as e: raise errors.BadReaderParamsError("Bad entity kind: %s" % e) if cls.FILTERS_PARAM in params: filters = params[cls.FILTERS_PARAM] if issubclass(model_class, db.Model): cls._validate_filters(filters, model_class) else: cls._validate_filters_ndb(filters, model_class) property_range.PropertyRange(filters, entity_kind)
def validate(cls, job_config): """Inherit docs.""" super(DatastoreInputReader, cls).validate(job_config) params = job_config.input_reader_params entity_kind = params[cls.ENTITY_KIND_PARAM] if "." in entity_kind: logging.warning( ". detected in entity kind %s specified for reader %s." "Assuming entity kind contains the dot.", entity_kind, cls.__name__) if cls.FILTERS_PARAM in params: filters = params[cls.FILTERS_PARAM] for f in filters: if f[1] != "=": raise errors.BadReaderParamsError( "Only equality filters are supported: %s", f)
def _get_range_from_filters(cls, filters, model_class): """Get property range from filters user provided. This method also validates there is one and only one closed range on a single property. Args: filters: user supplied filters. Each filter should be a list or tuple of format (<property_name_as_str>, <query_operator_as_str>, <value_of_certain_type>). Value type should satisfy the property's type. model_class: the model class for the entity type to apply filters on. Returns: a tuple of (property, start_filter, end_filter). property is the model's field that the range is about. start_filter and end_filter define the start and the end of the range. (None, None, None) if no range is found. Raises: BadReaderParamsError: if any filter is invalid in any way. """ if not filters: return None, None, None range_property = None start_val = None end_val = None start_filter = None end_filter = None for f in filters: prop, op, val = f if op in [">", ">=", "<", "<="]: if range_property and range_property != prop: raise errors.BadReaderParamsError( "Range on only one property is supported.") range_property = prop if val is None: raise errors.BadReaderParamsError( "Range can't be None in filter %s", f) if op in [">", ">="]: if start_val is not None: raise errors.BadReaderParamsError( "Operation %s is specified more than once.", op) start_val = val start_filter = f else: if end_val is not None: raise errors.BadReaderParamsError( "Operation %s is specified more than once.", op) end_val = val end_filter = f elif op != "=": raise errors.BadReaderParamsError( "Only < <= > >= = are supported as operation. Got %s", op) if not range_property: return None, None, None if start_val is None or end_val is None: raise errors.BadReaderParamsError( "Filter should contains a complete range on property %s", range_property) if issubclass(model_class, db.Model): property_obj = model_class.properties()[range_property] else: property_obj = (model_class._properties[range_property]) supported_properties = (_DISCRETE_PROPERTY_SPLIT_FUNCTIONS.keys() + _CONTINUOUS_PROPERTY_SPLIT_FUNCTIONS.keys()) if not isinstance(property_obj, tuple(supported_properties)): raise errors.BadReaderParamsError( "Filtered property %s is not supported by sharding.", range_property) if not start_val < end_val: raise errors.BadReaderParamsError( "Start value %s should be smaller than end value %s", start_val, end_val) return property_obj, start_filter, end_filter
class AbstractDatastoreInputReader(input_reader.InputReader): """Implementation of an abstract base class for a Datastore input reader.""" _BATCH_SIZE = 50 _MAX_SHARD_COUNT = 256 MAX_NAMESPACES_FOR_KEY_SHARD = 10 _APP_PARAM = "_app" NAMESPACE_PARAM = "namespace" ENTITY_KIND_PARAM = "entity_kind" KEYS_ONLY_PARAM = "keys_only" BATCH_SIZE_PARAM = "batch_size" KEY_RANGE_PARAM = "key_range" FILTERS_PARAM = "filters" _KEY_RANGE_ITER_CLS = db_iters.AbstractKeyRangeIterator def __init__(self, iterator): """Create new AbstractDatastoreInputReader object. This is internal constructor. Use split_input to create readers instead. Args: iterator: an iterator that generates objects for this input reader. """ self._iter = iterator def __iter__(self): """Yields whatever the internal iterator yields.""" for o in self._iter: yield o def __str__(self): """Returns the string representation of this InputReader.""" return repr(self._iter) def to_json(self): """Inherit doc.""" return self._iter.to_json() @classmethod def from_json(cls, state): """Inherit doc.""" return cls(db_iters.RangeIteratorFactory.from_json(state)) @classmethod def _get_query_spec(cls, params): """Construct a model.QuerySpec from model.MapperSpec.""" entity_kind = params[cls.ENTITY_KIND_PARAM] filters = params.get(cls.FILTERS_PARAM) app = params.get(cls._APP_PARAM) ns = params.get(cls.NAMESPACE_PARAM) return model.QuerySpec( entity_kind=cls._get_raw_entity_kind(entity_kind), keys_only=bool(params.get(cls.KEYS_ONLY_PARAM, False)), filters=filters, batch_size=int(params.get(cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE)), model_class_path=entity_kind, app=app, ns=ns) @classmethod def split_input(cls, job_config): """Inherit doc.""" shard_count = job_config.shard_count params = job_config.input_reader_params query_spec = cls._get_query_spec(params) namespaces = None if query_spec.ns is not None: k_ranges = cls._to_key_ranges_by_shard(query_spec.app, [query_spec.ns], shard_count, query_spec) else: ns_keys = namespace_range.get_namespace_keys( query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD + 1) if not ns_keys: return elif len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD: namespaces = [ns_key.name() or "" for ns_key in ns_keys] k_ranges = cls._to_key_ranges_by_shard(query_spec.app, namespaces, shard_count, query_spec) else: ns_ranges = namespace_range.NamespaceRange.split( n=shard_count, contiguous=False, can_query=lambda: True, _app=query_spec.app) k_ranges = [ key_ranges.KeyRangesFactory.create_from_ns_range(ns_range) for ns_range in ns_ranges ] iters = [ db_iters.RangeIteratorFactory.create_key_ranges_iterator( r, query_spec, cls._KEY_RANGE_ITER_CLS) for r in k_ranges ] return [cls(i) for i in iters] @classmethod def _to_key_ranges_by_shard(cls, app, namespaces, shard_count, query_spec): """Get a list of key_ranges.KeyRanges objects, one for each shard. This method uses scatter index to split each namespace into pieces and assign those pieces to shards. Args: app: app_id in str. namespaces: a list of namespaces in str. shard_count: number of shards to split. query_spec: model.QuerySpec. Returns: a list of key_ranges.KeyRanges objects. """ key_ranges_by_ns = [] for namespace in namespaces: ranges = cls._split_ns_by_scatter(shard_count, namespace, query_spec.entity_kind, app) random.shuffle(ranges) key_ranges_by_ns.append(ranges) ranges_by_shard = [[] for _ in range(shard_count)] for ranges in key_ranges_by_ns: for i, k_range in enumerate(ranges): if k_range: ranges_by_shard[i].append(k_range) key_ranges_by_shard = [] for ranges in ranges_by_shard: if ranges: key_ranges_by_shard.append( key_ranges.KeyRangesFactory.create_from_list(ranges)) return key_ranges_by_shard @classmethod def _split_ns_by_scatter(cls, shard_count, namespace, raw_entity_kind, app): """Split a namespace by scatter index into key_range.KeyRange. TODO: Power this with key_range.KeyRange.compute_split_points. Args: shard_count: number of shards. namespace: namespace name to split. str. raw_entity_kind: low level datastore API entity kind. app: app id in str. Returns: A list of key_range.KeyRange objects. If there are not enough entities to splits into requested shards, the returned list will contain KeyRanges ordered lexicographically with any Nones appearing at the end. """ if shard_count == 1: return [key_range.KeyRange(namespace=namespace, _app=app)] ds_query = datastore.Query(kind=raw_entity_kind, namespace=namespace, _app=app, keys_only=True) ds_query.Order("__scatter__") oversampling_factor = 32 random_keys = ds_query.Get(shard_count * oversampling_factor) if not random_keys: return ([key_range.KeyRange(namespace=namespace, _app=app)] + [None] * (shard_count - 1)) random_keys.sort() if len(random_keys) >= shard_count: random_keys = cls._choose_split_points(random_keys, shard_count) k_ranges = [] k_ranges.append( key_range.KeyRange(key_start=None, key_end=random_keys[0], direction=key_range.KeyRange.ASC, include_start=False, include_end=False, namespace=namespace, _app=app)) for i in range(0, len(random_keys) - 1): k_ranges.append( key_range.KeyRange(key_start=random_keys[i], key_end=random_keys[i + 1], direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace, _app=app)) k_ranges.append( key_range.KeyRange(key_start=random_keys[-1], key_end=None, direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace, _app=app)) if len(k_ranges) < shard_count: k_ranges += [None] * (shard_count - len(k_ranges)) return k_ranges @classmethod def _choose_split_points(cls, sorted_keys, shard_count): """Returns the best split points given a random set of datastore.Keys.""" assert len(sorted_keys) >= shard_count index_stride = len(sorted_keys) / float(shard_count) return [ sorted_keys[int(round(index_stride * i))] for i in range(1, shard_count) ] @classmethod def validate(cls, job_config): """Inherit docs.""" super(AbstractDatastoreInputReader, cls).validate(job_config) params = job_config.input_reader_params if cls.ENTITY_KIND_PARAM not in params: raise errors.BadReaderParamsError("Missing input reader parameter " "'entity_kind'") if cls.BATCH_SIZE_PARAM in params: try: batch_size = int(params[cls.BATCH_SIZE_PARAM]) if batch_size < 1: raise errors.BadReaderParamsError("Bad batch size: %s" % batch_size) except ValueError, e: raise errors.BadReaderParamsError("Bad batch size: %s" % e) try: bool(params.get(cls.KEYS_ONLY_PARAM, False)) except: raise errors.BadReaderParamsError( "keys_only expects a boolean value but " "got %s", params[cls.KEYS_ONLY_PARAM]) if cls.NAMESPACE_PARAM in params: if not isinstance(params[cls.NAMESPACE_PARAM], (str, unicode, type(None))): raise errors.BadReaderParamsError( "Expected a single namespace string") if cls.FILTERS_PARAM in params: filters = params[cls.FILTERS_PARAM] if not isinstance(filters, list): raise errors.BadReaderParamsError( "Expected list for filters parameter") for f in filters: if not isinstance(f, (tuple, list)): raise errors.BadReaderParamsError( "Filter should be a tuple or list: " "%s", f) if len(f) != 3: raise errors.BadReaderParamsError( "Filter should be a 3-tuple: %s", f) prop, op, _ = f if not isinstance(prop, basestring): raise errors.BadReaderParamsError( "Property should be string: %s", prop) if not isinstance(op, basestring): raise errors.BadReaderParamsError( "Operator should be string: %s", op)
class GCSInputReader(map_job.InputReader): """Input reader from Google Cloud Storage using the cloudstorage library. Required configuration in the mapper_spec.input_reader dictionary. BUCKET_NAME_PARAM: name of the bucket to use. No "/" prefix or suffix. OBJECT_NAMES_PARAM: a list of object names or prefixes. All objects must be in the BUCKET_NAME_PARAM bucket. If the name ends with a * it will be treated as prefix and all objects with matching names will be read. Entries should not start with a slash unless that is part of the object's name. An example list could be: ["my-1st-input-file", "directory/my-2nd-file", "some/other/dir/input-*"] To retrieve all files "*" will match every object in the bucket. If a file is listed twice or is covered by multiple prefixes it will be read twice, there is no de-duplication. Optional configuration in the mapper_sec.input_reader dictionary. BUFFER_SIZE_PARAM: the size of the read buffer for each file handle. PATH_FILTER_PARAM: an instance of PathFilter. PathFilter is a predicate on which filenames to read. DELIMITER_PARAM: str. The delimiter that signifies directory. If you have too many files to shard on the granularity of individual files, you can specify this to enable shallow splitting. In this mode, the reader only goes one level deep during "*" expansion and stops when the delimiter is encountered. """ COUNTER_FILE_READ = "file-read" COUNTER_FILE_MISSING = "file-missing" BUCKET_NAME_PARAM = "bucket_name" OBJECT_NAMES_PARAM = "objects" BUFFER_SIZE_PARAM = "buffer_size" DELIMITER_PARAM = "delimiter" PATH_FILTER_PARAM = "path_filter" _ACCOUNT_ID_PARAM = "account_id" _JSON_PICKLE = "pickle" _STRING_MAX_FILES_LISTED = 10 def __init__(self, filenames, index=0, buffer_size=None, _account_id=None, delimiter=None, path_filter=None): """Initialize a GoogleCloudStorageInputReader instance. Args: filenames: A list of Google Cloud Storage filenames of the form '/bucket/objectname'. index: Index of the next filename to read. buffer_size: The size of the read buffer, None to use default. _account_id: Internal use only. See cloudstorage documentation. delimiter: Delimiter used as path separator. See class doc. path_filter: An instance of PathFilter. """ super(GCSInputReader, self).__init__() self._filenames = filenames self._index = index self._buffer_size = buffer_size self._account_id = _account_id self._delimiter = delimiter self._bucket = None self._bucket_iter = None self._path_filter = path_filter self._slice_ctx = None def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith( self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket) @classmethod def validate(cls, job_config): """Validate mapper specification. Args: job_config: map_job.JobConfig. Raises: BadReaderParamsError: if the specification is invalid for any reason such as missing the bucket name or providing an invalid bucket name. """ reader_params = job_config.input_reader_params if cls.BUCKET_NAME_PARAM not in reader_params: raise errors.BadReaderParamsError( "%s is required for Google Cloud Storage" % cls.BUCKET_NAME_PARAM) try: cloudstorage.validate_bucket_name( reader_params[cls.BUCKET_NAME_PARAM]) except ValueError, error: raise errors.BadReaderParamsError("Bad bucket name, %s" % (error)) if cls.OBJECT_NAMES_PARAM not in reader_params: raise errors.BadReaderParamsError( "%s is required for Google Cloud Storage" % cls.OBJECT_NAMES_PARAM) filenames = reader_params[cls.OBJECT_NAMES_PARAM] if not isinstance(filenames, list): raise errors.BadReaderParamsError( "Object name list is not a list but a %s" % filenames.__class__.__name__) for filename in filenames: if not isinstance(filename, basestring): raise errors.BadReaderParamsError( "Object name is not a string but a %s" % filename.__class__.__name__) if cls.DELIMITER_PARAM in reader_params: delimiter = reader_params[cls.DELIMITER_PARAM] if not isinstance(delimiter, basestring): raise errors.BadReaderParamsError( "%s is not a string but a %s" % (cls.DELIMITER_PARAM, type(delimiter))) if cls.BUFFER_SIZE_PARAM in reader_params: buffer_size = reader_params[cls.BUFFER_SIZE_PARAM] if not isinstance(buffer_size, int): raise errors.BadReaderParamsError( "%s is not an int but a %s" % (cls.BUFFER_SIZE_PARAM, type(buffer_size))) if cls.PATH_FILTER_PARAM in reader_params: path_filter = reader_params[cls.PATH_FILTER_PARAM] if not isinstance(path_filter, PathFilter): raise errors.BadReaderParamsError( "%s is not an instance of PathFilter but %s." % (cls.PATH_FILTER_PARAM, type(path_filter)))