def testGlobalName(self): """Tests when the name has no dots in it.""" try: util.for_name("this_is_a_bad_module_name") except ImportError, e: self.assertTrue(str(e).startswith( "Could not find 'this_is_a_bad_module_name' on path "))
def run(self, job_name, sequence_num, namespace, output, complete_fn, mapreduce_pipeline_args): results = [] try: iterator = input_readers.GoogleCloudStorageInputReader(output, 0) for file_reader in iterator: for item in file_reader: # Map/reduce puts reducer output into blobstore files as a # string obtained via "str(result)". Use AST as a safe # alternative to eval() to get the Python object back. results.append(ast.literal_eval(item)) if complete_fn: util.for_name(complete_fn)(mapreduce_pipeline_args, results) with Namespace(namespace): db.run_in_transaction( DurableJobEntity._complete_job, job_name, sequence_num, MapReduceJob.build_output(self.root_pipeline_id, results)) # Don't know what exceptions are currently, or will be in future, # thrown from Map/Reduce or Pipeline libraries; these are under # active development. # # pylint: disable=broad-except except Exception, ex: logging.critical('Failed running map/reduce job %s: %s', job_name, str(ex)) common_utils.log_exception_origin() time_completed = time.time() with Namespace(namespace): db.run_in_transaction( DurableJobEntity._fail_job, job_name, sequence_num, MapReduceJob.build_output(self.root_pipeline_id, results, str(ex)))
def testBadClass(self): """Tests when the class is found but the function name is missing.""" try: util.for_name("__main__.TestHandlerWithArgs.missing") except ImportError, e: self.assertEquals( "Could not find 'missing' on path '__main__.TestHandlerWithArgs'", str(e))
def testBadModule(self): """Tests when the module name is bogus.""" try: util.for_name("this_is_a_bad_module_name.stuff") except ImportError, e: self.assertEquals( "Could not find 'stuff' on path 'this_is_a_bad_module_name'", str(e))
def testBadFunction(self): """Tests when the module name is good but the function is missing.""" try: util.for_name("__main__.does_not_exist") except ImportError, e: self.assertEquals( "Could not find 'does_not_exist' on path '__main__'", str(e))
def validate(cls, mapper_spec): super(DjangoModelInputReader, cls).validate(mapper_spec) params = _get_params(mapper_spec) if cls.NAMESPACE_PARAM in params: raise BadReaderParamsError("Namespaces are not supported.") entity_kind_name = params[cls.ENTITY_KIND_PARAM] try: util.for_name(entity_kind_name) except ImportError, e: raise BadReaderParamsError("Bad entity kind: %s" % e)
def _to_map_job_config(cls, mr_spec, # TODO(user): Remove this parameter after it can be # read from mr_spec. queue_name): """Converts model.MapreduceSpec back to JobConfig. This method allows our internal methods to use JobConfig directly. This method also allows us to expose JobConfig as an API during execution, despite that it is not saved into datastore. Args: mr_spec: model.MapreduceSpec. queue_name: queue name. Returns: The JobConfig object for this job. """ mapper_spec = mr_spec.mapper # 0 means all the old APIs before api_version is introduced. api_version = mr_spec.params.get("api_version", 0) old_api = api_version == 0 # We can not always convert MapreduceSpec generated by older API # to JobConfig. Thus, mr framework should use/expose the returned JobConfig # object with caution when a job is started with an old API. # In this case, this method only tries not to blow up and assemble a # JobConfig object as accurate as possible. return cls(_lenient=old_api, job_name=mr_spec.name, job_id=mr_spec.mapreduce_id, # handler_spec from older API may not have map_job.Mapper type. mapper=util.for_name(mapper_spec.handler_spec), input_reader_cls=mapper_spec.input_reader_class(), input_reader_params=input_readers._get_params(mapper_spec), output_writer_cls=mapper_spec.output_writer_class(), output_writer_params=output_writers._get_params(mapper_spec), shard_count=mapper_spec.shard_count, queue_name=queue_name, user_params=mr_spec.params.get("user_params"), shard_max_attempts=mr_spec.params.get("shard_max_attempts"), done_callback_url=mr_spec.params.get("done_callback"), _force_writes=mr_spec.params.get("force_writes"), _base_path=mr_spec.params["base_path"], _task_max_attempts=mr_spec.params.get("task_max_attempts"), _task_max_data_processing_attempts=( mr_spec.params.get("task_max_data_processing_attempts")), _hooks_cls=util.for_name(mr_spec.hooks_class_name), _app=mr_spec.params.get("app_id"), _api_version=api_version)
def output_writer_class(self): """Get output writer class. Returns: output writer class object. """ return self.output_writer_spec and util.for_name(self.output_writer_spec)
def _get_params(self, validator_parameter, name_prefix): """Retrieves additional user-supplied params for the job and validates them. Args: validator_parameter: name of the request parameter which supplies validator for this parameter set. name_prefix: common prefix for all parameter names in the request. Raises: Any exception raised by the 'params_validator' request parameter if the params fail to validate. """ params_validator = self.request.get(validator_parameter) user_params = {} for key in self.request.arguments(): if key.startswith(name_prefix): values = self.request.get_all(key) adjusted_key = key[len(name_prefix):] if len(values) == 1: user_params[adjusted_key] = values[0] else: user_params[adjusted_key] = values if params_validator: resolved_validator = util.for_name(params_validator) resolved_validator(user_params) return user_params
def input_reader_class(self): """Get input reader class. Returns: input reader class object. """ return util.for_name(self.input_reader_spec)
def handler_for_name(fq_name): """Resolves and instantiates handler by fully qualified name. NOTE: This is a clone of a function in the map/reduce module which has also been taught that map and reduce functions may be marked with @classmethod, as opposed to only member functions of default-constructable classes or @staticmethod. It is applied as a monkey-patch to fix the base library. First resolves the name using for_name call. Then if it resolves to a class, instantiates a class, if it resolves to a method - instantiates the class and binds method to the instance. Args: fq_name: fully qualified name of something to find. Returns: handler instance which is ready to be called. """ resolved_name = mapreduce_util.for_name(fq_name) if isinstance(resolved_name, (type, types.ClassType)): # create new instance if this is type return resolved_name() elif (isinstance(resolved_name, types.MethodType) and resolved_name.im_self is None): # bind the method return getattr(resolved_name.im_class(), resolved_name.__name__) else: # Already bound -- classmethod or staticmethod. return resolved_name
def run(self, job_name, mapper_spec, shuffler_spec, reducer_spec, input_reader_spec, output_writer_spec=None, mapper_params=None, shuffler_params=None, reducer_params=None, shards=None, combiner_spec=None): map_pipeline = yield MapPipeline(job_name, mapper_spec, input_reader_spec, params=mapper_params, shards=shards) shuffler_pipeline = yield util.for_name(shuffler_spec)(job_name, shuffler_params, map_pipeline) reducer_pipeline = yield mapreduce_pipeline.ReducePipeline( job_name, reducer_spec, output_writer_spec, reducer_params, shuffler_pipeline, combiner_spec=combiner_spec) with pipeline.After(reducer_pipeline): all_temp_files = yield pipeline_common.Extend( map_pipeline, shuffler_pipeline) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(reducer_pipeline)
def __iter__(self): """Create a generator for model instances for entities. Iterating through entities moves query range past the consumed entities. Yields: next model instance. """ while True: if self._current_key_range is None: break while True: query = self._current_key_range.make_ascending_query( util.for_name(self._entity_kind)) results = query.fetch(limit=self._batch_size) if not results: self._advance_key_range() break for model_instance in results: key = model_instance.key() self._current_key_range.advance(key) yield model_instance
def __iter__(self): """Create a generator for entities or keys in the range. Iterating through entries moves query range past the consumed entries. Yields: next entry. """ while True: entries_query = self._key_range.make_ascending_query( util.for_name(self._entity_kind), self._keys_only) entries_list = entries_query.fetch(limit=self.batch_size) if not entries_list: return for entry in entries_list: if hasattr(entry, 'key'): key = entry.key() else: key = entry self._key_range = key_range.KeyRange( key, self._key_range.key_end, self._key_range.direction, False, self._key_range.include_end) yield entry
def __iter__(self): k_range = self._key_range # Namespaces are not supported by djangoappengine if k_range.namespace: return model_class = util.for_name(self._query_spec.model_class_path) q = model_class.objects.all() if k_range.key_start: if k_range.include_start: q = q.filter(pk__gte=k_range.key_start.id_or_name()) else: q = q.filter(pk__gt=k_range.key_start.id_or_name()) if k_range.key_end: if k_range.include_end: q = q.filter(pk__lte=k_range.key_end.id_or_name()) else: q = q.filter(pk__lt=k_range.key_end.id_or_name()) q = q.order_by('pk') q = set_config(q, batch_size=self._query_spec.batch_size) if self._cursor: q = set_cursor(q, self._cursor) self._query = q for entity in self._query.iterator(): yield entity
def run(self, job_id, job_class_str, kwargs): # Disabling 4 space indentation checker for this docstring because this # "Yields:" section yields 2 objects and the Yields/Returns are # generally supposed to only yield 1 object which messes up the # indentation checking. This is the only case of this happening. """Returns a coroutine which runs the job pipeline and stores results. Args: job_id: str. The ID of the job to run. job_class_str: str. Should uniquely identify each type of job. kwargs: dict(str : object). Extra arguments used to build the MapreducePipeline. Yields: MapreducePipeline. Ready to start processing. Expects the output of that pipeline to be sent back. StoreMapReduceResults. Will be constructed with whatever output the caller sends back to the coroutine. """ job_class = mapreduce_util.for_name(job_class_str) job_class.register_start( job_id, metadata={ job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id # pylint: disable=protected-access }) # TODO(sll): Need try/except/mark-as-canceled here? output = yield mapreduce_pipeline.MapreducePipeline(**kwargs) yield StoreMapReduceResults(job_id, job_class_str, output)
def __iter__(self): """Create a generator for entities or keys in the range. Iterating through entries moves query range past the consumed entries. Yields: next entry. """ while True: entries_query = self._key_range.make_ascending_query( util.for_name(self._entity_kind), self._keys_only) entries_list = entries_query.fetch(limit=self.batch_size) if not entries_list: return for entry in entries_list: if hasattr(entry, 'key'): key = entry.key() else: key = entry self._key_range = key_range.KeyRange(key, self._key_range.key_end, self._key_range.direction, False, self._key_range.include_end) yield entry
def split_input(cls, mapper_spec): """Splits query into shards without fetching query results. Tries as best as it can to split the whole query result set into equal shards. Due to difficulty of making the perfect split, resulting shards' sizes might differ significantly from each other. The actual number of shards might also be less then requested (even 1), though it is never greater. Current implementation does key-lexicographic order splitting. It requires query not to specify any __key__-based ordering. If an index for query.order('-__key__') query is not present, an inaccurate guess at sharding will be made by splitting the full key range. Args: mapper_spec: MapperSpec with params containing 'entity_kind'. May also have 'batch_size' in the params to specify the number of entities to process in each batch. Returns: A list of InputReader objects of length <= number_of_shards. These may be DatastoreInputReader or DatastoreKeyInputReader objects. Raises: BadReaderParamsError: required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Input reader class mismatch") params = mapper_spec.params if cls.ENTITY_KIND_PARAM not in params: raise BadReaderParamsError("Missing mapper parameter 'entity_kind'") entity_kind_name = params[cls.ENTITY_KIND_PARAM] shard_count = mapper_spec.shard_count app = params.get(cls._APP_PARAM) # keys_only remains for backwards compatability. It may go away. keys_only = util.parse_bool(params.get(cls.KEYS_ONLY_PARAM, False)) if keys_only: raise BadReaderParamsError("The keys_only parameter is obsolete. " "Use DatastoreKeyInputReader instead.") # Fail fast if Model cannot be located. util.for_name(entity_kind_name) return cls._split_input_from_params( app, entity_kind_name, params, shard_count)
def _get_raw_entity_kind(cls, model_classpath): entity_type = util.for_name(model_classpath) if isinstance(entity_type, db.Model): return entity_type.kind() elif isinstance(entity_type, (ndb.Model, ndb.MetaModel)): # pylint: disable=protected-access return entity_type._get_kind() else: return util.get_short_name(model_classpath)
def run(self, job_id, job_class_str, kwargs): job_class = mapreduce_util.for_name(job_class_str) job_class.register_start(job_id, metadata={ job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id }) # TODO(sll): Need try/except/mark-as-canceled here? output = yield mapreduce_pipeline.MapreducePipeline(**kwargs) yield StoreMapReduceResults(job_id, job_class_str, output)
def split_input(cls, mapper_spec): """Splits query into shards without fetching query results. Tries as best as it can to split the whole query result set into equal shards. Due to difficulty of making the perfect split, resulting shards' sizes might differ significantly from each other. The actual number of shards might also be less then requested (even 1), though it is never greater. Current implementation does key-lexicographic order splitting. It requires query not to specify any __key__-based ordering. If an index for query.order('-__key__') query is not present, an inaccurate guess at sharding will be made by splitting the full key range. Args: mapper_spec: MapperSpec with params containing 'entity_kind'. May also have 'batch_size' in the params to specify the number of entities to process in each batch. Returns: A list of DatastoreInputReader objects of length <= number_of_shards. Raises: BadReaderParamsError if required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Input reader class mismatch") params = mapper_spec.params if "entity_kind" not in params: raise BadReaderParamsError( "Missing mapper parameter 'entity_kind'") entity_kind_name = params["entity_kind"] entity_kind = util.for_name(entity_kind_name) shard_count = mapper_spec.shard_count batch_size = int(params.get("batch_size", cls._BATCH_SIZE)) keys_only = int(params.get("keys_only", False)) ds_query = entity_kind.all()._get_query() ds_query.Order("__key__") first_entity = ds_query.Get(1) if not first_entity: return [] else: first_entity_key = first_entity[0].key() ds_query.Order(("__key__", datastore.Query.DESCENDING)) try: last_entity = ds_query.Get(1) last_entity_key = last_entity[0].key() except db.NeedIndexError, e: logging.warning( "Cannot create accurate approximation of keyspace, " "guessing instead. Please address this problem: %s", e) last_entity_key = key_range.KeyRange.guess_end_key( entity_kind.kind(), first_entity_key)
def validate(cls, job_config): """Inherit docs.""" super(ModelDatastoreInputReader, cls).validate(job_config) params = job_config.input_reader_params entity_kind = params[cls.ENTITY_KIND_PARAM] # Fail fast if Model cannot be located. try: model_class = util.for_name(entity_kind) except ImportError, e: raise errors.BadReaderParamsError("Bad entity kind: %s" % e)
def split_input(cls, mapper_spec): """Splits query into shards without fetching query results. Tries as best as it can to split the whole query result set into equal shards. Due to difficulty of making the perfect split, resulting shards' sizes might differ significantly from each other. The actual number of shards might also be less then requested (even 1), though it is never greater. Current implementation does key-lexicographic order splitting. It requires query not to specify any __key__-based ordering. If an index for query.order('-__key__') query is not present, an inaccurate guess at sharding will be made by splitting the full key range. Args: mapper_spec: MapperSpec with params containing 'entity_kind'. May also have 'batch_size' in the params to specify the number of entities to process in each batch. Returns: A list of DatastoreInputReader objects of length <= number_of_shards. Raises: BadReaderParamsError if required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Input reader class mismatch") params = mapper_spec.params if "entity_kind" not in params: raise BadReaderParamsError("Missing mapper parameter 'entity_kind'") entity_kind_name = params["entity_kind"] entity_kind = util.for_name(entity_kind_name) shard_count = mapper_spec.shard_count batch_size = int(params.get("batch_size", cls._BATCH_SIZE)) keys_only = int(params.get("keys_only", False)) ds_query = entity_kind.all()._get_query() ds_query.Order("__key__") first_entity = ds_query.Get(1) if not first_entity: return [] else: first_entity_key = first_entity[0].key() ds_query.Order(("__key__", datastore.Query.DESCENDING)) try: last_entity = ds_query.Get(1) last_entity_key = last_entity[0].key() except db.NeedIndexError, e: logging.warning("Cannot create accurate approximation of keyspace, " "guessing instead. Please address this problem: %s", e) last_entity_key = key_range.KeyRange.guess_end_key( entity_kind.kind(), first_entity_key)
def run(self, key, values): if not self._combiner: ctx = context.get() params = ctx.mapreduce_spec.mapper.params combine_spec = params.get(_CombinePipeline.COMBINE_SPEC_PARAM) self._combiner = util.for_name(combine_spec) for combined_value in self._combiner(key, values, []): proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(combined_value) yield proto.Encode()
def validate(cls, mapper_spec): """Validates mapper spec and all mapper parameters. Args: mapper_spec: The MapperSpec for this InputReader. Raises: BadReaderParamsError: required parameters are missing or invalid. """ cls._common_validate(mapper_spec) params = mapper_spec.params keys_only = util.parse_bool(params.get(cls.KEYS_ONLY_PARAM, False)) if keys_only: raise BadReaderParamsError("The keys_only parameter is obsolete. " "Use DatastoreKeyInputReader instead.") entity_kind_name = params[cls.ENTITY_KIND_PARAM] # Fail fast if Model cannot be located. try: util.for_name(entity_kind_name) except ImportError, e: raise BadReaderParamsError("Bad entity kind: %s" % e)
def get_hooks(self): """Returns a hooks.Hooks class or None if no hooks class has been set.""" if self.__hooks is None and self.hooks_class_name is not None: hooks_class = util.for_name(self.hooks_class_name) if not isinstance(hooks_class, type): raise ValueError("hooks_class_name must refer to a class, got %s" % type(hooks_class).__name__) if not issubclass(hooks_class, hooks.Hooks): raise ValueError( "hooks_class_name must refer to a hooks.Hooks subclass") self.__hooks = hooks_class(self) return self.__hooks
def __init__(self, filters, model_class_path): """Init. Args: filters: user supplied filters. Each filter should be a list or tuple of format (<property_name_as_str>, <query_operator_as_str>, <value_of_certain_type>). Value type should satisfy the property's type. model_class_path: full path to the model class in str. """ self.filters = filters self.model_class_path = model_class_path self.model_class = util.for_name(self.model_class_path) self.prop, self.start, self.end = self._get_range_from_filters( self.filters, self.model_class)
def run(self, job_id, job_class_str, output): job_class = mapreduce_util.for_name(job_class_str) try: iterator = input_readers.GoogleCloudStorageInputReader(output, 0) results_list = [] for item_reader in iterator: for item in item_reader: results_list.append(json.loads(item)) job_class.register_completion(job_id, results_list) except Exception as e: logging.error(traceback.format_exc()) logging.error("Job %s failed at %s" % (job_id, utils.get_current_time_in_millisecs())) job_class.register_failure(job_id, "%s\n%s" % (unicode(e), traceback.format_exc()))
def run(self, job_name, sequence_num, time_started, namespace, output, complete_fn, kwargs): results = [] # TODO(mgainer): Notice errors earlier in pipeline, and mark job # as failed in that case as well. try: iterator = input_readers.GoogleCloudStorageInputReader(output, 0) for file_reader in iterator: for item in file_reader: # Map/reduce puts reducer output into blobstore files as a # string obtained via "str(result)". Use AST as a safe # alternative to eval() to get the Python object back. results.append(ast.literal_eval(item)) if complete_fn: util.for_name(complete_fn)(kwargs, results) time_completed = time.time() with Namespace(namespace): db.run_in_transaction( DurableJobEntity._complete_job, job_name, sequence_num, MapReduceJob.build_output(self.root_pipeline_id, results), long(time_completed - time_started)) # Don't know what exceptions are currently, or will be in future, # thrown from Map/Reduce or Pipeline libraries; these are under # active development. # # pylint: disable=broad-except except Exception, ex: logging.critical('Failed running map/reduce job %s: %s', job_name, str(ex)) common_utils.log_exception_origin() time_completed = time.time() with Namespace(namespace): db.run_in_transaction( DurableJobEntity._fail_job, job_name, sequence_num, MapReduceJob.build_output(self.root_pipeline_id, results, str(ex)), long(time_completed - time_started))
def get_handler(self): """Get mapper handler instance. Returns: cached handler instance as callable. """ if self.__handler is None: resolved_spec = util.for_name(self.handler_spec) if isinstance(resolved_spec, type): self.__handler = resolved_spec() elif isinstance(resolved_spec, types.MethodType): self.__handler = getattr(resolved_spec.im_class(), resolved_spec.__name__) else: self.__handler = resolved_spec return self.__handler
def run(self, job_id, job_class_str, output): job_class = mapreduce_util.for_name(job_class_str) try: iterator = input_readers.GoogleCloudStorageInputReader(output, 0) results_list = [] for item_reader in iterator: for item in item_reader: results_list.append(json.loads(item)) job_class.register_completion(job_id, results_list) except Exception as e: logging.error(traceback.format_exc()) logging.error('Job %s failed at %s' % (job_id, utils.get_current_time_in_millisecs())) job_class.register_failure( job_id, '%s\n%s' % (unicode(e), traceback.format_exc()))
def _iter_key_range(self, k_range): cursor = None while True: query = k_range.make_ascending_query( util.for_name(self._entity_kind)) if cursor: query.with_cursor(cursor) results = query.fetch(limit=self._batch_size) if not results: break for model_instance in results: key = model_instance.key() yield key, model_instance cursor = query.cursor()
def run(self, job_id, job_class_str, output): job_class = mapreduce_util.for_name(job_class_str) try: iterator = input_readers.RecordsReader(output, 0) results_list = [] for item in iterator: # Map/reduce puts reducer output into blobstore files as a # string obtained via "str(result)". Use AST as a safe # alternative to eval() to get the Python object back. results_list.append(ast.literal_eval(item)) job_class.register_completion(job_id, results_list) except Exception as e: logging.error(traceback.format_exc()) logging.error('Job %s failed at %s' % (job_id, utils.get_current_time_in_millisecs())) job_class.register_failure( job_id, '%s\n%s' % (unicode(e), traceback.format_exc()))
def _split_input_from_namespace(cls, app, namespace, entity_kind_name, shard_count): entity_kind = util.for_name(entity_kind_name) entity_kind_name = entity_kind.kind() hex_key_start = db.Key.from_path(entity_kind_name, 0) hex_key_end = db.Key.from_path(entity_kind_name, int('f' * 40, base=16)) hex_range = key_range.KeyRange(hex_key_start, hex_key_end, None, True, True, namespace=namespace, _app=app) key_range_list = [hex_range] number_of_half_splits = int(math.floor(math.log(shard_count, 2))) for index in xrange(0, number_of_half_splits): new_ranges = [] for current_range in key_range_list: new_ranges.extend(current_range.split_range(1)) key_range_list = new_ranges adjusted_range_list = [] for current_range in key_range_list: adjusted_range = key_range.KeyRange( key_start=db.Key.from_path(current_range.key_start.kind(), 'hash_%040x' % (current_range.key_start.id() or 0), _app=current_range._app), key_end=db.Key.from_path(current_range.key_end.kind(), 'hash_%040x' % (current_range.key_end.id() or 0), _app=current_range._app), direction=current_range.direction, include_start=current_range.include_start, include_end=current_range.include_end, namespace=current_range.namespace, _app=current_range._app) adjusted_range_list.append(adjusted_range) return adjusted_range_list
def get_handler(self): """Get mapper handler instance. Returns: cached handler instance as callable. """ if self.__handler is None: logging.warn(self.handler_spec) resolved_spec = util.for_name(self.handler_spec) if isinstance(resolved_spec, type): # create new instance if this is type self.__handler = resolved_spec() elif isinstance(resolved_spec, types.MethodType): # bind the method self.__handler = getattr(resolved_spec.im_class(), resolved_spec.__name__) else: self.__handler = resolved_spec return self.__handler
def __iter__(self): self._query = self._key_range.make_ascending_query( util.for_name(self._query_spec.model_class_path), filters=self._query_spec.filters) if isinstance(self._query, db.Query): if self._cursor: self._query.with_cursor(self._cursor) for model_instance in self._query.run( batch_size=self._query_spec.batch_size, keys_only=self._query_spec.keys_only): yield model_instance else: self._query = self._query.iter(batch_size=self._query_spec.batch_size, keys_only=self._query_spec.keys_only, start_cursor=self._cursor, produce_cursors=True) for model_instance in self._query: yield model_instance
def _split_input_from_namespace( cls, app, namespace, entity_kind_name, shard_count): entity_kind = util.for_name(entity_kind_name) entity_kind_name = entity_kind.kind() hex_key_start = db.Key.from_path( entity_kind_name, 0) hex_key_end = db.Key.from_path( entity_kind_name, int('f' * 40, base=16)) hex_range = key_range.KeyRange( hex_key_start, hex_key_end, None, True, True, namespace=namespace, _app=app) key_range_list = [hex_range] number_of_half_splits = int(math.floor(math.log(shard_count, 2))) for index in xrange(0, number_of_half_splits): new_ranges = [] for current_range in key_range_list: new_ranges.extend(current_range.split_range(1)) key_range_list = new_ranges adjusted_range_list = [] for current_range in key_range_list: adjusted_range = key_range.KeyRange( key_start=db.Key.from_path( current_range.key_start.kind(), 'hash_%040x' % (current_range.key_start.id() or 0), _app=current_range._app), key_end=db.Key.from_path( current_range.key_end.kind(), 'hash_%040x' % (current_range.key_end.id() or 0), _app=current_range._app), direction=current_range.direction, include_start=current_range.include_start, include_end=current_range.include_end, namespace=current_range.namespace, _app=current_range._app) adjusted_range_list.append(adjusted_range) return adjusted_range_list
def __iter__(self): """Create a generator for model instances for entities. Iterating through entities moves query range past the consumed entities. Yields: next model instance. """ while True: query = self._key_range.make_ascending_query( util.for_name(self._entity_kind)) results = query.fetch(limit=self._batch_size) if not results: break for model_instance in results: key = model_instance.key() self._key_range.advance(key) yield model_instance
def run(self, job_id, job_class_str, output): """Extracts the results of a MR job and registers its completion. Args: job_id: str. The ID of the job to run. job_class_str: str. Should uniquely identify each type of job. output: str. The output produced by the job. """ job_class = mapreduce_util.for_name(job_class_str) try: iterator = input_readers.GoogleCloudStorageInputReader(output, 0) results_list = [] for item_reader in iterator: for item in item_reader: results_list.append(json.loads(item)) job_class.register_completion(job_id, results_list) except Exception as e: logging.exception('Job %s failed at %s' % (job_id, utils.get_current_time_in_millisecs())) job_class.register_failure( job_id, '%s\n%s' % (python_utils.UNICODE(e), traceback.format_exc()))
def _get_raw_entity_kind(cls, entity_kind): """Returns an datastore entity kind from a Django model.""" model_class = util.for_name(entity_kind) return model_class._meta.db_table
def _to_map_job_config(cls, mr_spec, # TODO(user): Remove this parameter after it can be # read from mr_spec. queue_name): """Converts model.MapreduceSpec back to JobConfig. This method allows our internal methods to use JobConfig directly. This method also allows us to expose JobConfig as an API during execution, despite that it is not saved into datastore. Args: mr_spec: model.MapreduceSpec. queue_name: queue name. Returns: The JobConfig object for this job. """ mapper_spec = mr_spec.mapper # 0 means all the old APIs before api_version is introduced. api_version = mr_spec.params.get("api_version", 0) old_api = api_version == 0 # Deserialize params from json if input_reader/output_writer are new API. input_reader_cls = mapper_spec.input_reader_class() input_reader_params = input_readers._get_params(mapper_spec) if issubclass(input_reader_cls, input_reader.InputReader): input_reader_params = input_reader_cls.params_from_json( input_reader_params) output_writer_cls = mapper_spec.output_writer_class() output_writer_params = output_writers._get_params(mapper_spec) # TODO(user): Call json (de)serialization for writer. # if (output_writer_cls and # issubclass(output_writer_cls, output_writer.OutputWriter)): # output_writer_params = output_writer_cls.params_from_json( # output_writer_params) # We can not always convert MapreduceSpec generated by older API # to JobConfig. Thus, mr framework should use/expose the returned JobConfig # object with caution when a job is started with an old API. # In this case, this method only tries not to blow up and assemble a # JobConfig object as accurate as possible. return cls(_lenient=old_api, job_name=mr_spec.name, job_id=mr_spec.mapreduce_id, # handler_spec from older API may not have map_job.Mapper type. mapper=util.for_name(mapper_spec.handler_spec), input_reader_cls=input_reader_cls, input_reader_params=input_reader_params, output_writer_cls=output_writer_cls, output_writer_params=output_writer_params, shard_count=mapper_spec.shard_count, queue_name=queue_name, user_params=mr_spec.params.get("user_params"), shard_max_attempts=mr_spec.params.get("shard_max_attempts"), done_callback_url=mr_spec.params.get("done_callback"), _force_writes=mr_spec.params.get("force_writes"), _base_path=mr_spec.params["base_path"], _task_max_attempts=mr_spec.params.get("task_max_attempts"), _task_max_data_processing_attempts=( mr_spec.params.get("task_max_data_processing_attempts")), _hooks_cls=util.for_name(mr_spec.hooks_class_name), _app=mr_spec.params.get("app_id"), _api_version=api_version)
def testClassName(self): """Test passing fq class name.""" self.assertEquals(TestHandler, util.for_name("__main__.TestHandler"))
def testMethodName(self): """Test passing method name.""" self.assertEquals(TestHandler.process, util.for_name("__main__.TestHandler.process"))
def testClassWithArgs(self): """Test passing method name of class with constructor args.""" self.assertEquals(TestHandlerWithArgs.process, util.for_name("__main__.TestHandlerWithArgs.process"))
def testFunctionName(self): """Test passing function name.""" self.assertEquals(test_handler_function, util.for_name("__main__.test_handler_function"))
from mapreduce import util util.for_name('migrate.process')