def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): """See control.start_map.""" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec( name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.handler finally: context.Context._set(None) if not transactional: state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app config = util.create_datastore_write_config(mapreduce_spec) state.put(config=config) parent_entity = state cls._add_kickoff_task( base_path, mapreduce_spec, eta, countdown, parent_entity, queue_name, transactional, _app) return mapreduce_id
def testProcessNamespace(self): """Test ProcessNamespace function.""" namespace_manager.set_namespace("1") TestEntity().put() namespace_manager.set_namespace(None) namespaces_jobs = utils.RunMapForKinds( self.operation, [TestEntity.kind()], 'Test job for %(kind)s%(namespace)s', '__main__.foo', self.reader_class_spec, {'test_param': 1}) testutil.execute_all_tasks(self.taskqueue) m = mox.Mox() m.StubOutWithMock(context, "get", use_mock_anything=True) ctx = context.Context( model.MapreduceState.get_by_job_id(namespaces_jobs[0]).mapreduce_spec, None) context.get().AndReturn(ctx) context.get().AndReturn(ctx) m.ReplayAll() try: jobs = utils.ProcessNamespace('1') jobs.extend(utils.ProcessNamespace('1')) m.VerifyAll() finally: m.UnsetStubs() testutil.execute_all_tasks(self.taskqueue) self.assertEquals(1, len(jobs)) job = jobs[0] state = model.MapreduceState.get_by_job_id(job) self.assertTrue(state) spec = state.mapreduce_spec self.assertTrue(spec) self.assertEquals("Test job for TestEntity in namespace 1", spec.name) mapper = spec.mapper self.assertTrue(mapper) self.assertEquals({'test_param': 1, 'entity_kind': TestEntity.kind(), 'namespaces': '1'}, mapper.params) self.assertEquals('__main__.foo', mapper.handler_spec) self.assertEquals(self.reader_class_spec, mapper.input_reader_spec)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(tstate.shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not self._try_acquire_lease(shard_state, tstate): return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) return if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) retry_shard = False try: self.process_inputs(tstate.input_reader, shard_state, tstate, ctx) if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) except Exception, e: retry_shard = self._retry_logic(e, shard_state, tstate, spec.mapreduce_id)
def testOp(self): """Test AllocateMaxId operation.""" ctx = context.Context(None, None) copy_handler.AllocateMaxId(key('TestEntity', 30), self.app_id)(ctx) self.assertEqual( { ('TestEntity', 1): 30, }, ctx.get_pool('allocate_max_id_test_app_pool').key_path_to_max_id) ctx.flush() self.assertEqual([ (key(u'TestEntity', 1, _app=u'test_app'), 1, 30), ], self.allocated_id_ranges)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( "mapper-walltime-msec", int((time.time() - self._start_time) * 1000))(ctx) ctx.flush() if not shard_state.active: if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.put(config=util.create_datastore_write_config(spec)) finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error( "State not found for shard %s; Possible spurious task " "execution. Dropping this task.", shard_id) return if not shard_state.active: logging.error( "Shard %s is not active. Possible spurious task " "execution. Dropping this task.", shard_id) logging.error(str(shard_state)) return if shard_state.retries > tstate.retries: logging.error( "Got shard %s from previous shard retry %s. Possible spurious " "task execution. Dropping this task.", shard_id, tstate.retries) logging.error(str(shard_state)) return elif shard_state.retries < tstate.retries: raise ValueError( "ShardState for %s is behind slice. Waiting for it to catch up", shard_state.shard_id) ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) retry_shard = False try: self.process_inputs(input_reader, shard_state, tstate, ctx) if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) except Exception, e: retry_shard = self._retry_logic(e, shard_state, tstate, spec.mapreduce_id)
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") if queue_name[0] == "_": queue_name = "default" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.get_handler() finally: context.Context._set(None) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = parent_entity if not transactional: state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info( "Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) ctx.flush() except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED if not shard_state.active: if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error( "Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx()
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time)*1000))(ctx) ctx.flush() except errors.RetrySliceError, e: logging.error("Slice error: %s", e) retry_count = int( os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0) if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES: raise logging.error("Too many retries: %d, failing the job", retry_count) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) self._start_time = self._time() shard_id = self.shard_id() logging.debug("post: shard=%s slice=%s headers=%s", shard_id, self.slice_id(), self.request.headers) shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = self.input_reader(spec.mapper) if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_entity(entity, ctx) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS ctx.flush() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() if shard_state.active: self.reschedule(spec, input_reader)