def handle(self): """Handles kick off request.""" spec = model.MapreduceSpec.from_json_str( self._get_required_param("mapreduce_spec")) app_id = self.request.get("app", None) queue_name = os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") mapper_input_reader_class = spec.mapper.input_reader_class() state = model.MapreduceState.create_new(spec.mapreduce_id) state.mapreduce_spec = spec state.active = True state.char_url = "" state.sparkline_url = "" if app_id: state.app_id = app_id input_readers = mapper_input_reader_class.split_input(spec.mapper) if not input_readers: logging.warning("Found no mapper input data to process.") state.active = False state.active_shards = 0 state.put(config=util.create_datastore_write_config(spec)) return spec.mapper.shard_count = len(input_readers) state.active_shards = len(input_readers) state.mapreduce_spec = spec state.put(config=util.create_datastore_write_config(spec)) KickOffJobHandler._schedule_shards( spec, input_readers, queue_name, self.base_path()) ControllerCallbackHandler.reschedule( self.base_path(), spec, queue_name=queue_name, serial_id=0)
def flush(self): """Flush all information recorded in context.""" for pool in self._pools.values(): pool.flush() if self.shard_state: self.shard_state.put( config=util.create_datastore_write_config(self.mapreduce_spec))
def _schedule_shards(cls, spec, input_readers, queue_name, base_path): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. """ shard_states = [] for shard_number, input_reader in enumerate(input_readers): shard = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard.shard_description = str(input_reader) shard_states.append(shard) existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) db.put((shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec)) for shard_number, input_reader in enumerate(input_readers): shard_id = model.ShardState.shard_id_from_number( spec.mapreduce_id, shard_number) MapperWorkerCallbackHandler.schedule_slice( base_path, spec, shard_id, 0, input_reader, queue_name=queue_name)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(tstate.shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) shard_state = self._try_acquire_lease(shard_state, tstate) if shard_state == self._TASK_STATE.RETRY_TASK: self.retry_task() return if shard_state == self._TASK_STATE.DROP_TASK: return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.set_for_abort() shard_state.put(config=util.create_datastore_write_config(spec)) return ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) retry_directive = False try: self.process_inputs( tstate.input_reader, shard_state, tstate, ctx) if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) except Exception, e: retry_directive = self._retry_logic( e, shard_state, tstate, spec.mapreduce_id)
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): """See control.start_map.""" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec( name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.handler finally: context.Context._set(None) if not transactional: state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app config = util.create_datastore_write_config(mapreduce_spec) state.put(config=config) parent_entity = state cls._add_kickoff_task( base_path, mapreduce_spec, eta, countdown, parent_entity, queue_name, transactional, _app) return mapreduce_id
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path="/mapreduce", queue_name="default", eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False): mapper_spec.get_handler() mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec( name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = taskqueue.Task( url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): if not transactional: state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: pass else: return kickoff_worker_task.add(queue_name, transactional=True) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def _schedule_shards(cls, spec, input_readers, queue_name, base_path, mr_state): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. mr_state: The MapReduceState of current job. """ shard_states = [] writer_class = spec.mapper.output_writer_class() output_writers = [None] * len(input_readers) for shard_number, input_reader in enumerate(input_readers): shard_state = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard_state.shard_description = str(input_reader) if writer_class: output_writers[shard_number] = writer_class.create( mr_state, shard_state) shard_states.append(shard_state) existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) db.put((shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec)) processing_rate = int(spec.mapper.params.get( "processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) quota_refill = processing_rate / len(shard_states) quota_manager = quota.QuotaManager(memcache.Client()) for shard_state in shard_states: quota_manager.put(shard_state.shard_id, quota_refill) for shard_number, (input_reader, output_writer) in enumerate( zip(input_readers, output_writers)): shard_id = model.ShardState.shard_id_from_number( spec.mapreduce_id, shard_number) MapperWorkerCallbackHandler._schedule_slice( shard_states[shard_number], model.TransientShardState( base_path, spec, shard_id, 0, input_reader, input_reader, output_writer=output_writer), queue_name=queue_name)
def _txn(): state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=util.create_datastore_write_config(mapreduce_spec)) return state
def _put_state(): fresh_state = model.MapreduceState.get_by_job_id(spec.mapreduce_id) if not fresh_state.active: logging.warning( "Job %s is not active. Look like spurious task execution. " "Dropping controller task.", spec.mapreduce_id) return config = util.create_datastore_write_config(spec) state.put(config=config)
def _schedule_shards(cls, spec, input_readers, queue_name, base_path, mr_state): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. mr_state: The MapReduceState of current job. """ shard_states = [] writer_class = spec.mapper.output_writer_class() output_writers = [None] * len(input_readers) for shard_number, input_reader in enumerate(input_readers): shard_state = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard_state.shard_description = str(input_reader) if writer_class: output_writers[shard_number] = writer_class.create( mr_state, shard_state) shard_states.append(shard_state) existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) db.put((shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec)) for shard_number, (input_reader, output_writer) in enumerate( zip(input_readers, output_writers)): shard_id = model.ShardState.shard_id_from_number( spec.mapreduce_id, shard_number) task = MapperWorkerCallbackHandler._state_to_task( model.TransientShardState( base_path, spec, shard_id, 0, input_reader, input_reader, output_writer=output_writer)) MapperWorkerCallbackHandler._add_task(task, shard_states[shard_number], spec, queue_name)
def handle(self): mapreduce_id = self.request.get("mapreduce_id") mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) if mapreduce_state: config=util.create_datastore_write_config(mapreduce_state.mapreduce_spec) db.delete(model.MapreduceControl.get_key_by_job_id(mapreduce_id), config=config) shard_states = model.ShardState.find_by_mapreduce_state(mapreduce_state) for shard_state in shard_states: db.delete(util._HugeTaskPayload.all().ancestor(shard_state), config=config) db.delete(util._HugeTaskPayload.all().ancestor(mapreduce_state), config=config)
def handle(self): """Handles kick off request.""" spec = model.MapreduceSpec.from_json_str( self._get_required_param("mapreduce_spec")) app_id = self.request.get("app", None) queue_name = os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") mapper_input_reader_class = spec.mapper.input_reader_class() state = model.MapreduceState.create_new(spec.mapreduce_id) state.mapreduce_spec = spec state.active = True if app_id: state.app_id = app_id input_readers = mapper_input_reader_class.split_input(spec.mapper) if not input_readers: logging.warning("Found no mapper input data to process.") state.active = False state.active_shards = 0 ControllerCallbackHandler._finalize_job(spec, state, self.base_path()) return spec.mapper.shard_count = len(input_readers) state.active_shards = len(input_readers) state.mapreduce_spec = spec output_writer_class = spec.mapper.output_writer_class() if output_writer_class: output_writer_class.init_job(state) output_writers = [] if output_writer_class: for shard_number in range(len(input_readers)): writer = output_writer_class.create(state, shard_number) assert isinstance(writer, output_writer_class) output_writers.append(writer) else: output_writers = [None for ir in input_readers] state.put(config=util.create_datastore_write_config(spec)) KickOffJobHandler._schedule_shards( spec, input_readers, output_writers, queue_name, self.base_path()) ControllerCallbackHandler.reschedule( state, self.base_path(), spec, queue_name=queue_name, serial_id=0)
def _finalize_job(cls, mapreduce_spec, mapreduce_state, base_path): """Finalize job execution. Invokes done callback and save mapreduce state in a transaction, and schedule necessary clean ups. Args: mapreduce_spec: an instance of MapreduceSpec mapreduce_state: an instance of MapreduceState base_path: handler_base path. """ config = util.create_datastore_write_config(mapreduce_spec) queue_name = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") done_callback = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) done_task = None if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": mapreduce_spec.mapreduce_id}, method=mapreduce_spec.params.get("done_callback_method", "POST")) @db.transactional(retries=5) def _put_state(): fresh_state = model.MapreduceState.get_by_job_id( mapreduce_spec.mapreduce_id) if not fresh_state.active: logging.warning( "Job %s is not active. Look like spurious task execution. " "Dropping controller task.", mapreduce_spec.mapreduce_id) return mapreduce_state.put(config=config) if done_task and not _run_task_hook( mapreduce_spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) _put_state() logging.info("Final result for job '%s' is '%s'", mapreduce_spec.mapreduce_id, mapreduce_state.result_status) cls._clean_up_mr(mapreduce_spec, base_path)
def _update_state_from_shard_states(self, state, shard_states, control): """Update mr state by examing shard states. Args: state: current mapreduce state as MapreduceState. shard_states: all shard states (active and inactive). list of ShardState. control: model.MapreduceControl entity. """ active_shards = [s for s in shard_states if s.active] failed_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED] aborted_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED] spec = state.mapreduce_spec state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if not control and (failed_shards or aborted_shards): model.MapreduceControl.abort(spec.mapreduce_id) self._aggregate_stats(state, shard_states) state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) if not state.active: if failed_shards or not shard_states: state.result_status = model.MapreduceState.RESULT_FAILED elif aborted_shards: state.result_status = model.MapreduceState.RESULT_ABORTED else: state.result_status = model.MapreduceState.RESULT_SUCCESS self._finalize_job(spec, state, self.base_path()) else: config = util.create_datastore_write_config(spec) state.put(config=config)
def _finalize_job(cls, mapreduce_spec, mapreduce_state, base_path): """Finalize job execution. Finalizes output writer, invokes done callback and save mapreduce state in a transaction, and schedule necessary clean ups. Args: mapreduce_spec: an instance of MapreduceSpec mapreduce_state: an instance of MapreduceState base_path: handler_base path. """ config = util.create_datastore_write_config(mapreduce_spec) if (mapreduce_spec.mapper.output_writer_class() and mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS): mapreduce_spec.mapper.output_writer_class().finalize_job(mapreduce_state) queue_name = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") done_callback = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) done_task = None if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": mapreduce_spec.mapreduce_id}, method=mapreduce_spec.params.get("done_callback_method", "POST")) def put_state(state): state.put(config=config) if done_task and not _run_task_hook( mapreduce_spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) logging.info("Final result for job '%s' is '%s'", mapreduce_spec.mapreduce_id, mapreduce_state.result_status) db.run_in_transaction_custom_retries(5, put_state, mapreduce_state) cls._clean_up_mr(mapreduce_spec, base_path)
def _finalize_job(mapreduce_spec, mapreduce_state, base_path): """Finalize job execution. Finalizes output writer, invokes done callback an schedules finalize job execution. Args: mapreduce_spec: an instance of MapreduceSpec mapreduce_state: an instance of MapreduceState base_path: handler base path. """ config = util.create_datastore_write_config(mapreduce_spec) if (mapreduce_spec.mapper.output_writer_class() and mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS): mapreduce_spec.mapper.output_writer_class().finalize_job(mapreduce_state) def put_state(state): state.put(config=config) done_callback = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": mapreduce_spec.mapreduce_id}, method=mapreduce_spec.params.get("done_callback_method", "POST")) queue_name = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") if not _run_task_hook(mapreduce_spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) FinalizeJobHandler.schedule(base_path, mapreduce_spec) db.run_in_transaction_custom_retries(5, put_state, mapreduce_state)
if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) except Exception, e: retry_shard = self._retry_logic(e, shard_state, tstate, spec.mapreduce_id) finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() config = util.create_datastore_write_config(spec) @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error("Spurious task execution. Aborting the shard.") return
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) self._start_time = self._time() shard_id = self.shard_id() logging.debug("post: shard=%s slice=%s headers=%s", shard_id, self.slice_id(), self.request.headers) shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = self.input_reader(spec.mapper) if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_entity(entity, ctx) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS ctx.flush() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() if shard_state.active: self.reschedule(spec, input_reader)
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) logging.debug("post: id=%s headers=%s", spec.mapreduce_id, self.request.headers) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_id(spec.mapreduce_id) if state.active and len(shard_states) != spec.mapper.shard_count: logging.error( "Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED ] aborted_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED ] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if (not state.active and control and control.command == model.MapreduceControl.ABORT): logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: if [ s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS ]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) config = util.create_datastore_write_config(spec) if not state.active: def put_state(state): state.put(config=config) done_callback = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": spec.mapreduce_id}) queue_name = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") if not _run_task_hook(spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) db.run_in_transaction(put_state, state) return else: state.put(config=config) processing_rate = int( spec.mapper.params.get("processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule(self.base_path(), spec, self.serial_id() + 1)
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) self._start_time = self._time() shard_id = self.shard_id() logging.debug("post: shard=%s slice=%s headers=%s", shard_id, self.slice_id(), self.request.headers) shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = self.input_reader(spec.mapper) if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_entity(entity, ctx) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS ctx.flush() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() if shard_state.active: self.reschedule(spec, input_reader)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time)*1000))(ctx) ctx.flush() except errors.RetrySliceError, e: logging.error("Slice error: %s", e) retry_count = int( os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0) if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES: raise logging.error("Too many retries: %d, failing the job", retry_count) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) logging.debug("post: id=%s headers=%s spec=%s", spec.mapreduce_id, self.request.headers, self.request.get("mapreduce_spec")) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_state(state) if state.active and len(shard_states) != spec.mapper.shard_count: logging.error( "Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED ] aborted_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED ] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if not control and failed_shards: model.MapreduceControl.abort(spec.mapreduce_id) if (not state.active and control and control.command == model.MapreduceControl.ABORT): logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: if [ s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS ]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) if not state.active: ControllerCallbackHandler._finalize_job(spec, state, self.base_path()) return else: config = util.create_datastore_write_config(spec) state.put(config=config) processing_rate = int( spec.mapper.params.get("processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule(state, self.base_path(), spec, self.serial_id() + 1)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(None, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( "mapper-walltime-msec", int((time.time() - self._start_time) * 1000))(ctx) ctx.flush() if not shard_state.active: if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.put(config=util.create_datastore_write_config(spec)) finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time)*1000))(ctx) ctx.flush() except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED if not shard_state.active: if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error("Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx()
def _save_state_and_schedule_next(self, shard_state, tstate, retry_shard): """Save state to datastore and schedule next task for this shard. Update and save shard state. Schedule next slice if needed. This method handles interactions with datastore and taskqueue. Args: shard_state: model.ShardState for current shard. tstate: model.TransientShardState for current shard. retry_shard: whether to retry shard. """ spec = tstate.mapreduce_spec config = util.create_datastore_write_config(spec) task = None if retry_shard: task = self._state_to_task(tstate) elif shard_state.active: shard_state.advance_for_next_slice() tstate.advance_for_next_slice() countdown = self._get_countdown_for_next_slice(spec) task = self._state_to_task(tstate, countdown=countdown) queue_name = os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") @db.transactional(retries=5) def _tx(): fresh_shard_state = model.ShardState.get_by_shard_id(tstate.shard_id) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): logging.error("Shard %s is not active. Possible spurious task " "execution. Dropping this task.", tstate.shard_id) logging.error("Datastore's %s", str(fresh_shard_state)) logging.error("Slice's %s", str(shard_state)) return fresh_shard_state.copy_from(shard_state) if fresh_shard_state.active: assert task is not None self._add_task(task, fresh_shard_state, spec, queue_name) fresh_shard_state.put(config=config) try: _tx() except (datastore_errors.Error, taskqueue.Error, runtime.DeadlineExceededError, apiproxy_errors.Error), e: logging.error( "Can't transactionally continue shard. " "Will retry slice %s %s for the %s time.", tstate.shard_id, tstate.slice_id, self.task_retry_count() + 1) shard_state.slice_id -= 1 self._try_free_lease(shard_state) raise e
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: self.process_inputs( input_reader, shard_state, tstate, quota_consumer, ctx) if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error("Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(None, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( "mapper-walltime-msec", int((time.time() - self._start_time)*1000))(ctx) ctx.flush() if not shard_state.active: if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.put(config=util.create_datastore_write_config(spec)) finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") if queue_name[0] == "_": queue_name = "default" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.get_handler() finally: context.Context._set(None) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = parent_entity if not transactional: state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(tstate.shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) lease_acquired = self._try_acquire_lease(shard_state, tstate) if lease_acquired is None: self.retry_task() return if not lease_acquired: return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) return ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) retry_directive = False try: self.process_inputs( tstate.input_reader, shard_state, tstate, ctx) if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) except Exception, e: retry_directive = self._retry_logic( e, shard_state, tstate, spec.mapreduce_id)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return if shard_state.retries > tstate.retries: logging.error( "Got shard %s from previous shard retry %s. Drop", shard_state.shard_id, tstate.retries) return elif shard_state.retries < tstate.retries: raise ValueError( "ShardState for %s is behind slice. Waiting for it to catch up", shard_state.shard_id) ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) retry_shard = False try: self.process_inputs( input_reader, shard_state, tstate, quota_consumer, ctx) if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) except Exception, e: retry_shard = self._retry_logic(e, shard_state, tstate, spec.mapreduce_id)
def _try_acquire_lease(self, shard_state, tstate): """Validate datastore and the task payload are consistent. If so, attempt to get a lease on this slice's execution. See model.ShardState doc on slice_start_time. Args: shard_state: model.ShardState from datastore. tstate: model.TransientShardState from taskqueue paylod. Returns: True if lease is acquired. None if this task should be retried. False if this task should be dropped. Only old tasks (comparing to datastore state) will be dropped. Future tasks are retried until they naturally become old so that we don't ever stuck MR. """ if not shard_state: logging.warning("State not found for shard %s; Possible spurious task " "execution. Dropping this task.", tstate.shard_id) return False if not shard_state.active: logging.warning("Shard %s is not active. Possible spurious task " "execution. Dropping this task.", tstate.shard_id) logging.warning(str(shard_state)) return False if shard_state.retries > tstate.retries: logging.warning( "Got shard %s from previous shard retry %s. Possible spurious " "task execution. Dropping this task.", tstate.shard_id, tstate.retries) logging.warning(str(shard_state)) return False elif shard_state.retries < tstate.retries: logging.warning( "ShardState for %s is behind slice. Waiting for it to catch up", shard_state.shard_id) return if shard_state.slice_id > tstate.slice_id: logging.warning( "Task %s-%s is behind ShardState %s. Dropping task.""", tstate.shard_id, tstate.slice_id, shard_state.slice_id) return False elif shard_state.slice_id < tstate.slice_id: logging.warning( "Task %s-%s is ahead of ShardState %s. Waiting for it to catch up.", tstate.shard_id, tstate.slice_id, shard_state.slice_id) return if shard_state.slice_start_time: countdown = self._wait_time(shard_state, _LEASE_GRACE_PERIOD + _SLICE_DURATION_SEC) if countdown > 0: logging.warning( "Last retry of slice %s-%s may be still running." "Will try again in %s seconds", tstate.shard_id, tstate.slice_id, countdown) time.sleep(countdown) return else: if (not self._old_request_ended(shard_state) and self._wait_time(shard_state, _REQUEST_EVENTUAL_TIMEOUT)): logging.warning( "Last retry of slice %s-%s is still in flight with request_id " "%s. Will try again later.", tstate.shard_id, tstate.slice_id, shard_state.slice_request_id) return config = util.create_datastore_write_config(tstate.mapreduce_spec) @db.transactional(retries=5) def _tx(): """Use datastore to set slice_start_time to now. If failed for any reason, raise error to retry the task (hence all the previous validation code). The task would die naturally eventually. Returns: True if state commit succeeded. None otherwise. """ fresh_state = model.ShardState.get_by_shard_id(tstate.shard_id) if not fresh_state: logging.error("ShardState missing.") raise db.Rollback() if (fresh_state.active and fresh_state.slice_id == shard_state.slice_id and fresh_state.slice_start_time == shard_state.slice_start_time): fresh_state.slice_start_time = datetime.datetime.now() fresh_state.slice_request_id = os.environ.get("REQUEST_LOG_ID") fresh_state.put(config=config) return True else: logging.warning( "Contention on slice %s-%s execution. Will retry again.", tstate.shard_id, tstate.slice_id) time.sleep(random.randrange(1, 5)) return return _tx()
self.process_inputs(input_reader, shard_state, tstate, ctx) if not shard_state.active: if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) except Exception, e: retry_shard = self._retry_logic(e, shard_state, tstate, spec.mapreduce_id) finally: context.Context._set(None) config = util.create_datastore_write_config(spec) @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): logging.error( "Shard %s is not active. Possible spurious task " "execution. Dropping this task.", shard_id) logging.error("Datastore's %s", str(fresh_shard_state)) logging.error("Slice's %s", str(shard_state)) return
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name="default", eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False): mapper_spec.get_handler() mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = None if not transactional: state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def _save_state_and_schedule_next(self, shard_state, tstate, retry_shard): """Save state to datastore and schedule next task for this shard. Update and save shard state. Schedule next slice if needed. This method handles interactions with datastore and taskqueue. Args: shard_state: model.ShardState for current shard. tstate: model.TransientShardState for current shard. retry_shard: whether to retry shard. """ spec = tstate.mapreduce_spec config = util.create_datastore_write_config(spec) task = None if retry_shard: task = self._state_to_task(tstate, shard_state) elif shard_state.active: shard_state.advance_for_next_slice() tstate.advance_for_next_slice() countdown = self._get_countdown_for_next_slice(spec) task = self._state_to_task(tstate, shard_state, countdown=countdown) queue_name = os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") @db.transactional(retries=5) def _tx(): fresh_shard_state = model.ShardState.get_by_shard_id(tstate.shard_id) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): logging.error("Shard %s is not active. Possible spurious task " "execution. Dropping this task.", tstate.shard_id) logging.error("Datastore's %s", str(fresh_shard_state)) logging.error("Slice's %s", str(shard_state)) return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) if fresh_shard_state.active: assert task is not None self._add_task(task, spec, queue_name) try: _tx() except (datastore_errors.Error, taskqueue.Error, runtime.DeadlineExceededError, apiproxy_errors.Error), e: logging.error( "Can't transactionally continue shard. " "Will retry slice %s %s for the %s time.", tstate.shard_id, tstate.slice_id, self.task_retry_count() + 1) shard_state.slice_id -= 1 self._try_free_lease(shard_state) raise e
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) logging.debug("post: id=%s headers=%s spec=%s", spec.mapreduce_id, self.request.headers, self.request.get("mapreduce_spec")) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_state(state) if state.active and len(shard_states) != spec.mapper.shard_count: logging.error("Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED] aborted_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if (not state.active and control and control.command == model.MapreduceControl.ABORT): logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: if [s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) if not state.active: ControllerCallbackHandler._finalize_job( spec, state, self.base_path()) return else: config = util.create_datastore_write_config(spec) state.put(config=config) processing_rate = int(spec.mapper.params.get( "processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule( state, self.base_path(), spec, self.serial_id() + 1)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info( "Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) ctx.flush() except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED if not shard_state.active: if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error( "Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx()
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") if queue_name[0] == "_": queue_name = "default" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec( name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.get_handler() finally: context.Context._set(None) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask( url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = parent_entity if not transactional: state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def _try_acquire_lease(self, shard_state, tstate): """Validate datastore and the task payload are consistent. If so, attempt to get a lease on this slice's execution. See model.ShardState doc on slice_start_time. Args: shard_state: model.ShardState from datastore. tstate: model.TransientShardState from taskqueue paylod. Returns: True if lease is acquired. False if this task should be dropped. Only old tasks (comparing to datastore state) will be dropped. Future tasks are retried until they naturally become old so that we don't ever stuck MR. Raises: Exception: if the task should be retried by taskqueue. """ if not shard_state: logging.warning("State not found for shard %s; Possible spurious task " "execution. Dropping this task.", tstate.shard_id) return False if not shard_state.active: logging.warning("Shard %s is not active. Possible spurious task " "execution. Dropping this task.", tstate.shard_id) logging.warning(str(shard_state)) return False if shard_state.retries > tstate.retries: logging.warning( "Got shard %s from previous shard retry %s. Possible spurious " "task execution. Dropping this task.", tstate.shard_id, tstate.retries) logging.warning(str(shard_state)) return False elif shard_state.retries < tstate.retries: raise ValueError( "ShardState for %s is behind slice. Waiting for it to catch up", shard_state.shard_id) if shard_state.slice_id > tstate.slice_id: logging.warning( "Task %s-%s is behind ShardState %s. Dropping task.""", tstate.shard_id, tstate.slice_id, shard_state.slice_id) return False elif shard_state.slice_id < tstate.slice_id: logging.warning( "Task %s-%s is ahead of ShardState %s. Waiting for it to catch up.", tstate.shard_id, tstate.slice_id, shard_state.slice_id) raise errors.RetrySliceError("Raise an error to trigger retry.") if shard_state.slice_start_time: countdown = self._lease_countdown(shard_state) if countdown > 0: logging.warning( "Last retry of slice %s-%s may be still running." "Will try again in %s seconds", tstate.shard_id, tstate.slice_id, countdown) time.sleep(countdown) raise errors.RetrySliceError("Raise an error to trigger retry") else: if not self._old_request_ended(shard_state): logging.warning( "Last retry of slice %s-%s is still in flight with request_id " "%s. Will try again later.", tstate.shard_id, tstate.slice_id, shard_state.slice_request_id) raise errors.RetrySliceError("Raise an error to trigger retry") config = util.create_datastore_write_config(tstate.mapreduce_spec) @db.transactional(retries=5) def _tx(): """Use datastore to set slice_start_time to now. If failed for any reason, raise error to retry the task (hence all the previous validation code). The task would die naturally eventually. """ fresh_state = model.ShardState.get_by_shard_id(tstate.shard_id) if not fresh_state: logging.error("ShardState missing.") raise db.Rollback() if (fresh_state.active and fresh_state.slice_id == shard_state.slice_id and fresh_state.slice_start_time == shard_state.slice_start_time): fresh_state.slice_start_time = datetime.datetime.now() fresh_state.slice_request_id = os.environ.get("REQUEST_LOG_ID") fresh_state.put(config=config) else: logging.warning( "Contention on slice %s-%s execution. Will retry again.", tstate.shard_id, tstate.slice_id) time.sleep(random.randrange(1, 5)) raise errors.RetrySliceError() _tx() return True
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) logging.debug("post: id=%s headers=%s spec=%s", spec.mapreduce_id, self.request.headers, self.request.get("mapreduce_spec")) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_id(spec.mapreduce_id) if state.active and len(shard_states) != spec.mapper.shard_count: logging.error("Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED] aborted_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if (not state.active and control and control.command == model.MapreduceControl.ABORT): logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: if [s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) config = util.create_datastore_write_config(spec) if not state.active: if spec.mapper.output_writer_class(): spec.mapper.output_writer_class().finalize_job(state) def put_state(state): state.put(config=config) done_callback = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": spec.mapreduce_id}, method=spec.params.get("done_callback_method", "POST")) queue_name = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") if not _run_task_hook(spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) FinalizeJobHandler.schedule(self.base_path(), spec) db.run_in_transaction(put_state, state) return else: state.put(config=config) processing_rate = int(spec.mapper.params.get( "processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule( state, self.base_path(), spec, self.serial_id() + 1)
def _try_acquire_lease(self, shard_state, tstate): """Validate datastore and the task payload are consistent. If so, attempt to get a lease on this slice's execution. See model.ShardState doc on slice_start_time. Args: shard_state: model.ShardState from datastore. tstate: model.TransientShardState from taskqueue paylod. Returns: A fresh shard state entity if lease is acquired. A _TASK_STATE enum if this task should be retried or dropped. Only old tasks (comparing to datastore state) will be dropped. Future tasks are retried until they naturally become old so that we don't ever stuck MR. """ if not shard_state: logging.warning("State not found for shard %s; Possible spurious task " "execution. Dropping this task.", tstate.shard_id) return self._TASK_STATE.DROP_TASK if not shard_state.active: logging.warning("Shard %s is not active. Possible spurious task " "execution. Dropping this task.", tstate.shard_id) logging.warning(str(shard_state)) return self._TASK_STATE.DROP_TASK if shard_state.retries > tstate.retries: logging.warning( "Got shard %s from previous shard retry %s. Possible spurious " "task execution. Dropping this task.", tstate.shard_id, tstate.retries) logging.warning(str(shard_state)) return self._TASK_STATE.DROP_TASK elif shard_state.retries < tstate.retries: logging.warning( "ShardState for %s is behind slice. Waiting for it to catch up", shard_state.shard_id) return self._TASK_STATE.RETRY_TASK if shard_state.slice_id > tstate.slice_id: logging.warning( "Task %s-%s is behind ShardState %s. Dropping task.""", tstate.shard_id, tstate.slice_id, shard_state.slice_id) return self._TASK_STATE.DROP_TASK elif shard_state.slice_id < tstate.slice_id: logging.warning( "Task %s-%s is ahead of ShardState %s. Waiting for it to catch up.", tstate.shard_id, tstate.slice_id, shard_state.slice_id) return self._TASK_STATE.RETRY_TASK if shard_state.slice_start_time: countdown = self._wait_time(shard_state, _LEASE_GRACE_PERIOD + _SLICE_DURATION_SEC) if countdown > 0: logging.warning( "Last retry of slice %s-%s may be still running." "Will try again in %s seconds", tstate.shard_id, tstate.slice_id, countdown) time.sleep(countdown) return self._TASK_STATE.RETRY_TASK else: if self._wait_time(shard_state, _REQUEST_EVENTUAL_TIMEOUT): if not self._old_request_ended(shard_state): logging.warning( "Last retry of slice %s-%s is still in flight with request_id " "%s. Will try again later.", tstate.shard_id, tstate.slice_id, shard_state.slice_request_id) return self._TASK_STATE.RETRY_TASK else: logging.warning( "Last retry of slice %s-%s has no log entry and has" "timed out after %s seconds", tstate.shard_id, tstate.slice_id, _REQUEST_EVENTUAL_TIMEOUT) config = util.create_datastore_write_config(tstate.mapreduce_spec) @db.transactional(retries=5) def _tx(): """Use datastore to set slice_start_time to now. If failed for any reason, raise error to retry the task (hence all the previous validation code). The task would die naturally eventually. Returns: Fresh shard state if state commit succeeded. None otherwise. """ fresh_state = model.ShardState.get_by_shard_id(tstate.shard_id) if not fresh_state: logging.error("ShardState missing.") raise db.Rollback() if (fresh_state.active and fresh_state.slice_id == shard_state.slice_id and fresh_state.slice_start_time == shard_state.slice_start_time): fresh_state.slice_start_time = datetime.datetime.now() fresh_state.slice_request_id = os.environ.get("REQUEST_LOG_ID") fresh_state.acquired_once = True fresh_state.put(config=config) return fresh_state else: logging.warning( "Contention on slice %s-%s execution. Will retry again.", tstate.shard_id, tstate.slice_id) time.sleep(random.randrange(1, 5)) return self._TASK_STATE.RETRY_TASK return _tx()