def handle(self): """Handles kick off request.""" spec = model.MapreduceSpec.from_json_str( self._get_required_param("mapreduce_spec")) app_id = self.request.get("app", None) queue_name = os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") mapper_input_reader_class = spec.mapper.input_reader_class() # StartJobHandler might have already saved the state, but it's OK # to override it because we're using the same mapreduce id. state = model.MapreduceState.create_new(spec.mapreduce_id) state.mapreduce_spec = spec state.active = True # TODO(user): Initialize UI fields correctly. state.char_url = "" state.sparkline_url = "" if app_id: state.app_id = app_id input_readers = mapper_input_reader_class.split_input(spec.mapper) if not input_readers: # We don't have any data. Finish map. logging.warning("Found no mapper input data to process.") state.active = False state.active_shards = 0 state.put(config=util.create_datastore_write_config(spec)) return # Update state and spec with actual shard count. spec.mapper.shard_count = len(input_readers) state.active_shards = len(input_readers) state.mapreduce_spec = spec output_writer_class = spec.mapper.output_writer_class() if output_writer_class: output_writer_class.init_job(state) output_writers = [] if output_writer_class: for shard_number in range(len(input_readers)): writer = output_writer_class.create(state, shard_number) assert isinstance(writer, output_writer_class) output_writers.append(writer) else: output_writers = [None for ir in input_readers] state.put(config=util.create_datastore_write_config(spec)) KickOffJobHandler._schedule_shards(spec, input_readers, output_writers, queue_name, self.base_path()) ControllerCallbackHandler.reschedule(state, self.base_path(), spec, queue_name=queue_name, serial_id=0)
def handle(self): """Handles kick off request.""" spec = model.MapreduceSpec.from_json_str( self._get_required_param("mapreduce_spec")) app_id = self.request.get("app", None) queue_name = os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") mapper_input_reader_class = spec.mapper.input_reader_class() # StartJobHandler might have already saved the state, but it's OK # to override it because we're using the same mapreduce id. state = model.MapreduceState.create_new(spec.mapreduce_id) state.mapreduce_spec = spec state.active = True # TODO(user): Initialize UI fields correctly. state.char_url = "" state.sparkline_url = "" if app_id: state.app_id = app_id input_readers = mapper_input_reader_class.split_input(spec.mapper) if not input_readers: # We don't have any data. Finish map. logging.warning("Found no mapper input data to process.") state.active = False state.active_shards = 0 state.put(config=util.create_datastore_write_config(spec)) return # Update state and spec with actual shard count. spec.mapper.shard_count = len(input_readers) state.active_shards = len(input_readers) state.mapreduce_spec = spec output_writer_class = spec.mapper.output_writer_class() if output_writer_class: output_writer_class.init_job(state) output_writers = [] if output_writer_class: for shard_number in range(len(input_readers)): writer = output_writer_class.create(state, shard_number) assert isinstance(writer, output_writer_class) output_writers.append(writer) else: output_writers = [None for ir in input_readers] state.put(config=util.create_datastore_write_config(spec)) KickOffJobHandler._schedule_shards( spec, input_readers, output_writers, queue_name, self.base_path()) ControllerCallbackHandler.reschedule( state, self.base_path(), spec, queue_name=queue_name, serial_id=0)
def flush(self): """Flush all information recorded in context.""" for pool in self._pools.values(): pool.flush() if self.shard_state: self.shard_state.put( config=util.create_datastore_write_config(self.mapreduce_spec))
def _schedule_shards(cls, spec, input_readers, queue_name, base_path): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. """ # Note: it's safe to re-attempt this handler because: # - shard state has deterministic and unique key. # - schedule_slice will fall back gracefully if a task already exists. shard_states = [] for shard_number, input_reader in enumerate(input_readers): shard = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard.shard_description = str(input_reader) shard_states.append(shard) # Retrievs already existing shards. existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) # Puts only non-existing shards. db.put((shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec)) for shard_number, input_reader in enumerate(input_readers): shard_id = model.ShardState.shard_id_from_number( spec.mapreduce_id, shard_number) MapperWorkerCallbackHandler.schedule_slice( base_path, spec, shard_id, 0, input_reader, queue_name=queue_name)
def _schedule_shards(cls, spec, input_readers, queue_name, base_path): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. """ # Note: it's safe to re-attempt this handler because: # - shard state has deterministic and unique key. # - schedule_slice will fall back gracefully if a task already exists. shard_states = [] for shard_number, input_reader in enumerate(input_readers): shard = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard.shard_description = str(input_reader) shard_states.append(shard) # Retrievs already existing shards. existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) # Puts only non-existing shards. db.put( (shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec), ) for shard_number, input_reader in enumerate(input_readers): shard_id = model.ShardState.shard_id_from_number(spec.mapreduce_id, shard_number) MapperWorkerCallbackHandler.schedule_slice( base_path, spec, shard_id, 0, input_reader, queue_name=queue_name )
def _schedule_shards(cls, spec, input_readers, queue_name, base_path, mr_state): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. mr_state: The MapReduceState of current job. """ # Note: it's safe to re-attempt this handler because: # - shard state has deterministic and unique key. # - _schedule_slice will fall back gracefully if a task already exists. shard_states = [] writer_class = spec.mapper.output_writer_class() output_writers = [None] * len(input_readers) for shard_number, input_reader in enumerate(input_readers): shard_state = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard_state.shard_description = str(input_reader) if writer_class: output_writers[shard_number] = writer_class.create( mr_state, shard_state) shard_states.append(shard_state) # Retrievs already existing shards. existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) # Puts only non-existing shards. db.put((shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec)) # Give each shard some quota to start with. processing_rate = int(spec.mapper.params.get( "processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) quota_refill = processing_rate / len(shard_states) quota_manager = quota.QuotaManager(memcache.Client()) for shard_state in shard_states: quota_manager.put(shard_state.shard_id, quota_refill) # Schedule shard tasks. for shard_number, (input_reader, output_writer) in enumerate( zip(input_readers, output_writers)): shard_id = model.ShardState.shard_id_from_number( spec.mapreduce_id, shard_number) MapperWorkerCallbackHandler._schedule_slice( shard_states[shard_number], model.TransientShardState( base_path, spec, shard_id, 0, input_reader, input_reader, output_writer=output_writer), queue_name=queue_name)
def _schedule_shards(cls, spec, input_readers, output_writers, queue_name, base_path): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. """ assert len(input_readers) == len(output_writers) # Note: it's safe to re-attempt this handler because: # - shard state has deterministic and unique key. # - _schedule_slice will fall back gracefully if a task already exists. shard_states = [] for shard_number, input_reader in enumerate(input_readers): shard_state = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard_state.shard_description = str(input_reader) shard_states.append(shard_state) # Retrievs already existing shards. existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) # Puts only non-existing shards. db.put((shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec)) # Give each shard some quota to start with. processing_rate = int( spec.mapper.params.get("processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) quota_refill = processing_rate / len(shard_states) quota_manager = quota.QuotaManager(memcache.Client()) for shard_state in shard_states: quota_manager.put(shard_state.shard_id, quota_refill) # Schedule shard tasks. for shard_number, (input_reader, output_writer) in enumerate( zip(input_readers, output_writers)): shard_id = model.ShardState.shard_id_from_number( spec.mapreduce_id, shard_number) MapperWorkerCallbackHandler._schedule_slice( shard_states[shard_number], model.TransientShardState(base_path, spec, shard_id, 0, input_reader, output_writer=output_writer), queue_name=queue_name)
def handle(self): mapreduce_id = self.request.get("mapreduce_id") mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) if mapreduce_state: config=util.create_datastore_write_config(mapreduce_state.mapreduce_spec) db.delete(model.MapreduceControl.get_key_by_job_id(mapreduce_id), config=config) shard_states = model.ShardState.find_by_mapreduce_state(mapreduce_state) for shard_state in shard_states: db.delete(util._HugeTaskPayload.all().ancestor(shard_state), config=config) db.delete(shard_states, config=config) db.delete(util._HugeTaskPayload.all().ancestor(mapreduce_state), config=config)
def handle(self): mapreduce_id = self.request.get("mapreduce_id") mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) if mapreduce_state: config = util.create_datastore_write_config( mapreduce_state.mapreduce_spec) db.delete(model.MapreduceControl.get_key_by_job_id(mapreduce_id), config=config) shard_states = model.ShardState.find_by_mapreduce_state( mapreduce_state) for shard_state in shard_states: db.delete(util._HugeTaskPayload.all().ancestor(shard_state), config=config) db.delete(shard_states, config=config) db.delete(util._HugeTaskPayload.all().ancestor(mapreduce_state), config=config)
def _finalize_job(mapreduce_spec, mapreduce_state, base_path): """Finalize job execution. Finalizes output writer, invokes done callback an schedules finalize job execution. Args: mapreduce_spec: an instance of MapreduceSpec mapreduce_state: an instance of MapreduceState base_path: handler base path. """ config = util.create_datastore_write_config(mapreduce_spec) # Only finalize the output writers if we the job is successful. if (mapreduce_spec.mapper.output_writer_class() and mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS): mapreduce_spec.mapper.output_writer_class().finalize_job( mapreduce_state) # Enqueue done_callback if needed. def put_state(state): state.put(config=config) done_callback = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": mapreduce_spec.mapreduce_id}, method=mapreduce_spec.params.get("done_callback_method", "POST")) queue_name = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") if not _run_task_hook(mapreduce_spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) FinalizeJobHandler.schedule(base_path, mapreduce_spec) db.run_in_transaction(put_state, mapreduce_state)
def _finalize_job(mapreduce_spec, mapreduce_state, base_path): """Finalize job execution. Finalizes output writer, invokes done callback an schedules finalize job execution. Args: mapreduce_spec: an instance of MapreduceSpec mapreduce_state: an instance of MapreduceState base_path: handler base path. """ config = util.create_datastore_write_config(mapreduce_spec) # Only finalize the output writers if we the job is successful. if (mapreduce_spec.mapper.output_writer_class() and mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS): mapreduce_spec.mapper.output_writer_class().finalize_job(mapreduce_state) # Enqueue done_callback if needed. def put_state(state): state.put(config=config) done_callback = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": mapreduce_spec.mapreduce_id}, method=mapreduce_spec.params.get("done_callback_method", "POST")) queue_name = mapreduce_spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") if not _run_task_hook(mapreduce_spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) FinalizeJobHandler.schedule(base_path, mapreduce_spec) db.run_in_transaction(put_state, mapreduce_state)
def testForceWrites(self): self.spec.params["force_writes"] = "True" config = util.create_datastore_write_config(self.spec) self.assertTrue(config) self.assertTrue(config.force_writes)
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str(self.request.get("mapreduce_spec")) self._start_time = self._time() shard_id = self.shard_id() # TODO(user): Make this prettier logging.debug("post: shard=%s slice=%s headers=%s", shard_id, self.slice_id(), self.request.headers) shard_state, control = db.get( [ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ] ) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info( "Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id ) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = self.input_reader(spec.mapper) if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer(quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_entity(entity, ctx) # Check if we've got enough quota for the next entity. if quota_consumer and not scan_aborted and not quota_consumer.consume(): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info( "Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id ) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(spec, input_reader)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) # We recieved a command to abort. We don't care if we override # some data. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() if not shard_state.active: # shard is going to stop. Finalize output writer if any. if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) # We don't want shard state to override active state, since that # may stuck job execution (see issue 116). Do a transactional # verification for status. # TODO(user): this might still result in some data inconsistency # which can be avoided. It doesn't seem to be worth it now, because # various crashes might result in all sort of data consistencies # anyway. @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error( "Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def testDefaultConfig(self): config = util.create_datastore_write_config(self.spec) self.assertTrue(config) self.assertFalse(config.force_writes)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # NOTE: When aborting, specifically do not finalize the output writer # because it might be in a bad state. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None # Tell NDB to never cache anything in memcache or in-process. This ensures # that entities fetched from Datastore input_readers via NDB will not bloat # up the request memory size and Datastore Puts will avoid doing calls # to memcache. Without this you get soft memory limit exits, which hurts # overall throughput. if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: self.process_inputs( input_reader, shard_state, tstate, quota_consumer, ctx) if not shard_state.active: # shard is going to stop. Finalize output writer only when shard is # successful because writer might be stuck in some bad state otherwise. if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) # We don't want shard state to override active state, since that # may stuck job execution (see issue 116). Do a transactional # verification for status. # TODO(user): this might still result in some data inconsistency # which can be avoided. It doesn't seem to be worth it now, because # various crashes might result in all sort of data consistencies # anyway. @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error("Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) # TODO(user): Make this logging prettier. logging.debug("post: id=%s headers=%s spec=%s", spec.mapreduce_id, self.request.headers, self.request.get("mapreduce_spec")) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_id(spec.mapreduce_id) if state.active and len(shard_states) != spec.mapper.shard_count: # Some shards were lost logging.error("Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED] aborted_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if (not state.active and control and control.command == model.MapreduceControl.ABORT): # User-initiated abort *after* all shards have completed. logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: # Set final result status derived from shard states. if [s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) # We don't need a transaction here, since we change only statistics data, # and we don't care if it gets overwritten/slightly inconsistent. self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) config = util.create_datastore_write_config(spec) if not state.active: # This is the last execution. # Enqueue done_callback if needed. if spec.mapper.output_writer_class(): spec.mapper.output_writer_class().finalize_job(state) def put_state(state): state.put(config=config) done_callback = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": spec.mapreduce_id}, method=spec.params.get("done_callback_method", "POST")) queue_name = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") if not _run_task_hook(spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) FinalizeJobHandler.schedule(self.base_path(), spec) db.run_in_transaction(put_state, state) return else: state.put(config=config) processing_rate = int(spec.mapper.params.get( "processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule( state, self.base_path(), spec, self.serial_id() + 1)
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path="/mapreduce", queue_name="default", eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False): # Check that handler can be instantiated. mapper_spec.get_handler() # Check that reader can be instantiated and is configured correctly mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = taskqueue.Task(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): if not transactional: # Save state in datastore so that UI can see it. # We can't save state in foreign transaction, but conventional UI # doesn't ask for transactional starts anyway. state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: # Use the default task addition implementation. pass else: return kickoff_worker_task.add(queue_name, transactional=True) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") if queue_name[0] == "_": # We are currently in some special queue. E.g. __cron. queue_name = "default" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") # Check that reader can be instantiated and is configured correctly mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) # Check that handler can be instantiated. ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.get_handler() finally: context.Context._set(None) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = parent_entity if not transactional: # Save state in datastore so that UI can see it. # We can't save state in foreign transaction, but conventional UI # doesn't ask for transactional starts anyway. state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: # Use the default task addition implementation. pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) # TODO(user): Make this logging prettier. logging.debug("post: id=%s headers=%s spec=%s", spec.mapreduce_id, self.request.headers, self.request.get("mapreduce_spec")) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_state(state) if state.active and len(shard_states) != spec.mapper.shard_count: # Some shards were lost logging.error( "Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED ] aborted_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED ] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if (not state.active and control and control.command == model.MapreduceControl.ABORT): # User-initiated abort *after* all shards have completed. logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: # Set final result status derived from shard states. if [ s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS ]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) # We don't need a transaction here, since we change only statistics data, # and we don't care if it gets overwritten/slightly inconsistent. self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) config = util.create_datastore_write_config(spec) if not state.active: # This is the last execution. # Enqueue done_callback if needed. if spec.mapper.output_writer_class(): spec.mapper.output_writer_class().finalize_job(state) def put_state(state): state.put(config=config) done_callback = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK) if done_callback: done_task = taskqueue.Task( url=done_callback, headers={"Mapreduce-Id": spec.mapreduce_id}, method=spec.params.get("done_callback_method", "POST")) queue_name = spec.params.get( model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE, "default") if not _run_task_hook(spec.get_hooks(), "enqueue_done_task", done_task, queue_name): done_task.add(queue_name, transactional=True) FinalizeJobHandler.schedule(self.base_path(), spec) db.run_in_transaction(put_state, state) return else: state.put(config=config) processing_rate = int( spec.mapper.params.get("processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule(state, self.base_path(), spec, self.serial_id() + 1)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( "mapper-walltime-msec", int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() if not shard_state.active: # shard is going to stop. Finalize output writer if any. if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.put(config=util.create_datastore_write_config(spec)) finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) # TODO(user): Make this logging prettier. logging.debug("post: id=%s headers=%s spec=%s", spec.mapreduce_id, self.request.headers, self.request.get("mapreduce_spec")) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_state(state) if state.active and len(shard_states) != spec.mapper.shard_count: # Some shards were lost logging.error("Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED] aborted_shards = [s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if (not state.active and control and control.command == model.MapreduceControl.ABORT): # User-initiated abort *after* all shards have completed. logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: # Set final result status derived from shard states. if [s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) # We don't need a transaction here, since we change only statistics data, # and we don't care if it gets overwritten/slightly inconsistent. self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) if not state.active: ControllerCallbackHandler._finalize_job( spec, state, self.base_path()) return else: config = util.create_datastore_write_config(spec) state.put(config=config) processing_rate = int(spec.mapper.params.get( "processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule( state, self.base_path(), spec, self.serial_id() + 1)
shard_state.result_status = model.ShardState.RESULT_FAILED except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED if not shard_state.active: # shard is going to stop. Don't finalize output writer unless the job is # going to be successful, because writer might be stuck in some bad state # otherwise. if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) # We don't want shard state to override active state, since that # may stuck job execution (see issue 116). Do a transactional # verification for status. # TODO(user): this might still result in some data inconsistency # which can be avoided. It doesn't seem to be worth it now, because # various crashes might result in all sort of data consistencies # anyway. @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS):
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # NOTE: When aborting, specifically do not finalize the output writer # because it might be in a bad state. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None # Tell NDB to never cache anything in memcache or in-process. This ensures # that entities fetched from Datastore input_readers via NDB will not bloat # up the request memory size and Datastore Puts will avoid doing calls # to memcache. Without this you get soft memory limit exits, which hurts # overall throughput. if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(verbose=True): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume(verbose=True)): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time)*1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() except errors.RetrySliceError, e: logging.error("Slice error: %s", e) retry_count = int( os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0) if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES: raise logging.error("Too many retries: %d, failing the job", retry_count) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED
logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED if not shard_state.active: # shard is going to stop. Don't finalize output writer unless the job is # going to be successful, because writer might be stuck in some bad state # otherwise. if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) # We don't want shard state to override active state, since that # may stuck job execution (see issue 116). Do a transactional # verification for status. # TODO(user): this might still result in some data inconsistency # which can be avoided. It doesn't seem to be worth it now, because # various crashes might result in all sort of data consistencies # anyway. @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if not fresh_shard_state: raise db.Rollback() if (not fresh_shard_state.active or "worker_active_state_collision"
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # NOTE: When aborting, specifically do not finalize the output writer # because it might be in a bad state. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None # Tell NDB to never cache anything in memcache or in-process. This ensures # that entities fetched from Datastore input_readers via NDB will not bloat # up the request memory size and Datastore Puts will avoid doing calls # to memcache. Without this you get soft memory limit exits, which hurts # overall throughput. if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info( "Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() except errors.RetrySliceError, e: logging.error("Slice error: %s", e) retry_count = int( os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0) if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES: raise logging.error("Too many retries: %d, failing the job", retry_count) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) # We recieved a command to abort. We don't care if we override # some data. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time)*1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() if not shard_state.active: # shard is going to stop. Finalize output writer if any. if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) # We don't want shard state to override active state, since that # may stuck job execution (see issue 116). Do a transactional # verification for status. # TODO(user): this might still result in some data inconsistency # which can be avoided. It doesn't seem to be worth it now, because # various crashes might result in all sort of data consistencies # anyway. @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error("Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def handle(self): """Handle request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) state, control = db.get([ model.MapreduceState.get_key_by_job_id(spec.mapreduce_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not state: logging.error("State not found for mapreduce_id '%s'; skipping", spec.mapreduce_id) return shard_states = model.ShardState.find_by_mapreduce_state(state) if state.active and len(shard_states) != spec.mapper.shard_count: # Some shards were lost logging.error( "Incorrect number of shard states: %d vs %d; " "aborting job '%s'", len(shard_states), spec.mapper.shard_count, spec.mapreduce_id) state.active = False state.result_status = model.MapreduceState.RESULT_FAILED model.MapreduceControl.abort(spec.mapreduce_id) active_shards = [s for s in shard_states if s.active] failed_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_FAILED ] aborted_shards = [ s for s in shard_states if s.result_status == model.ShardState.RESULT_ABORTED ] if state.active: state.active = bool(active_shards) state.active_shards = len(active_shards) state.failed_shards = len(failed_shards) state.aborted_shards = len(aborted_shards) if not control and failed_shards: model.MapreduceControl.abort(spec.mapreduce_id) if (not state.active and control and control.command == model.MapreduceControl.ABORT): # User-initiated abort *after* all shards have completed. logging.info("Abort signal received for job '%s'", spec.mapreduce_id) state.result_status = model.MapreduceState.RESULT_ABORTED if not state.active: state.active_shards = 0 if not state.result_status: # Set final result status derived from shard states. if [ s for s in shard_states if s.result_status != model.ShardState.RESULT_SUCCESS ]: state.result_status = model.MapreduceState.RESULT_FAILED else: state.result_status = model.MapreduceState.RESULT_SUCCESS logging.info("Final result for job '%s' is '%s'", spec.mapreduce_id, state.result_status) # We don't need a transaction here, since we change only statistics data, # and we don't care if it gets overwritten/slightly inconsistent. self.aggregate_state(state, shard_states) poll_time = state.last_poll_time state.last_poll_time = datetime.datetime.utcfromtimestamp(self._time()) if not state.active: ControllerCallbackHandler._finalize_job(spec, state, self.base_path()) return else: config = util.create_datastore_write_config(spec) state.put(config=config) processing_rate = int( spec.mapper.params.get("processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) self.refill_quotas(poll_time, processing_rate, active_shards) ControllerCallbackHandler.reschedule(state, self.base_path(), spec, self.serial_id() + 1)
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False): queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") if queue_name[0] == "_": # We are currently in some special queue. E.g. __cron. queue_name = "default" # Check that handler can be instantiated. mapper_spec.get_handler() # Check that reader can be instantiated and is configured correctly mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec( name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask( url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = None if not transactional: # Save state in datastore so that UI can see it. # We can't save state in foreign transaction, but conventional UI # doesn't ask for transactional starts anyway. state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: # Use the default task addition implementation. pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error("Shard is not active. Looks like spurious task execution.") return if shard_state.retries > tstate.retries: logging.error( "Got shard %s from previous shard retry %s. Drop", shard_state.shard_id, tstate.retries) return elif shard_state.retries < tstate.retries: # This happens when the transaction that updates shardstate and enqueues # task fails after the task has been added. That transaction will # be retried. Adding the same task will result in # TaskAlreadyExistsError but the error is ignored. raise ValueError( "ShardState for %s is behind slice. Waiting for it to catch up", shard_state.shard_id) ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # NOTE: When aborting, specifically do not finalize the output writer # because it might be in a bad state. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None # Tell NDB to never cache anything in memcache or in-process. This ensures # that entities fetched from Datastore input_readers via NDB will not bloat # up the request memory size and Datastore Puts will avoid doing calls # to memcache. Without this you get soft memory limit exits, which hurts # overall throughput. if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) retry_shard = False try: self.process_inputs( input_reader, shard_state, tstate, quota_consumer, ctx) if not shard_state.active: # shard is going to stop. Finalize output writer only when shard is # successful because writer might be stuck in some bad state otherwise. if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and tstate.output_writer): tstate.output_writer.finalize(ctx, shard_state) # pylint: disable=broad-except except Exception, e: retry_shard = self._retry_logic(e, shard_state, tstate, spec.mapreduce_id)