Exemplo n.º 1
0
    def refill_quotas(self, last_poll_time, processing_rate,
                      active_shard_states):
        """Refill quotas for all active shards.

    Args:
      last_poll_time: Datetime with the last time the job state was updated.
      processing_rate: How many items to process per second overall.
      active_shard_states: All active shard states, list of ShardState.
    """
        if not active_shard_states:
            return
        quota_manager = quota.QuotaManager(memcache.Client())

        current_time = int(self._time())
        last_poll_time = time.mktime(last_poll_time.timetuple())
        total_quota_refill = processing_rate * max(
            0, current_time - last_poll_time)
        quota_refill = int(
            math.ceil(1.0 * total_quota_refill / len(active_shard_states)))

        if not quota_refill:
            return

        # TODO(user): use batch memcache API to refill quota in one API call.
        for shard_state in active_shard_states:
            quota_manager.put(shard_state.shard_id, quota_refill)
Exemplo n.º 2
0
    def _schedule_shards(cls, spec, input_readers, output_writers, queue_name,
                         base_path):
        """Prepares shard states and schedules their execution.

    Args:
      spec: mapreduce specification as MapreduceSpec.
      input_readers: list of InputReaders describing shard splits.
      queue_name: The queue to run this job on.
      base_path: The base url path of mapreduce callbacks.
    """
        assert len(input_readers) == len(output_writers)
        # Note: it's safe to re-attempt this handler because:
        # - shard state has deterministic and unique key.
        # - _schedule_slice will fall back gracefully if a task already exists.
        shard_states = []
        for shard_number, input_reader in enumerate(input_readers):
            shard_state = model.ShardState.create_new(spec.mapreduce_id,
                                                      shard_number)
            shard_state.shard_description = str(input_reader)
            shard_states.append(shard_state)

        # Retrievs already existing shards.
        existing_shard_states = db.get(shard.key() for shard in shard_states)
        existing_shard_keys = set(shard.key()
                                  for shard in existing_shard_states
                                  if shard is not None)

        # Puts only non-existing shards.
        db.put((shard for shard in shard_states
                if shard.key() not in existing_shard_keys),
               config=util.create_datastore_write_config(spec))

        # Give each shard some quota to start with.
        processing_rate = int(
            spec.mapper.params.get("processing_rate")
            or model._DEFAULT_PROCESSING_RATE_PER_SEC)
        quota_refill = processing_rate / len(shard_states)
        quota_manager = quota.QuotaManager(memcache.Client())
        for shard_state in shard_states:
            quota_manager.put(shard_state.shard_id, quota_refill)

        # Schedule shard tasks.
        for shard_number, (input_reader, output_writer) in enumerate(
                zip(input_readers, output_writers)):
            shard_id = model.ShardState.shard_id_from_number(
                spec.mapreduce_id, shard_number)
            MapperWorkerCallbackHandler._schedule_slice(
                shard_states[shard_number],
                model.TransientShardState(base_path,
                                          spec,
                                          shard_id,
                                          0,
                                          input_reader,
                                          output_writer=output_writer),
                queue_name=queue_name)
Exemplo n.º 3
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if not shard_state.active:
            logging.error(
                "Shard is not active. Looks like spurious task execution.")
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            # NOTE: When aborting, specifically do not finalize the output writer
            # because it might be in a bad state.
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        # Tell NDB to never cache anything in memcache or in-process. This ensures
        # that entities fetched from Datastore input_readers via NDB will not bloat
        # up the request memory size and Datastore Puts will avoid doing calls
        # to memcache. Without this you get soft memory limit exits, which hurts
        # overall throughput.
        if ndb is not None:
            ndb_ctx = ndb.get_context()
            ndb_ctx.set_cache_policy(lambda key: False)
            ndb_ctx.set_memcache_policy(lambda key: False)

        context.Context._set(ctx)
        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                try:
                    # We shouldn't fetch an entity from the reader if there's not enough
                    # quota to process it. Perform all quota checks proactively.
                    if not quota_consumer or quota_consumer.consume():
                        for entity in input_reader:
                            if isinstance(entity, db.Model):
                                shard_state.last_work_item = repr(entity.key())
                            else:
                                shard_state.last_work_item = repr(entity)[:100]

                            scan_aborted = not self.process_data(
                                entity, input_reader, ctx, tstate)

                            # Check if we've got enough quota for the next entity.
                            if (quota_consumer and not scan_aborted
                                    and not quota_consumer.consume()):
                                scan_aborted = True
                            if scan_aborted:
                                break
                    else:
                        scan_aborted = True

                    if not scan_aborted:
                        logging.info(
                            "Processing done for shard %d of job '%s'",
                            shard_state.shard_number, shard_state.mapreduce_id)
                        # We consumed extra quota item at the end of for loop.
                        # Just be nice here and give it back :)
                        if quota_consumer:
                            quota_consumer.put(1)
                        shard_state.active = False
                        shard_state.result_status = model.ShardState.RESULT_SUCCESS

                    operation.counters.Increment(
                        context.COUNTER_MAPPER_WALLTIME_MS,
                        int((time.time() - self._start_time) * 1000))(ctx)

                    # TODO(user): Mike said we don't want this happen in case of
                    # exception while scanning. Figure out when it's appropriate to skip.
                    ctx.flush()
                except errors.RetrySliceError, e:
                    logging.error("Slice error: %s", e)
                    retry_count = int(
                        os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0)
                    if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES:
                        raise
                    logging.error("Too many retries: %d, failing the job",
                                  retry_count)
                    scan_aborted = True
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_FAILED
                except errors.FailJobError, e:
                    logging.error("Job failed: %s", e)
                    scan_aborted = True
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_FAILED
Exemplo n.º 4
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            if tstate.output_writer:
                tstate.output_writer.finalize(ctx, shard_state.shard_number)
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        context.Context._set(ctx)
        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                # We shouldn't fetch an entity from the reader if there's not enough
                # quota to process it. Perform all quota checks proactively.
                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_data(
                            entity, input_reader, ctx, tstate)

                        # Check if we've got enough quota for the next entity.
                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)
                    # We consumed extra quota item at the end of for loop.
                    # Just be nice here and give it back :)
                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            operation.counters.Increment(
                "mapper-walltime-msec",
                int((time.time() - self._start_time) * 1000))(ctx)

            # TODO(user): Mike said we don't want this happen in case of
            # exception while scanning. Figure out when it's appropriate to skip.
            ctx.flush()

            if not shard_state.active:
                # shard is going to stop. Finalize output writer if any.
                if tstate.output_writer:
                    tstate.output_writer.finalize(ctx,
                                                  shard_state.shard_number)
            shard_state.put(config=util.create_datastore_write_config(spec))
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        # Rescheduling work should always be the last statement. It shouldn't happen
        # if there were any exceptions in code before it.
        if shard_state.active:
            self.reschedule(shard_state, tstate)
        gc.collect()
Exemplo n.º 5
0
 def setUp(self):
     QuotaTestCase.setUp(self)
     self.quota_manager = quota.QuotaManager(memcache.Client())
     self.consumer = quota.QuotaConsumer(self.quota_manager, "foo", 50)
Exemplo n.º 6
0
 def setUp(self):
     QuotaTestCase.setUp(self)
     self.quota_manager = quota.QuotaManager(memcache.Client())
Exemplo n.º 7
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if not shard_state.active:
            logging.error(
                "Shard is not active. Looks like spurious task execution.")
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            if tstate.output_writer:
                tstate.output_writer.finalize(ctx, shard_state.shard_number)
            # We recieved a command to abort. We don't care if we override
            # some data.
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        context.Context._set(ctx)
        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                # We shouldn't fetch an entity from the reader if there's not enough
                # quota to process it. Perform all quota checks proactively.
                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_data(
                            entity, input_reader, ctx, tstate)

                        # Check if we've got enough quota for the next entity.
                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)
                    # We consumed extra quota item at the end of for loop.
                    # Just be nice here and give it back :)
                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            operation.counters.Increment(
                context.COUNTER_MAPPER_WALLTIME_MS,
                int((time.time() - self._start_time) * 1000))(ctx)

            # TODO(user): Mike said we don't want this happen in case of
            # exception while scanning. Figure out when it's appropriate to skip.
            ctx.flush()

            if not shard_state.active:
                # shard is going to stop. Finalize output writer if any.
                if tstate.output_writer:
                    tstate.output_writer.finalize(ctx,
                                                  shard_state.shard_number)

            config = util.create_datastore_write_config(spec)
            # We don't want shard state to override active state, since that
            # may stuck job execution (see issue 116). Do a transactional
            # verification for status.
            # TODO(user): this might still result in some data inconsistency
            # which can be avoided. It doesn't seem to be worth it now, because
            # various crashes might result in all sort of data consistencies
            # anyway.
            @db.transactional(retries=5)
            def tx():
                fresh_shard_state = db.get(
                    model.ShardState.get_key_by_shard_id(shard_id))
                if (not fresh_shard_state.active
                        or "worker_active_state_collision"
                        in _TEST_INJECTED_FAULTS):
                    shard_state.active = False
                    logging.error(
                        "Spurious task execution. Aborting the shard.")
                    return
                fresh_shard_state.copy_from(shard_state)
                fresh_shard_state.put(config=config)

            tx()
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        # Rescheduling work should always be the last statement. It shouldn't happen
        # if there were any exceptions in code before it.
        if shard_state.active:
            self.reschedule(shard_state, tstate)
        gc.collect()
Exemplo n.º 8
0
    def post(self):
        """Handle post request."""
        spec = model.MapreduceSpec.from_json_str(
            self.request.get("mapreduce_spec"))
        self._start_time = self._time()
        shard_id = self.shard_id()

        # TODO(user): Make this prettier
        logging.debug("post: shard=%s slice=%s headers=%s", shard_id,
                      self.slice_id(), self.request.META)

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.save()
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = self.input_reader(spec.mapper)

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(cache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        ctx = context.Context(spec, shard_state)
        context.Context._set(ctx)

        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                # We shouldn't fetch an entity from the reader if there's not enough
                # quota to process it. Perform all quota checks proactively.
                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_entity(entity, ctx)

                        # Check if we've got enough quota for the next entity.
                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)
                    # We consumed extra quota item at the end of for loop.
                    # Just be nice here and give it back :)
                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            # TODO(user): Mike said we don't want this happen in case of
            # exception while scanning. Figure out when it's appropriate to skip.
            ctx.flush()
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        # Rescheduling work should always be the last statement. It shouldn't happen
        # if there were any exceptions in code before it.
        if shard_state.active:
            self.reschedule(spec, input_reader)