示例#1
0
    def __iter__(self):
        ctx = context.get()
        combiner = None

        if ctx:
            combiner_spec = ctx.mapreduce_spec.mapper.params.get(
                "combiner_spec")
            if combiner_spec:
                combiner = util.handler_for_name(combiner_spec)

        self.current_key = None
        self.current_values = None

        for binary_record in super(_ReducerReader, self).__iter__():
            proto = file_service_pb.KeyValues()
            proto.ParseFromString(binary_record)

            if self.current_key is None:
                self.current_key = proto.key()
                self.current_values = []
            else:
                assert proto.key() == self.current_key, (
                    "inconsistent key sequence. Expected %s but got %s" %
                    (self.current_key, proto.key()))

            if combiner:
                combiner_result = combiner(self.current_key,
                                           proto.value_list(),
                                           self.current_values)

                if not util.is_generator(combiner_result):
                    raise errors.BadCombinerOutputError(
                        "Combiner %s should yield values instead of returning them (%s)"
                        % (combiner, combiner_result))

                self.current_values = []
                for value in combiner_result:
                    if isinstance(value, operation.Operation):
                        value(ctx)
                    else:

                        self.current_values.append(value)
            else:

                self.current_values.extend(proto.value_list())

            if not proto.partial():
                key = self.current_key
                values = self.current_values

                self.current_key = None
                self.current_values = None
                yield (key, values)
            else:
                yield input_readers.ALLOW_CHECKPOINT
  def __iter__(self):
    ctx = context.get()
    combiner = None

    if ctx:
      combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec")
      if combiner_spec:
        combiner = util.handler_for_name(combiner_spec)

    self.current_key = None
    self.current_values = None

    for binary_record in super(_ReducerReader, self).__iter__():
      proto = file_service_pb.KeyValues()
      proto.ParseFromString(binary_record)

      if self.current_key is None:
        self.current_key = proto.key()
        self.current_values = []
      else:
        assert proto.key() == self.current_key, (
            "inconsistent key sequence. Expected %s but got %s" %
            (self.current_key, proto.key()))

      if combiner:
        combiner_result = combiner(
            self.current_key, proto.value_list(), self.current_values)

        if not util.is_generator(combiner_result):
          raise errors.BadCombinerOutputError(
              "Combiner %s should yield values instead of returning them (%s)" %
              (combiner, combiner_result))

        self.current_values = []
        for value in combiner_result:
          if isinstance(value, operation.Operation):
            value(ctx)
          else:

            self.current_values.append(value)
      else:

        self.current_values.extend(proto.value_list())

      if not proto.partial():
        key = self.current_key
        values = self.current_values

        self.current_key = None
        self.current_values = None
        yield (key, values)
      else:
        yield input_readers.ALLOW_CHECKPOINT
示例#3
0
    def process_data(self, data, input_reader, ctx, transient_shard_state):
        """Process a single data piece.

    Call mapper handler on the data.

    Args:
      data: a datum to process.
      input_reader: input reader.
      ctx: current execution context.

    Returns:
      True if scan should be continued, False if scan should be aborted.
    """
        if data is not input_readers.ALLOW_CHECKPOINT:
            ctx.counters.increment(context.COUNTER_MAPPER_CALLS)

            handler = ctx.mapreduce_spec.mapper.handler
            if input_reader.expand_parameters:
                result = handler(*data)
            else:
                result = handler(data)

            if util.is_generator(handler):
                for output in result:
                    if isinstance(output, operation.Operation):
                        output(ctx)
                    else:
                        output_writer = transient_shard_state.output_writer
                        if not output_writer:
                            logging.error(
                                "Handler yielded %s, but no output writer is set.",
                                output)
                        else:
                            output_writer.write(output, ctx)

        if self._time() - self._start_time > _SLICE_DURATION_SEC:
            logging.debug("Spent %s seconds. Rescheduling",
                          self._time() - self._start_time)
            return False
        return True
示例#4
0
  def process_data(self, data, input_reader, ctx, transient_shard_state):
    """Process a single data piece.

    Call mapper handler on the data.

    Args:
      data: a datum to process.
      input_reader: input reader.
      ctx: current execution context.

    Returns:
      True if scan should be continued, False if scan should be aborted.
    """
    if data is not input_readers.ALLOW_CHECKPOINT:
      ctx.counters.increment(context.COUNTER_MAPPER_CALLS)

      handler = ctx.mapreduce_spec.mapper.handler
      if input_reader.expand_parameters:
        result = handler(*data)
      else:
        result = handler(data)

      if util.is_generator(handler):
        for output in result:
          if isinstance(output, operation.Operation):
            output(ctx)
          else:
            output_writer = transient_shard_state.output_writer
            if not output_writer:
              logging.error(
                  "Handler yielded %s, but no output writer is set.", output)
            else:
              output_writer.write(output, ctx)

    if self._time() - self._start_time > _SLICE_DURATION_SEC:
      logging.debug("Spent %s seconds. Rescheduling",
                    self._time() - self._start_time)
      return False
    return True