def __iter__(self): ctx = context.get() combiner = None if ctx: combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec") if combiner_spec: combiner = util.handler_for_name(combiner_spec) self.current_key = None self.current_values = None for binary_record in super(_ReducerReader, self).__iter__(): proto = file_service_pb.KeyValues() proto.ParseFromString(binary_record) if self.current_key is None: self.current_key = proto.key() self.current_values = [] else: assert proto.key() == self.current_key, ( "inconsistent key sequence. Expected %s but got %s" % (self.current_key, proto.key())) if combiner: combiner_result = combiner( self.current_key, proto.value_list(), self.current_values) if not util.is_generator(combiner_result): raise errors.BadCombinerOutputError( "Combiner %s should yield values instead of returning them (%s)" % (combiner, combiner_result)) self.current_values = [] for value in combiner_result: if isinstance(value, operation.Operation): value(ctx) else: # with combiner current values always come from combiner self.current_values.append(value) else: # without combiner we just accumulate values. self.current_values.extend(proto.value_list()) if not proto.partial(): key = self.current_key values = self.current_values # This is final value, don't try to serialize it. self.current_key = None self.current_values = None yield (key, values) else: yield input_readers.ALLOW_CHECKPOINT
def mr_wrap(func): if util.is_generator(func): def map_func(*args, **kwargs): fbl = get_fblookup() # this passes the generator on to the client as a generator, while still having this function be detected as a generator (instead of just returning the generator directly, which would work but not let this function be detected as a generator) for x in func(fbl, *args, **kwargs): yield x else: def map_func(*args, **kwargs): fbl = get_fblookup() return func(fbl, *args, **kwargs) return map_func
def process_data(self, data, input_reader, ctx, transient_shard_state): """Process a single data piece. Call mapper handler on the data. Args: data: a datum to process. input_reader: input reader. ctx: current execution context. Returns: True if scan should be continued, False if scan should be aborted. """ if data is not input_readers.ALLOW_CHECKPOINT: ctx.counters.increment(context.COUNTER_MAPPER_CALLS) handler = ctx.mapreduce_spec.mapper.handler if input_reader.expand_parameters: result = handler(*data) else: result = handler(data) if util.is_generator(handler): for output in result: if isinstance(output, operation.Operation): output(ctx) else: output_writer = transient_shard_state.output_writer if not output_writer: logging.error( "Handler yielded %s, but no output writer is set.", output) else: output_writer.write(output, ctx) if self._time() - self._start_time > _SLICE_DURATION_SEC: logging.debug("Spent %s seconds. Rescheduling", self._time() - self._start_time) return False return True
def testNotGenerator(self): self.assertFalse(util.is_generator(test_handler_function))
def testGenerator(self): self.assertTrue(util.is_generator(test_handler_yield))