def _merge_map(k, values): """A map function used in merge phase. Stores (k, values) into KeyValues proto and yields its serialization. """ proto = file_service_pb.KeyValues() proto.set_key(k) proto.value_list().extend(values) yield proto.Encode()
def __iter__(self): ctx = context.get() combiner = None if ctx: combiner_spec = ctx.mapreduce_spec.mapper.params.get( "combiner_spec") if combiner_spec: combiner = util.handler_for_name(combiner_spec) self.current_key = None self.current_values = None for binary_record in super(_ReducerReader, self).__iter__(): proto = file_service_pb.KeyValues() proto.ParseFromString(binary_record) if self.current_key is None: self.current_key = proto.key() self.current_values = [] else: assert proto.key() == self.current_key, ( "inconsistent key sequence. Expected %s but got %s" % (self.current_key, proto.key())) if combiner: combiner_result = combiner(self.current_key, proto.value_list(), self.current_values) if not util.is_generator(combiner_result): raise errors.BadCombinerOutputError( "Combiner %s should yield values instead of returning them (%s)" % (combiner, combiner_result)) self.current_values = [] for value in combiner_result: if isinstance(value, operation.Operation): value(ctx) else: self.current_values.append(value) else: self.current_values.extend(proto.value_list()) if not proto.partial(): key = self.current_key values = self.current_values self.current_key = None self.current_values = None yield (key, values) else: yield input_readers.ALLOW_CHECKPOINT
def _merge_map(key, values, partial): """A map function used in merge phase. Stores (key, values) into KeyValues proto and yields its serialization. Args: key: values key. values: values themselves. partial: True if more values for this key will follow. False otherwise. """ proto = file_service_pb.KeyValues() proto.set_key(key) proto.value_list().extend(values) proto.set_partial(partial) yield proto.Encode()
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler.ShufflePipeline("testjob", {"bucket_name": bucket_name}, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([ (str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def __iter__(self): for binary_record in input_readers.RecordsReader.__iter__(self): proto = file_service_pb.KeyValues() proto.ParseFromString(binary_record) yield (proto.key(), proto.value_list())