Exemplo n.º 1
0
def _merge_map(k, values):
    """A map function used in merge phase.

  Stores (k, values) into KeyValues proto and yields its serialization.
  """
    proto = file_service_pb.KeyValues()
    proto.set_key(k)
    proto.value_list().extend(values)
    yield proto.Encode()
Exemplo n.º 2
0
    def __iter__(self):
        ctx = context.get()
        combiner = None

        if ctx:
            combiner_spec = ctx.mapreduce_spec.mapper.params.get(
                "combiner_spec")
            if combiner_spec:
                combiner = util.handler_for_name(combiner_spec)

        self.current_key = None
        self.current_values = None

        for binary_record in super(_ReducerReader, self).__iter__():
            proto = file_service_pb.KeyValues()
            proto.ParseFromString(binary_record)

            if self.current_key is None:
                self.current_key = proto.key()
                self.current_values = []
            else:
                assert proto.key() == self.current_key, (
                    "inconsistent key sequence. Expected %s but got %s" %
                    (self.current_key, proto.key()))

            if combiner:
                combiner_result = combiner(self.current_key,
                                           proto.value_list(),
                                           self.current_values)

                if not util.is_generator(combiner_result):
                    raise errors.BadCombinerOutputError(
                        "Combiner %s should yield values instead of returning them (%s)"
                        % (combiner, combiner_result))

                self.current_values = []
                for value in combiner_result:
                    if isinstance(value, operation.Operation):
                        value(ctx)
                    else:

                        self.current_values.append(value)
            else:

                self.current_values.extend(proto.value_list())

            if not proto.partial():
                key = self.current_key
                values = self.current_values

                self.current_key = None
                self.current_values = None
                yield (key, values)
            else:
                yield input_readers.ALLOW_CHECKPOINT
Exemplo n.º 3
0
def _merge_map(key, values, partial):
  """A map function used in merge phase.

  Stores (key, values) into KeyValues proto and yields its serialization.

  Args:
    key: values key.
    values: values themselves.
    partial: True if more values for this key will follow. False otherwise.
  """
  proto = file_service_pb.KeyValues()
  proto.set_key(key)
  proto.value_list().extend(values)
  proto.set_partial(partial)
  yield proto.Encode()
Exemplo n.º 4
0
  def testShuffleFiles(self):
    """Test shuffling multiple files."""
    input_data = [(str(i), str(i)) for i in range(100)]
    input_data.sort()

    bucket_name = "testbucket"
    test_filename = "testfile"
    full_filename = "/%s/%s" % (bucket_name, test_filename)

    with cloudstorage.open(full_filename, mode="w") as f:
      with records.RecordsWriter(f) as w:
        for (k, v) in input_data:
          proto = file_service_pb.KeyValue()
          proto.set_key(k)
          proto.set_value(v)
          w.write(proto.Encode())

    p = shuffler.ShufflePipeline("testjob", {"bucket_name": bucket_name},
                                 [full_filename, full_filename, full_filename])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

    output_files = p.outputs.default.value
    output_data = []
    for output_file in output_files:
      with cloudstorage.open(output_file) as f:
        for record in records.RecordsReader(f):
          proto = file_service_pb.KeyValues()
          proto.ParseFromString(record)
          output_data.append((proto.key(), proto.value_list()))
    output_data.sort()

    expected_data = sorted([
        (str(k), [str(v), str(v), str(v)]) for (k, v) in input_data])
    self.assertEquals(expected_data, output_data)
    self.assertEquals(1, len(self.emails))
Exemplo n.º 5
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
Exemplo n.º 6
0
 def __iter__(self):
     for binary_record in input_readers.RecordsReader.__iter__(self):
         proto = file_service_pb.KeyValues()
         proto.ParseFromString(binary_record)
         yield (proto.key(), proto.value_list())