Пример #1
0
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
    def testHugeTaskUseDatastore(self):
        """Test map job with huge parameter values."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader",
            {
                "file": input_file,
                # the parameter can't be compressed and wouldn't fit into
                # taskqueue payload
                "huge_parameter": random_string(900000)
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
        self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
Пример #3
0
  def flush(self):
    """Flush pool contents."""
    # Write data to in-memory buffer first.
    buf = _StringWriter()
    with records.RecordsWriter(buf) as w:
      for record in self._buffer:
        w.write(record)

    str_buf = buf.to_string()
    if not self._exclusive and len(str_buf) > _FILES_API_MAX_SIZE:
      # Shouldn't really happen because of flush size.
      raise errors.Error(
          "Buffer too big. Can't write more than %s bytes in one request: "
          "risk of writes interleaving. Got: %s" %
          (_FILES_API_MAX_SIZE, len(str_buf)))

    # Write data to file.
    start_time = time.time()
    with files.open(self._filename, "a", exclusive_lock=self._exclusive) as f:
      f.write(str_buf)
      if self._ctx:
        operation.counters.Increment(
            COUNTER_IO_WRITE_BYTES, len(str_buf))(self._ctx)
    if self._ctx:
      operation.counters.Increment(
          COUNTER_IO_WRITE_MSEC,
          int((time.time() - start_time) * 1000))(self._ctx)

    # reset buffer
    self._buffer = []
    self._size = 0
    gc.collect()
Пример #4
0
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
Пример #5
0
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(
                files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ('1', ['a'], True),
                ('1', ['a'], True),
                ('1', ['a'], False),
                ('2', ['b'], True),
                ('2', ['b'], True),
                ('2', ['b'], False),
                ('3', ['c'], True),
                ('3', ['c'], True),
                ('3', ['c'], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
Пример #6
0
    def createMockData(self, data):
        """Create mock data for FetchContentPipeline"""
        input_file = files.blobstore.create()
        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                key = str(data[0])
                value = str(data[1])
                proto = file_service_pb.KeyValue()
                proto.set_key(key)
                proto.set_value(value)
                w.write(proto.Encode())

        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        return input_file
    def testRecordsReader(self):
        """End-to-end test for records reader."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader", {"file": input_file},
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
Пример #8
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
Пример #9
0
  def testReadPartial(self):
    input_file = files.blobstore.create()

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        # First record is full
        proto = file_service_pb.KeyValues()
        proto.set_key("key1")
        proto.value_list().extend(["a", "b"])
        w.write(proto.Encode())
        # Second record is partial
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["a", "b"])
        proto.set_partial(True)
        w.write(proto.Encode())
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["c", "d"])
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    self.assertEquals(
        [("key1", ["a", "b"]),
         input_readers.ALLOW_CHECKPOINT,
         ("key2", ["a", "b", "c", "d"])],
        list(reader))

    # now test state serialization
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    self.assertEquals(
        {"position": 0,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    self.assertEquals(
        {"position": 19,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    self.assertEquals(
        {"position": 40,
         "current_values": ["a", "b"],
         "current_key": "key2",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    self.assertEquals(
        {"position": 59,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass

    # now do test deserialization at every moment.
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass