def testSortFile(self):
        """Test sorting a file."""
        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler._SortChunksPipeline("testjob", bucket_name,
                                         [[full_filename]])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with cloudstorage.open(output_file) as f:
                for binary_record in records.RecordsReader(f):
                    proto = kv_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
        self.assertEquals(1, len(self.emails))
示例#2
0
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = TestMergePipeline(bucket_name,
                              [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with cloudstorage.open(output_file) as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
        self.assertEquals(1, len(self.emails))
示例#4
0
  def next(self):
    """Returns the next input from this input reader, a record.

    Returns:
      The next input from this input reader in the form of a record read from
      an LevelDB file.

    Raises:
      StopIteration: The ordered set records has been exhausted.
    """
    while True:
      if not hasattr(self, "_cur_handle") or self._cur_handle is None:
        # If there are no more files, StopIteration is raised here
        self._cur_handle = super(GCSRecordInputReader, self).next()
      if not hasattr(self, "_record_reader") or self._record_reader is None:
        self._record_reader = records.RecordsReader(self._cur_handle)

      try:
        start_time = time.time()
        content = self._record_reader.read()
        self._slice_ctx.incr(self.COUNTER_IO_READ_BYTE, len(content))
        self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC,
                             int(time.time() - start_time) * 1000)
        return content
      except EOFError:
        self._cur_handle = None
        self._record_reader = None
示例#5
0
  def testReadTruncatedBuffer(self):
    """Test reading records from truncated file."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      # Block 1
      w.write('1' * 2)
      w.write('1' * 2)
      # Block 2
      w.write('1' * 2)
      w.write('1' * 2)

    data = writer.data

    while data:
      data = data[:-1]
      reader = records.RecordsReader(StringReader(data))
      count = len(list(reader))
      if len(data) >= 38:
        self.assertEqual(4, count)
      elif len(data) >= 29:
        self.assertEqual(3, count)
      elif len(data) >= 18:
        self.assertEqual(2, count)
      elif len(data) >= 9:
        self.assertEqual(1, count)
      else:
        self.assertEqual(0, count)
示例#6
0
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
  def testLotsOfValuesForSingleKey(self):
    TestEntity(data=str(1)).put()
    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".map_yield_lots_of_values",
        __name__ + ".reduce_length",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=
            output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
        mapper_params= {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    output_data = []
    for output_file in p.outputs.default.value:
      with files.open(output_file, "r") as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

    expected_data = ["('1', 50000)"]
    expected_data.sort()
    output_data.sort()
    self.assertEquals(expected_data, output_data)
 def testAppendAndFlush(self):
   self.pool.append("a")
   self.assertEquals("", self.file_service.get_content("tempfile"))
   self.pool.append("b")
   self.assertEquals("", self.file_service.get_content("tempfile"))
   self.pool.flush()
   self.assertEquals(
       ["a", "b"],
       list(records.RecordsReader(files.open("tempfile", "r"))))
    def testMapReduce(self):
        # Prepare test data
        bucket_name = "testbucket"
        job_name = "test_job"
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            job_name,
            __name__ + ".test_mapreduce_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=(output_writers.__name__ +
                                "._GoogleCloudStorageRecordOutputWriter"),
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "bucket_name": bucket_name
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": bucket_name
                },
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
        output_data = []
        for output_file in p.outputs.default.value:
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        expected_data = [str((str(d), ["", ""])) for d in range(entity_count)]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)

        # Verify that mapreduce doesn't leave intermediate files behind.
        temp_file_stats = cloudstorage.listbucket("/" + bucket_name)
        for stat in temp_file_stats:
            if stat.filename:
                self.assertFalse(
                    stat.filename.startswith("/%s/%s-shuffle-" %
                                             (bucket_name, job_name)))
示例#10
0
  def testIter(self):
    """Test reader iterator interface."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      w.write('1' * 1)
      w.write('2' * 2)
      w.write('3' * 3)
      w.write('4' * 4)

    reader = records.RecordsReader(StringReader(writer.data))
    self.assertEqual(['1', '22', '333', '4444'], list(reader))
    def testMapReduceWithShardRetry(self):
        # Prepare test data
        bucket_name = "testbucket"
        entity_count = 200
        db.delete(RetryCount.all())

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_mapreduce_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=(__name__ + ".TestFileRecordsOutputWriter"),
            mapper_params={
                "input_reader": {
                    "entity_kind": __name__ + "." + TestEntity.__name__,
                },
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": bucket_name
                },
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
        output_data = []
        retries = 0
        for output_file in p.outputs.default.value:
            # Get the number of shard retries by parsing filename.
            retries += (int(output_file[-1]) - 1)
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        # Assert file names also suggest the right number of retries.
        self.assertEquals(44, retries)
        expected_data = [str((str(d), ["", ""])) for d in range(entity_count)]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)
示例#12
0
  def testReadNoRoomForHeader(self):
    """Test reading records that leave <7 bytes in a block."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      w.write('1' * 10)
      w.write('1' * 10)

    reader = records.RecordsReader(StringReader(writer.data))

    self.assertEqual('1' * 10, reader.read())
    self.assertEqual('1' * 10, reader.read())
    self.assertRaises(EOFError, reader.read)
示例#13
0
  def testReadHeaderAtTheEndOfTheBlock(self):
    """Test reading records, that leave exactly 7 bytes at the end of block."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      w.write('1' * 6)
      w.write('1' * 10)

    reader = records.RecordsReader(StringReader(writer.data))

    self.assertEqual('1' * 6, reader.read())
    self.assertEqual('1' * 10, reader.read())
    self.assertRaises(EOFError, reader.read)
示例#14
0
  def testPadBlockIdempotency(self):
    """Test _pad_block is idempotent."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      w.write('')
      w._pad_block()
      w._pad_block()
      w._pad_block()
      w._pad_block()

    reader = records.RecordsReader(StringReader(writer.data))
    self.assertEqual('', reader.read())
    self.assertEqual(records._BLOCK_SIZE, len(writer.data))
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [("1", "a"), ("2", "b"), ("3", "c")]
            input_data.sort()

            bucket_name = "testbucket"
            test_filename = "testfile"
            full_filename = "/%s/%s" % (bucket_name, test_filename)

            with cloudstorage.open(full_filename, mode="w") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = kv_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())

            p = TestMergePipeline(
                bucket_name, [full_filename, full_filename, full_filename])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ("1", ["a"], True),
                ("1", ["a"], True),
                ("1", ["a"], False),
                ("2", ["b"], True),
                ("2", ["b"], True),
                ("2", ["b"], False),
                ("3", ["c"], True),
                ("3", ["c"], True),
                ("3", ["c"], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
        self.assertEquals(1, len(self.emails))
示例#16
0
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(
                files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ('1', ['a'], True),
                ('1', ['a'], True),
                ('1', ['a'], False),
                ('2', ['b'], True),
                ('2', ['b'], True),
                ('2', ['b'], False),
                ('3', ['c'], True),
                ('3', ['c'], True),
                ('3', ['c'], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
示例#17
0
  def testReadCorruptedRecordOrder_LastThenFull(self):
    """Tests corruption when a last record is followed by a full record."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      # Fake last record that should be ignored
      w._RecordsWriter__write_record(
          records._RECORD_TYPE_LAST, 'A' * (records._BLOCK_SIZE / 2))

      # Single-block, "full" record.
      w.write('B' * (records._BLOCK_SIZE / 4))

    data = writer.data
    reader = records.RecordsReader(StringReader(data))
    self.assertEqual('B' * (records._BLOCK_SIZE / 4), reader.read())
    self.assertRaises(EOFError, reader.read)
示例#18
0
 def testAppendAndFlush(self):
     self.pool.append("a")
     self.assertRaises(cloudstorage.errors.NotFoundError, cloudstorage.open,
                       self.filename)
     self.pool.append("b")
     self.assertRaises(cloudstorage.errors.NotFoundError, cloudstorage.open,
                       self.filename)
     self.pool.flush()
     self.assertRaises(cloudstorage.errors.NotFoundError, cloudstorage.open,
                       self.filename)
     # File handle does need to be explicitly closed.
     self.filehandle.close()
     self.assertEquals(32 * 1024, cloudstorage.stat(self.filename).st_size)
     self.assertEquals(
         ["a", "b"],
         list(records.RecordsReader(cloudstorage.open(self.filename))))
示例#19
0
  def testReadWholeBlocks(self):
    """Test reading record occupying a whole block."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      w.write('1' * 13)
      w._pad_block()

    with records.RecordsWriter(writer) as w:
      w.write('1' * 13)

    reader = records.RecordsReader(StringReader(writer.data))

    self.assertEqual('1' * 13, reader.read())
    self.assertEqual('1' * 13, reader.read())
    self.assertRaises(EOFError, reader.read)
示例#20
0
    def testMapReduce(self):
        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_mapreduce_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".BlobstoreRecordsOutputWriter",
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
        output_data = []
        for output_file in p.outputs.default.value:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        expected_data = [str((str(d), ["", ""])) for d in range(entity_count)]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)

        # Verify that mapreduce doesn't leave intermediate files behind.
        blobInfos = blobstore.BlobInfo.all().fetch(limit=1000)
        for blobinfo in blobInfos:
            self.assertTrue(
                "Bad filename: %s" % blobinfo.filename,
                re.match("test-reduce-.*-output-\d+", blobinfo.filename))
示例#21
0
  def testReadEmptyRecord(self):
    """Test reading empty records."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      w.write('')
      w._pad_block()

    with records.RecordsWriter(writer) as w:
      w.write('')

    reader = records.RecordsReader(StringReader(writer.data))

    self.assertEqual('', reader.read())
    # Should correctly skip padding.
    self.assertEqual('', reader.read())
    self.assertRaises(EOFError, reader.read)
示例#22
0
  def testReadCorruptedCrcLargeRecord(self):
    """Test reading large records with corrupted crc."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      # Blocks 1-6
      w.write('1' * 100)
      # Block 7
      w.write('1' * 2)

    data = writer.data
    data = '_' + data[1:]
    reader = records.RecordsReader(StringReader(data))

    # First record should be completely skipped.
    self.assertEqual('1' * 2, reader.read())
    self.assertRaises(EOFError, reader.read)
示例#23
0
  def testReadCorruptedRecordOrder_FirstThenFirst(self):
    """Tests corruption when a first record is followed by a first record."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      # Fake first record that should be ignored
      w._RecordsWriter__write_record(
          records._RECORD_TYPE_FIRST, 'A' * (records._BLOCK_SIZE / 2))

      # Multi-block record. This will cause 'A' to be ignored because
      # of a repeated first record.
      w.write('B' * 2 * records._BLOCK_SIZE)

    data = writer.data
    reader = records.RecordsReader(StringReader(data))
    self.assertEqual('B' * 2 * records._BLOCK_SIZE, reader.read())
    self.assertRaises(EOFError, reader.read)
示例#24
0
  def testSmoke(self):
    """Smoke test of all cases.

    Other smaller tests are more revealing in particular situations.
    """
    input_size = 0
    # Try many input sizes!
    while input_size < records._BLOCK_SIZE * 3:
      writer = StringWriter()
      inputs = '1' * input_size
      with records.RecordsWriter(writer) as w:
        # Make sure even the smallest input covers more than one block.
        for _ in range(records._BLOCK_SIZE):
          w.write(inputs)

      reader = records.RecordsReader(StringReader(writer.data))
      for _ in range(records._BLOCK_SIZE):
        self.assertEqual(inputs, reader.read())
      self.assertRaises(EOFError, reader.read)
      input_size += 1
    def testHashingMultipleFiles(self):
        """Test hashing files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler._HashPipeline(
            "testjob", bucket_name,
            [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._HashPipeline.from_id(p.pipeline_id)

        list_of_output_files = p.outputs.default.value
        output_data = []
        for output_files in list_of_output_files:
            for output_file in output_files:
                with cloudstorage.open(output_file) as f:
                    for binary_record in records.RecordsReader(f):
                        proto = kv_pb.KeyValue()
                        proto.ParseFromString(binary_record)
                        output_data.append((proto.key(), proto.value()))

        output_data.sort()
        self.assertEquals(300, len(output_data))
        for i in range(len(input_data)):
            self.assertEquals(input_data[i], output_data[(3 * i)])
            self.assertEquals(input_data[i], output_data[(3 * i) + 1])
            self.assertEquals(input_data[i], output_data[(3 * i) + 2])
        self.assertEquals(1, len(self.emails))
示例#26
0
  def testReadCorruptedLength(self):
    """Test reading record with corrupted length."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      # Block 1
      w.write('1' * 2)
      w.write('1' * 2)
      # Block 2
      w.write('1' * 2)
      w.write('1' * 2)

    data = writer.data
    # replace length by 65535
    data = data[:4] + '\xff\xff' + data[6:]
    reader = records.RecordsReader(StringReader(data))

    # First  block should be completely skipped.
    self.assertEqual('1' * 2, reader.read())
    self.assertEqual('1' * 2, reader.read())
    self.assertRaises(EOFError, reader.read)
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler.ShufflePipeline(
            "testjob", {"bucket_name": bucket_name},
            [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    proto = kv_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
        self.assertEquals(1, len(self.emails))
示例#28
0
  def testReadLargeRecords(self):
    """Test reading large headers."""
    writer = StringWriter()

    with records.RecordsWriter(writer) as w:
      w.write('1' * 10)
      w.write('1' * 20)
      w.write('1' * 30)
      w.write('1' * 40)
      w.write('1' * 50)
      w.write('1' * 60)
      w.write('1' * 70)

    reader = records.RecordsReader(StringReader(writer.data))

    self.assertEqual('1' * 10, reader.read())
    self.assertEqual('1' * 20, reader.read())
    self.assertEqual('1' * 30, reader.read())
    self.assertEqual('1' * 40, reader.read())
    self.assertEqual('1' * 50, reader.read())
    self.assertEqual('1' * 60, reader.read())
    self.assertEqual('1' * 70, reader.read())
    self.assertRaises(EOFError, reader.read)
示例#29
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
    def _runTest(self, num_shards):
        entity_count = 1000
        bucket_name = "bucket"
        job_name = "test_map"

        for _ in range(entity_count):
            TestEntity().put()

        mapreduce_id = control.start_map(
            job_name,
            __name__ + ".test_handler_yield_key_str",
            DATASTORE_READER_NAME, {
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "output_writer": {
                    "bucket_name": bucket_name,
                },
            },
            shard_count=num_shards,
            output_writer_spec=self.WRITER_NAME)

        test_support.execute_until_empty(self.taskqueue)
        mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
        filenames = self.WRITER_CLS.get_filenames(mapreduce_state)

        self.assertEqual(num_shards, len(set(filenames)))
        total_entries = 0
        for shard in range(num_shards):
            self.assertTrue(filenames[shard].startswith(
                "/%s/%s" % (bucket_name, job_name)))
            data = "".join([
                _ for _ in records.RecordsReader(
                    cloudstorage.open(filenames[shard]))
            ])
            # strip() is used to remove the last newline of each file so that split()
            # does not return extraneous empty entries.
            total_entries += len(data.strip().split("\n"))
        self.assertEqual(entity_count, total_entries)