def testSortFile(self): """Test sorting a file.""" bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) input_data = [(str(i), "_" + str(i)) for i in range(100)] with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler._SortChunksPipeline("testjob", bucket_name, [[full_filename]]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for binary_record in records.RecordsReader(f): proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data) self.assertEquals(1, len(self.emails))
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = TestMergePipeline(bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def next(self): """Returns the next input from this input reader, a record. Returns: The next input from this input reader in the form of a record read from an LevelDB file. Raises: StopIteration: The ordered set records has been exhausted. """ while True: if not hasattr(self, "_cur_handle") or self._cur_handle is None: # If there are no more files, StopIteration is raised here self._cur_handle = super(GCSRecordInputReader, self).next() if not hasattr(self, "_record_reader") or self._record_reader is None: self._record_reader = records.RecordsReader(self._cur_handle) try: start_time = time.time() content = self._record_reader.read() self._slice_ctx.incr(self.COUNTER_IO_READ_BYTE, len(content)) self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC, int(time.time() - start_time) * 1000) return content except EOFError: self._cur_handle = None self._record_reader = None
def testReadTruncatedBuffer(self): """Test reading records from truncated file.""" writer = StringWriter() with records.RecordsWriter(writer) as w: # Block 1 w.write('1' * 2) w.write('1' * 2) # Block 2 w.write('1' * 2) w.write('1' * 2) data = writer.data while data: data = data[:-1] reader = records.RecordsReader(StringReader(data)) count = len(list(reader)) if len(data) >= 38: self.assertEqual(4, count) elif len(data) >= 29: self.assertEqual(3, count) elif len(data) >= 18: self.assertEqual(2, count) elif len(data) >= 9: self.assertEqual(1, count) else: self.assertEqual(0, count)
def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testLotsOfValuesForSingleKey(self): TestEntity(data=str(1)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".map_yield_lots_of_values", __name__ + ".reduce_length", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec= output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params= { "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = ["('1', 50000)"] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def testAppendAndFlush(self): self.pool.append("a") self.assertEquals("", self.file_service.get_content("tempfile")) self.pool.append("b") self.assertEquals("", self.file_service.get_content("tempfile")) self.pool.flush() self.assertEquals( ["a", "b"], list(records.RecordsReader(files.open("tempfile", "r"))))
def testMapReduce(self): # Prepare test data bucket_name = "testbucket" job_name = "test_job" entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( job_name, __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, "bucket_name": bucket_name }, reducer_params={ "output_writer": { "bucket_name": bucket_name }, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] for output_file in p.outputs.default.value: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data) # Verify that mapreduce doesn't leave intermediate files behind. temp_file_stats = cloudstorage.listbucket("/" + bucket_name) for stat in temp_file_stats: if stat.filename: self.assertFalse( stat.filename.startswith("/%s/%s-shuffle-" % (bucket_name, job_name)))
def testIter(self): """Test reader iterator interface.""" writer = StringWriter() with records.RecordsWriter(writer) as w: w.write('1' * 1) w.write('2' * 2) w.write('3' * 3) w.write('4' * 4) reader = records.RecordsReader(StringReader(writer.data)) self.assertEqual(['1', '22', '333', '4444'], list(reader))
def testMapReduceWithShardRetry(self): # Prepare test data bucket_name = "testbucket" entity_count = 200 db.delete(RetryCount.all()) for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(__name__ + ".TestFileRecordsOutputWriter"), mapper_params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, reducer_params={ "output_writer": { "bucket_name": bucket_name }, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] retries = 0 for output_file in p.outputs.default.value: # Get the number of shard retries by parsing filename. retries += (int(output_file[-1]) - 1) with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) # Assert file names also suggest the right number of retries. self.assertEquals(44, retries) expected_data = [str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def testReadNoRoomForHeader(self): """Test reading records that leave <7 bytes in a block.""" writer = StringWriter() with records.RecordsWriter(writer) as w: w.write('1' * 10) w.write('1' * 10) reader = records.RecordsReader(StringReader(writer.data)) self.assertEqual('1' * 10, reader.read()) self.assertEqual('1' * 10, reader.read()) self.assertRaises(EOFError, reader.read)
def testReadHeaderAtTheEndOfTheBlock(self): """Test reading records, that leave exactly 7 bytes at the end of block.""" writer = StringWriter() with records.RecordsWriter(writer) as w: w.write('1' * 6) w.write('1' * 10) reader = records.RecordsReader(StringReader(writer.data)) self.assertEqual('1' * 6, reader.read()) self.assertEqual('1' * 10, reader.read()) self.assertRaises(EOFError, reader.read)
def testPadBlockIdempotency(self): """Test _pad_block is idempotent.""" writer = StringWriter() with records.RecordsWriter(writer) as w: w.write('') w._pad_block() w._pad_block() w._pad_block() w._pad_block() reader = records.RecordsReader(StringReader(writer.data)) self.assertEqual('', reader.read()) self.assertEqual(records._BLOCK_SIZE, len(writer.data))
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [("1", "a"), ("2", "b"), ("3", "c")] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = TestMergePipeline( bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ("1", ["a"], True), ("1", ["a"], True), ("1", ["a"], False), ("2", ["b"], True), ("2", ["b"], True), ("2", ["b"], False), ("3", ["c"], True), ("3", ["c"], True), ("3", ["c"], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count self.assertEquals(1, len(self.emails))
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ('1', ['a'], True), ('1', ['a'], True), ('1', ['a'], False), ('2', ['b'], True), ('2', ['b'], True), ('2', ['b'], False), ('3', ['c'], True), ('3', ['c'], True), ('3', ['c'], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def testReadCorruptedRecordOrder_LastThenFull(self): """Tests corruption when a last record is followed by a full record.""" writer = StringWriter() with records.RecordsWriter(writer) as w: # Fake last record that should be ignored w._RecordsWriter__write_record( records._RECORD_TYPE_LAST, 'A' * (records._BLOCK_SIZE / 2)) # Single-block, "full" record. w.write('B' * (records._BLOCK_SIZE / 4)) data = writer.data reader = records.RecordsReader(StringReader(data)) self.assertEqual('B' * (records._BLOCK_SIZE / 4), reader.read()) self.assertRaises(EOFError, reader.read)
def testAppendAndFlush(self): self.pool.append("a") self.assertRaises(cloudstorage.errors.NotFoundError, cloudstorage.open, self.filename) self.pool.append("b") self.assertRaises(cloudstorage.errors.NotFoundError, cloudstorage.open, self.filename) self.pool.flush() self.assertRaises(cloudstorage.errors.NotFoundError, cloudstorage.open, self.filename) # File handle does need to be explicitly closed. self.filehandle.close() self.assertEquals(32 * 1024, cloudstorage.stat(self.filename).st_size) self.assertEquals( ["a", "b"], list(records.RecordsReader(cloudstorage.open(self.filename))))
def testReadWholeBlocks(self): """Test reading record occupying a whole block.""" writer = StringWriter() with records.RecordsWriter(writer) as w: w.write('1' * 13) w._pad_block() with records.RecordsWriter(writer) as w: w.write('1' * 13) reader = records.RecordsReader(StringReader(writer.data)) self.assertEqual('1' * 13, reader.read()) self.assertEqual('1' * 13, reader.read()) self.assertRaises(EOFError, reader.read)
def testMapReduce(self): # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data) # Verify that mapreduce doesn't leave intermediate files behind. blobInfos = blobstore.BlobInfo.all().fetch(limit=1000) for blobinfo in blobInfos: self.assertTrue( "Bad filename: %s" % blobinfo.filename, re.match("test-reduce-.*-output-\d+", blobinfo.filename))
def testReadEmptyRecord(self): """Test reading empty records.""" writer = StringWriter() with records.RecordsWriter(writer) as w: w.write('') w._pad_block() with records.RecordsWriter(writer) as w: w.write('') reader = records.RecordsReader(StringReader(writer.data)) self.assertEqual('', reader.read()) # Should correctly skip padding. self.assertEqual('', reader.read()) self.assertRaises(EOFError, reader.read)
def testReadCorruptedCrcLargeRecord(self): """Test reading large records with corrupted crc.""" writer = StringWriter() with records.RecordsWriter(writer) as w: # Blocks 1-6 w.write('1' * 100) # Block 7 w.write('1' * 2) data = writer.data data = '_' + data[1:] reader = records.RecordsReader(StringReader(data)) # First record should be completely skipped. self.assertEqual('1' * 2, reader.read()) self.assertRaises(EOFError, reader.read)
def testReadCorruptedRecordOrder_FirstThenFirst(self): """Tests corruption when a first record is followed by a first record.""" writer = StringWriter() with records.RecordsWriter(writer) as w: # Fake first record that should be ignored w._RecordsWriter__write_record( records._RECORD_TYPE_FIRST, 'A' * (records._BLOCK_SIZE / 2)) # Multi-block record. This will cause 'A' to be ignored because # of a repeated first record. w.write('B' * 2 * records._BLOCK_SIZE) data = writer.data reader = records.RecordsReader(StringReader(data)) self.assertEqual('B' * 2 * records._BLOCK_SIZE, reader.read()) self.assertRaises(EOFError, reader.read)
def testSmoke(self): """Smoke test of all cases. Other smaller tests are more revealing in particular situations. """ input_size = 0 # Try many input sizes! while input_size < records._BLOCK_SIZE * 3: writer = StringWriter() inputs = '1' * input_size with records.RecordsWriter(writer) as w: # Make sure even the smallest input covers more than one block. for _ in range(records._BLOCK_SIZE): w.write(inputs) reader = records.RecordsReader(StringReader(writer.data)) for _ in range(records._BLOCK_SIZE): self.assertEqual(inputs, reader.read()) self.assertRaises(EOFError, reader.read) input_size += 1
def testHashingMultipleFiles(self): """Test hashing files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler._HashPipeline( "testjob", bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._HashPipeline.from_id(p.pipeline_id) list_of_output_files = p.outputs.default.value output_data = [] for output_files in list_of_output_files: for output_file in output_files: with cloudstorage.open(output_file) as f: for binary_record in records.RecordsReader(f): proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) output_data.sort() self.assertEquals(300, len(output_data)) for i in range(len(input_data)): self.assertEquals(input_data[i], output_data[(3 * i)]) self.assertEquals(input_data[i], output_data[(3 * i) + 1]) self.assertEquals(input_data[i], output_data[(3 * i) + 2]) self.assertEquals(1, len(self.emails))
def testReadCorruptedLength(self): """Test reading record with corrupted length.""" writer = StringWriter() with records.RecordsWriter(writer) as w: # Block 1 w.write('1' * 2) w.write('1' * 2) # Block 2 w.write('1' * 2) w.write('1' * 2) data = writer.data # replace length by 65535 data = data[:4] + '\xff\xff' + data[6:] reader = records.RecordsReader(StringReader(data)) # First block should be completely skipped. self.assertEqual('1' * 2, reader.read()) self.assertEqual('1' * 2, reader.read()) self.assertRaises(EOFError, reader.read)
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler.ShufflePipeline( "testjob", {"bucket_name": bucket_name}, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): proto = kv_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def testReadLargeRecords(self): """Test reading large headers.""" writer = StringWriter() with records.RecordsWriter(writer) as w: w.write('1' * 10) w.write('1' * 20) w.write('1' * 30) w.write('1' * 40) w.write('1' * 50) w.write('1' * 60) w.write('1' * 70) reader = records.RecordsReader(StringReader(writer.data)) self.assertEqual('1' * 10, reader.read()) self.assertEqual('1' * 20, reader.read()) self.assertEqual('1' * 30, reader.read()) self.assertEqual('1' * 40, reader.read()) self.assertEqual('1' * 50, reader.read()) self.assertEqual('1' * 60, reader.read()) self.assertEqual('1' * 70, reader.read()) self.assertRaises(EOFError, reader.read)
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def _runTest(self, num_shards): entity_count = 1000 bucket_name = "bucket" job_name = "test_map" for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( job_name, __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_writer": { "bucket_name": bucket_name, }, }, shard_count=num_shards, output_writer_spec=self.WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = self.WRITER_CLS.get_filenames(mapreduce_state) self.assertEqual(num_shards, len(set(filenames))) total_entries = 0 for shard in range(num_shards): self.assertTrue(filenames[shard].startswith( "/%s/%s" % (bucket_name, job_name))) data = "".join([ _ for _ in records.RecordsReader( cloudstorage.open(filenames[shard])) ]) # strip() is used to remove the last newline of each file so that split() # does not return extraneous empty entries. total_entries += len(data.strip().split("\n")) self.assertEqual(entity_count, total_entries)