def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file, # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
def testRecordsReader(self): """End-to-end test for records reader.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testMultipleRequests(self): """Tests restoring the reader state across multiple requests.""" input_file = files.blobstore.create() # Create a file with two records. with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["a", "b"]) proto.set_partial(True) w.write(proto.Encode()) proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["c", "d"]) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) # Now read the records in two attempts, serializing and recreating the # input reader as if it's a separate request. reader = mapreduce_pipeline._ReducerReader([input_file], 0) it = iter(reader) self.assertEquals(input_readers.ALLOW_CHECKPOINT, it.next()) reader_state = reader.to_json() other_reader = mapreduce_pipeline._ReducerReader.from_json(reader_state) it = iter(reader) self.assertEquals(("key2", ["a", "b", "c", "d"]), it.next())
def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def createMockDataLine(self, data): file_name = "myblob_01" file_path = files.blobstore.create("text/plain", file_name) with files.open(file_path, 'a') as fp: fp.write(data) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) return file_name
def finalize(self, ctx, shard_number): """Finalize writer shard-level state. Args: ctx: an instance of context.Context. shard_number: shard number as integer. """ finalized_filenames = [] for filename in self._filenames: files.finalize(filename)
def createGSData(self, file_count, data): file_paths = [] for file_number in range(file_count): file_path = "/gs/foo/bar%d" % file_number write_path = files.gs.create(file_path, mime_type='text/plain', acl='public-read') with files.open(write_path, 'a') as fp: fp.write(data) files.finalize(write_path) file_paths.append(file_path) return file_paths
def createInvalidMockData(self): blob_keys = [] url = "invalidScheme://test_url.com" file_path = files.blobstore.create("text/plain", url) with files.open(file_path, 'a') as fp: fp.write(url) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) blob_keys.append(str(file_name)) return blob_keys
def createMockData(self, url_count, shard): blob_keys = [] for num in range(shard): file_name = "myblob_%d" % num urls = "\n".join(["http://test_url_%d.com" % i for i in range(url_count)]) file_path = files.blobstore.create("text/plain", file_name) with files.open(file_path, 'a') as fp: fp.write(urls) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) blob_keys.append(str(file_name)) return blob_keys
def finalize_job(cls, mapreduce_state): """Finalize job-level writer state. Args: mapreduce_state: an instance of model.MapreduceState describing current job. """ state = BlobstoreOutputWriter._State.from_json( mapreduce_state.writer_state) files.finalize(state.filename) state.filename = files.blobstore.get_file_name( files.blobstore.get_blob_key(state.filename)) mapreduce_state.writer_state = state.to_json()
def createMockData(self, url_count, shard): blob_keys = [] for num in range(shard): file_name = "myblob_%d" % num urls = "\n".join( ["http://test_url_%d.com" % i for i in range(url_count)]) file_path = files.blobstore.create("text/plain", file_name) with files.open(file_path, 'a') as fp: fp.write(urls) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) blob_keys.append(str(file_name)) return blob_keys
def finalize_job(cls, mapreduce_state): """Finalize job-level writer state. Args: mapreduce_state: an instance of model.MapreduceState describing current job. State can be modified during finalization. """ finalized_filenames = [] for filename in mapreduce_state.writer_state["filenames"]: files.finalize(filename) finalized_filenames.append( files.blobstore.get_file_name( files.blobstore.get_blob_key(filename))) mapreduce_state.writer_state = {"filenames": finalized_filenames}
def finalize(self, ctx, shard_number): """Finalize writer shard-level state. Args: ctx: an instance of context.Context. shard_number: shard number as integer. """ mapreduce_spec = ctx.mapreduce_spec output_sharding = _get_output_sharding(mapper_spec=mapreduce_spec.mapper) if output_sharding == self.OUTPUT_SHARDING_INPUT_SHARDS: # Finalize our file because we're responsible for it. # Do it here and not in finalize_job to spread out finalization # into multiple tasks. files.finalize(self._filename)
def callback(self, **kwargs): if "error" in kwargs: self.abort("Error from shuffle service: %s" % kwargs["error"]) return output_files = self.outputs._output_files.value for filename in output_files: files.finalize(filename) finalized_file_names = [] for filename in output_files: finalized_file_names.append( files.blobstore.get_file_name( files.blobstore.get_blob_key(filename))) self.complete(finalized_file_names)
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ('1', ['a'], True), ('1', ['a'], True), ('1', ['a'], False), ('2', ['b'], True), ('2', ['b'], True), ('2', ['b'], False), ('3', ['c'], True), ('3', ['c'], True), ('3', ['c'], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [("1", "a"), ("2", "b"), ("3", "c")] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ("1", ["a"], True), ("1", ["a"], True), ("1", ["a"], False), ("2", ["b"], True), ("2", ["b"], True), ("2", ["b"], False), ("3", ["c"], True), ("3", ["c"], True), ("3", ["c"], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def createMockData(self, data): """Create mock data for FetchContentPipeline""" input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: key = str(data[0]) value = str(data[1]) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) return input_file
def _sort_records(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new blobstore file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) proto_records = [None] * l # TODO(user): demote these log statements. logging.info("parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) proto_records[i] = proto logging.info("sorting") proto_records.sort(cmp=_compare_keys) logging.info("writing") blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id + "-output") output_path = files.blobstore.create( _blobinfo_uploaded_filename=blob_file_name) with output_writers.RecordsPool(output_path, ctx=ctx) as pool: for proto in proto_records: pool.append(proto.Encode()) logging.info("finalizing") files.finalize(output_path) output_path = files.blobstore.get_file_name( files.blobstore.get_blob_key(output_path)) entity = _OutputFile(key_name=output_path, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def requestCrawlerOutput(self): logging.debug("Request Crawler Output") file_content = 'DEFAULT_CONTENT' url = "http://web.ist.utl.pt/ist163512/crawler.txt" result = urllib2.urlopen(url) file_content = result.read() file_name = files.blobstore.create(mime_type='application/octet-stream') self.crawler_file_name = file_name with files.open(file_name, 'a') as f: f.write("%s" % file_content) files.finalize(file_name) logging.debug("File saved successfully") key = files.blobstore.get_blob_key(file_name) return key
def finalize_job(cls, mapreduce_state): """Finalize job-level writer state. Args: mapreduce_state: an instance of model.MapreduceState describing current job. """ state = cls._State.from_json(mapreduce_state.writer_state) output_sharding = _get_output_sharding(mapreduce_state=mapreduce_state) finalized_filenames = [] for filename in state.filenames: if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS: files.finalize(filename) finalized_filenames.append( files.blobstore.get_file_name( files.blobstore.get_blob_key(filename))) state.filenames = finalized_filenames mapreduce_state.writer_state = state.to_json()
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new blobstore file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id + "-output") output_path = files.blobstore.create( _blobinfo_uploaded_filename=blob_file_name) with output_writers.RecordsPool(output_path, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") files.finalize(output_path) output_path = files.blobstore.get_file_name( files.blobstore.get_blob_key(output_path)) entity = _OutputFile(key_name=output_path, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def get(self): filekey = self.request.get("filekey") #key = ManageCrawlerOutput.requestCrawlerOutput(self) # str_key = str(key) self.response.headers['Content-Type'] = 'text/plain' query = db.GqlQuery("SELECT * FROM WebSiteInfo") file_name = files.blobstore.create(mime_type='application/octet-stream') with files.open(file_name, 'a') as f: for q in query: w = WebSiteInfo() w=q title = str(w.title) title = re.sub(r" ", "", title) #title.replace(" ","-") self.response.out.write("parsed title %s\n" % title) f.write("%s %s\n" % (title,w.siteLinks)) files.finalize(file_name) logging.debug("File saved successfully") key = files.blobstore.get_blob_key(file_name) # info = blobstore.get(key) # reader = info.open() # file_content = reader.read(501900) # self.response.out.write("\n\n") # self.response.out.write("%s" % file_content) pipeline = PageRankPipeline(filekey, str(key)) pipeline.start() self.redirect(pipeline.base_path + "/status?root=" + pipeline.pipeline_id)
def finalize_job(cls, mapreduce_state): """Finalize job-level writer state. Args: mapreduce_state: an instance of model.MapreduceState describing current job. """ state = cls._State.from_json(mapreduce_state.writer_state) output_sharding = cls._get_output_sharding(mapreduce_state=mapreduce_state) filesystem = cls._get_filesystem(mapreduce_state.mapreduce_spec.mapper) finalized_filenames = [] for create_filename, request_filename in itertools.izip( state.filenames, state.request_filenames): if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS: files.finalize(create_filename) finalized_filenames.append(cls._get_finalized_filename(filesystem, create_filename, request_filename)) state.filenames = finalized_filenames state.request_filenames = [] mapreduce_state.writer_state = state.to_json()
def finalize_job(cls, mapreduce_state): """Finalize job-level writer state. Args: mapreduce_state: an instance of model.MapreduceState describing current job. """ state = cls._State.from_json( mapreduce_state.writer_state) output_sharding = _get_output_sharding(mapreduce_state=mapreduce_state) finalized_filenames = [] for filename in state.filenames: if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS: files.finalize(filename) finalized_filenames.append( files.blobstore.get_file_name( files.blobstore.get_blob_key(filename))) state.filenames = finalized_filenames mapreduce_state.writer_state = state.to_json()
def testRecordsReader(self): """End-to-end test for records reader.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", {"file": input_file}, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def finalize_job(cls, mapreduce_state): """Finalize job-level writer state. Args: mapreduce_state: an instance of model.MapreduceState describing current job. """ state = cls._State.from_json(mapreduce_state.writer_state) output_sharding = cls._get_output_sharding( mapreduce_state=mapreduce_state) filesystem = cls._get_filesystem(mapreduce_state.mapreduce_spec.mapper) finalized_filenames = [] for create_filename, request_filename in itertools.izip( state.filenames, state.request_filenames): if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS: files.finalize(create_filename) finalized_filenames.append( cls._get_finalized_filename(filesystem, create_filename, request_filename)) state.filenames = finalized_filenames state.request_filenames = [] mapreduce_state.writer_state = state.to_json()
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def createGSData(self, file_path, data): write_path = files.gs.create(file_path, mime_type='text/plain', acl='public-read') with files.open(write_path, 'a') as fp: fp.write(data) files.finalize(write_path)
def testReadPartial(self): input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: # First record is full proto = file_service_pb.KeyValues() proto.set_key("key1") proto.value_list().extend(["a", "b"]) w.write(proto.Encode()) # Second record is partial proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["a", "b"]) proto.set_partial(True) w.write(proto.Encode()) proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["c", "d"]) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) reader = mapreduce_pipeline._ReducerReader([input_file], 0) self.assertEquals( [("key1", ["a", "b"]), input_readers.ALLOW_CHECKPOINT, ("key2", ["a", "b", "c", "d"])], list(reader)) # now test state serialization reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() self.assertEquals( {"position": 0, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) self.assertEquals( {"position": 19, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) self.assertEquals( {"position": 40, "current_values": ["a", "b"], "current_key": "key2", "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) self.assertEquals( {"position": 59, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass # now do test deserialization at every moment. reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass
def testSingleRequest(self): """Tests when a key can be handled during a single request.""" input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: # First record is full proto = file_service_pb.KeyValues() proto.set_key("key1") proto.value_list().extend(["a", "b"]) w.write(proto.Encode()) # Second record is partial proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["a", "b"]) proto.set_partial(True) w.write(proto.Encode()) proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["c", "d"]) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) reader = mapreduce_pipeline._ReducerReader([input_file], 0) self.assertEquals( [("key1", ["a", "b"]), input_readers.ALLOW_CHECKPOINT, ("key2", ["a", "b", "c", "d"])], list(reader)) # now test state serialization reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() self.assertEquals( {"position": 0, "current_values": "Ti4=", "current_key": "Ti4=", "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) self.assertEquals( {"position": 19, "current_values": "Ti4=", "current_key": "Ti4=", "filenames": [input_file]}, reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) self.assertEquals( {"position": 40, "current_values": "KGxwMApTJ2EnCnAxCmFTJ2InCnAyCmEu", "current_key": "UydrZXkyJwpwMAou", "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) self.assertEquals( {"position": 59, "current_values": "Ti4=", "current_key": "Ti4=", "filenames": [input_file]}, reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass # now do test deserialization at every moment. reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass