def testSortFile(self): """Test sorting a file.""" bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) input_data = [(str(i), "_" + str(i)) for i in range(100)] with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler._SortChunksPipeline("testjob", bucket_name, [[full_filename]]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for binary_record in records.RecordsReader(f): proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data) self.assertEquals(1, len(self.emails))
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) file_index = key.__hash__() % len(self._filehandles) # Work-around: Since we don't have access to the context in the to_json() # function, but we need to flush each pool before we serialize the # filehandle, we rely on a member variable instead of using context for # pool management. pool = self._pools[file_index] if pool is None: filehandle = self._filehandles[file_index] pool = output_writers.GCSRecordsPool(filehandle=filehandle, ctx=ctx) self._pools[file_index] = pool proto = kv_pb.KeyValue() proto.set_key(key) proto.set_value(value) pool.append(proto.Encode())
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = TestMergePipeline(bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def testHashingMultipleFiles(self): """Test hashing files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler._HashPipeline( "testjob", bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._HashPipeline.from_id(p.pipeline_id) list_of_output_files = p.outputs.default.value output_data = [] for output_files in list_of_output_files: for output_file in output_files: with cloudstorage.open(output_file) as f: for binary_record in records.RecordsReader(f): proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) output_data.sort() self.assertEquals(300, len(output_data)) for i in range(len(input_data)): self.assertEquals(input_data[i], output_data[(3 * i)]) self.assertEquals(input_data[i], output_data[(3 * i) + 1]) self.assertEquals(input_data[i], output_data[(3 * i) + 2]) self.assertEquals(1, len(self.emails))
def _hashing_map(binary_record): """A map function used in hash phase. Reads KeyValue from binary record. Args: binary_record: The binary record. Yields: The (key, value). """ proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) yield (proto.key(), proto.value())
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [("1", "a"), ("2", "b"), ("3", "c")] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = TestMergePipeline( bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ("1", ["a"], True), ("1", ["a"], True), ("1", ["a"], False), ("2", ["b"], True), ("2", ["b"], True), ("2", ["b"], False), ("3", ["c"], True), ("3", ["c"], True), ("3", ["c"], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count self.assertEquals(1, len(self.emails))
def write(self, data): if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) proto = kv_pb.KeyValue() proto.set_key(key) proto.set_value(value) GoogleCloudStorageRecordOutputWriter.write(self, proto.Encode())
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new GCS file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = kv_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") mapper_spec = ctx.mapreduce_spec.mapper params = input_readers._get_params(mapper_spec) bucket_name = params.get("bucket_name") filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" + ctx.shard_id + "-" + str(int(time.time()))) full_filename = "/%s/%s" % (bucket_name, filename) filehandle = cloudstorage.open(full_filename, mode="w") with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") filehandle.close() entity = _OutputFile(key_name=full_filename, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler.ShufflePipeline( "testjob", {"bucket_name": bucket_name}, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): proto = kv_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. Raises: Exception: when Files list and offsets do not match. Yields: The result. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") # Heap with (Key, Value, Index, reader) pairs. readers = [] # Initialize heap for (i, filename) in enumerate(filenames): offset = self._offsets[i] # TODO(user): Shrinking the buffer size is a workaround until # a tiered/segmented merge is implemented. reader = records.RecordsReader( cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE)) reader.seek(offset) readers.append((None, None, i, reader)) # Read records from heap and merge values with the same key. # current_result is yielded and consumed buy _merge_map. # current_result = (key, value, is_partial) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: # New key encountered should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): # Maximum number of values encountered. current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): # Maximum size of values encountered current_result[2] = True should_yield = True if should_yield: # New key encountered or maximum count hit. Yield current key. yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) # Read next key/value from reader. try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() # update counters if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) # Put read data back into heap. heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) # Yield leftovers. if current_result: yield current_result