def __next__(self): """Returns the next input from this input reader, a record. Returns: The next input from this input reader in the form of a record read from an LevelDB file. Raises: StopIteration: The ordered set records has been exhausted. """ while True: if not hasattr(self, "_cur_handle") or self._cur_handle is None: self._cur_handle = next(super(GCSRecordInputReader, self)) if not hasattr(self, "_record_reader") or self._record_reader is None: self._record_reader = records.RecordsReader(self._cur_handle) try: start_time = time.time() content = self._record_reader.read() self._slice_ctx.incr(self.COUNTER_IO_READ_BYTE, len(content)) self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC, int(time.time() - start_time) * 1000) return content except EOFError: self._cur_handle = None self._record_reader = None
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. Raises: Exception: when Files list and offsets do not match. Yields: The result. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader( cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): current_result[2] = True should_yield = True if should_yield: yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result