示例#1
0
    def __next__(self):
        """Returns the next input from this input reader, a record.

    Returns:
      The next input from this input reader in the form of a record read from
      an LevelDB file.

    Raises:
      StopIteration: The ordered set records has been exhausted.
    """
        while True:
            if not hasattr(self, "_cur_handle") or self._cur_handle is None:

                self._cur_handle = next(super(GCSRecordInputReader, self))
            if not hasattr(self,
                           "_record_reader") or self._record_reader is None:
                self._record_reader = records.RecordsReader(self._cur_handle)

            try:
                start_time = time.time()
                content = self._record_reader.read()
                self._slice_ctx.incr(self.COUNTER_IO_READ_BYTE, len(content))
                self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC,
                                     int(time.time() - start_time) * 1000)
                return content
            except EOFError:
                self._cur_handle = None
                self._record_reader = None
示例#2
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx._shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]

            reader = records.RecordsReader(
                cloudstorage.open(filename,
                                  read_buffer_size=self.GCS_BUFFER_SIZE))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        current_count = 0
        current_size = 0
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                current_count += 1
                current_size += len(value)

                should_yield = False
                if current_result:
                    if key != current_result[0]:

                        should_yield = True
                    elif (self._max_values_count != -1
                          and current_count >= self._max_values_count):

                        current_result[2] = True
                        should_yield = True
                    elif (self._max_values_size != -1
                          and current_size >= self._max_values_size):

                        current_result[2] = True
                        should_yield = True

                if should_yield:

                    yield current_result
                if not current_result or should_yield:
                    current_result = [key, [], False]
                    current_count = 0
                    current_size = 0
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result