def get(self, filename):
     #self.response.write(self.request.url)
     pass
     path = '/gs/%s' % (urllib.quote(filename))
     fp = files.BufferedFile(path)
     data = fp.read(1000*1024*1024*2)
     self.response.content_type= 'application/octet-stream';
     #self.response.write(self.response.content_type)
     self.response.write(data)
Пример #2
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx.shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]
            reader = records.RecordsReader(files.BufferedFile(filename))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                if current_result and key != current_result[0]:

                    yield current_result
                if not current_result or key != current_result[0]:
                    current_result = (key, [])
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result
Пример #3
0
  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx._shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")

    # Heap with (Key, Value, Index, reader) pairs.
    readers = []

    # Initialize heap
    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]
      reader = records.RecordsReader(files.BufferedFile(filename))
      reader.seek(offset)
      readers.append((None, None, i, reader))

    # Read records from heap and merge values with the same key.

    # current_result is yielded and consumed buy _merge_map.
    # current_result = (key, value, is_partial)
    current_result = None
    current_count = 0
    current_size = 0
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        current_count += 1
        current_size += len(value)

        should_yield = False
        if current_result:
          if key != current_result[0]:
            # New key encountered
            should_yield = True
          elif (self._max_values_count != -1 and
              current_count >= self._max_values_count):
            # Maximum number of values encountered.
            current_result[2] = True
            should_yield = True
          elif (self._max_values_size != -1 and
              current_size >= self._max_values_size):
            # Maximum size of values encountered
            current_result[2] = True
            should_yield = True

        if should_yield:
          # New key encountered or maximum count hit. Yield current key.
          yield current_result
        if not current_result or should_yield:
          current_result = [key, [], False]
          current_count = 0
          current_size = 0
        current_result[1].append(value)

      # Read next key/value from reader.
      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()
        # update counters
        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)
        # Put read data back into heap.
        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)

    # Yield leftovers.
    if current_result:
      yield current_result