예제 #1
0
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
예제 #2
0
  def testSortFile(self):
    """Test sorting a file."""
    bucket_name = "testbucket"
    test_filename = "testfile"
    full_filename = "/%s/%s" % (bucket_name, test_filename)

    input_data = [
        (str(i), "_" + str(i)) for i in range(100)]

    with cloudstorage.open(full_filename, mode="w") as f:
      with records.RecordsWriter(f) as w:
        for (k, v) in input_data:
          proto = file_service_pb.KeyValue()
          proto.set_key(k)
          proto.set_value(v)
          w.write(proto.Encode())

    p = shuffler._SortChunksPipeline("testjob", bucket_name, [[full_filename]])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

    input_data.sort()
    output_files = p.outputs.default.value[0]
    output_data = []
    for output_file in output_files:
      with cloudstorage.open(output_file) as f:
        for binary_record in records.RecordsReader(f):
          proto = file_service_pb.KeyValue()
          proto.ParseFromString(binary_record)
          output_data.append((proto.key(), proto.value()))

    self.assertEquals(input_data, output_data)
    self.assertEquals(1, len(self.emails))
예제 #3
0
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
예제 #4
0
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    file_index = key.__hash__() % len(self._filenames)
    pool_name = "kv_pool%d" % file_index
    filename = self._filenames[file_index]

    if ctx.get_pool(pool_name) is None:
      ctx.register_pool(pool_name,
                        output_writers.RecordsPool(filename=filename, ctx=ctx))
    proto = file_service_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    ctx.get_pool(pool_name).append(proto.Encode())
예제 #5
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if len(data) != 2:
            logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                          len(data), data)

        try:
            key = str(data[0])
            value = str(data[1])
        except TypeError:
            logging.error("Expecting a tuple, but got %s: %s",
                          data.__class__.__name__, data)

        file_index = key.__hash__() % len(self._filehandles)

        # Work-around: Since we don't have access to the context in the to_json()
        # function, but we need to flush each pool before we serialize the
        # filehandle, we rely on a member variable instead of using context for
        # pool management.
        pool = self._pools[file_index]
        if pool is None:
            filehandle = self._filehandles[file_index]
            pool = output_writers.GCSRecordsPool(filehandle=filehandle,
                                                 ctx=ctx)
            self._pools[file_index] = pool

        proto = file_service_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        pool.append(proto.Encode())
예제 #6
0
  def testMergeFiles(self):
    """Test merging multiple files."""
    input_data = [(str(i), "_" + str(i)) for i in range(100)]
    input_data.sort()

    bucket_name = "testbucket"
    test_filename = "testfile"
    full_filename = "/%s/%s" % (bucket_name, test_filename)

    with cloudstorage.open(full_filename, mode="w") as f:
      with records.RecordsWriter(f) as w:
        for (k, v) in input_data:
          proto = file_service_pb.KeyValue()
          proto.set_key(k)
          proto.set_value(v)
          w.write(proto.Encode())

    p = TestMergePipeline(bucket_name,
                          [full_filename, full_filename, full_filename])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    p = TestMergePipeline.from_id(p.pipeline_id)

    output_file = p.outputs.default.value[0]
    output_data = []
    with cloudstorage.open(output_file) as f:
      for record in records.RecordsReader(f):
        output_data.append(record)

    expected_data = [
        str((k, [v, v, v], False)) for (k, v) in input_data]
    self.assertEquals(expected_data, output_data)
    self.assertEquals(1, len(self.emails))
예제 #7
0
def _hashing_map(binary_record):
  """A map function used in hash phase.

  Reads KeyValue from binary record and yields (key, value).
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  yield (proto.key(), proto.value())
예제 #8
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx.shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]
            reader = records.RecordsReader(files.BufferedFile(filename))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                if current_result and key != current_result[0]:

                    yield current_result
                if not current_result or key != current_result[0]:
                    current_result = (key, [])
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result
예제 #9
0
  def testHashingMultipleFiles(self):
    """Test hashing files."""
    input_data = [(str(i), str(i)) for i in range(100)]
    input_data.sort()

    bucket_name = "testbucket"
    test_filename = "testfile"
    full_filename = "/%s/%s" % (bucket_name, test_filename)

    with cloudstorage.open(full_filename, mode="w") as f:
      with records.RecordsWriter(f) as w:
        for (k, v) in input_data:
          proto = file_service_pb.KeyValue()
          proto.set_key(k)
          proto.set_value(v)
          w.write(proto.Encode())

    p = shuffler._HashPipeline("testjob", bucket_name,
                               [full_filename, full_filename, full_filename])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    p = shuffler._HashPipeline.from_id(p.pipeline_id)

    list_of_output_files = p.outputs.default.value
    output_data = []
    for output_files in list_of_output_files:
      for output_file in output_files:
        with cloudstorage.open(output_file) as f:
          for binary_record in records.RecordsReader(f):
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            output_data.append((proto.key(), proto.value()))

    output_data.sort()
    self.assertEquals(300, len(output_data))
    for i in range(len(input_data)):
      self.assertEquals(input_data[i], output_data[(3 * i)])
      self.assertEquals(input_data[i], output_data[(3 * i) + 1])
      self.assertEquals(input_data[i], output_data[(3 * i) + 2])
    self.assertEquals(1, len(self.emails))
예제 #10
0
  def testPartialRecords(self):
    """Test merging into partial key values."""
    try:
      self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
      # force max values count to extremely low value.
      shuffler._MergePipeline._MAX_VALUES_COUNT = 1

      input_data = [("1", "a"), ("2", "b"), ("3", "c")]
      input_data.sort()

      bucket_name = "testbucket"
      test_filename = "testfile"
      full_filename = "/%s/%s" % (bucket_name, test_filename)

      with cloudstorage.open(full_filename, mode="w") as f:
        with records.RecordsWriter(f) as w:
          for (k, v) in input_data:
            proto = file_service_pb.KeyValue()
            proto.set_key(k)
            proto.set_value(v)
            w.write(proto.Encode())

      p = TestMergePipeline(bucket_name,
                            [full_filename, full_filename, full_filename])
      p.start()
      test_support.execute_until_empty(self.taskqueue)
      p = TestMergePipeline.from_id(p.pipeline_id)

      output_file = p.outputs.default.value[0]
      output_data = []
      with cloudstorage.open(output_file) as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

      expected_data = [
          ("1", ["a"], True),
          ("1", ["a"], True),
          ("1", ["a"], False),
          ("2", ["b"], True),
          ("2", ["b"], True),
          ("2", ["b"], False),
          ("3", ["c"], True),
          ("3", ["c"], True),
          ("3", ["c"], False),
          ]
      self.assertEquals([str(e) for e in expected_data], output_data)
    finally:
      shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
    self.assertEquals(1, len(self.emails))
예제 #11
0
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(
                files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ('1', ['a'], True),
                ('1', ['a'], True),
                ('1', ['a'], False),
                ('2', ['b'], True),
                ('2', ['b'], True),
                ('2', ['b'], False),
                ('3', ['c'], True),
                ('3', ['c'], True),
                ('3', ['c'], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
예제 #12
0
  def write(self, data, ctx):
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    proto = file_service_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    FileRecordsOutputWriter.write(self, proto.Encode(), ctx)
예제 #13
0
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
예제 #14
0
def _sort_records(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  proto_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    proto_records[i] = proto

  logging.debug("Sorting")
  proto_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  blob_file_name = (ctx.mapreduce_spec.name + "-" +
                    ctx.mapreduce_id + "-output")
  output_path = files.blobstore.create(
      _blobinfo_uploaded_filename=blob_file_name)
  with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
    for proto in proto_records:
      pool.append(proto.Encode())

  logging.debug("Finalizing")
  files.finalize(output_path)
  time.sleep(1)
  output_path = files.blobstore.get_file_name(
      files.blobstore.get_blob_key(output_path))

  entity = _OutputFile(key_name=output_path,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
예제 #15
0
  def testShuffleFiles(self):
    """Test shuffling multiple files."""
    input_data = [(str(i), str(i)) for i in range(100)]
    input_data.sort()

    bucket_name = "testbucket"
    test_filename = "testfile"
    full_filename = "/%s/%s" % (bucket_name, test_filename)

    with cloudstorage.open(full_filename, mode="w") as f:
      with records.RecordsWriter(f) as w:
        for (k, v) in input_data:
          proto = file_service_pb.KeyValue()
          proto.set_key(k)
          proto.set_value(v)
          w.write(proto.Encode())

    p = shuffler.ShufflePipeline("testjob", {"bucket_name": bucket_name},
                                 [full_filename, full_filename, full_filename])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

    output_files = p.outputs.default.value
    output_data = []
    for output_file in output_files:
      with cloudstorage.open(output_file) as f:
        for record in records.RecordsReader(f):
          proto = file_service_pb.KeyValues()
          proto.ParseFromString(record)
          output_data.append((proto.key(), proto.value_list()))
    output_data.sort()

    expected_data = sorted([
        (str(k), [str(v), str(v), str(v)]) for (k, v) in input_data])
    self.assertEquals(expected_data, output_data)
    self.assertEquals(1, len(self.emails))
예제 #16
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
예제 #17
0
  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx._shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")

    # Heap with (Key, Value, Index, reader) pairs.
    readers = []

    # Initialize heap
    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]
      reader = records.RecordsReader(files.BufferedFile(filename))
      reader.seek(offset)
      readers.append((None, None, i, reader))

    # Read records from heap and merge values with the same key.

    # current_result is yielded and consumed buy _merge_map.
    # current_result = (key, value, is_partial)
    current_result = None
    current_count = 0
    current_size = 0
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        current_count += 1
        current_size += len(value)

        should_yield = False
        if current_result:
          if key != current_result[0]:
            # New key encountered
            should_yield = True
          elif (self._max_values_count != -1 and
              current_count >= self._max_values_count):
            # Maximum number of values encountered.
            current_result[2] = True
            should_yield = True
          elif (self._max_values_size != -1 and
              current_size >= self._max_values_size):
            # Maximum size of values encountered
            current_result[2] = True
            should_yield = True

        if should_yield:
          # New key encountered or maximum count hit. Yield current key.
          yield current_result
        if not current_result or should_yield:
          current_result = [key, [], False]
          current_count = 0
          current_size = 0
        current_result[1].append(value)

      # Read next key/value from reader.
      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()
        # update counters
        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)
        # Put read data back into heap.
        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)

    # Yield leftovers.
    if current_result:
      yield current_result
예제 #18
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx._shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]

            reader = records.RecordsReader(
                cloudstorage.open(filename,
                                  read_buffer_size=self.GCS_BUFFER_SIZE))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        current_count = 0
        current_size = 0
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                current_count += 1
                current_size += len(value)

                should_yield = False
                if current_result:
                    if key != current_result[0]:

                        should_yield = True
                    elif (self._max_values_count != -1
                          and current_count >= self._max_values_count):

                        current_result[2] = True
                        should_yield = True
                    elif (self._max_values_size != -1
                          and current_size >= self._max_values_size):

                        current_result[2] = True
                        should_yield = True

                if should_yield:

                    yield current_result
                if not current_result or should_yield:
                    current_result = [key, [], False]
                    current_count = 0
                    current_size = 0
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result