def testSortFile(self):
        """Test sorting a file."""
        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler._SortChunksPipeline("testjob", bucket_name,
                                         [[full_filename]])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with cloudstorage.open(output_file) as f:
                for binary_record in records.RecordsReader(f):
                    proto = kv_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
        self.assertEquals(1, len(self.emails))
示例#2
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if len(data) != 2:
            logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                          len(data), data)

        try:
            key = str(data[0])
            value = str(data[1])
        except TypeError:
            logging.error("Expecting a tuple, but got %s: %s",
                          data.__class__.__name__, data)

        file_index = key.__hash__() % len(self._filehandles)

        # Work-around: Since we don't have access to the context in the to_json()
        # function, but we need to flush each pool before we serialize the
        # filehandle, we rely on a member variable instead of using context for
        # pool management.
        pool = self._pools[file_index]
        if pool is None:
            filehandle = self._filehandles[file_index]
            pool = output_writers.GCSRecordsPool(filehandle=filehandle,
                                                 ctx=ctx)
            self._pools[file_index] = pool

        proto = kv_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        pool.append(proto.Encode())
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = TestMergePipeline(bucket_name,
                              [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with cloudstorage.open(output_file) as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
        self.assertEquals(1, len(self.emails))
    def testHashingMultipleFiles(self):
        """Test hashing files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler._HashPipeline(
            "testjob", bucket_name,
            [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._HashPipeline.from_id(p.pipeline_id)

        list_of_output_files = p.outputs.default.value
        output_data = []
        for output_files in list_of_output_files:
            for output_file in output_files:
                with cloudstorage.open(output_file) as f:
                    for binary_record in records.RecordsReader(f):
                        proto = kv_pb.KeyValue()
                        proto.ParseFromString(binary_record)
                        output_data.append((proto.key(), proto.value()))

        output_data.sort()
        self.assertEquals(300, len(output_data))
        for i in range(len(input_data)):
            self.assertEquals(input_data[i], output_data[(3 * i)])
            self.assertEquals(input_data[i], output_data[(3 * i) + 1])
            self.assertEquals(input_data[i], output_data[(3 * i) + 2])
        self.assertEquals(1, len(self.emails))
示例#5
0
def _hashing_map(binary_record):
    """A map function used in hash phase.

  Reads KeyValue from binary record.

  Args:
    binary_record: The binary record.

  Yields:
    The (key, value).
  """
    proto = kv_pb.KeyValue()
    proto.ParseFromString(binary_record)
    yield (proto.key(), proto.value())
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [("1", "a"), ("2", "b"), ("3", "c")]
            input_data.sort()

            bucket_name = "testbucket"
            test_filename = "testfile"
            full_filename = "/%s/%s" % (bucket_name, test_filename)

            with cloudstorage.open(full_filename, mode="w") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = kv_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())

            p = TestMergePipeline(
                bucket_name, [full_filename, full_filename, full_filename])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ("1", ["a"], True),
                ("1", ["a"], True),
                ("1", ["a"], False),
                ("2", ["b"], True),
                ("2", ["b"], True),
                ("2", ["b"], False),
                ("3", ["c"], True),
                ("3", ["c"], True),
                ("3", ["c"], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
        self.assertEquals(1, len(self.emails))
示例#7
0
  def write(self, data):
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    proto = kv_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    GoogleCloudStorageRecordOutputWriter.write(self, proto.Encode())
示例#8
0
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = kv_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler.ShufflePipeline(
            "testjob", {"bucket_name": bucket_name},
            [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    proto = kv_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
        self.assertEquals(1, len(self.emails))
示例#10
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx._shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        # Heap with (Key, Value, Index, reader) pairs.
        readers = []

        # Initialize heap
        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]
            # TODO(user): Shrinking the buffer size is a workaround until
            # a tiered/segmented merge is implemented.
            reader = records.RecordsReader(
                cloudstorage.open(filename,
                                  read_buffer_size=self.GCS_BUFFER_SIZE))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        # Read records from heap and merge values with the same key.

        # current_result is yielded and consumed buy _merge_map.
        # current_result = (key, value, is_partial)
        current_result = None
        current_count = 0
        current_size = 0
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                current_count += 1
                current_size += len(value)

                should_yield = False
                if current_result:
                    if key != current_result[0]:
                        # New key encountered
                        should_yield = True
                    elif (self._max_values_count != -1
                          and current_count >= self._max_values_count):
                        # Maximum number of values encountered.
                        current_result[2] = True
                        should_yield = True
                    elif (self._max_values_size != -1
                          and current_size >= self._max_values_size):
                        # Maximum size of values encountered
                        current_result[2] = True
                        should_yield = True

                if should_yield:
                    # New key encountered or maximum count hit. Yield current key.
                    yield current_result
                if not current_result or should_yield:
                    current_result = [key, [], False]
                    current_count = 0
                    current_size = 0
                current_result[1].append(value)

            # Read next key/value from reader.
            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()
                # update counters
                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = kv_pb.KeyValue()
                proto.ParseFromString(binary_record)
                # Put read data back into heap.
                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        # Yield leftovers.
        if current_result:
            yield current_result