def get_recordio_protobuf_dmatrix(path,
                                  is_pipe=False,
                                  subsample_ratio_on_read=None):
    """Get Data Matrix from recordio-protobuf data.

    :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should
            be read into memory.
    :return: xgb.DMatrix or None
    """
    try:
        if is_pipe:
            dataset = [mlio.SageMakerPipe(path)]
            reader = mlio.RecordIOProtobufReader(
                dataset=dataset,
                batch_size=BATCH_SIZE,
                subsample_ratio=subsample_ratio_on_read)
        else:
            dataset = mlio.list_files(path)
            reader = mlio.RecordIOProtobufReader(
                dataset=dataset,
                batch_size=BATCH_SIZE,
                subsample_ratio=subsample_ratio_on_read)

        exm = reader.peek_example()
        if exm is None:
            return None

        # Recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
        if isinstance(exm['values'], mlio.DenseTensor):
            to_matrix = as_numpy
            vstack = np.vstack
        else:
            to_matrix = to_coo_matrix
            vstack = scipy_vstack

        all_values = []
        all_labels = []
        for example in reader:
            values = to_matrix(example['values'])
            all_values.append(values)

            labels = as_numpy(example['label_values']).squeeze()
            all_labels.append(labels)

        all_values = vstack(all_values)
        all_labels = np.concatenate(all_labels)

        return xgb.DMatrix(all_values, label=all_labels)
    except Exception as e:
        raise exc.UserError(
            "Failed to load recordio-protobuf data with exception:\n{}".format(
                e))
Пример #2
0
def test_recordio_protobuf_reader_params():
    filename = os.path.join(resources_dir, 'test.pbr')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    reader = mlio.RecordIOProtobufReader(rdr_prm)

    example = reader.read_example()
    record = [as_numpy(feature) for feature in example]
    assert record[0].squeeze() == np.array(1)
    assert np.all(record[1].squeeze() == np.array([0, 0, 0]))

    # Parameters should be reusable
    reader2 = mlio.RecordIOProtobufReader(rdr_prm)
    assert reader2.peek_example()
def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a RecordIO-Protobuf byte representation to a DMatrix object.
    Args:
        string_like (bytes): RecordIO-Protobuf bytes.
    Returns:
    (xgb.DMatrix): XGBoost DataMatrix
    """
    buf = bytes(string_like)
    dataset = [mlio.InMemoryStore(buf)]
    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
    reader = mlio.RecordIOProtobufReader(reader_params)

    is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor

    examples = []
    for example in reader:
        # Ignore labels if present
        values = as_numpy(
            example['values']) if is_dense_tensor else to_coo_matrix(
                example['values'])
        examples.append(values)

    data = np.vstack(examples) if is_dense_tensor else scipy_vstack(
        examples).tocsr()
    dmatrix = xgb.DMatrix(data)
    return dmatrix
def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a RecordIO-Protobuf byte representation to a DMatrix object.
    Args:
        string_like (bytes): RecordIO-Protobuf bytes.
    Returns:
    (xgb.DMatrix): XGBoost DataMatrix
    """
    buf = bytes(string_like)
    dataset = [mlio.InMemoryStore(buf)]
    reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=100)

    if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
        to_matrix = as_numpy
        vstack = np.vstack
    else:
        to_matrix = to_coo_matrix
        vstack = scipy_vstack

    examples = []
    for example in reader:
        tmp = to_matrix(example['values'])  # Ignore labels if present
        examples.append(tmp)

    data = vstack(examples)
    dmatrix = xgb.DMatrix(data)
    return dmatrix
Пример #5
0
 def pipe_iterator(self, fifo_id=0):
     fifo_id = self.increment()
     print(f"opening pipe iterator {self.path}:{fifo_id}")
     pipe = mlio.SageMakerPipe(self.path, fifo_id=fifo_id)
     reader_params = mlio.DataReaderParams(dataset=[pipe],
                                           **self.reader_params)
     reader = mlio.RecordIOProtobufReader(reader_params)
     return reader
Пример #6
0
def get_recordio_protobuf_dmatrix(path, is_pipe=False):
    """Get Data Matrix from recordio-protobuf data.

    :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :return: xgb.DMatrix or None
    """
    try:
        if is_pipe:
            dataset = [mlio.SageMakerPipe(path)]
            reader = mlio.RecordIOProtobufReader(dataset=dataset,
                                                 batch_size=BATCH_SIZE)
        else:
            dataset = mlio.list_files(path)
            reader = mlio.RecordIOProtobufReader(dataset=dataset,
                                                 batch_size=BATCH_SIZE)

        if reader.peek_example() is not None:
            # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
            if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
                to_matrix = as_numpy
                vstack = np.vstack
            else:
                to_matrix = to_coo_matrix
                vstack = scipy_vstack

            all_features = []
            all_labels = []
            for example in reader:
                features = to_matrix(example['values'])
                all_features.append(features)

                labels = as_numpy(example['label_values'])
                all_labels.append(labels)

            all_features = vstack(all_features)
            all_labels = np.concatenate(all_labels, axis=None)
            dmatrix = xgb.DMatrix(all_features, label=all_labels)
            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError(
            "Failed to load recordio-protobuf data with exception:\n{}".format(
                e))