def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a RecordIO-Protobuf byte representation to a DMatrix object.
    Args:
        string_like (bytes): RecordIO-Protobuf bytes.
    Returns:
    (xgb.DMatrix): XGBoost DataMatrix
    """
    buf = bytes(string_like)
    dataset = [mlio.InMemoryStore(buf)]
    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
    reader = mlio.RecordIOProtobufReader(reader_params)

    is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor

    examples = []
    for example in reader:
        # Ignore labels if present
        values = as_numpy(
            example['values']) if is_dense_tensor else to_coo_matrix(
                example['values'])
        examples.append(values)

    data = np.vstack(examples) if is_dense_tensor else scipy_vstack(
        examples).tocsr()
    dmatrix = xgb.DMatrix(data)
    return dmatrix
def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a RecordIO-Protobuf byte representation to a DMatrix object.
    Args:
        string_like (bytes): RecordIO-Protobuf bytes.
    Returns:
    (xgb.DMatrix): XGBoost DataMatrix
    """
    buf = bytes(string_like)
    dataset = [mlio.InMemoryStore(buf)]
    reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=100)

    if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
        to_matrix = as_numpy
        vstack = np.vstack
    else:
        to_matrix = to_coo_matrix
        vstack = scipy_vstack

    examples = []
    for example in reader:
        tmp = to_matrix(example['values'])  # Ignore labels if present
        examples.append(tmp)

    data = vstack(examples)
    dmatrix = xgb.DMatrix(data)
    return dmatrix
def _get_data(source):
    """Determines the input mode of the source and returns a InMemoryStore, SageMakerPipe, or File object
    based on the input mode.

    If source is a python buffer, a mlio.core.InMemoryStore will be returned.

    If SM_INPUT_DATA_CONFIG environment variable is not defined, source is assumed to be a file or directory and a
    mlio.core.File object will be returned.

    If SM_INPUT_DATA_CONFIG environment variable is defined, source can be the name of the channel in
    SM_INPUT_DATA_CONFIG. If the source is a path, it is assumed that the basename of the path is the name of the
    channel. The type of mlio.core object to be returned will be based on the "TrainingInputMode" of the channel.

    Here is an example of SM_INPUT_DATA_CONFIG with two channels ("code" and "train").
    SM_INPUT_DATA_CONFIG=
    {
        "code": {
            "ContentType": "application/x-code",
            "RecordWrapperType": "None",
            "S3DistributionType": "FullyReplicated",
            "TrainingInputMode": "File"
        },
        "train": {
            "ContentType": "text/csv",
            "RecordWrapperType": "None",
            "S3DistributionType": "ShardedByS3Key",
            "TrainingInputMode": "File"
        }
    }

    Parameters
    ----------
    source: str or bytes
        Name of the SageMaker Channel, File, or directory from which the data is being read or
        the Python buffer object from which the data is being read.

    Returns
    -------
    mlio.core.File:
        A mlio.core.File object is return based on the file or directory described by the `source`.

    mlio.core.SageMakerPipe:
        In SageMaker framework containers, the inputdataconfig.json is made available via environment
        variable 'SM_INPUT_DATA_CONFIG'. When the given source is a to 'Pipe' the value of the
        environment variable 'SM_INPUT_DATA_CONFIG' is used to read out the 'TrainingInputMode' and
        confirm that the source is a 'Pipe'. Then a `mlio.SageMakerPipe` object is created using the
        'source' and returned.

    mlio.core.InMemoryStore:
        Given the `source` is a Python buffer, a mlio.InMemoryStore object is created and returned
    """
    if isinstance(source, bytes):
        return [mlio.InMemoryStore(source)]

    if isinstance(source, mlio.core.File):
        source = source.id

    config = os.environ.get("SM_INPUT_DATA_CONFIG")

    if config is None:
        return mlio.list_files(source, pattern="*")

    channels = json.loads(config)

    source_channel_name = os.path.basename(source)
    try:
        channel_config = channels[source_channel_name]
    except KeyError:
        raise KeyError(
            "Configuration for channel name {} is not provided in SM_INPUT_DATA_CONFIG."
            .format(source_channel_name))

    try:
        data_config_input_mode = channel_config["TrainingInputMode"]
    except KeyError:
        raise KeyError(
            "SM_INPUT_DATA_CONFIG is malformed. TrainingInputMode is "
            "not found for channel name {}".format(source_channel_name))

    if data_config_input_mode == "Pipe":
        return [mlio.SageMakerPipe(source)]

    return mlio.list_files(source, pattern="*")  # 'File' mode