def get_recordio_protobuf_dmatrix(path, is_pipe=False):
    """Get Data Matrix from recordio-protobuf data.

    :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :return: xgb.DMatrix or None
    """
    try:
        if is_pipe:
            pipes_path = path if isinstance(path, list) else [path]
            dataset = [
                mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path
            ]
        else:
            dataset = mlio.list_files(path)

        reader_params = mlio.DataReaderParams(dataset=dataset,
                                              batch_size=BATCH_SIZE)
        reader = mlio.RecordIOProtobufReader(reader_params)

        if reader.peek_example() is not None:
            # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
            is_dense_tensor = type(
                reader.peek_example()['values']) is mlio.DenseTensor

            all_features = []
            all_labels = []
            for example in reader:
                features = as_numpy(
                    example['values']) if is_dense_tensor else to_coo_matrix(
                        example['values'])
                all_features.append(features)

                labels = as_numpy(example['label_values'])
                all_labels.append(labels)

            all_features = np.vstack(
                all_features) if is_dense_tensor else scipy_vstack(
                    all_features).tocsr()
            all_labels = np.concatenate(all_labels, axis=None)
            dmatrix = xgb.DMatrix(all_features, label=all_labels)
            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError(
            "Failed to load recordio-protobuf data with exception:\n{}".format(
                e))
예제 #2
0
 async def __preprocess_chunk(self, chunk):
     t0 = time.time()
     preproc_fn = self.params.preproc_fn
     Xy = np.column_stack([as_numpy(feature) for feature in chunk])
     X, y = preproc_fn(Xy, self.params.label_col_idx)
     logging.debug('t_preproc_chunk={:.2f}'.format(time.time() - t0))
     return X, y
    def _initialize_state(self, first_batch):
        super()._initialize_state(first_batch)

        # Estimate the size of items in each column using the first batch.
        for i in range(self._n_columns):
            column = as_numpy(first_batch[i]).flatten()
            self._row_nbytes += _get_size_total(column) / column.shape[0]
 def _construct_features_array_data(self, batch):
     """Stacks numpy columns created from an incoming data batch into a numpy array."""
     return np.column_stack([
         as_numpy(batch[column_index]).flatten()
         for column_index in range(self._n_columns)
         if column_index != self.target_column_index
     ])
예제 #5
0
def _test_dedupe_column_names(tmpdir,
                              input_column_names: List[str],
                              input_data: List[int],
                              expected_column_names: List[str],
                              expected_data: List[int],
                              dedupe_column_names: bool = True,
                              **kwargs) -> None:

    header_str = ','.join(input_column_names)
    data_str = ','.join(str(x) for x in input_data)
    csv_file = tmpdir.join("test.csv")
    csv_file.write(header_str + '\n' + data_str)

    dataset = [mlio.File(str(csv_file))]
    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    csv_params = mlio.CsvParams(dedupe_column_names=dedupe_column_names,
                                **kwargs)
    reader = mlio.CsvReader(reader_params, csv_params)

    example = reader.read_example()
    names = [desc.name for desc in example.schema.descriptors]
    assert names == expected_column_names

    record = [as_numpy(feature) for feature in example]
    assert np.all(np.array(record).squeeze() == np.array(expected_data))
def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a RecordIO-Protobuf byte representation to a DMatrix object.
    Args:
        string_like (bytes): RecordIO-Protobuf bytes.
    Returns:
    (xgb.DMatrix): XGBoost DataMatrix
    """
    buf = bytes(string_like)
    dataset = [mlio.InMemoryStore(buf)]
    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
    reader = mlio.RecordIOProtobufReader(reader_params)

    is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor

    examples = []
    for example in reader:
        # Ignore labels if present
        values = as_numpy(
            example['values']) if is_dense_tensor else to_coo_matrix(
                example['values'])
        examples.append(values)

    data = np.vstack(examples) if is_dense_tensor else scipy_vstack(
        examples).tocsr()
    dmatrix = xgb.DMatrix(data)
    return dmatrix
예제 #7
0
def to_pandas(example):
    """
    Converts the specified ``Example`` to a pandas DataFrame.
    """
    data = {attr.name: as_numpy(ftr).flatten()
            for attr, ftr in zip(example.schema.attributes, example)}

    return pd.DataFrame(data)
    def _construct_features_array_data(self, batch):
        """Creates a list of `self._n_features` arrays containing data from each column in the batch.

        Note that the arrays are interpreted as strings here, in order to easily extract itemsize and estimate size.
        """
        return [
            as_numpy(batch[i]).flatten().astype(str)
            for i in range(self._n_columns) if i != self.target_column_index
        ]
def get_recordio_protobuf_dmatrix(path,
                                  is_pipe=False,
                                  subsample_ratio_on_read=None):
    """Get Data Matrix from recordio-protobuf data.

    :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should
            be read into memory.
    :return: xgb.DMatrix or None
    """
    try:
        if is_pipe:
            dataset = [mlio.SageMakerPipe(path)]
            reader = mlio.RecordIOProtobufReader(
                dataset=dataset,
                batch_size=BATCH_SIZE,
                subsample_ratio=subsample_ratio_on_read)
        else:
            dataset = mlio.list_files(path)
            reader = mlio.RecordIOProtobufReader(
                dataset=dataset,
                batch_size=BATCH_SIZE,
                subsample_ratio=subsample_ratio_on_read)

        exm = reader.peek_example()
        if exm is None:
            return None

        # Recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
        if isinstance(exm['values'], mlio.DenseTensor):
            to_matrix = as_numpy
            vstack = np.vstack
        else:
            to_matrix = to_coo_matrix
            vstack = scipy_vstack

        all_values = []
        all_labels = []
        for example in reader:
            values = to_matrix(example['values'])
            all_values.append(values)

            labels = as_numpy(example['label_values']).squeeze()
            all_labels.append(labels)

        all_values = vstack(all_values)
        all_labels = np.concatenate(all_labels)

        return xgb.DMatrix(all_values, label=all_labels)
    except Exception as e:
        raise exc.UserError(
            "Failed to load recordio-protobuf data with exception:\n{}".format(
                e))
예제 #10
0
def to_tf(tensor):
    if isinstance(tensor, DenseTensor):
        return tf.convert_to_tensor(as_numpy(tensor))

    mtx = to_coo_matrix(tensor).tocsr()

    non_zero_row_col = mtx.nonzero()
    indices = np.asmatrix([non_zero_row_col[0], non_zero_row_col[1]])
    indices = indices.transpose()

    return tf.SparseTensor(indices, mtx.data, mtx.shape)
예제 #11
0
def test_csv_params():
    filename = os.path.join(resources_dir, 'test.csv')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    csv_prm = mlio.CsvParams(header_row_index=None)
    reader = mlio.CsvReader(rdr_prm, csv_prm)

    example = reader.read_example()
    record = [as_numpy(feature) for feature in example]
    assert np.all(np.array(record).squeeze() == np.array([1, 0, 0, 0]))

    reader2 = mlio.CsvReader(rdr_prm, csv_prm)
    assert reader2.peek_example()
예제 #12
0
    async def __train_old(self):
        chunk_size = self.params.chunk_size  # getattr(self.params, "chunk_size")
        dataset = mlio.list_files(getattr(self.params, "dataset_path"),
                                  pattern='*.csv')
        logging.debug('mlio dataset={}'.format(dataset))
        preproc_fn = self.params.preproc_fn
        reader_params = mlio.DataReaderParams(
            dataset=dataset,
            batch_size=chunk_size,
            num_prefetched_batches=self.params.num_prefetched_chunks)
        reader = mlio.CsvReader(reader_params)
        logging.debug('mlio reader={}'.format(reader))
        num_epochs = self.params.num_epochs  # Number of times to read the full dataset.
        # use eta parameteres
        eta = 0.01
        if self.params.ml_lib == 'snap':
            eta = 0.1
            from pai4sk import BoostingMachine as Booster
        else:
            from sklearn.tree import DecisionTreeRegressor

        logging.debug('starting training')
        models = []
        # RecordIOProtobufReader is simply an iterator over mini-batches of data.
        for chunk_idx, chunk in enumerate(reader):
            rand_state = self.params.rand_state
            # Alternatively, transform the mini-batch into a NumPy array.
            chunk_train_Xy = np.column_stack(
                [as_numpy(feature) for feature in chunk])
            chunk_train_X, chunk_train_y = preproc_fn(
                chunk_train_Xy, self.params.label_col_idx)
            #print(chunk_train_X)
            if self.params.ml_lib == 'snap':
                bl = Booster(**self.params.ml_opts_dict)
                bl.fit(chunk_train_X, chunk_train_y)
                models.append(bl)
            else:
                z_train = np.zeros(chunk_train_X.shape[0])
                for epoch in range(num_epochs):
                    #logging.debug('chunk idx={} chunk={}'.format(chunk_idx, chunk))

                    target = chunk_train_y - z_train
                    bl = DecisionTreeRegressor(max_depth=3,
                                               max_features='sqrt',
                                               random_state=rand_state)
                    bl.fit(chunk_train_X, target)
                    u_train = bl.predict(chunk_train_X)
                    z_train = z_train + eta * u_train
                    models.append(bl)
        return models
예제 #13
0
def test_recordio_protobuf_reader_params():
    filename = os.path.join(resources_dir, 'test.pbr')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    reader = mlio.RecordIOProtobufReader(rdr_prm)

    example = reader.read_example()
    record = [as_numpy(feature) for feature in example]
    assert record[0].squeeze() == np.array(1)
    assert np.all(record[1].squeeze() == np.array([0, 0, 0]))

    # Parameters should be reusable
    reader2 = mlio.RecordIOProtobufReader(rdr_prm)
    assert reader2.peek_example()
예제 #14
0
def test_csv_nonutf_encoding_with_encoding_param():
    filename = os.path.join(resources_dir, 'test_iso8859_5.csv')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset,
                                    batch_size=2)
    csv_params = mlio.CsvParams(encoding='ISO-8859-5')

    reader = mlio.CsvReader(rdr_prm, csv_params)
    example = reader.read_example()
    nonutf_feature = example['col_3']

    try:
        feature_np = as_numpy(nonutf_feature)
    except SystemError as err:
        pytest.fail("Unexpected exception thrown")
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights):
    """Get Data Matrix from CSV data in pipe mode.

    :param pipe_path: SageMaker pipe path where CSV formatted training data is piped
    :param csv_weights: 1 if instance weights are in second column of CSV data; else 0
    :return: xgb.DMatrix or None
    """
    try:
        pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path]
        dataset = [mlio.SageMakerPipe(path) for path in pipes_path]
        reader_params = mlio.DataReaderParams(dataset=dataset,
                                              batch_size=BATCH_SIZE)
        csv_params = mlio.CsvParams(header_row_index=None)
        reader = mlio.CsvReader(reader_params, csv_params)

        # Check if data is present in reader
        if reader.peek_example() is not None:
            examples = []
            for example in reader:
                # Write each feature (column) of example into a single numpy array
                tmp = [as_numpy(feature).squeeze() for feature in example]
                tmp = np.array(tmp)
                if len(tmp.shape) > 1:
                    # Columns are written as rows, needs to be transposed
                    tmp = tmp.T
                else:
                    # If tmp is a 1-D array, it needs to be reshaped as a matrix
                    tmp = np.reshape(tmp, (1, tmp.shape[0]))
                examples.append(tmp)

            data = np.vstack(examples)
            del examples

            if csv_weights == 1:
                dmatrix = xgb.DMatrix(data[:, 2:],
                                      label=data[:, 0],
                                      weight=data[:, 1])
            else:
                dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])

            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError(
            "Failed to load csv data with exception:\n{}".format(e))
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights,
                               subsample_ratio_on_read):
    """Get Data Matrix from CSV data in pipe mode.

    :param pipe_path: SageMaker pipe path where CSV formatted training data is piped
    :param csv_weights: 1 if instance weights are in second column of CSV data; else 0
    :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should
            be read into memory.
    :return: xgb.DMatrix or None
    """
    try:
        dataset = [mlio.SageMakerPipe(pipe_path, fifo_id=0)]
        reader = mlio.CsvReader(dataset=dataset,
                                batch_size=BATCH_SIZE,
                                header_row_index=None,
                                subsample_ratio=subsample_ratio_on_read)

        # Check if data is present in reader
        if reader.peek_example() is None:
            return None

        batches = []
        for example in reader:
            batch = np.column_stack([as_numpy(f) for f in example])
            batches.append(batch)

        data = np.vstack(batches)
        del batches

        if csv_weights == 1:
            dmatrix = xgb.DMatrix(data[:, 2:],
                                  label=data[:, 0],
                                  weights=data[:, 1])
        else:
            dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])

        return dmatrix
    except Exception as e:
        raise exc.UserError(
            "Failed to load csv data with exception:\n{}".format(e))
 def _construct_target_array_data(self, batch):
     if self._split_target:
         return as_numpy(
             batch[self.target_column_index]).flatten().astype(str)
     return None