示例#1
0
def _test_dedupe_column_names(tmpdir,
                              input_column_names: List[str],
                              input_data: List[int],
                              expected_column_names: List[str],
                              expected_data: List[int],
                              dedupe_column_names: bool = True,
                              **kwargs) -> None:

    header_str = ','.join(input_column_names)
    data_str = ','.join(str(x) for x in input_data)
    csv_file = tmpdir.join("test.csv")
    csv_file.write(header_str + '\n' + data_str)

    dataset = [mlio.File(str(csv_file))]
    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    csv_params = mlio.CsvParams(dedupe_column_names=dedupe_column_names,
                                **kwargs)
    reader = mlio.CsvReader(reader_params, csv_params)

    example = reader.read_example()
    names = [desc.name for desc in example.schema.descriptors]
    assert names == expected_column_names

    record = [as_numpy(feature) for feature in example]
    assert np.all(np.array(record).squeeze() == np.array(expected_data))
def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a RecordIO-Protobuf byte representation to a DMatrix object.
    Args:
        string_like (bytes): RecordIO-Protobuf bytes.
    Returns:
    (xgb.DMatrix): XGBoost DataMatrix
    """
    buf = bytes(string_like)
    dataset = [mlio.InMemoryStore(buf)]
    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
    reader = mlio.RecordIOProtobufReader(reader_params)

    is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor

    examples = []
    for example in reader:
        # Ignore labels if present
        values = as_numpy(
            example['values']) if is_dense_tensor else to_coo_matrix(
                example['values'])
        examples.append(values)

    data = np.vstack(examples) if is_dense_tensor else scipy_vstack(
        examples).tocsr()
    dmatrix = xgb.DMatrix(data)
    return dmatrix
def _get_reader(source, batch_size):
    """Returns 'CsvReader' for the given source

       Parameters
       ----------
       source: str or bytes
           Name of the SageMaker Channel, File, or directory from which the data is being read or
           the Python buffer object from which the data is being read.

       batch_size : int
           The batch size in rows to read from the source.

       Returns
       -------
       mlio.CsvReader
           CsvReader configured with a SageMaker Pipe, File or InMemory buffer
       """
    data_reader_params = mlio.DataReaderParams(dataset=_get_data(source),
                                               batch_size=batch_size,
                                               warn_bad_instances=False)
    csv_params = mlio.CsvParams(default_data_type=mlio.DataType.STRING,
                                header_row_index=None,
                                allow_quoted_new_lines=True)
    return mlio.CsvReader(data_reader_params=data_reader_params,
                          csv_params=csv_params)
示例#4
0
 def pipe_iterator(self, fifo_id=0):
     fifo_id = self.increment()
     print(f"opening pipe iterator {self.path}:{fifo_id}")
     pipe = mlio.SageMakerPipe(self.path, fifo_id=fifo_id)
     reader_params = mlio.DataReaderParams(dataset=[pipe],
                                           **self.reader_params)
     reader = mlio.RecordIOProtobufReader(reader_params)
     return reader
示例#5
0
def test_csv_params():
    filename = os.path.join(resources_dir, 'test.csv')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    csv_prm = mlio.CsvParams(header_row_index=None)
    reader = mlio.CsvReader(rdr_prm, csv_prm)

    example = reader.read_example()
    record = [as_numpy(feature) for feature in example]
    assert np.all(np.array(record).squeeze() == np.array([1, 0, 0, 0]))

    reader2 = mlio.CsvReader(rdr_prm, csv_prm)
    assert reader2.peek_example()
示例#6
0
    async def __predict(self):
        if self.params.ml_lib == 'snap':
            from pai4sk import BoostingMachine as Booster
        else:
            from sklearn.tree import DecisionTreeRegressor
        chunk_size = self.params.chunk_size  # getattr(self.params, "chunk_size")
        dataset = mlio.list_files(getattr(self.params, "dataset_test_path"),
                                  pattern='*.csv')
        logging.debug('mlio dataset={}'.format(dataset))
        reader_params = mlio.DataReaderParams(
            dataset=dataset,
            batch_size=chunk_size,
            num_prefetched_batches=self.params.num_prefetched_chunks)
        reader = mlio.CsvReader(reader_params)
        logging.debug('mlio reader={}'.format(reader))

        logging.debug('starting inference')
        score_norm = 0.0
        score = 0.0
        # preample
        chunkim1 = reader.read_example()
        if chunkim1 != None:
            X_im1, y_im1 = await self.__preprocess_chunk(chunkim1)
        chunki = reader.read_example()
        i = 1
        logging.debug('chunk{}={}'.format(0, chunkim1))
        logging.debug('chunk{}={}'.format(i, chunki))
        while chunki != None:
            logging.debug('chunk{}={}'.format(i, chunki))
            task_predict = asyncio.create_task(
                self.__predict_chunk(X_im1, y_im1))
            task_preprocess = asyncio.create_task(
                self.__preprocess_chunk(chunki))
            X_i, y_i = await task_preprocess
            s, n = await task_predict
            score += s
            score_norm += n
            X_im1 = X_i
            y_im1 = y_i
            chunkim1 = chunki
            chunki = reader.read_example()
            i += 1
        # postample
        if chunkim1 != None:
            logging.debug('y{}m1={}'.format(i, y_im1))
            s, n = await self.__predict_chunk(X_im1, y_im1)
            score += s
            score_norm += n
        score /= score_norm
        return score
示例#7
0
def test_recordio_protobuf_reader_params():
    filename = os.path.join(resources_dir, 'test.pbr')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    reader = mlio.RecordIOProtobufReader(rdr_prm)

    example = reader.read_example()
    record = [as_numpy(feature) for feature in example]
    assert record[0].squeeze() == np.array(1)
    assert np.all(record[1].squeeze() == np.array([0, 0, 0]))

    # Parameters should be reusable
    reader2 = mlio.RecordIOProtobufReader(rdr_prm)
    assert reader2.peek_example()
示例#8
0
def test_image_reader_jpeg_no_resize():
    filename = os.path.join(resources_dir, 'test_image_0.jpg')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    img_prm = mlio.ImageReaderParams(img_frame=mlio.ImageFrame.NONE,
                                     image_dimensions=[3, 50, 50],
                                     to_rgb=1)

    reader = mlio.ImageReader(rdr_prm, img_prm)
    example = reader.read_example()
    tensor = example['value']

    assert tensor.shape == (1, 50, 50, 3)
    assert tensor.strides == (7500, 150, 3, 1)
示例#9
0
    async def __train_old(self):
        chunk_size = self.params.chunk_size  # getattr(self.params, "chunk_size")
        dataset = mlio.list_files(getattr(self.params, "dataset_path"),
                                  pattern='*.csv')
        logging.debug('mlio dataset={}'.format(dataset))
        preproc_fn = self.params.preproc_fn
        reader_params = mlio.DataReaderParams(
            dataset=dataset,
            batch_size=chunk_size,
            num_prefetched_batches=self.params.num_prefetched_chunks)
        reader = mlio.CsvReader(reader_params)
        logging.debug('mlio reader={}'.format(reader))
        num_epochs = self.params.num_epochs  # Number of times to read the full dataset.
        # use eta parameteres
        eta = 0.01
        if self.params.ml_lib == 'snap':
            eta = 0.1
            from pai4sk import BoostingMachine as Booster
        else:
            from sklearn.tree import DecisionTreeRegressor

        logging.debug('starting training')
        models = []
        # RecordIOProtobufReader is simply an iterator over mini-batches of data.
        for chunk_idx, chunk in enumerate(reader):
            rand_state = self.params.rand_state
            # Alternatively, transform the mini-batch into a NumPy array.
            chunk_train_Xy = np.column_stack(
                [as_numpy(feature) for feature in chunk])
            chunk_train_X, chunk_train_y = preproc_fn(
                chunk_train_Xy, self.params.label_col_idx)
            #print(chunk_train_X)
            if self.params.ml_lib == 'snap':
                bl = Booster(**self.params.ml_opts_dict)
                bl.fit(chunk_train_X, chunk_train_y)
                models.append(bl)
            else:
                z_train = np.zeros(chunk_train_X.shape[0])
                for epoch in range(num_epochs):
                    #logging.debug('chunk idx={} chunk={}'.format(chunk_idx, chunk))

                    target = chunk_train_y - z_train
                    bl = DecisionTreeRegressor(max_depth=3,
                                               max_features='sqrt',
                                               random_state=rand_state)
                    bl.fit(chunk_train_X, target)
                    u_train = bl.predict(chunk_train_X)
                    z_train = z_train + eta * u_train
                    models.append(bl)
        return models
def get_recordio_protobuf_dmatrix(path, is_pipe=False):
    """Get Data Matrix from recordio-protobuf data.

    :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :return: xgb.DMatrix or None
    """
    try:
        if is_pipe:
            pipes_path = path if isinstance(path, list) else [path]
            dataset = [
                mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path
            ]
        else:
            dataset = mlio.list_files(path)

        reader_params = mlio.DataReaderParams(dataset=dataset,
                                              batch_size=BATCH_SIZE)
        reader = mlio.RecordIOProtobufReader(reader_params)

        if reader.peek_example() is not None:
            # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
            is_dense_tensor = type(
                reader.peek_example()['values']) is mlio.DenseTensor

            all_features = []
            all_labels = []
            for example in reader:
                features = as_numpy(
                    example['values']) if is_dense_tensor else to_coo_matrix(
                        example['values'])
                all_features.append(features)

                labels = as_numpy(example['label_values'])
                all_labels.append(labels)

            all_features = np.vstack(
                all_features) if is_dense_tensor else scipy_vstack(
                    all_features).tocsr()
            all_labels = np.concatenate(all_labels, axis=None)
            dmatrix = xgb.DMatrix(all_features, label=all_labels)
            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError(
            "Failed to load recordio-protobuf data with exception:\n{}".format(
                e))
示例#11
0
def test_image_reader_recordio():
    filename = os.path.join(resources_dir, 'test_image_0.rec')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)
    img_prm = mlio.ImageReaderParams(img_frame=mlio.ImageFrame.RECORDIO,
                                     resize=100,
                                     image_dimensions=[3, 100, 100],
                                     to_rgb=1)

    reader = mlio.ImageReader(rdr_prm, img_prm)
    example = reader.read_example()
    tensor = example['value']

    assert tensor.shape == (1, 100, 100, 3)
    assert tensor.strides == (30000, 300, 3, 1)
示例#12
0
def test_csv_nonutf_encoding_with_encoding_param():
    filename = os.path.join(resources_dir, 'test_iso8859_5.csv')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset,
                                    batch_size=2)
    csv_params = mlio.CsvParams(encoding='ISO-8859-5')

    reader = mlio.CsvReader(rdr_prm, csv_params)
    example = reader.read_example()
    nonutf_feature = example['col_3']

    try:
        feature_np = as_numpy(nonutf_feature)
    except SystemError as err:
        pytest.fail("Unexpected exception thrown")
def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights):
    """Get Data Matrix from CSV data in pipe mode.

    :param pipe_path: SageMaker pipe path where CSV formatted training data is piped
    :param csv_weights: 1 if instance weights are in second column of CSV data; else 0
    :return: xgb.DMatrix or None
    """
    try:
        pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path]
        dataset = [mlio.SageMakerPipe(path) for path in pipes_path]
        reader_params = mlio.DataReaderParams(dataset=dataset,
                                              batch_size=BATCH_SIZE)
        csv_params = mlio.CsvParams(header_row_index=None)
        reader = mlio.CsvReader(reader_params, csv_params)

        # Check if data is present in reader
        if reader.peek_example() is not None:
            examples = []
            for example in reader:
                # Write each feature (column) of example into a single numpy array
                tmp = [as_numpy(feature).squeeze() for feature in example]
                tmp = np.array(tmp)
                if len(tmp.shape) > 1:
                    # Columns are written as rows, needs to be transposed
                    tmp = tmp.T
                else:
                    # If tmp is a 1-D array, it needs to be reshaped as a matrix
                    tmp = np.reshape(tmp, (1, tmp.shape[0]))
                examples.append(tmp)

            data = np.vstack(examples)
            del examples

            if csv_weights == 1:
                dmatrix = xgb.DMatrix(data[:, 2:],
                                      label=data[:, 0],
                                      weight=data[:, 1])
            else:
                dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])

            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError(
            "Failed to load csv data with exception:\n{}".format(e))
示例#14
0
    async def __train(self):
        chunk_size = self.params.chunk_size  # getattr(self.params, "chunk_size")
        dataset = mlio.list_files(getattr(self.params, "dataset_path"),
                                  pattern='*.csv')
        logging.debug('mlio dataset={}'.format(dataset))
        reader_params = mlio.DataReaderParams(
            dataset=dataset,
            batch_size=chunk_size,
            num_prefetched_batches=self.params.num_prefetched_chunks)
        reader = mlio.CsvReader(reader_params)
        logging.debug('mlio reader={}'.format(reader))
        num_epochs = self.params.num_epochs  # Number of times to read the full dataset.
        # use eta parameteres
        eta = 0.01
        if self.params.ml_lib == 'snap':
            eta = 0.1
            from pai4sk import BoostingMachine as Booster
        else:
            from sklearn.tree import DecisionTreeRegressor

        logging.debug('starting training')
        models = []
        # preample
        chunkim1 = reader.read_example()
        if chunkim1 != None:
            X_im1, y_im1 = await self.__preprocess_chunk(chunkim1)
        chunki = reader.read_example()
        i = 1
        logging.debug('chunk{}={}'.format(0, chunkim1))
        logging.debug('chunk{}={}'.format(i, chunki))
        while chunki != None:
            logging.debug('chunk{}={}'.format(i, chunki))
            task_preprocess = asyncio.create_task(
                self.__preprocess_chunk(chunki))
            task_train = asyncio.create_task(self.__train_chunk(X_im1, y_im1))
            X_i, y_i = await task_preprocess
            models.extend(await task_train)
            X_im1 = X_i
            y_im1 = y_i
            chunkim1 = chunki
            chunki = reader.read_example()
            i += 1
        # postample
        if chunkim1 != None:
            logging.debug('y{}m1={}'.format(i, y_im1))
            models.extend(await self.__train_chunk(X_im1, y_im1))
        return models
示例#15
0
def test_data_reader_params_members():
    filename = os.path.join(resources_dir, 'test.pbr')
    dataset = [mlio.File(filename)]
    rdr_prm = mlio.DataReaderParams(dataset=dataset, batch_size=1)

    assert rdr_prm.dataset == dataset
    assert rdr_prm.batch_size == 1
    assert rdr_prm.num_prefetched_batches == 0
    assert rdr_prm.num_parallel_reads == 0
    assert rdr_prm.last_batch_handling == mlio.LastBatchHandling.NONE
    assert rdr_prm.bad_batch_handling == mlio.BadBatchHandling.ERROR
    assert rdr_prm.num_instances_to_skip == 0
    assert rdr_prm.num_instances_to_read is None
    assert rdr_prm.shard_index == 0
    assert rdr_prm.num_shards == 0
    assert rdr_prm.shuffle_instances is False
    assert rdr_prm.shuffle_window == 0
    assert rdr_prm.shuffle_seed is None
    assert rdr_prm.reshuffle_each_epoch is True
    assert rdr_prm.subsample_ratio is None

    rdr_prm.batch_size = 2
    assert rdr_prm.batch_size == 2