def test_regression_split(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind, test_ind = splitter._split(reg_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 80
    assert valid_ind.shape[0] == 10
    assert test_ind.shape[0] == 10

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    test = NumpyTupleDataset(*reg_dataset.features[test_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
    assert 45.0 < test.features[:, -1].mean() < 55.0

    train_ind, valid_ind, test_ind = splitter._split(reg_dataset,
                                                     frac_train=0.5,
                                                     frac_valid=0.3,
                                                     frac_test=0.2)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 50
    assert valid_ind.shape[0] == 30
    assert test_ind.shape[0] == 20

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    test = NumpyTupleDataset(*reg_dataset.features[test_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
    assert 45.0 < test.features[:, -1].mean() < 55.0
def test_classification_split_by_labels_ndarray(cls_dataset, cls_label):
    splitter = StratifiedSplitter()
    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                     labels=cls_label)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 24
    assert valid_ind.shape[0] == 3
    assert test_ind.shape[0] == 3

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    test = NumpyTupleDataset(*cls_dataset.features[test_ind])
    assert (train.features[:, -1] == 1).sum() == 8
    assert (valid.features[:, -1] == 1).sum() == 1
    assert (test.features[:, -1] == 1).sum() == 1

    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                     labels=cls_label,
                                                     frac_train=0.5,
                                                     frac_valid=0.3,
                                                     frac_test=0.2)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 15
    assert valid_ind.shape[0] == 9
    assert test_ind.shape[0] == 6

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    test = NumpyTupleDataset(*cls_dataset.features[test_ind])
    assert (train.features[:, -1] == 1).sum() == 5
    assert (valid.features[:, -1] == 1).sum() == 3
    assert (test.features[:, -1] == 1).sum() == 2
def test_split_fail_by_frac_ratio(cls_dataset):
    splitter = StratifiedSplitter()
    with pytest.raises(AssertionError):
        train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                         frac_train=0.4,
                                                         frac_valid=0.3,
                                                         frac_test=0.2)
def test_regression_split(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind, test_ind = splitter._split(reg_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 80
    assert valid_ind.shape[0] == 10
    assert test_ind.shape[0] == 10

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    test = NumpyTupleDataset(*reg_dataset.features[test_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
    assert 45.0 < test.features[:, -1].mean() < 55.0

    train_ind, valid_ind, test_ind = splitter._split(reg_dataset,
                                                     frac_train=0.5,
                                                     frac_valid=0.3,
                                                     frac_test=0.2)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 50
    assert valid_ind.shape[0] == 30
    assert test_ind.shape[0] == 20

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    test = NumpyTupleDataset(*reg_dataset.features[test_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
    assert 45.0 < test.features[:, -1].mean() < 55.0
def test_classification_split_by_labels_ndarray(cls_dataset, cls_label):
    splitter = StratifiedSplitter()
    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                     labels=cls_label)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 24
    assert valid_ind.shape[0] == 3
    assert test_ind.shape[0] == 3

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    test = NumpyTupleDataset(*cls_dataset.features[test_ind])
    assert (train.features[:, -1] == 1).sum() == 8
    assert (valid.features[:, -1] == 1).sum() == 1
    assert (test.features[:, -1] == 1).sum() == 1

    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                     labels=cls_label,
                                                     frac_train=0.5,
                                                     frac_valid=0.3,
                                                     frac_test=0.2)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 15
    assert valid_ind.shape[0] == 9
    assert test_ind.shape[0] == 6

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    test = NumpyTupleDataset(*cls_dataset.features[test_ind])
    assert (train.features[:, -1] == 1).sum() == 5
    assert (valid.features[:, -1] == 1).sum() == 3
    assert (test.features[:, -1] == 1).sum() == 2
def test_split_fail_by_frac_ratio(cls_dataset):
    splitter = StratifiedSplitter()
    with pytest.raises(AssertionError):
        train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                         frac_train=0.4,
                                                         frac_valid=0.3,
                                                         frac_test=0.2)
def test_split_fail_by_invalid_task_type(cls_dataset):
    splitter = StratifiedSplitter()
    with pytest.raises(ValueError):
        train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                         frac_train=0.5,
                                                         frac_valid=0.3,
                                                         frac_test=0.2,
                                                         task_type='mix')
def test_split_fail_by_invalid_task_type(cls_dataset):
    splitter = StratifiedSplitter()
    with pytest.raises(ValueError):
        train_ind, valid_ind, test_ind = splitter._split(cls_dataset,
                                                         frac_train=0.5,
                                                         frac_valid=0.3,
                                                         frac_test=0.2,
                                                         task_type='mix')
def test_regression_split_fix_seed(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind1, valid_ind1, test_ind1 = splitter._split(reg_dataset, seed=44)
    train_ind2, valid_ind2, test_ind2 = splitter._split(reg_dataset, seed=44)

    assert numpy.array_equal(train_ind1, train_ind2)
    assert numpy.array_equal(valid_ind1, valid_ind2)
    assert numpy.array_equal(test_ind1, test_ind2)
def test_regression_split_fix_seed(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind1, valid_ind1, test_ind1 = splitter._split(reg_dataset, seed=44)
    train_ind2, valid_ind2, test_ind2 = splitter._split(reg_dataset, seed=44)

    assert numpy.array_equal(train_ind1, train_ind2)
    assert numpy.array_equal(valid_ind1, valid_ind2)
    assert numpy.array_equal(test_ind1, test_ind2)
def test_train_valid_regression_split_return_dataset(reg_dataset):
    splitter = StratifiedSplitter()
    train, valid = splitter.train_valid_split(reg_dataset, return_index=False)
    assert type(train) == NumpyTupleDataset
    assert type(valid) == NumpyTupleDataset
    assert len(train) == 90
    assert len(valid) == 10
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
def test_train_valid_classification_split_return_dataset(cls_dataset):
    splitter = StratifiedSplitter()
    train, valid = splitter.train_valid_split(cls_dataset, return_index=False)
    assert type(train) == NumpyTupleDataset
    assert type(valid) == NumpyTupleDataset
    assert len(train) == 27
    assert len(valid) == 3
    assert (train.features[:, -1] == 1).sum() == 9
    assert (valid.features[:, -1] == 1).sum() == 1
def test_train_valid_classification_split_return_dataset(cls_dataset):
    splitter = StratifiedSplitter()
    train, valid = splitter.train_valid_split(cls_dataset, return_index=False)
    assert type(train) == NumpyTupleDataset
    assert type(valid) == NumpyTupleDataset
    assert len(train) == 27
    assert len(valid) == 3
    assert (train.features[:, -1] == 1).sum() == 9
    assert (valid.features[:, -1] == 1).sum() == 1
def test_train_valid_regression_split_return_dataset(reg_dataset):
    splitter = StratifiedSplitter()
    train, valid = splitter.train_valid_split(reg_dataset, return_index=False)
    assert type(train) == NumpyTupleDataset
    assert type(valid) == NumpyTupleDataset
    assert len(train) == 90
    assert len(valid) == 10
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
def test_train_valid_classification_split(cls_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind = splitter.train_valid_split(cls_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 27
    assert valid_ind.shape[0] == 3

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    assert (train.features[:, -1] == 1).sum() == 9
    assert (valid.features[:, -1] == 1).sum() == 1
def test_train_valid_regression_split(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind = splitter.train_valid_split(reg_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 90
    assert valid_ind.shape[0] == 10

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
def test_train_valid_classification_split(cls_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind = splitter.train_valid_split(cls_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 27
    assert valid_ind.shape[0] == 3

    train = NumpyTupleDataset(*cls_dataset.features[train_ind])
    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])
    assert (train.features[:, -1] == 1).sum() == 9
    assert (valid.features[:, -1] == 1).sum() == 1
def test_train_valid_regression_split(reg_dataset):
    splitter = StratifiedSplitter()
    train_ind, valid_ind = splitter.train_valid_split(reg_dataset)
    assert type(train_ind) == numpy.ndarray
    assert train_ind.shape[0] == 90
    assert valid_ind.shape[0] == 10

    train = NumpyTupleDataset(*reg_dataset.features[train_ind])
    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])
    assert 45.0 < train.features[:, -1].mean() < 55.0
    assert 45.0 < valid.features[:, -1].mean() < 55.0
def test_train_valid_classification_split_ndarray_return_dataset(
        cls_ndarray_dataset):
    cls_dataset, cls_label = cls_ndarray_dataset
    splitter = StratifiedSplitter()
    train, valid = splitter.train_valid_split(cls_dataset, labels=cls_label,
                                              return_index=False)
    assert type(train) == numpy.ndarray
    assert type(valid) == numpy.ndarray
    assert len(train) == 27
    assert len(valid) == 3
    assert (train == 1).sum() == 9
    assert (valid == 1).sum() == 1
def test_train_valid_classification_split_ndarray_return_dataset(
        cls_ndarray_dataset):
    cls_dataset, cls_label = cls_ndarray_dataset
    splitter = StratifiedSplitter()
    train, valid = splitter.train_valid_split(cls_dataset,
                                              labels=cls_label,
                                              return_index=False)
    assert type(train) == numpy.ndarray
    assert type(valid) == numpy.ndarray
    assert len(train) == 27
    assert len(valid) == 3
    assert (train == 1).sum() == 9
    assert (valid == 1).sum() == 1
def test_train_valid_test_cls_split_by_labels_return_dataset(
        cls_dataset, cls_label):
    splitter = StratifiedSplitter()
    train, valid, test = splitter.train_valid_test_split(cls_dataset,
                                                         labels=cls_label,
                                                         return_index=False)
    assert type(train) == NumpyTupleDataset
    assert type(valid) == NumpyTupleDataset
    assert type(test) == NumpyTupleDataset
    assert len(train) == 24
    assert len(valid) == 3
    assert len(test) == 3
    assert (train.features[:, -1] == 1).sum() == 8
    assert (valid.features[:, -1] == 1).sum() == 1
    assert (test.features[:, -1] == 1).sum() == 1
def test_train_valid_test_cls_split_by_labels_return_dataset(cls_dataset,
                                                             cls_label):
    splitter = StratifiedSplitter()
    train, valid, test = splitter.train_valid_test_split(cls_dataset,
                                                         labels=cls_label,
                                                         return_index=False)
    assert type(train) == NumpyTupleDataset
    assert type(valid) == NumpyTupleDataset
    assert type(test) == NumpyTupleDataset
    assert len(train) == 24
    assert len(valid) == 3
    assert len(test) == 3
    assert (train.features[:, -1] == 1).sum() == 8
    assert (valid.features[:, -1] == 1).sum() == 1
    assert (test.features[:, -1] == 1).sum() == 1