def test_seq_dataset_shuffle(): dataset1 = ArrayDataset(X, y, sample_weight, seed=42) dataset2 = CSRDataset(X_csr.data, X_csr.indptr, X_csr.indices, y, sample_weight, seed=42) # not shuffled for i in range(5): _, _, _, idx1 = dataset1._next_py() _, _, _, idx2 = dataset2._next_py() assert_equal(idx1, i) assert_equal(idx2, i) for i in range(5): _, _, _, idx1 = dataset1._random_py() _, _, _, idx2 = dataset2._random_py() assert_equal(idx1, idx2) seed = 77 dataset1._shuffle_py(seed) dataset2._shuffle_py(seed) for i in range(5): _, _, _, idx1 = dataset1._next_py() _, _, _, idx2 = dataset2._next_py() assert_equal(idx1, idx2) _, _, _, idx1 = dataset1._random_py() _, _, _, idx2 = dataset2._random_py() assert_equal(idx1, idx2)
def test_seq_dataset(): dataset1 = ArrayDataset(X, y, sample_weight, seed=42) dataset2 = CSRDataset(X_csr.data, X_csr.indptr, X_csr.indices, y, sample_weight, seed=42) for dataset in (dataset1, dataset2): for i in range(5): # next sample xi_, yi, swi, idx = dataset._next_py() xi = sp.csr_matrix((xi_), shape=(1, X.shape[1])) assert_array_equal(xi.data, X_csr[idx].data) assert_array_equal(xi.indices, X_csr[idx].indices) assert_array_equal(xi.indptr, X_csr[idx].indptr) assert_equal(yi, y[idx]) assert_equal(swi, sample_weight[idx]) # random sample xi_, yi, swi, idx = dataset._random_py() xi = sp.csr_matrix((xi_), shape=(1, X.shape[1])) assert_array_equal(xi.data, X_csr[idx].data) assert_array_equal(xi.indices, X_csr[idx].indices) assert_array_equal(xi.indptr, X_csr[idx].indptr) assert_equal(yi, y[idx]) assert_equal(swi, sample_weight[idx])
def make_dataset(X, y, sample_weight, random_state=None): """Create ``Dataset`` abstraction for sparse and dense inputs. This also returns the ``intercept_decay`` which is different for sparse datasets. """ rng = check_random_state(random_state) # seed should never be 0 in SequentialDataset seed = rng.randint(1, np.iinfo(np.int32).max) if sp.issparse(X): dataset = CSRDataset(X.data, X.indptr, X.indices, y, sample_weight, seed=seed) intercept_decay = SPARSE_INTERCEPT_DECAY else: dataset = ArrayDataset(X, y, sample_weight, seed=seed) intercept_decay = 1.0 return dataset, intercept_decay