def test_lots_of_data_with_multiple_backend(self, repo_300_filled_samples): repo = repo_300_filled_samples co = repo.checkout() aset = co.columns['aset'] np_dset = make_numpy_dataset([aset], batch_size=10, drop_last=True) for data in np_dset: assert isinstance(data, np.ndarray) assert data.shape == (10, 5, 7) co.close()
def test_nested_column(self, repo_20_filled_subsamples): co = repo_20_filled_subsamples.checkout() col1 = co['writtenaset'] col2 = co['second_aset'] dset = make_numpy_dataset([col1, col2]) for data1, data2 in dset: assert isinstance(data1, dict) assert isinstance(data2, dict) assert tuple(data1.keys()) == tuple(data2.keys()) dset = make_numpy_dataset([col1, col2], batch_size=1, drop_last=True) for data1, data2 in dset: assert type(data1) is type(data2) is tuple assert len(data1) == len(data2) == 1 assert tuple(data1[0].keys()) == tuple(data2[0].keys()) dset = make_numpy_dataset([col1, col2], batch_size=2, drop_last=True) for data1, data2 in dset: assert len(data1) == len(data2) == 2 co.close()
def test_shuffle(self, repo_20_filled_samples): repo = repo_20_filled_samples co = repo.checkout() first_aset = co.columns['writtenaset'] unshuffled_dataset = make_numpy_dataset( (first_aset, ), keys=[str(i) for i in range(15)], shuffle=False) expected_unshuffled_content = [i for i in range(15)] recieved_unshuffled_content = [] for data in unshuffled_dataset: recieved_unshuffled_content.append(int(data[0][0])) assert expected_unshuffled_content == recieved_unshuffled_content shuffled_dataset = make_numpy_dataset((first_aset, ), keys=[str(i) for i in range(15)], shuffle=True) recieved_shuffled_content = [] for data in shuffled_dataset: recieved_shuffled_content.append(int(data[0][0])) assert recieved_shuffled_content != expected_unshuffled_content co.close()
def test_multiple_dataset_batched_loader(self, repo_20_filled_samples): co = repo_20_filled_samples.checkout() first_aset = co.columns['writtenaset'] second_aset = co.columns['second_aset'] dset = make_numpy_dataset([first_aset, second_aset], batch_size=6, drop_last=True) total_samples = 0 for dset1, dset2 in dset: total_samples += dset1.shape[0] assert dset1.shape == (6, 5, 7) assert dset2.shape == (6, 5, 7) assert total_samples == 18 # drop last is True # testing with batch_size = 1 dset = make_numpy_dataset([first_aset, second_aset], batch_size=1, drop_last=True) total_samples = 0 for dset1, dset2 in dset: total_samples += dset1.shape[0] assert dset1.shape == (1, 5, 7) assert dset2.shape == (1, 5, 7) assert total_samples == 20 # drop last is True will not have any effect with pytest.raises(RuntimeError, match="Setting `drop_last` is a no-op when " "batching is not enabled"): # Setting drop_last without batching dset = make_numpy_dataset([first_aset, second_aset], batch_size=0, drop_last=True) dset = make_numpy_dataset([first_aset, second_aset], batch_size=0) total_samples = 0 for dset1, dset2 in dset: total_samples += 1 assert dset1.shape == (5, 7) assert dset2.shape == (5, 7) assert total_samples == 20 co.close()
def test_collate_fn(self, repo_20_filled_subsamples): co = repo_20_filled_subsamples.checkout() col1 = co['writtenaset'] col2 = co['second_aset'] keys = (((0, ...), (0, 1)), ((1, ...), (1, 4))) dataset = make_numpy_dataset([col1, col2], keys=keys, shuffle=False, batch_size=2) col1data, col2data = next(iter(dataset)) assert isinstance(col1data, tuple) assert isinstance(col2data, np.ndarray) assert list(col1data[0].keys()) == [1, 2, 3] assert list(col1data[1].keys()) == [4, 5, 6] assert np.allclose(col2data, np.stack((col2[0][1], col2[1][4]))) def collate_fn(data_arr): arr1 = [] arr2 = [] for elem in data_arr: # picking one arbitrary subsample k = list(elem[0].keys())[2] data1 = elem[0][k] data2 = elem[1] arr1.append(data1) arr2.append(data2) return np.stack(arr1), np.stack(arr2) dataset = make_numpy_dataset([col1, col2], keys=keys, shuffle=False, batch_size=2, collate_fn=collate_fn) col1data, col2data = next(iter(dataset)) assert np.allclose(col1data, np.stack((col1[0][3], col1[1][6]))) assert np.allclose(col2data, np.stack((col2[0][1], col2[1][4]))) co.close()