def test_multiple_dataset_loader(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout(write=True) second_aset = co.arraysets['second_aset'] del second_aset['10'] co.commit('deleting') co.close() co = repo.checkout() first_aset = co.arraysets['writtenaset'] second_aset = co.arraysets['second_aset'] with pytest.raises(ValueError): # emtpy list make_torch_dataset([]) with pytest.raises(TypeError): # if more than one dataset, those should be in a list/tuple make_torch_dataset(first_aset, first_aset) with pytest.warns(UserWarning, match='Arraysets do not contain equal number of samples'): torch_dset = make_torch_dataset([first_aset, second_aset]) loader = DataLoader(torch_dset, batch_size=6, drop_last=True) total_samples = 0 for dset1, dset2 in loader: total_samples += dset1.shape[0] assert dset1.shape == (6, 5, 7) assert dset2.shape == (6, 5, 7) assert total_samples == 18 # drop last is True co.close()
def test_field_names(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout() first_aset = co.arraysets['writtenaset'] second_aset = co.arraysets['second_aset'] with pytest.raises( ValueError): # number of dsets and field_names are different make_torch_dataset([first_aset, second_aset], field_names=('input', )) with pytest.raises(TypeError): # field_names's type is wrong make_torch_dataset([first_aset, second_aset], field_names={ 'input': '', 'target': '' }) torch_dset = make_torch_dataset([first_aset, second_aset], field_names=('input', 'target')) assert hasattr(torch_dset[1], 'input') assert hasattr(torch_dset[1], 'target') if torch.__version__ > '1.0.1': loader = DataLoader(torch_dset, batch_size=5) for sample in loader: assert hasattr(sample, 'input') assert hasattr(sample, 'target') co.close()
def test_dataset_loader_fails_with_write_enabled_checkout(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout(write=True) first_aset = co.arraysets['writtenaset'] second_aset = co.arraysets['second_aset'] with pytest.raises(TypeError): make_torch_dataset([first_aset, second_aset]) co.close()
def test_warns_experimental(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout() first_aset = co.arraysets['writtenaset'] second_aset = co.arraysets['second_aset'] with pytest.warns(UserWarning, match='Dataloaders are experimental'): make_torch_dataset([first_aset, second_aset]) co.close()
def test_warns_arrayset_sample_size_mismatch(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout(write=True) second_aset = co.arraysets['second_aset'] del second_aset['10'] co.commit('deleting') co.close() co = repo.checkout() first_aset = co.arraysets['writtenaset'] second_aset = co.arraysets['second_aset'] with pytest.warns(UserWarning, match='Arraysets do not contain equal number of samples'): make_torch_dataset([first_aset, second_aset]) co.close()
def test_with_keys(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout() aset = co.arraysets['writtenaset'] # with keys keys = ['2', '4', '5', '6', '7', '9', '15', '18', '19'] bad_tensor0 = aset['0'] bad_tensor1 = aset['1'] bad_tensor3 = aset['3'] bad_tensor8 = aset['8'] torch_dset = make_torch_dataset(aset, keys=keys) loader = DataLoader(torch_dset, batch_size=3) total_batches = 0 for batch in loader: assert batch[0].size(0) == 3 total_batches += 1 for sample in batch: assert not np.allclose(sample, bad_tensor0) assert not np.allclose(sample, bad_tensor1) assert not np.allclose(sample, bad_tensor3) assert not np.allclose(sample, bad_tensor8) assert total_batches == 3 co.close()
def test_field_names(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout() first_aset = co.arraysets['writtenaset'] second_aset = co.arraysets['second_aset'] with pytest.raises(ValueError): # number of dsets and field_names are different make_torch_dataset([first_aset, second_aset], field_names=('input',)) with pytest.raises(TypeError): # field_names's type is wrong make_torch_dataset([first_aset, second_aset], field_names={'input': '', 'target': ''}) torch_dset = make_torch_dataset([first_aset, second_aset], field_names=('input', 'target')) assert len(torch_dset) == 20 loader = DataLoader(torch_dset, batch_size=5) for sample in loader: assert type(sample).__name__ == 'BatchTuple_input_target' assert sample._fields == ('input', 'target') co.close()
def test_lots_of_data_with_multiple_backend(self, repo_with_10000_samples): repo = repo_with_10000_samples co = repo.checkout() aset = co.arraysets['aset'] torch_dset = make_torch_dataset([aset]) loader = DataLoader(torch_dset, batch_size=1000, drop_last=True) for data in loader: assert data.aset.shape == (1000, 5, 7) co.close()
def test_lots_of_data_with_multiple_backend_multiple_worker_dataloader(self, repo_with_10000_samples): repo = repo_with_10000_samples co = repo.checkout() aset = co.arraysets['aset'] torch_dset = make_torch_dataset([aset]) loader = DataLoader(torch_dset, batch_size=1000, drop_last=True, num_workers=2) for data in loader: assert type(data).__name__ == 'BatchTuple_aset' assert data.aset.shape == (1000, 5, 7) co.close()
def test_lots_of_data_with_multiple_backend(self, repo_300_filled_samples): repo = repo_300_filled_samples co = repo.checkout() aset = co.columns['aset'] torch_dset = make_torch_dataset([aset]) loader = DataLoader(torch_dset, batch_size=10, drop_last=True) for data in loader: assert type(data).__name__ == 'BatchTuple_aset' assert data.aset.shape == (10, 5, 7) co.close()
def test_local_without_data_fails_data_unavailable(self, written_two_cmt_server_repo, managed_tmpdir): new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) server, _ = written_two_cmt_server_repo repo = Repository(path=new_tmpdir, exists=False) repo.clone('name', '[email protected]', server, remove_old=True) co = repo.checkout() aset = co.arraysets['writtenaset'] with pytest.raises(FileNotFoundError): torch_dset = make_torch_dataset(aset, keys=['1', '2']) co.close() repo._env._close_environments()
def test_local_without_data_fails_no_common_no_local( self, written_two_cmt_server_repo, managed_tmpdir): new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) server, _ = written_two_cmt_server_repo repo = Repository(path=new_tmpdir, exists=False) repo.clone('name', '[email protected]', server, remove_old=True) co = repo.checkout() aset = co.columns['writtenaset'] with pytest.raises(ValueError): torch_dset = make_torch_dataset(aset) co.close() repo._env._close_environments()
def test_two_aset_loader_two_worker_dataloader(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout() first_aset = co.arraysets['writtenaset'] second_aset = co.arraysets['second_aset'] torch_dset = make_torch_dataset([first_aset, second_aset]) loader = DataLoader(torch_dset, batch_size=2, drop_last=True, num_workers=2) count = 0 for asets_batch in loader: assert type(asets_batch).__name__ == 'BatchTuple_writtenaset_second_aset' assert isinstance(asets_batch, tuple) assert len(asets_batch) == 2 assert asets_batch._fields == ('writtenaset', 'second_aset') assert asets_batch.writtenaset.shape == (2, 5, 7) assert asets_batch.second_aset.shape == (2, 5, 7) assert np.allclose(asets_batch.writtenaset, -asets_batch.second_aset) count += 1 assert count == 10
def test_with_index_range(self, repo_with_20_samples): repo = repo_with_20_samples co = repo.checkout() aset = co.arraysets['writtenaset'] # with keys bad_tensor0 = aset['0'] bad_tensor1 = aset['1'] # with index range index_range = slice(2, 20) torch_dset = make_torch_dataset(aset, index_range=index_range) loader = DataLoader(torch_dset, batch_size=3) total_batches = 0 for batch in loader: assert batch[0].size(0) == 3 total_batches += 1 for sample in batch: assert not np.allclose(sample, bad_tensor0) assert not np.allclose(sample, bad_tensor1) assert total_batches == 6 co.close()
parser = ArgumentParser() parser.add_argument('--gpus', type=int, default=None) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--max_epochs', type=int, default=1) parser.add_argument('--max_elems', type=int, default=60000) parser.add_argument('--hangar', action='store_true') args = parser.parse_args() repo = Repository(path=Path(__file__).parent / "hangar") co = repo.checkout() if args.hangar: dataset = make_torch_dataset( [co.columns['digits'], co.columns['label']], index_range=slice(0, args.max_elems)) else: dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor()) print(len(dataset)) datapoint, label = dataset[0] print(type(datapoint), type(label)) print("making a loader!") train_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=16, shuffle=False) # init model