def test_time(tmpdir): global DATA_PATH cl_dataset = CIFAR100(data_path=DATA_PATH, download=False, train=True, labels_type="category", task_labels="lifelong") # in practice the construction is part by part to reduce data load but here we do it at once x, y, t = cl_dataset.get_data() h5_filename = os.path.join(tmpdir, "test_time_h5.hdf5") h5dataset = H5Dataset(x, y, t, data_path=h5_filename) task_set = H5TaskSet(h5_filename, y=h5dataset.get_class_vector(), t=h5dataset.get_task_indexes(), trsf=None) start = time.time() for i in range(10000): a = task_set[5] end = time.time() print(f"normal __getitem__ {end - start}") start = time.time() with h5py.File(h5_filename, 'r') as hf: for i in range(10000): x = hf['x'][5] y = hf['y'][5] if 't' in hf.keys(): t = hf['t'][5] else: t = -1 end = time.time() print(f"open only once __getitem__ {end - start}")
def test_slice_h5( tmpdir, dataset, keep_classes, discard_classes, keep_tasks, discard_tasks, error, ids ): dataset = H5Dataset(*dataset, data_path=os.path.join(tmpdir, "test.h5")) if error: with pytest.raises(Exception): sliced_dataset = dataset.slice( os.path.join(tmpdir, "test_bis.h5"), keep_classes, discard_classes, keep_tasks, discard_tasks ) return else: sliced_dataset = dataset.slice( os.path.join(tmpdir, "test_bis.h5"), keep_classes, discard_classes, keep_tasks, discard_tasks ) h5_path, _, _ = sliced_dataset.get_data() assert h5_path == os.path.join(tmpdir, "test_bis.h5") with h5py.File(h5_path, 'r') as hf: x = hf['x'][:] assert (np.unique(x) == np.array(ids)).all(), (np.unique(x), ids)
def test_concatenate_h5dataset(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) h5dataset.add_data(x_, y_, t_) assert len(h5dataset.get_class_vector()) == 2 * len(y_)
def test_h5dataset_to_taskset(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) task_set = h5dataset.to_taskset() loader = DataLoader(task_set) for _ in loader: pass
def test_h5dataset_add_data(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) h5dataset.add_data(x_, y_, t_) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) assert scenario.nb_tasks == nb_task
def test_creation_h5dataset(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) x_0, y_0, t_0 = h5dataset.get_data() assert isinstance(x_0, str) # x is only the path to the file assert len(y_0) == len(y_) assert len(t_0) == len(t_)
def test_h5dataset_reloading(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data # create dataset h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) # destroy object del h5dataset # reload data set h5dataset_reloaded = H5Dataset(x=None, y=None, t=None, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset_reloaded) for task_set in scenario: loader = DataLoader(task_set) for _ in loader: pass assert scenario.nb_tasks == nb_task
def test_h5dataset_reloading_slow(tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") nb_tasks = 5 cl_dataset = CIFAR100(data_path=DATA_PATH, download=False, train=True, labels_type="category", task_labels="lifelong") x, y, t = cl_dataset.get_data() # create dataset h5dataset = H5Dataset(x, y, t, data_path=filename_h5) # destroy object del h5dataset # reload data set h5dataset_reloaded = H5Dataset(x=None, y=None, t=None, data_path=filename_h5) scenario = ContinualScenario(h5dataset_reloaded) for task_set in scenario: loader = DataLoader(task_set) for _ in loader: pass assert scenario.nb_tasks == nb_tasks task_order = np.arange(nb_tasks) sub_scenario = create_subscenario(scenario, task_order[:-1]) assert sub_scenario.nb_tasks == nb_tasks-1 np.random.shuffle(task_order) sub_scenario = create_subscenario(scenario, task_order) assert sub_scenario.nb_tasks == nb_tasks
def test_h5dataset_ContinualScenario(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) assert scenario.nb_tasks == nb_task data_indexes = np.where(t_ == 0)[0] assert len(data_indexes) == len(scenario[0])
def test_h5dataset_loading(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) for task_set in scenario: loader = DataLoader(task_set) for _ in loader: pass assert scenario.nb_tasks == nb_task
def test_h5dataset_get_raw(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) for task_set in scenario: indexes = np.random.randint(len(task_set), size=len(task_set) // 2) _, _, _ = task_set.get_raw_samples(indexes.sort()) # test with no indexes _, _, _ = task_set.get_raw_samples() assert scenario.nb_tasks == nb_task
def test_create_subscenario_h5dataset(data, tmpdir): from continuum.scenarios import create_subscenario filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) sub_scenario = create_subscenario(scenario, np.arange(nb_task - 1)) for task_set in sub_scenario: loader = DataLoader(task_set) for _ in loader: pass assert sub_scenario.nb_tasks == nb_task - 1
def test_h5dataset_IncrementalScenario(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data nb_task = 2 h5dataset = H5Dataset(x_, y_, None, data_path=filename_h5) scenario = ClassIncremental(h5dataset, nb_tasks=nb_task) assert scenario.nb_tasks == nb_task tot_len = 0 for task_set in scenario: tot_len += len(task_set) loader = DataLoader(task_set) for _ in loader: pass assert tot_len == len(y_)
def test_h5dataset_split_train_test(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) for task_set in scenario: task_set_tr, task_set_val = split_train_val(task_set) loader_tr = DataLoader(task_set_tr) for _ in loader_tr: pass loader_val = DataLoader(task_set_val) for _ in loader_val: pass assert scenario.nb_tasks == nb_task
def encode_into_dataset(model, scenario, batch_size, filename, inference_fct=None): """This function encode a scenario into a h5 dataset to reproduce the same scenario with features. :param model: model to encode the data. :param scenario: scenario to encode. :param batch_size: batch size to load data. :param filename: filename for the h5 dataset. :param inference_fct: A function that make possible to have a sophisticate way to get features. """ training_mode = model.training if inference_fct is None: inference_fct = (lambda model, x: model.to(torch.device('cuda:0')) (x.to(torch.device('cuda:0')))) # we save feature in eval mode model.eval() encoded_dataset = None for task_id, taskset in enumerate(scenario): # we need to load the data to use the transformation if there is some loader = DataLoader(taskset, shuffle=False, batch_size=batch_size) for i, (x, y, t) in enumerate(loader): features = inference_fct(model, x) if t is None: t = (torch.ones(len(y)) * task_id).long() if task_id == 0 and i == 0: encoded_dataset = H5Dataset(features.cpu().numpy(), y, t, data_path=filename) else: encoded_dataset.add_data(features.cpu().numpy(), y, t) model.train(training_mode) return encoded_dataset
def test_on_array_dataset(tmpdir): filename_h5 = os.path.join(tmpdir, "test_CIFAR100_h5.hdf5") cl_dataset = CIFAR100(data_path=DATA_PATH, download=False, train=True, labels_type="category", task_labels="lifelong") # in practice the construction is part by part to reduce data load but here we do it at once x, y, t = cl_dataset.get_data() h5dataset = H5Dataset(x, y, t, data_path=filename_h5) scenario = ContinualScenario(h5dataset) for task_set in scenario: loader = DataLoader(task_set, batch_size=64) for x, y, t in loader: assert x.shape == torch.Size([64, 3, 32, 32]) break assert scenario.nb_tasks == 5 # number of task of CIFAR100Lifelong
def test_on_array_dataset_incremental(tmpdir): filename_h5 = os.path.join(tmpdir, "test_CIFAR100_h5.hdf5") nb_tasks = 10 cl_dataset = CIFAR100(data_path=DATA_PATH, download=False, train=True) # in practice the construction is part by part to reduce data load but here we do it at once x, y, t = cl_dataset.get_data() h5dataset = H5Dataset(x, y, t, data_path=filename_h5) scenario = ClassIncremental(h5dataset, nb_tasks=nb_tasks) for task_set in scenario: loader = DataLoader(task_set, batch_size=64) for x, y, t in loader: assert x.shape == torch.Size([64, 3, 32, 32]) break assert scenario.nb_tasks == nb_tasks # number of task of CIFAR100Lifelong
def test_create_subscenario_suffle_h5dataset(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) task_order = np.arange(nb_task) np.random.shuffle(task_order) sub_scenario = create_subscenario(scenario, task_order) for task_set in sub_scenario: loader = DataLoader(task_set) for _ in loader: pass assert sub_scenario.nb_tasks == nb_task