def split_train_val(dataset: TaskSet, val_split: float = 0.1) -> Tuple[TaskSet, TaskSet]: """Split train dataset into two datasets, one for training and one for validation. :param dataset: A torch dataset, with .x and .y attributes. :param val_split: Percentage to allocate for validation, between [0, 1[. :return: A tuple a dataset, respectively for train and validation. """ random_state = np.random.RandomState(seed=1) indexes = np.arange(len(dataset)) random_state.shuffle(indexes) train_indexes = indexes[int(val_split * len(indexes)):] val_indexes = indexes[:int(val_split * len(indexes))] x_train, y_train, t_train = dataset.get_raw_samples(train_indexes) train_dataset = TaskSet(x_train, y_train, t_train, trsf=dataset.trsf, data_type=dataset.data_type) x_val, y_val, t_val = dataset.get_raw_samples(val_indexes) val_dataset = TaskSet(x_val, y_val, t_val, trsf=dataset.trsf, data_type=dataset.data_type) return train_dataset, val_dataset
def test_get_random_samples(nb_samples): x = np.ones((10, 2, 2, 3)) y = np.ones((10, )) t = np.ones((10, )) base_set = TaskSet(x, y, t, None) base_set.get_random_samples(nb_samples=nb_samples)
def test_get_raw_samples(nb_samples): x = np.ones((10, 2, 2, 3)) y = np.ones((10, )) t = np.ones((10, )) base_set = TaskSet(x, y, t, None) data, y_, t_ = base_set.get_raw_samples(indexes=range(nb_samples)) assert (x[:nb_samples] == data).all() assert (y[:nb_samples] == y_).all() assert (t[:nb_samples] == t_).all()
def __getitem__(self, task_index: Union[int, slice]): """Returns a task by its unique index. :param task_index: The unique index of a task. As for List, you can use indexing between [0, len], negative indexing, or even slices. :return: A train PyTorch's Datasets. """ if isinstance(task_index, slice): raise NotImplementedError( f"You cannot select multiple task ({task_index}) on OnlineFellowship scenario yet" ) self.cl_dataset = self.cl_datasets[task_index] if isinstance(self.cl_dataset, _ContinuumDataset): x, y, _ = self.cl_dataset.get_data() t = np.ones(len(y)) * task_index taskset = TaskSet(x, y, t, trsf=self._get_trsf(task_index, self.transformations), target_trsf=self._get_label_trsf(task_index), data_type=self.cl_dataset.data_type, bounding_boxes=self.cl_dataset.bounding_boxes) else: if not isinstance(self.cl_dataset, BaseTaskSet): raise ValueError( "self.cl_datasets can only contain _ContinuumDataset or TaskSet" ) taskset = self.cl_dataset return taskset
def concat(task_sets: List[TaskSet]) -> TaskSet: """Concatenate a dataset A with one or many *other* datasets. The transformations will be those of the first dataset. :param Tasksets: A list of task sets. :return: A concatenated task set. """ x, y, t = [], [], [] data_type = task_sets[0].data_type for task_set in task_sets: if task_set.data_type != data_type: raise Exception( f"Invalid data type {task_set.data_type} != {data_type}") x.append(task_set._x) y.append(task_set._y) t.append(task_set._t) return TaskSet(np.concatenate(x), np.concatenate(y), np.concatenate(t), trsf=task_sets[0].trsf, data_type=data_type)
def test_concat_method(nb_others): x = np.random.rand(10, 2, 2, 3) y = np.ones((10, )) t = np.ones((10, )) base_set = TaskSet(x, y, t, None) initial_len = len(base_set) others = [ TaskSet(np.copy(x), np.copy(y), np.copy(t), None) for _ in range(nb_others) ] base_set.concat(*others) assert len(base_set) == initial_len + nb_others * initial_len loader = DataLoader(base_set) for x, y, t in loader: pass
def test_split_train_val(val_split, nb_val): x = np.random.rand(10, 2, 2, 3) y = np.ones((10, )) t = np.ones((10, )) base_set = TaskSet(x, y, t, None) train_set, val_set = split_train_val(base_set, val_split) assert len(val_set) == nb_val assert len(train_set) + len(val_set) == len(base_set)
def __getitem__(self, task_index: Union[int, slice]): """Returns a task by its unique index. :param task_index: The unique index of a task. As for List, you can use indexing between [0, len], negative indexing, or even slices. :return: A train PyTorch's Datasets. """ x, y, t = self._select_data_by_task(task_index) return TaskSet(x, y, t, self.trsf, data_type=self.cl_dataset.data_type)
def taskset_subset(taskset: TaskSet, indices: np.ndarray) -> TaskSet: # x, y, t = taskset.get_raw_samples(indices) x, y, t = taskset.get_raw_samples(indices) # TODO: Not sure if/how to handle the `bounding_boxes` attribute here. bounding_boxes = taskset.bounding_boxes if bounding_boxes is not None: bounding_boxes = bounding_boxes[indices] return replace_taskset_attributes( taskset, x=x, y=y, t=t, bounding_boxes=bounding_boxes )
def test_split_train_val_loading(): x = np.random.rand(10, 2, 2, 3) y = np.ones((10, )) t = np.ones((10, )) base_set = TaskSet(x, y, t, None) train_set, val_set = split_train_val(base_set, 0.2) for task_set in (train_set, val_set): loader = DataLoader(task_set, batch_size=32) for x, y, t in loader: pass
def __getitem__(self, task_index): """Returns a task by its unique index. :param task_index: The unique index of a task, between 0 and len(loader) - 1. Or it could be a list or a numpy array or even a slice. :return: A train PyTorch's Datasets. """ x, y, _ = self.dataset if isinstance(task_index, slice): # Convert a slice to a list and respect the Python's advanced indexing conventions start = task_index.start if task_index.start is not None else 0 stop = task_index.stop if task_index.stop is not None else len(self) + 1 step = task_index.step if task_index.step is not None else 1 task_index = list(range(start, stop, step)) if len(task_index) == 0: raise ValueError(f"Invalid slicing resulting in no data (start={start}, end={stop}, step={step}).") elif isinstance(task_index, np.ndarray): task_index = list(task_index) elif isinstance(task_index, int): task_index = [task_index] else: raise TypeError(f"Invalid type of task index {type(task_index).__name__}.") task_index = set([_handle_negative_indexes(ti, len(self)) for ti in task_index]) t = np.concatenate([ (np.ones(len(x)) * ti).astype(np.int32) for ti in task_index ]) x = np.concatenate([ x for _ in range(len(task_index)) ]) if self.shared_label_space: y = np.concatenate([ y for _ in range(len(task_index)) ]) else: # Different transformations have different labels even though # the original images were the same y = np.concatenate([ y + ti * self.num_classes_per_task for ti in task_index ]) trsf = [ # Non-used tasks have a None trsf self.get_task_transformation(ti) if ti in task_index else None for ti in range(len(self)) ] return TaskSet(x, y, t, trsf, data_type=self.cl_dataset.data_type)
def test_target_trsf(nb_classes): x = np.random.rand(10, 2, 2, 3) y = np.arange(10) t = np.ones((10, )) target_trsf = transforms.Lambda(lambda x: x % nb_classes) tasket = TaskSet(x, y, t, None, target_trsf=target_trsf) assert tasket.nb_classes == nb_classes, print( "target transform not applied in get_classes") loader = DataLoader(tasket) for x, y, t in loader: pass
def test_concat_function(nb_others): x = np.random.rand(10, 2, 2, 3) y = np.ones((10, )) t = np.ones((10, )) task_sets = [ TaskSet(np.copy(x), np.copy(y), np.copy(t), None) for _ in range(nb_others) ] concatenation = concat(task_sets) assert len(concatenation) == nb_others * 10 loader = DataLoader(concatenation) for x, y, t in loader: pass
def test_sampler_function(log): np.random.seed(1) torch.manual_seed(1) x = np.random.rand(100, 2, 2, 3) y = np.ones((100, ), dtype=np.int64) y[0] = 0 t = np.ones((100, )) taskset = TaskSet(x, y, t, None) sampler = get_balanced_sampler(taskset, log=log) loader = DataLoader(taskset, sampler=sampler, batch_size=1) nb_0 = 0 for x, y, t in loader: if 0 in y: nb_0 += 1 assert nb_0 > 1
def __getitem__(self, task_index: Union[int, slice]) -> TaskSet: """Returns a task by its unique index. :param task_index: The unique index of a task. As for List, you can use indexing between [0, len], negative indexing, or even slices. :return: A train PyTorch's Datasets. """ if isinstance(task_index, slice) and task_index.step is not None: raise ValueError("Step in slice for segmentation is not supported.") x, y, t, task_index = self._select_data_by_task(task_index) t = self._get_task_ids(t, task_index) return TaskSet( x, y, t, self.trsf, target_trsf=self._get_label_transformation(task_index), data_type=self.cl_dataset.data_type )
def __getitem__(self, task_index): """Returns a task by its unique index. :param task_index: The unique index of a task, between 0 and len(loader) - 1. :return: A train PyTorch's Datasets. """ if isinstance(task_index, slice): raise ValueError( "Incremental training based on transformations " "does not support slice, please provide only integer.") elif task_index < 0: # Support for negative index, e.g. -1 == last while task_index < 0: task_index += len(self) self.update_task_indexes(task_index) if not self.shared_label_space: self.update_labels(task_index) train = self._select_data_by_task(task_index) trsf = self.get_task_transformation(task_index) return TaskSet(*train, trsf, data_type=self.cl_dataset.data_type)
def __getitem__(self, task_index: Union[int, slice]): """Returns a task by its unique index. :param task_index: The unique index of a task. As for List, you can use indexing between [0, len], negative indexing, or even slices. :return: A train PyTorch's Datasets. """ if isinstance(task_index, slice) and isinstance(self.trsf, list): raise ValueError( f"You cannot select multiple task ({task_index}) when you have a " "different set of transformations per task") x, y, t, _, data_indexes = self._select_data_by_task(task_index) return TaskSet(x, y, t, trsf=self.trsf[task_index] if isinstance( self.trsf, list) else self.trsf, data_type=self.cl_dataset.data_type, bounding_boxes=self.cl_dataset.bounding_boxes, data_indexes=data_indexes)