def run_experiment(self, raise_error=False, expected_msg=None): dummy_exp = DummyExperiment() dummy_exp = self.messenger_cls(dummy_exp, **self.messenger_kwargs) dset_train = DummyDataset(10) dset_test = DummyDataset(10) dmgr_train = DataManager(dset_train, 2, 1, None) dmgr_test = DataManager(dset_test, 2, 1, None) with self.assertLogs(logger, level='INFO') as cm: if raise_error: with self.assertRaises(RuntimeError): dummy_exp.run(dmgr_train, dmgr_test, raise_error=True, **self.run_kwargs) else: dummy_exp.run( dmgr_train, dmgr_test, raise_error=False, **self.run_kwargs, ) if expected_msg is None or not expected_msg: logger.info("NoExpectedMessage") if expected_msg is None or not expected_msg: self.assertEqual(cm.output, ["INFO:UnitTestMessenger:NoExpectedMessage"]) else: self.assertEqual(cm.output, expected_msg)
def test_datamanager(self): batch_size = 16 np.random.seed(1) dset = DummyDataset(600, [0.5, 0.3, 0.2]) manager = DataManager(dset, batch_size, n_process_augmentation=0, transforms=None) self.assertIsInstance(manager.get_batchgen(), Augmenter) # create batch manually data, labels = [], [] for i in range(batch_size): data.append(dset[i]["data"]) labels.append(dset[i]["label"]) batch_dict = {"data": np.asarray(data), "label": np.asarray(labels)} augmenter = manager.get_batchgen() augmenter_iter = iter(augmenter) for key, val in next(augmenter_iter).items(): self.assertTrue((val == batch_dict[key]).all()) for key, val in next(augmenter_iter).items(): self.assertEqual(len(val), batch_size)
def run_experiment(experiment_cls, config, network_cls, len_train, len_test, **kwargs): assert issubclass(experiment_cls, BaseExperiment) exp = experiment_cls(config, network_cls, **kwargs) dset_train = DummyDataset(len_train) dset_test = DummyDataset(len_test) dmgr_train = DataManager(dset_train, 16, 4, None) dmgr_test = DataManager(dset_test, 16, 1, None) return exp.run(dmgr_train, dmgr_test)
def kfold_experiment(self, raise_error=False, expected_msg=None): kfold_kwargs = copy.deepcopy(self.run_kwargs) kfold_kwargs.pop("fold") dummy_exp = DummyExperiment() dummy_exp = self.messenger_cls(dummy_exp, **self.messenger_kwargs) dset = DummyDataset(10) dmgr = DataManager(dset, 2, 1, None) with self.assertLogs(logger, level='INFO') as cm: if raise_error: with self.assertRaises(RuntimeError): dummy_exp.kfold(data=dmgr, metrics={}, num_splits=2, raise_error=True, **kfold_kwargs) else: dummy_exp.kfold(data=dmgr, metrics={}, num_splits=2, raise_error=False, **kfold_kwargs) if expected_msg is None: logger.info("NoExpectedMessage") if expected_msg is None: self.assertEqual(cm.output, ["INFO:UnitTestMessenger:NoExpectedMessage"]) else: self.assertEqual(cm.output, expected_msg)
def kfold_experiment(experiment_cls, config, network_cls, len_data, shuffle=True, split_type="random", num_splits=2, val_split=None, **kwargs): assert issubclass(experiment_cls, BaseExperiment) metric_keys = kwargs.pop("metric_keys", None) exp = experiment_cls(config, network_cls, **kwargs) dset = DummyDataset(len_data) dmgr = DataManager(dset, 16, 1, None) return exp.kfold(data=dmgr, metrics=config.nested_get("metrics"), shuffle=shuffle, split_type=split_type, num_splits=num_splits, val_split=val_split, metric_keys=metric_keys)
def test_experiment(experiment_cls, config, network_cls, len_test, **kwargs): assert issubclass(experiment_cls, BaseExperiment) exp = experiment_cls(config, network_cls, **kwargs) dset_test = DummyDataset(len_test) dmgr_test = DataManager(dset_test, 16, 1, None) model = network_cls() return exp.test(model, dmgr_test, config.nested_get("metrics", {}), kwargs.get("metric_keys", None))
torch.jit.loat(checkpoint_path) model.eval() if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") model.to(device) print("Load Data") dset = WholeLegDataset(root_path=data_path, include_flipped=True, img_size=img_size, contourwidth=25) # , bone_label=1) dmgr = DataManager(dset, 1, 4, transforms, sampler_cls=SequentialSampler) print("Start Predictions") predictor = Predictor( model, key_mapping={"x": "data"}, convert_batch_to_npy_fn=convert_torch_to_numpy, prepare_batch_fn=partial(model.prepare_batch, input_device=device, output_device=device), ) os.makedirs(os.path.join(save_path, "per_class_all_masks"), exist_ok=True) # os.makedirs(os.path.join(save_path, "per_mask_all_classes"), exist_ok=True)
include_flipped=True, img_size=config.img_size, bone_label=bone_label) # idx_train, idx_val = train_test_split( # list(range(len(dset))), test_size=config.val_split, # random_state=config.seed) # dset_train = dset.get_subset(idx_train) # dset_val = dset.get_subset(idx_val) dset_train = dset dset_val = dset_test mgr_train = DataManager(dset_train, config.batchsize, 4, train_transforms, sampler_cls=RandomSampler) mgr_val = DataManager(dset_val, config.batchsize, 4, test_transforms, sampler_cls=SequentialSampler) mgr_test = DataManager(dset_test, config.batchsize, 4, test_transforms, sampler_cls=SequentialSampler) experiment = PyTorchExperiment( config,
def predict_data_mgr(self, datamgr: DataManager, batchsize=None, metrics=None, metric_keys=None, verbose=False, **kwargs): """ Defines a routine to predict data obtained from a batchgenerator without explicitly caching anything Parameters ---------- datamgr : :class:`DataManager` Manager producing a generator holding the batches batchsize : int Artificial batchsize (sampling will be done with batchsize 1 and sampled data will be stacked to match the artificial batchsize)(default: None) metrics : dict the metrics to calculate metric_keys : dict the ``batch_dict`` items to use for metric calculation verbose : bool whether to show a progress-bar or not, default: False kwargs : keyword arguments passed to :func:`prepare_batch_fn` Yields ------ dict a dictionary containing all predictions of the current batch dict a dictionary containing all metrics of the current batch """ if metrics is None: metrics = {} orig_num_aug_processes = datamgr.n_process_augmentation orig_batch_size = datamgr.batch_size if batchsize is None: batchsize = orig_batch_size datamgr.batch_size = 1 batchgen = datamgr.get_batchgen() n_batches = datamgr.n_batches if verbose: iterable = tqdm(enumerate(batchgen), unit=' sample', total=n_batches, desc=self._tqdm_desc) else: iterable = enumerate(batchgen) batch_list = [] for i, batch in iterable: Predictor._at_iter_begin(self, iter_num=i) if not batch_list and (n_batches - i) < batchsize: batchsize = n_batches - i logger.debug("Set Batchsize down to %d to avoid cutting " "of the last batches" % batchsize) batch_list.append(batch) # if queue is full process queue: if batchsize is None or len(batch_list) >= batchsize: batch_dict = {} for _batch in batch_list: for key, val in _batch.items(): if key in batch_dict.keys(): batch_dict[key].append(val) else: batch_dict[key] = [val] for key, val_list in batch_dict.items(): batch_dict[key] = np.concatenate(val_list) batch_dict = self._prepare_batch(batch_dict) preds = self.predict(batch_dict, already_prepared=True, **kwargs) # convert batchdict back to numpy (self.predict may convert it # to backend-specific tensor type) - no-op if already numpy batch_dict = self._convert_to_npy_fn(**batch_dict)[1] preds_batch = LookupConfig() # explicitly free memory of old lookup config gc.collect() preds_batch.update(batch_dict) preds_batch.update(preds) # calculate metrics for predicted batch _metric_vals = self.calc_metrics(preds_batch, metrics=metrics, metric_keys=metric_keys) self._at_iter_end( data_dict={ **batch_dict, **preds_batch }, metrics={"val_" + k: v for k, v in _metric_vals.items()}, iter_num=i) yield preds, _metric_vals batch_list = [] datamgr.batch_size = orig_batch_size datamgr.n_process_augmentation = orig_num_aug_processes return
def kfold(self, data: DataManager, metrics: dict, num_epochs=None, num_splits=None, shuffle=False, random_seed=None, split_type="random", val_split=0.2, label_key="label", train_kwargs: dict = None, metric_keys: dict = None, test_kwargs: dict = None, config=None, verbose=False, **kwargs): """ Performs a k-Fold cross-validation Parameters ---------- data : :class:`DataManager` the data to use for training(, validation) and testing. Will be split based on ``split_type`` and ``val_split`` metrics : dict dictionary containing the metrics to evaluate during k-fold num_epochs : int or None number of epochs to train (if not given, will either be extracted from ``config``, ``self.config`` or ``self.n_epochs``) num_splits : int or None the number of splits to extract from ``data``. If None: uses a default of 10 shuffle : bool whether to shuffle the data before splitting or not (implemented by index-shuffling rather than actual data-shuffling to retain potentially lazy-behavior of datasets) random_seed : None seed to seed numpy, the splitting functions and the used backend-framework split_type : str must be one of ['random', 'stratified'] if 'random': uses random data splitting if 'stratified': uses stratified data splitting. Stratification will be based on ``label_key`` val_split : float or None the fraction of the train data to use as validation set. If None: No validation will be done during training; only testing for each fold after the training is complete label_key : str the label to use for stratification. Will be ignored unless ``split_type`` is 'stratified'. Default: 'label' train_kwargs : dict or None kwargs to update the behavior of the :class:`DataManager` containing the train data. If None: empty dict will be passed metric_keys : dict of tuples the batch_dict keys to use for each metric to calculate. Should contain a value for each key in ``metrics``. If no values are given for a key, per default ``pred`` and ``label`` will be used for metric calculation test_kwargs : dict or None kwargs to update the behavior of the :class:`DataManager` containing the test and validation data. If None: empty dict will be passed config : :class:`DeliraConfig`or None the training and model parameters (will be merged with ``self.config``) verbose : bool verbosity **kwargs : additional keyword arguments Returns ------- dict all predictions from all folds dict all metric values from all folds Raises ------ ValueError if ``split_type`` is neither 'random', nor 'stratified' See Also -------- * :class:`sklearn.model_selection.KFold` and :class:`sklearn.model_selection.ShuffleSplit` for random data-splitting * :class:`sklearn.model_selection.StratifiedKFold` and :class:`sklearn.model_selection.StratifiedShuffleSplit` for stratified data-splitting * :meth:`DataManager.update_from_state_dict` for updating the data managers by kwargs * :meth:`BaseExperiment.run` for the training * :meth:`BaseExperiment.test` for the testing Notes ----- using stratified splits may be slow during split-calculation, since each item must be loaded once to obtain the labels necessary for stratification. """ # set number of splits if not specified if num_splits is None: num_splits = 10 logger.warning("num_splits not defined, using default value of \ 10 splits instead ") metrics_test, outputs = {}, {} split_idxs = list(range(len(data.dataset))) if train_kwargs is None: train_kwargs = {} if test_kwargs is None: test_kwargs = {} # switch between differnt kfold types if split_type == "random": split_cls = KFold val_split_cls = ShuffleSplit # split_labels are ignored for random splitting, set them to # split_idxs just ensures same length split_labels = split_idxs elif split_type == "stratified": split_cls = StratifiedKFold val_split_cls = StratifiedShuffleSplit # iterate over dataset to get labels for stratified splitting split_labels = [ data.dataset[_idx][label_key] for _idx in split_idxs ] else: raise ValueError("split_type must be one of " "['random', 'stratified'], but got: %s" % str(split_type)) fold = split_cls(n_splits=num_splits, shuffle=shuffle, random_state=random_seed) if random_seed is not None: np.random.seed(random_seed) # iterate over folds for idx, (train_idxs, test_idxs) in enumerate(fold.split(split_idxs, split_labels)): # extract data from single manager train_data = data.get_subset(train_idxs) test_data = data.get_subset(test_idxs) train_data.update_state_from_dict(copy.deepcopy(train_kwargs)) test_data.update_state_from_dict(copy.deepcopy(test_kwargs)) val_data = None if val_split is not None: if split_type == "random": # split_labels are ignored for random splitting, set them # to split_idxs just ensures same length train_labels = train_idxs elif split_type == "stratified": # iterate over dataset to get labels for stratified # splitting train_labels = [ train_data.dataset[_idx][label_key] for _idx in train_idxs ] else: raise ValueError("split_type must be one of " "['random', 'stratified'], but got: %s" % str(split_type)) _val_split = val_split_cls(n_splits=1, test_size=val_split, random_state=random_seed) for _train_idxs, _val_idxs in _val_split.split( train_idxs, train_labels): val_data = train_data.get_subset(_val_idxs) val_data.update_state_from_dict(copy.deepcopy(test_kwargs)) train_data = train_data.get_subset(_train_idxs) model = self.run(train_data=train_data, val_data=val_data, config=config, num_epochs=num_epochs, fold=idx, **kwargs) _outputs, _metrics_test = self.test(model, test_data, metrics=metrics, metric_keys=metric_keys, verbose=verbose) outputs[str(idx)] = _outputs metrics_test[str(idx)] = _metrics_test return outputs, metrics_test