def batch_bdc2rws_contour(dcm_dir, bdc_dir, rws_dir, **kwargs): """ Convert BDC format annotation to rws format. Args: dcm_dir (str): dicom files directory. bdc_dir (str): bdc annotation files directory. rws_dir (str): rws annotation files directory. N.B. dicom title should be exactly the same with annotation file title. e.g. 123.dcm, 123.txt """ mv.mkdirs(rws_dir) dcm_filenames = mv.listdir(dcm_dir) bdc_filenames = mv.listdir(bdc_dir) dcm_titles = [mv.splitext(fn)[0] for fn in dcm_filenames] bdc_titles = [mv.splitext(fn)[0] for fn in bdc_filenames] file_titles = list(set(dcm_titles).intersection(set(bdc_titles))) if (len(dcm_filenames) != len(bdc_filenames) or len(file_titles) != len(dcm_filenames)): logging.warning('dicoms & annotations do not exactly match') for file_title in tqdm(file_titles): dcm_path = mv.joinpath(dcm_dir, file_title + '.dcm') bdc_path = mv.joinpath(bdc_dir, file_title + '.txt') rws_path = mv.joinpath(rws_dir, file_title + '.json') bdc2rws_contour(dcm_path, bdc_path, rws_path, **kwargs)
def save_checkpoint(model, path, optimizer=None, metadata=None): """ Save checkpoint to file. The checkpoint will have 3 fields: ``metadata``, ``state_dict`` and ``optimizer``. Args: model (Module): module whose params are to be saved. path (str): path to save the checkpoint file. optimizer ('Optimizer', optional): optimizer to be saved. metadata (dict, optional): metadata to be saved in checkpoint. """ assert isinstance(metadata, (dict, type(None))) if metadata is None: metadata = {} mv.mkdirs(mv.parentdir(path)) # if wrapped by nn.DataParallel, remove the wrapper if hasattr(model, 'module'): model = model.module # make a checkpoint checkpoint = {'state_dict': _weights_to_cpu(model.state_dict())} if optimizer is not None: checkpoint['optimizer'] = optimizer.state_dict() if metadata is not None: checkpoint['metadata'] = metadata torch.save(checkpoint, path)
def test_mkdirs(): with not_raises(FileExistsError): mv.mkdirs(DATA_DIR) path = mv.joinpath(DATA_DIR, 'temporary_subdir') mv.mkdirs(path) assert mv.isdir(path) mv.rmtree(path)
def test_imread_imwrite(img): dst_dir = mv.joinpath(DATA_DIR, 'temporary_subdir') dst_path = mv.joinpath(dst_dir, mv.basename(PNG_IMG_PATH)) mv.mkdirs(dst_dir) ret_val = mv.imwrite(img, dst_path) assert ret_val img_reloaded = mv.imread(dst_path, mv.ImreadMode.UNCHANGED) assert_image_equal(img, img_reloaded) mv.rmtree(dst_dir)
def save_cls_dsmd(dsmd_path, data, auto_mkdirs=True): if auto_mkdirs: mv.mkdirs(mv.parentdir(dsmd_path)) dsmd = mv.make_dsmd(data) with open(dsmd_path, 'w') as fd: for key, value in dsmd.items(): if mv.isarrayinstance(value): # handle multi-label case value = ','.join([str(entry) for entry in value]) line = '%s,%s\n' % (str(key), str(value)) fd.write(line)
def init_logging(log_dir=None, config_file=None): if log_dir is None: log_dir = os.getcwd() if config_file is None: config_file = mv.joinpath(mv.parentdir(mv.parentdir(__file__)), 'configs/default_log_config.yaml') with open(config_file, 'rt') as f: config = yaml.safe_load(f.read()) config['handlers']['info_file_handler']['filename'] = \ mv.joinpath(log_dir, 'info.log') config['handlers']['error_file_handler']['filename'] = \ mv.joinpath(log_dir, 'error.log') mv.mkdirs(log_dir) logging.config.dictConfig(config)
def test_copyfiles(): dst_dir = mv.joinpath(DATA_DIR, 'temporary_subdir') mv.mkdirs(dst_dir) src_paths = ['brain_001.dcm', 'brain_002.dcm'] mv.copyfiles(src_paths, dst_dir, DCM_DIR) assert len(mv.listdir(dst_dir)) == 2 with not_raises(FileExistsError): mv.copyfiles(src_paths, dst_dir, DCM_DIR, non_overwrite=False) with pytest.raises(FileExistsError): mv.copyfiles(src_paths, dst_dir, DCM_DIR, non_overwrite=True) mv.empty_dir(dst_dir) assert mv.isdir(dst_dir) assert len(mv.listdir(dst_dir)) == 0 mv.rmtree(dst_dir)
def batch_mask2rws(mask_dir, rws_dir, **kwargs): """ Convert mask format annotation to rws format. Args: mask_dir (str): mask files directory. rws_dir (str): rws annotation files directory. N.B. dicom file title should be exactly the same with mask file title. e.g. 123.dcm, 123.png """ mv.mkdirs(rws_dir) mask_filenames = mv.listdir(mask_dir) file_titles = [mv.splitext(fn)[0] for fn in mask_filenames] for file_title in tqdm(file_titles): mask_path = mv.joinpath(mask_dir, file_title + '.png') rws_path = mv.joinpath(rws_dir, file_title + '.json') mask2rws(mask_path, rws_path, **kwargs)
def test_gen_cls_ds(): tmp_dir = mv.joinpath(DATA_DIR, 'temporary_subdir') mv.mkdirs(tmp_dir) tmp_c2l_path = mv.joinpath(tmp_dir, 'tmp_c2l.txt') tmp_dsmd_path = mv.joinpath(tmp_dir, 'tmp_dsmd.txt') mv.gen_cls_dsmd_file_from_datafolder(DF_DIR, tmp_c2l_path, tmp_dsmd_path) dsmd = mv.load_dsmd(DSMD_DF) tmp_dsmd = mv.load_dsmd(tmp_dsmd_path) c2l = mv.load_dsmd(CLS2LBL) tmp_c2l = mv.load_dsmd(tmp_c2l_path) assert_equal_dsmds(dsmd, tmp_dsmd) assert_equal_dsmds(c2l, tmp_c2l) mv.empty_dir(tmp_dir) mv.gen_cls_ds_from_datafolder(DF_DIR, tmp_dir) assert len(mv.listdir(tmp_dir)) == 8 mv.rmtree(tmp_dir)
def save_dsmd(dsmd, file_path, auto_mkdirs=True): """ Save dataset metadata to specified file. Args: dsmd (dict): dataset metadata. file_path (str): file path to save dataset metadata. auto_mkdirs (bool): If the parent folder of `file_path` does not exist, whether to create it automatically. """ if auto_mkdirs: mv.mkdirs(mv.parentdir(file_path)) ordered_dsmd = collections.OrderedDict(natsorted(dsmd.items(), key=lambda t: t[0])) with open(file_path, 'w') as fd: for key, value in ordered_dsmd.items(): if mv.isarrayinstance(value): # for multi label case value = ', '.join([str(entry) for entry in value]) line = '%s, %s\n' % (str(key), str(value)) fd.write(line)
def test_split_dsmd_file(dsmd_file): tmp_dir = mv.joinpath(DATA_DIR, 'temporary_subdir') tmp_path = mv.joinpath(tmp_dir, 'tmp_dsmd.txt') mv.mkdirs(tmp_dir) mv.cp(dsmd_file, tmp_path) datasplit = {'train': 0.9, 'val': 0.1, 'test': 0.0} # shuffle mv.split_dsmd_file(tmp_path, datasplit) train_dsmd_file_path = mv.joinpath(tmp_dir, 'train.csv') val_dsmd_file_path = mv.joinpath(tmp_dir, 'val.csv') test_dsmd_file_path = mv.joinpath(tmp_dir, 'test.csv') assert mv.isfile(train_dsmd_file_path) assert mv.isfile(val_dsmd_file_path) assert not mv.isfile(test_dsmd_file_path) train_dsmd = mv.load_dsmd(train_dsmd_file_path) val_dsmd = mv.load_dsmd(val_dsmd_file_path) assert len(train_dsmd) == 18 assert len(val_dsmd) == 2 # non shuffle mv.split_dsmd_file(tmp_path, datasplit, shuffle=False) train_dsmd_file_path = mv.joinpath(tmp_dir, 'train.csv') val_dsmd_file_path = mv.joinpath(tmp_dir, 'val.csv') test_dsmd_file_path = mv.joinpath(tmp_dir, 'test.csv') assert mv.isfile(train_dsmd_file_path) assert mv.isfile(val_dsmd_file_path) assert not mv.isfile(test_dsmd_file_path) train_dsmd = mv.load_dsmd(train_dsmd_file_path) val_dsmd = mv.load_dsmd(val_dsmd_file_path) assert len(train_dsmd) == 18 assert len(val_dsmd) == 2 assert 'brain_001.dcm' in train_dsmd assert 'brain_019.dcm' in val_dsmd mv.rmtree(tmp_dir)
def imwrite(file_path, img, auto_mkdirs=True): """ Save image to specified file. Args: file_path (str): specified file path to save to. img (ndarray): image array to be written. auto_mkdirs (bool): If the parent folder of `file_path` does not exist, whether to create it automatically. Returns: (bool): returns whether the image is saved successfully. Note: If the given image is a color image. It should be in RGB format. """ if auto_mkdirs: mv.mkdirs(mv.parentdir(file_path)) if img.ndim == 3: img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) return cv2.imwrite(file_path, img)
def test_has_duplicated_files(): dst_dir = mv.joinpath(DATA_DIR, 'temporary_subdir') mv.mkdirs(dst_dir) # non duplicated files case src_paths = ['brain_001.dcm', 'brain_002.dcm', 'brain_003.dcm'] mv.copyfiles(src_paths, dst_dir, DCM_DIR) assert len(mv.find_duplicated_files(dst_dir)) == 0 # duplicated files case mv.non_overwrite_cp(mv.joinpath(DCM_DIR, src_paths[0]), mv.joinpath(dst_dir, 'dup_0.dcm')) duplicated_files = mv.find_duplicated_files(dst_dir) assert len(duplicated_files) == 1 assert (mv.joinpath(dst_dir, 'brain_001.dcm') in duplicated_files[0] and mv.joinpath(dst_dir, 'dup_0.dcm') in duplicated_files[0]) mv.non_overwrite_cp(mv.joinpath(DCM_DIR, src_paths[1]), mv.joinpath(dst_dir, 'dup_1.dcm')) duplicated_files = mv.find_duplicated_files(dst_dir) assert len(duplicated_files) == 2 mv.rmtree(dst_dir)
def save_det_dsmd(dsmd_path, data, class2label, auto_mkdirs=True): """ Save dataset metadata to specified file. Args: dsmd_path (str): file path to save dataset metadata. data (dict): dataset metadata, refer to 'load_dsmd'. class2label (str or dict): class-to-label file or class2label dict. auto_mkdirs (bool): If the parent folder of `file_path` does not exist, whether to create it automatically. """ if auto_mkdirs: mv.mkdirs(mv.parentdir(dsmd_path)) # get label->class mapping if isinstance(class2label, str): class2label = mv.load_c2l(class2label) label2class = {value: key for key, value in class2label.items()} # write dataset metadata loop dsmd = mv.make_dsmd(data) with open(dsmd_path, 'w') as fd: for key, value in dsmd.items(): _write_record(fd, key, value, label2class)
def gen_cls_ds_from_datafolder( in_dir, out_dir, auto_mkdirs=True, classnames=None): """ Generate classification dataset from DataFolder. This function will make a copy of each image in the DataFolder to the specified directory. Original DataFolder is left unchanged. Args: in_dir (str): DataFolder root directory. out_dir (str): directory to save all the images in DataFolder. auto_mkdirs (bool): If `out_dir` does not exist, whether to create it automatically. classnames (list[str]): names of specified classes to be collected. If not given, all classes are considered. Note: This function is expected to be used together with gen_cls_dsmd_file_from_datafolder(). Filename of each image in DataFolder should be unique. Otherwise, A FileExistsError will be thrown. DataFolder is described in 'gen_cls_dsmd_file_from_datafolder()'. """ assert mv.isdir(in_dir) # clean output directory if auto_mkdirs: mv.mkdirs(mv.parentdir(out_dir)) mv.empty_dir(out_dir) if classnames is None: classnames = mv.listdir(in_dir) for classname in classnames: class_dir = mv.joinpath(in_dir, classname) assert mv.isdir(class_dir) filenames = natsorted(mv.listdir(class_dir)) mv.copyfiles(filenames, out_dir, class_dir, non_overwrite=True)
def __init__(self, mode, model, batch_processor, train_dataloader=None, val_dataloader=None, optimizer=None, work_dir=None, max_epochs=10000): """ A training helper for PyTorch. Args: model (`torch.nn.Module`): The model to be run. mode ('ModeKey'): running mode. batch_processor (callable): A callable method that process a data batch. The interface of this method should be `batch_processor(model, data, train_mode) -> dict` train_dataloader ('DataLoader'): train data loader. val_dataloader ('DataLoader'): validation data loader. optimizer (dict or `Optimizer`): If it is a dict, runner will construct an optimizer according to it. work_dir (str, optional): The working directory to save checkpoints, logs and other outputs. max_epochs (int): Total training epochs. """ assert isinstance(mode, mv.ModeKey) assert isinstance(model, torch.nn.Module) assert callable(batch_processor) assert isinstance(optimizer, (str, torch.optim.Optimizer)) assert isinstance(work_dir, str) or work_dir is None assert isinstance(max_epochs, int) self.mode = mode self.epoch_runner = getattr(self, mode.value) self.model = model self.batch_processor = batch_processor self.train_dataloader = train_dataloader self.val_dataloader = val_dataloader self.optimizer = self.build_optimizer(optimizer) # create work_dir self.work_dir = mv.abspath(work_dir if work_dir is not None else '.') mv.mkdirs(self.work_dir) # init TensorboardX visualizer and dataloader if mode == mv.ModeKey.TRAIN: experiment = mv.basename(self.work_dir) self.visualizer = mv.TensorboardVisualizer(experiment) self.dataloader = self.train_dataloader else: self.visualizer = None self.dataloader = self.val_dataloader # init hooks and average meter self._hooks = [] self.average_meter = AverageMeter() # init loop parameters self._epoch = 0 self._max_epochs = max_epochs if mode == mv.ModeKey.TRAIN else 1 self._inner_iter = 0 self._iter = 0 self._max_iters = 0 # get model name from model class if hasattr(self.model, 'module'): self._model_name = self.model.module.__class__.__name__ else: self._model_name = self.model.__class__.__name__