def process_components( model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device Args: model (Model): torch model criterion (Criterion): criterion function optimizer (Optimizer): optimizer scheduler (Scheduler): scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 methond device (Device, optional): device """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) if device is None: device = utils.get_device() model: Model = maybe_recursive_call(model, "to", device=device) if utils.is_wrapped_with_ddp(model): pass elif len(distributed_params) > 0: assert isinstance(model, nn.Module) distributed_rank = distributed_params.pop("rank", -1) syncbn = distributed_params.pop("syncbn", False) if distributed_rank > -1: torch.cuda.set_device(distributed_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") if "opt_level" in distributed_params: utils.assert_fp16_available() from apex import amp amp_result = amp.initialize(model, optimizer, **distributed_params) if optimizer is not None: model, optimizer = amp_result else: model = amp_result if distributed_rank > -1: from apex.parallel import DistributedDataParallel model = DistributedDataParallel(model) if syncbn: from apex.parallel import convert_syncbn_model model = convert_syncbn_model(model) if distributed_rank <= -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) elif torch.cuda.device_count() > 1: if isinstance(model, nn.Module): model = torch.nn.DataParallel(model) elif isinstance(model, dict): model = {k: torch.nn.DataParallel(v) for k, v in model.items()} model = maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device
def _get_optimizer(self, stage: str, model: Union[Model, Dict[str, Model]], **params) -> Optimizer: # @TODO 1: refactoring; this method is too long # @TODO 2: load state dicts for schedulers & criterion layerwise_params = params.pop("layerwise_params", OrderedDict()) no_bias_weight_decay = params.pop("no_bias_weight_decay", True) # linear scaling rule from https://arxiv.org/pdf/1706.02677.pdf lr_scaling_params = params.pop("lr_linear_scaling", None) if lr_scaling_params: data_params = dict(self.stages_config[stage]["data_params"]) batch_size = data_params.get("batch_size") per_gpu_scaling = data_params.get("per_gpu_scaling", False) distributed_rank = utils.get_rank() distributed = distributed_rank > -1 if per_gpu_scaling and not distributed: num_gpus = max(1, torch.cuda.device_count()) batch_size *= num_gpus base_lr = lr_scaling_params.get("lr") base_batch_size = lr_scaling_params.get("base_batch_size", 256) lr_scaling = batch_size / base_batch_size params["lr"] = base_lr * lr_scaling # scale default lr else: lr_scaling = 1.0 # getting model parameters model_key = params.pop("_model", None) if model_key is None: assert isinstance( model, nn.Module ), "model is key-value, but optimizer has no specified model" model_params = utils.process_model_params(model, layerwise_params, no_bias_weight_decay, lr_scaling) elif isinstance(model_key, str): model_params = utils.process_model_params( model[model_key], layerwise_params, no_bias_weight_decay, lr_scaling, ) elif isinstance(model_key, (list, tuple)): model_params = [] for model_key_el in model_key: model_params_el = utils.process_model_params( model[model_key_el], layerwise_params, no_bias_weight_decay, lr_scaling, ) model_params.extend(model_params_el) else: raise ValueError("unknown type of model_params") load_from_previous_stage = params.pop("load_from_previous_stage", False) optimizer_key = params.pop("optimizer_key", None) optimizer = OPTIMIZERS.get_from_params(**params, params=model_params) if load_from_previous_stage and self.stages.index(stage) != 0: checkpoint_path = f"{self.logdir}/checkpoints/best_full.pth" checkpoint = utils.load_checkpoint(checkpoint_path) dict2load = optimizer if optimizer_key is not None: dict2load = {optimizer_key: optimizer} utils.unpack_checkpoint(checkpoint, optimizer=dict2load) # move optimizer to device device = utils.get_device() for param in model_params: param = param["params"][0] optimizer_state = optimizer.state[param] for state_key, state_value in optimizer_state.items(): optimizer_state[state_key] = utils.any2device( state_value, device) # update optimizer params for key, value in params.items(): for optimizer_param_group in optimizer.param_groups: optimizer_param_group[key] = value return optimizer
def trace( self, model: Model = None, batch=None, logdir: str = None, loader: DataLoader = None, method_name: str = "forward", mode: str = "eval", requires_grad: bool = False, fp16: Union[Dict, bool] = None, device: Device = "cpu", predict_params: dict = None, ) -> ScriptModule: """ Traces model using Torch Jit Args: model (Model): model to trace batch: batch to forward through the model to trace logdir (str, optional): If specified, the result will be written to the directory loader (DataLoader, optional): if batch is not specified, the batch will be ``next(iter(loader))`` method_name (str): model's method name that will be traced mode (str): ``train`` or ``eval`` requires_grad (bool): flag to trace with gradients fp16 (Union[Dict, bool]): If not None, then sets tracing params to FP16 deivice (Device): Torch deivice or a string predict_params (dict): additional parameters for model forward """ if batch is None: if loader is None: raise ValueError( "If batch is not provided the loader must be specified") batch = next(iter(loader)) if model is not None: self.model = model if isinstance(fp16, bool) and fp16: opt_level = "O1" elif isinstance(fp16, bool) and not fp16: opt_level = None elif isinstance(fp16, dict): opt_level = fp16["opt_level"] else: opt_level = fp16 if opt_level is not None: device = "cuda" elif device is None: if self.device is None: self.device = utils.get_device() device = self.device result = trace.trace_model(model=self.model, runner=self, batch=batch, method_name=method_name, mode=mode, requires_grad=requires_grad, opt_level=opt_level, device=device, predict_params=predict_params) if logdir is not None: filename = trace.get_trace_name(method_name=method_name, mode=mode, requires_grad=requires_grad, opt_level=opt_level) logdir = Path(logdir) output: Path = logdir / "trace" output.mkdir(exist_ok=True, parents=True) out_model = str(output / filename) torch.jit.save(result, out_model) return result
valid_transforms_fn=valid_data_transforms, batch_size=config.batch_size, config=config) model = plant.get_model(config.model_name, config.num_classes) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=config.patience, verbose=True, mode="min", factor=0.3) device = utils.get_device() if config.is_fp16_used: fp16_params = dict(opt_level="O1") # params for FP16 else: fp16_params = None runner = SupervisedRunner(device=device) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, # We can specify the callbacks list for the experiment;
def trace( self, *, model: Model = None, batch: Any = None, logdir: str = None, loader: DataLoader = None, method_name: str = "forward", mode: str = "eval", requires_grad: bool = False, fp16: Union[Dict, bool] = None, device: Device = "cpu", predict_params: dict = None, ) -> ScriptModule: """ Traces model using Torch Jit. Args: model (Model): model to trace batch: batch to forward through the model to trace logdir (str, optional): If specified, the result will be written to the directory loader (DataLoader, optional): if batch is not specified, the batch will be ``next(iter(loader))`` method_name (str): model's method name that will be traced mode (str): ``train`` or ``eval`` requires_grad (bool): flag to trace with gradients fp16 (Union[Dict, bool]): If not None, then sets tracing params to FP16 device (Device): Torch device or a string predict_params (dict): additional parameters for model forward """ if batch is None: if loader is None: raise ValueError( "If batch is not provided the loader must be specified" ) batch = next(iter(loader)) if model is not None: self.model = model assert self.model is not None if isinstance(fp16, bool) and fp16: opt_level = "O1" elif isinstance(fp16, bool) and not fp16: opt_level = None elif isinstance(fp16, dict): opt_level = fp16["opt_level"] else: opt_level = fp16 if opt_level is not None: device = "cuda" elif device is None: if self.device is None: self.device = utils.get_device() device = self.device # Dumping previous state of the model, we will need it to restore _device, _is_training, _requires_grad = ( self.device, self.model.training, utils.get_requires_grad(self.model), ) self.model.to(device) # function to run prediction on batch def predict_fn(model, inputs, **kwargs): _model = self.model self.model = model result = self.predict_batch(inputs, **kwargs) self.model = _model return result traced_model = utils.trace_model( model=self.model, predict_fn=predict_fn, batch=batch, method_name=method_name, mode=mode, requires_grad=requires_grad, opt_level=opt_level, device=device, predict_params=predict_params, ) if logdir is not None: utils.save_traced_model( model=traced_model, logdir=logdir, method_name=method_name, mode=mode, requires_grad=requires_grad, opt_level=opt_level, ) # Restore previous state of the model getattr(self.model, "train" if _is_training else "eval")() utils.set_requires_grad(self.model, _requires_grad) self.model.to(_device) return traced_model
def main(): args = get_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus SEED = 42 utils.set_global_seed(SEED) utils.prepare_cudnn(deterministic=True) num_classes = 14 #define datasets train_dataset = ChestXrayDataSet( data_dir=args.path_to_images, image_list_file=args.train_list, transform=transforms_train, ) val_dataset = ChestXrayDataSet( data_dir=args.path_to_images, image_list_file=args.val_list, transform=transforms_val, ) loaders = { 'train': DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers), 'valid': DataLoader(val_dataset, batch_size=2, shuffle=False, num_workers=args.num_workers) } logdir = args.log_dir #where model weights and logs are stored #define model model = DenseNet121(num_classes) if len(args.gpus) > 1: model = nn.DataParallel(model) device = utils.get_device() runner = SupervisedRunner(device=device) optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0003) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) weights = torch.Tensor( [10, 100, 30, 8, 40, 40, 330, 140, 35, 155, 110, 250, 155, 200]).to(device) criterion = BCEWithLogitsLoss(pos_weight=weights) class_names = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia' ] runner.train( model=model, logdir=logdir, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=args.epochs, # We can specify the callbacks list for the experiment; # For this task, we will check AUC and accuracy callbacks=[ AUCCallback( input_key="targets", output_key='logits', prefix='auc', class_names=class_names, num_classes=num_classes, activation='Sigmoid', ), AccuracyCallback( input_key="targets", output_key="logits", prefix="accuracy", accuracy_args=[1], num_classes=14, threshold=0.5, activation='Sigmoid', ), ], main_metric='auc/_mean', minimize_metric=False, verbose=True, )
def main_kaggle_smp(path_dataset='/dataset/kaggle/understanding_cloud_organization', ENCODER='resnet50', ENCODER_WEIGHTS='imagenet', num_workers=0, batch_size=8, epochs=19, debug=False, exec_catalyst=True, logdir="/src/logs/segmentation", pretrained=True ): # below line is potential input args # (name_dataset='eurosat', lr=0.0001, wd=0, ratio=0.9, batch_size=32, workers=4, epochs=15, num_gpus=1, # resume=None, dir_weights='./weights'): torch.backends.cudnn.benchmark = True # Dataset train, sub = get_meta_info_table(path_dataset) train_ids, valid_ids, test_ids = prepare_dataset(train, sub) preprocessing_fn = smp.encoders.get_preprocessing_fn(ENCODER, ENCODER_WEIGHTS) train_dataset = CloudDataset(df=train, datatype='train', img_ids=train_ids, transforms=get_training_augmentation(), preprocessing=get_preprocessing(preprocessing_fn), path=path_dataset) valid_dataset = CloudDataset(df=train, datatype='valid', img_ids=valid_ids, transforms=get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn), path=path_dataset) # DataLoader train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) loaders = { "train": train_loader, "valid": valid_loader } # todo: check how to used device in this case DEVICE = 'cuda' if debug: device = 'cpu' else: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ACTIVATION = None model = smp.Unet( encoder_name=ENCODER, encoder_weights=ENCODER_WEIGHTS, classes=4, activation=ACTIVATION, ) images, labels = next(iter(train_loader)) model.to(device) print(model) print(summary(model, input_size=tuple(images.shape[1:]))) # use smp epoch # num_epochs = 19 # model, criterion, optimizer optimizer = torch.optim.Adam([ {'params': model.decoder.parameters(), 'lr': 1e-2}, {'params': model.encoder.parameters(), 'lr': 1e-3}, ]) scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2) criterion = smp.utils.losses.DiceLoss(eps=1.) # smp.utils.losses.BCEDiceLoss(eps=1.) if not pretrained: # catalyst if exec_catalyst: device = utils.get_device() runner = SupervisedRunner(device=device) # train model runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[DiceCallback(), EarlyStoppingCallback(patience=5, min_delta=0.001)], logdir=logdir, num_epochs=epochs, verbose=True ) # # prediction # encoded_pixels = [] # loaders = {"infer": valid_loader} # runner.infer( # model=model, # loaders=loaders, # callbacks=[ # CheckpointCallback( # resume=f"{logdir}/checkpoints/best.pth"), # InferCallback() # ], # ) # valid_masks = [] # # # todo: where .pth? # # todo: from here # valid_num = valid_dataset.__len__() # probabilities = np.zeros((valid_num * 4, 350, 525)) # for i, (batch, output) in enumerate(tqdm(zip( # valid_dataset, runner.callbacks[0].predictions["logits"]))): # image, mask = batch # for m in mask: # if m.shape != (350, 525): # m = cv2.resize(m, dsize=(525, 350), interpolation=cv2.INTER_LINEAR) # valid_masks.append(m) # # for j, probability in enumerate(output): # if probability.shape != (350, 525): # probability = cv2.resize(probability, dsize=(525, 350), interpolation=cv2.INTER_LINEAR) # probabilities[valid_num * 4 + j, :, :] = probability # # # todo: from here # class_params = {} # for class_id in range(4): # print(class_id) # attempts = [] # for t in range(0, 100, 5): # t /= 100 # for ms in [0, 100, 1200, 5000, 10000]: # masks = [] # for i in range(class_id, len(probabilities), 4): # probability = probabilities[i] # predict, num_predict = post_process(sigmoid(probability), t, ms) # masks.append(predict) # # d = [] # for i, j in zip(masks, valid_masks[class_id::4]): # if (i.sum() == 0) & (j.sum() == 0): # d.append(1) # else: # d.append(dice(i, j)) # # attempts.append((t, ms, np.mean(d))) # # attempts_df = pd.DataFrame(attempts, columns=['threshold', 'size', 'dice']) # # attempts_df = attempts_df.sort_values('dice', ascending=False) # print(attempts_df.head()) # best_threshold = attempts_df['threshold'].values[0] # best_size = attempts_df['size'].values[0] # # class_params[class_id] = (best_threshold, best_size) else: for epoch in trange(epochs, desc="Epochs"): metrics_train = train_epoch(model, train_loader, criterion, optimizer, device) metrics_eval = eval_epoch(model, valid_loader, criterion, device) scheduler.step(metrics_eval['valid_loss']) print(f'epoch: {epoch} ', metrics_train, metrics_eval) else: if exec_catalyst: device = utils.get_device() checkpoint = utils.load_checkpoint(f'{logdir}/checkpoints/best_full.pth') utils.unpack_checkpoint(checkpoint, model=model) runner = SupervisedRunner(model=model) # prediction with infer encoded_pixels = [] loaders = {"infer": valid_loader} runner.infer( model=model, loaders=loaders, callbacks=[ CheckpointCallback( resume=f"{logdir}/checkpoints/best.pth"), InferCallback() ], ) # todo: jupyterで確認中 valid_masks = [] valid_num = valid_dataset.__len__() probabilities = np.zeros((valid_num * 4, 350, 525)) for i, (batch, output) in enumerate(tqdm(zip( valid_dataset, runner.callbacks[0].predictions["logits"]))): image, mask = batch for m in mask: if m.shape != (350, 525): m = cv2.resize(m, dsize=(525, 350), interpolation=cv2.INTER_LINEAR) valid_masks.append(m) for j, probability in enumerate(output): if probability.shape != (350, 525): probability = cv2.resize(probability, dsize=(525, 350), interpolation=cv2.INTER_LINEAR) probabilities[i * 4 + j, :, :] = probability class_params = {} for class_id in range(4): print(class_id) attempts = [] for t in range(0, 100, 5): t /= 100 for ms in [0, 100, 1200, 5000, 10000]: masks = [] for i in range(class_id, len(probabilities), 4): probability = probabilities[i] predict, num_predict = post_process(sigmoid(probability), t, ms) masks.append(predict) d = [] for i, j in zip(masks, valid_masks[class_id::4]): if (i.sum() == 0) & (j.sum() == 0): d.append(1) else: d.append(dice(i, j)) attempts.append((t, ms, np.mean(d))) attempts_df = pd.DataFrame(attempts, columns=['threshold', 'size', 'dice']) attempts_df = attempts_df.sort_values('dice', ascending=False) print(attempts_df.head()) best_threshold = attempts_df['threshold'].values[0] best_size = attempts_df['size'].values[0] class_params[class_id] = (best_threshold, best_size) # predictions torch.cuda.empty_cache() gc.collect() test_dataset = CloudDataset(df=sub, datatype='test', img_ids=test_ids, transforms=get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn)) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0) loaders = {"test": test_loader} encoded_pixels = [] image_id = 0 for i, test_batch in enumerate(tqdm(loaders['test'])): runner_out = runner.predict_batch({"features": test_batch[0].cuda()})['logits'] for i, batch in enumerate(runner_out): for probability in batch: probability = probability.cpu().detach().numpy() if probability.shape != (350, 525): probability = cv2.resize(probability, dsize=(525, 350), interpolation=cv2.INTER_LINEAR) predict, num_predict = post_process(sigmoid(probability), class_params[image_id % 4][0], class_params[image_id % 4][1]) if num_predict == 0: encoded_pixels.append('') else: r = mask2rle(predict) encoded_pixels.append(r) image_id += 1 sub['EncodedPixels'] = encoded_pixels sub.to_csv('data/kaggle_cloud_org/submission.csv', columns=['Image_Label', 'EncodedPixels'], index=False)
def main(): N_CLASSES = 14 CLASS_NAMES = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'] # initialize model device = utils.get_device() model = DenseNet121(N_CLASSES).to(device) checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint['model_state_dict']) # initialize test loader test_dataset = ChestXrayDataSet(data_dir=args.path_to_images, image_list_file=args.test_list, transform=transforms_test) test_loader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) # initialize the ground truth and output tensor gt = torch.FloatTensor() gt = gt.cuda() pred = torch.FloatTensor() pred = pred.cuda() # switch to evaluate mode model.eval() with torch.no_grad(): for i, (inp, target) in enumerate(test_loader): target = target.cuda() gt = torch.cat((gt, target), 0) bs, c, h, w = inp.size() input_var = torch.autograd.Variable(inp.view(-1, c, h, w).cuda()) output = model(input_var) output_mean = output.view(bs, -1) pred = torch.cat((pred, output_mean.data), 0) gt_np = gt.cpu().numpy() pred_np = sigmoid(pred.cpu().numpy()) Y_t = [] #labels for each anomaly for i in range(N_CLASSES): Y_t.append([]) for x in gt_np: Y_t[i].append(x[i]) Y_pred = [] #preds for each anomaly for j in range(N_CLASSES): Y_pred.append([]) for y in pred_np: Y_pred[j].append(y[j]) AUCs = [] # AUCs for each for i in range(N_CLASSES): auc = roc_auc_score(Y_t[i], Y_pred[i]) AUCs.append(auc) matrices=[] #for each for i in range(14): matrix = confusion_matrix(Y_t[i], np.asarray(Y_pred[i])>0.6) matrices.append(matrix) class_names = ['no disease', 'disease'] fig = plt.figure(figsize = (20,20)) for i in range(14): plt.subplot(4,4,i+1) df_cm = pd.DataFrame( matrices[i], index=class_names, columns=class_names) heatmap = sns.heatmap(df_cm, annot=True, fmt="d").set_title(CLASS_NAMES[i]) plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() fig.savefig(os.path.join(args.test_outdir,'confusion_matrix.pdf')) fig, axes2d = plt.subplots(nrows=2, ncols=7, sharex=True, sharey=True,figsize = (12, 4)) for i, row in enumerate(axes2d): for j, cell in enumerate(row): if i==0: x=i+j else: x=13-i*j fpr, tpr, threshold = roc_curve(Y_t[x], Y_pred[x]) roc_auc = auc(fpr, tpr) cell.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) cell.legend(loc = 'lower right', handlelength=0,handletextpad=0,frameon=False, prop={'size': 8}) cell.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) cell.set_title(CLASS_NAMES[x],fontsize=10) if i == len(axes2d) - 1: cell.set_xlabel('False positive rate') if j == 0: cell.set_ylabel('True negative rate') fig.tight_layout(pad=1.0) plt.show() fig.savefig(os.path.join(args.test_outdir,'roc_auc.pdf'))