def objective(trial): config = toml.load(args.config) lr = 1e-3 #config['block'][0]['stride'] = [trial.suggest_int('stride', 4, 6)] # C1 config['block'][0]['kernel'] = [ int(trial.suggest_discrete_uniform('c1_kernel', 1, 129, 2)) ] config['block'][0]['filters'] = trial.suggest_int( 'c1_filters', 1, 1024) # B1 - B5 for i in range(1, 6): config['block'][i]['repeat'] = trial.suggest_int( 'b%s_repeat' % i, 1, 9) config['block'][i]['filters'] = trial.suggest_int( 'b%s_filters' % i, 1, 512) config['block'][i]['kernel'] = [ int(trial.suggest_discrete_uniform('b%s_kernel' % i, 1, 129, 2)) ] # C2 config['block'][-2]['kernel'] = [ int(trial.suggest_discrete_uniform('c2_kernel', 1, 129, 2)) ] config['block'][-2]['filters'] = trial.suggest_int( 'c2_filters', 1, 1024) # C3 config['block'][-1]['kernel'] = [ int(trial.suggest_discrete_uniform('c3_kernel', 1, 129, 2)) ] config['block'][-1]['filters'] = trial.suggest_int( 'c3_filters', 1, 1024) model = load_symbol(config, 'Model')(config) num_params = sum(p.numel() for p in model.parameters()) print("[trial %s]" % trial.number) if num_params > args.max_params: print("[pruned] network too large") raise optuna.exceptions.TrialPruned() model.to(args.device) model.train() os.makedirs(workdir, exist_ok=True) optimizer = AdamW(model.parameters(), amsgrad=True, lr=lr) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) schedular = CosineAnnealingLR(optimizer, args.epochs * len(train_loader)) for epoch in range(1, args.epochs + 1): try: train_loss, duration = train(model, device, train_loader, optimizer, use_amp=True) val_loss, val_mean, val_median = test(model, device, test_loader) print( "[epoch {}] directory={} loss={:.4f} mean_acc={:.3f}% median_acc={:.3f}%" .format(epoch, workdir, val_loss, val_mean, val_median)) except KeyboardInterrupt: exit() except: print("[pruned] exception") raise optuna.exceptions.TrialPruned() if np.isnan(val_loss): val_loss = 9.9 trial.report(val_loss, epoch) if trial.should_prune(): print("[pruned] unpromising") raise optuna.exceptions.TrialPruned() trial.set_user_attr('seed', args.seed) trial.set_user_attr('val_loss', val_loss) trial.set_user_attr('val_mean', val_mean) trial.set_user_attr('val_median', val_median) trial.set_user_attr('train_loss', train_loss) trial.set_user_attr('batchsize', args.batch) trial.set_user_attr('model_params', num_params) torch.save(model.state_dict(), os.path.join(workdir, "weights_%s.tar" % trial.number)) toml.dump( config, open(os.path.join(workdir, 'config_%s.toml' % trial.number), 'w')) print("[loss] %.4f" % val_loss) return val_loss
def train(train_dataloader, query_dataloader, retrieval_dataloader, arch, code_length, device, lr, max_iter, topk, evaluate_interval, anchor_num, proportion ): #print("using device") #print(torch.cuda.current_device()) #print(torch.cuda.get_device_name(torch.cuda.current_device())) # Load model model = load_model(arch, code_length).to(device) model_mo = load_model_mo(arch).to(device) # Create criterion, optimizer, scheduler criterion = PrototypicalLoss() optimizer = optim.RMSprop( model.parameters(), lr=lr, weight_decay=5e-4, ) scheduler = CosineAnnealingLR( optimizer, max_iter, lr / 100, ) # Initialization running_loss = 0. best_map = 0. training_time = 0. # Training for it in range(max_iter): # timer tic = time.time() # harvest prototypes/anchors#some times killed, try another way with torch.no_grad(): output_mo = torch.tensor([]).to(device) for data, _, _ in train_dataloader: data = data.to(device) output_mo_temp = model_mo(data) output_mo = torch.cat((output_mo, output_mo_temp), 0) torch.cuda.empty_cache() anchor = get_anchor(output_mo, anchor_num, device) # compute anchor # self-supervised deep learning model.train() for data, targets, index in train_dataloader: data, targets, index = data.to(device), targets.to(device), index.to(device) optimizer.zero_grad() # output output_B = model(data) output_mo_batch = model_mo(data) # prototypes/anchors based similarity #sample_anchor_distance = torch.sqrt(torch.sum((output_mo_batch[:, None, :] - anchor) ** 2, dim=2)).to(device) #sample_anchor_dist_normalize = F.normalize(sample_anchor_distance, p=2, dim=1).to(device) #S = sample_anchor_dist_normalize @ sample_anchor_dist_normalize.t() # loss #loss = criterion(output_B, S) #running_loss = running_loss + loss.item() #loss.backward(retain_graph=True) with torch.no_grad(): dist = torch.sum((output_mo_batch[:, None, :] - anchor.to(device)) ** 2, dim=2) k = dist.size(1) dist = torch.exp(-1 * dist / torch.max(dist)).to(device) Z_su = torch.ones(k, 1).to(device) Z_sum = torch.sqrt(dist.mm(Z_su)) + 1e-12 Z_simi = torch.div(dist, Z_sum).to(device) S = (Z_simi.mm(Z_simi.t())) S=(2/(torch.max(S)-torch.min(S)))*S-1 loss = criterion(output_B, S) running_loss += loss.item() loss.backward() optimizer.step() with torch.no_grad(): # momentum update: for param_q, param_k in zip(model.parameters(), model_mo.parameters()): param_k.data = param_k.data * proportion + param_q.data * (1. - proportion) # proportion = 0.999 for update scheduler.step() training_time += time.time() - tic # Evaluate if it % evaluate_interval == evaluate_interval - 1: # Generate hash code query_code = generate_code(model, query_dataloader, code_length, device) retrieval_code = generate_code(model, retrieval_dataloader, code_length, device) query_targets = query_dataloader.dataset.get_onehot_targets() retrieval_targets = retrieval_dataloader.dataset.get_onehot_targets() # Compute map mAP = mean_average_precision( query_code.to(device), retrieval_code.to(device), query_targets.to(device), retrieval_targets.to(device), device, topk, ) # Compute pr curve P, R = pr_curve( query_code.to(device), retrieval_code.to(device), query_targets.to(device), retrieval_targets.to(device), device, ) # Log logger.info('[iter:{}/{}][loss:{:.2f}][map:{:.4f}][time:{:.2f}]'.format( it + 1, max_iter, running_loss / evaluate_interval, mAP, training_time, )) running_loss = 0. # Checkpoint if best_map < mAP: best_map = mAP checkpoint = { 'model': model.state_dict(), 'qB': query_code.cpu(), 'rB': retrieval_code.cpu(), 'qL': query_targets.cpu(), 'rL': retrieval_targets.cpu(), 'P': P, 'R': R, 'map': best_map, } return checkpoint
def cos_lr_scheduler(optimizer,t_max=3): return CosineAnnealingLR(optimizer, T_max=t_max)
def configure_optimizers(self): optimizer = Adam(self.parameters(), lr=self.hparams.learning_rate) # scheduler = StepLR(optimizer, step_size=300) scheduler = CosineAnnealingLR(optimizer, self.trainer.max_epochs, 10e-6) return {"optimizer": optimizer, "lr_scheduler": scheduler}
def train(args, config, io): train_loader, validation_loader, unlabelled_loader = get_loader( args, config) device = torch.device("cuda" if args.cuda else "cpu") #Try to load models model = DNN(args).to(device) ema_model = DNN(args).to(device) for param in ema_model.parameters(): param.detach_() if device == torch.device("cuda"): model = nn.DataParallel(model) ema_model = nn.DataParallel(ema_model) if args.model_path != "": model.load_state_dict(torch.load(args.model_path)) ema_model.load_state_dict(torch.load(args.model_path)) if args.use_sgd: print("Use SGD") opt = optim.SGD(model.parameters(), lr=args.lr * 100, momentum=args.momentum, weight_decay=1e-4) else: print("Use Adam") opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr) criterion = nn.MSELoss() consistency_criterion = nn.MSELoss() best_test_loss = 9999999. global_step = 0 for epoch in range(args.epochs): startTime = time.time() #################### # Train #################### train_loss = 0.0 count = 0.0 model.train() ema_model.train() i = -1 for (data, label), (u, _) in zip(cycle(train_loader), unlabelled_loader): i = i + 1 if data.shape[0] != u.shape[0]: bt_size = np.minimum(data.shape[0], u.shape[0]) data = data[0:bt_size] label = label[0:bt_size] u = u[0:bt_size] data, label, u = data.to(device), label.to(device), u.to(device) batch_size = data.shape[0] logits = model(data) class_loss = criterion(logits, label) u_student = jitter(u, device) u_teacher = jitter(u, device) logits_unlabeled = model(u_student) ema_logits_unlabeled = ema_model(u_teacher) ema_logits_unlabeled = Variable(ema_logits_unlabeled.detach().data, requires_grad=False) consistency_loss = consistency_criterion(logits_unlabeled, ema_logits_unlabeled) if epoch < args.consistency_rampup_starts: consistency_weight = 0.0 else: consistency_weight = get_current_consistency_weight( args, args.final_consistency, epoch, i, len(unlabelled_loader)) consistency_loss = consistency_weight * consistency_loss loss = class_loss + consistency_loss opt.zero_grad() loss.backward() opt.step() global_step += 1 # print(global_step) update_ema_variables(model, ema_model, args.ema_decay, global_step) count += batch_size train_loss += loss.item() * batch_size scheduler.step() outstr = 'Train %d, loss: %.6f' % (epoch, train_loss * 1.0 / count) io.cprint(outstr) #################### # Evaluation #################### test_loss = 0.0 count = 0.0 model.eval() ema_model.eval() for data, label in validation_loader: data, label = data.to(device), label.to(device) batch_size = data.shape[0] logits = ema_model(data) loss = criterion(logits, label) count += batch_size test_loss += loss.item() * batch_size outstr = 'Test %d, loss: %.6f' % (epoch, test_loss * 1.0 / count) io.cprint(outstr) if test_loss <= best_test_loss: best_test_loss = test_loss torch.save(ema_model.state_dict(), 'checkpoints/%s/models/model.t7' % args.exp_name) torch.save(ema_model, (config.root + config.model_path)) io.cprint('Time: %.3f sec' % (time.time() - startTime))
class SGHMC(_Inference): def __init__(self, hyperparameters, model=None, train_loader=None, model_loss='multi_class_linear_output', device=torch.device('cpu')): ''' :param hyperparameters: Hyperparameters include {'lr', 'prior_std', 'num_samples'} :param model: Pytorch model to run SGLD on. :param train_loader: DataLoader for train data :param model_loss: Loss function to use for the model. (e.g.: 'multi_class_linear_output') :param device: Device on which model is present (e.g.: torch.device('cpu')) ''' if hyperparameters == None: # Initialise as some default values hyperparameters = { 'lr': 0.001, 'prior_std': 10, 'num_samples': 2, 'alpha': 0.1, 'burn_in_epochs': 10 } super(SGHMC, self).__init__(hyperparameters, model, train_loader, device) self.lr = hyperparameters['lr'] self.prior_std = hyperparameters['prior_std'] self.num_samples = hyperparameters['num_samples'] self.alpha = hyperparameters['alpha'] self.burn_in_epochs = hyperparameters['burn_in_epochs'] self.model_loss = model_loss self.model = model self.train_loader = train_loader self.device = device self.dataset_size = len(train_loader.dataset) self.optimizer = optimSGHMC(params=self.model.parameters(), lr=self.lr, momentum=1 - self.alpha, num_training_samples=self.dataset_size, weight_decay=1 / (self.prior_std**2)) self.loss_criterion = get_loss_criterion(loss=model_loss) self.burnt_in = False self.epochs_run = 0 self.lr_final = self.lr / 2 # self.optimizer_scheduler = CosineAnnealingLR(optimizer=self.optimizer, T_max= # (self.burn_in_epochs + self.num_samples), eta_min=self.lr_final) self.optimizer_scheduler = OneCycleLR( optimizer=self.optimizer, max_lr=self.lr * 5, steps_per_epoch=len(self.train_loader), epochs=self.burn_in_epochs + self.num_samples) def update_hyp(self, hyperparameters): self.lr = hyperparameters['lr'] self.prior_std = hyperparameters['prior_std'] self.num_samples = hyperparameters['num_samples'] self.alpha = hyperparameters['alpha'] self.burn_in_epochs = hyperparameters['burn_in_epochs'] self.model = reset_model(self.model) self.burnt_in = False self.epochs_run = 0 self.optimizer = optimSGHMC(params=self.model.parameters(), lr=self.lr, momentum=1 - self.alpha, num_training_samples=self.dataset_size, weight_decay=1 / (self.prior_std**2)) self.lr_final = self.lr / 2 self.optimizer_scheduler = CosineAnnealingLR( optimizer=self.optimizer, T_max=self.burn_in_epochs + self.num_samples, eta_min=self.lr_final) def sample_iterative(self, val_loader=None, debug_val_loss=False, wandb_debug=False): if issubclass(self.model.__class__, torch.nn.Module): if self.burnt_in is False: epochs = self.burn_in_epochs + 1 self.burnt_in = True else: epochs = 1 for epoch in range(epochs): self.model.train() total_epoch_train_loss = 0. for batch_idx, (batch_data, batch_labels) in enumerate(self.train_loader): batch_data = batch_data.to(self.device) batch_labels = batch_labels.to(self.device) batch_data_logits = self.model(batch_data) self.optimizer.zero_grad() loss = self.loss_criterion(batch_data_logits, batch_labels) loss.backward() total_epoch_train_loss += loss.item() * len(batch_data) self.optimizer.step(add_langevin_noise=True) self.optimizer_scheduler.step() if debug_val_loss: avg_val_loss = self.compute_val_loss(val_loader) avg_train_loss = total_epoch_train_loss / self.dataset_size metrics = { 'train_loss': avg_train_loss, 'val_loss': avg_val_loss, 'lr': self.optimizer_scheduler.get_lr() } print(metrics) if wandb_debug: wandb.log(metrics) output_model = deepcopy(self.model.cpu()) self.model.to(self.device) return output_model else: raise NotImplementedError def sample(self, num_samples=None, val_loader=None, debug_val_loss=False, wandb_debug=False): output_list = [] if num_samples is None: num_samples = self.num_samples if issubclass(self.model.__class__, torch.nn.Module): for i in range(num_samples): output_list.append( self.sample_iterative(val_loader=val_loader, debug_val_loss=debug_val_loss, wandb_debug=wandb_debug)) return output_list else: raise NotImplementedError
def train(args, io): # ============= Model =================== num_part = 50 device = torch.device("cuda" if args.cuda else "cpu") MODELClass = importlib.import_module(args.model) model = MODELClass.get_model(num_part).to(device) io.cprint(str(model)) model.apply(weight_init) model = nn.DataParallel(model) print("Let's use", torch.cuda.device_count(), "GPUs!") '''Use Pretrain or not''' try: state_dict = torch.load("checkpoints/%s/best_insiou_model.pth" % args.exp_name, map_location=torch.device('cpu'))['model'] for k in state_dict.keys(): if 'module' not in k: from collections import OrderedDict new_state_dict = OrderedDict() for k in state_dict: new_state_dict['module.' + k] = state_dict[k] state_dict = new_state_dict break model.load_state_dict(state_dict) print("Using pretrained model...") print( torch.load("checkpoints/%s/best_insiou_model.pth" % args.exp_name).keys()) except: print("Training from scratch...") # =========== Dataloader ================= train_data = PartNormalDataset(npoints=2048, split='trainval', normalize=False) print("The number of training data is:%d", len(train_data)) test_data = PartNormalDataset(npoints=2048, split='test', normalize=False) print("The number of test data is:%d", len(test_data)) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=8, drop_last=True) test_loader = DataLoader(test_data, batch_size=args.test_batch_size, shuffle=False, num_workers=8, drop_last=False) # ============= Optimizer ================ if args.use_sgd: print("Use SGD") opt = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: print("Use Adam") opt = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.weight_decay) if args.scheduler == 'cos': print("Use CosLR") scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr / 100) else: print("Use StepLR") scheduler = StepLR(opt, step_size=args.step, gamma=0.5) # ============= Training ================= best_acc = 0 best_class_iou = 0 best_instance_iou = 0 num_part = 50 num_classes = 16 for epoch in range(args.epochs): train_epoch(train_loader, model, opt, scheduler, epoch, num_part, num_classes, io) test_metrics, total_per_cat_iou = test_epoch(test_loader, model, epoch, num_part, num_classes, io) # 1. when get the best accuracy, save the model: if test_metrics['accuracy'] > best_acc: best_acc = test_metrics['accuracy'] io.cprint('Max Acc:%.5f' % best_acc) state = { 'model': model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(), 'optimizer': opt.state_dict(), 'epoch': epoch, 'test_acc': best_acc } torch.save(state, 'checkpoints/%s/best_acc_model.pth' % args.exp_name) # 2. when get the best instance_iou, save the model: if test_metrics['shape_avg_iou'] > best_instance_iou: best_instance_iou = test_metrics['shape_avg_iou'] io.cprint('Max instance iou:%.5f' % best_instance_iou) state = { 'model': model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(), 'optimizer': opt.state_dict(), 'epoch': epoch, 'test_instance_iou': best_instance_iou } torch.save(state, 'checkpoints/%s/best_insiou_model.pth' % args.exp_name) # 3. when get the best class_iou, save the model: # first we need to calculate the average per-class iou class_iou = 0 for cat_idx in range(16): class_iou += total_per_cat_iou[cat_idx] avg_class_iou = class_iou / 16 if avg_class_iou > best_class_iou: best_class_iou = avg_class_iou # print the iou of each class: for cat_idx in range(16): io.cprint(classes_str[cat_idx] + ' iou: ' + str(total_per_cat_iou[cat_idx])) io.cprint('Max class iou:%.5f' % best_class_iou) state = { 'model': model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(), 'optimizer': opt.state_dict(), 'epoch': epoch, 'test_class_iou': best_class_iou } torch.save(state, 'checkpoints/%s/best_clsiou_model.pth' % args.exp_name) # report best acc, ins_iou, cls_iou io.cprint('Final Max Acc:%.5f' % best_acc) io.cprint('Final Max instance iou:%.5f' % best_instance_iou) io.cprint('Final Max class iou:%.5f' % best_class_iou) # save last model state = { 'model': model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(), 'optimizer': opt.state_dict(), 'epoch': args.epochs - 1, 'test_iou': best_instance_iou } torch.save(state, 'checkpoints/%s/model_ep%d.pth' % (args.exp_name, args.epochs))
def train_fold(args, device, save_dir, log, tbx, cross_val = False, fold_idx = None): """ Perform training and evaluate for the current fold """ # Define loss function class_weights = torch.FloatTensor(CLASS_W) loss_fn = nn.CrossEntropyLoss(weight=class_weights).to(device) # CrossEntropyLoss includes softmax # Get model log.info('Building model...') if args.model_name == 'SeizureNet': model = SeizureNet(args) model = nn.DataParallel(model, args.gpu_ids) step = 0 model = model.to(device) # To train mode model.train() # Get saver saver = utils.CheckpointSaver(save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adam(params=model.parameters(), lr=args.lr_init, weight_decay=args.l2_wd) scheduler = CosineAnnealingLR(optimizer, T_max=args.num_epochs) # Get data loader log.info('Building dataset...') if cross_val: seizure_file = os.path.join('data', 'fold' + str(fold_idx) + '_trainSet_seizure_files.txt') else: seizure_file = TRAIN_SEIZURE_FILE train_dataset = SeizureDataset(seizure_file, num_folds=args.num_folds, fold_idx=fold_idx, cross_val=cross_val, split='train') train_loader = data.DataLoader(dataset=train_dataset, shuffle=True, batch_size=args.train_batch_size, num_workers=args.num_workers) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for features, y, _, in train_loader: batch_size = features.shape[0] # Setup for forward features = features.view(-1, 3, 224, 224) # merge number of dense samples with batch size features = features.to(device) y = y.view(-1) # merge number of dense samples with batch size y = y.to(device) # Zero out optimizer first optimizer.zero_grad() # Forward logits = model(features) loss = loss_fn(logits, y) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, loss=loss_val, lr=optimizer.param_groups[0]['lr']) if cross_val: tbx.add_scalar('fold{}/train/Loss'.format(fold_idx), loss_val, step) tbx.add_scalar('fold{}/train/LR'.format(fold_idx), optimizer.param_groups[0]['lr'], step) else: tbx.add_scalar('train/Loss', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) eval_results = evaluate_fold(model, args, save_dir, device, cross_val=cross_val, fold_idx=fold_idx, is_test=False, write_outputs=False) best_path = saver.save(step, model, eval_results[args.metric_name], device, eval_results) # Back to train mode model.train() # Log to console results_str = ', '.join('{}: {}'.format(k, v) for k, v in eval_results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in eval_results.items(): if cross_val: tbx.add_scalar('fold{}/eval/{}'.format(fold_idx,k), v, step) else: tbx.add_scalar('eval/{}'.format(k), v, step) # step lr scheduler scheduler.step() return best_path
class HotDogTrainer(object): def __init__(self): super(HotDogTrainer, self).__init__() self.model = NaiveDLClassifier() self.epoch = epoch self.data = HotDogDataSetLoader() self.gpu_ids = GPUS_LIST self.load_model_path = model_path self.stat_cache = None self.global_step = 9 self.writer = SummaryWriter() self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.0001) self.scheduler = CosineAnnealingLR(self.optimizer, len(self.data.train())) self.device = torch.device("cuda:0" if GPUS_LIST else "cpu") self.loss = torch.nn.CrossEntropyLoss() def initialize(self): if GPUS_LIST: self.model.to(self.device) self.model = torch.nn.DataParallel(self.model, device_ids=self.gpu_ids) if self.load_model_path: if os.path.exists(self.load_model_path): self.load_old_best() def savemodel(self, metrics): import json with open(metrics_path, 'w') as f: json.dump(metrics, f) if GPUS_LIST: torch.save(self.model.module.state_dict(), self.load_model_path) else: torch.save(self.model.state_dict(), self.load_model_path) def train(self, nb_epoch): trainstream = tqdm(self.data.train()) self.avg_loss = AverageMeter() self.avg_acc = AverageMeter() self.model.train() for i, data in enumerate(trainstream): self.global_step += 1 trainstream.set_description("TRAINING") x = data['image'].to(self.device) y = data['label'].to(self.device) with torch.set_grad_enabled(True): y_ = self.model(x) out_labels = torch.max(y_, 1)[1] loss = self.loss(y_, y) self.optimizer.zero_grad() loss.backward() self.optimizer.step() acc = 100. * ( (y.int().flatten() == out_labels.int().flatten()).sum() / y.size(0)) self.avg_acc.update(acc.item()) self.avg_loss.update(loss.item()) self.writer.add_scalar('Loss/Train', self.avg_loss.avg, self.global_step) self.writer.add_scalar('Accuracy/Train', self.avg_acc.avg, self.global_step) trainstream.set_postfix({ 'epoch': nb_epoch, 'loss': self.avg_loss.avg, 'accuracy': self.avg_acc.avg }) self.scheduler.step() trainstream.close() self.test(nb_epoch) def test(self, nb_epoch): self.model.eval() teststream = tqdm(self.data.test()) self.avg_loss = AverageMeter() self.avg_acc = AverageMeter() teststream.set_description('TESTING') with torch.no_grad(): for i, data in enumerate(teststream): x = data['image'] y = data['label'] y_ = self.model(x) loss = self.loss(y_, y) out_labels = torch.max(y_, 1)[1] acc = 100. * ( (y.int().flatten() == out_labels.int().flatten()).sum() / y.size(0)) self.avg_acc.update(acc.item()) self.avg_loss.update(loss.item()) teststream.set_postfix({ 'epoch': nb_epoch, 'loss': self.avg_loss.avg, 'accuracy': self.avg_acc.avg }) self.writer.add_scalar('Loss/Test', self.avg_loss.avg, nb_epoch) self.writer.add_scalar('Accuracy/Test', self.avg_acc.avg, nb_epoch) if not self.stat_cache: self.stat_cache = {'best': self.avg_acc.avg} print('SAVING MODEL') self.savemodel({'best': self.avg_acc.avg}) else: if self.stat_cache['best'] < self.avg_acc.avg: print('LOADING BEST MODEL') self.load_old_best() def load_old_best(self): import json with open(metrics_path, 'r') as f: self.stat_cache = json.load(f) if GPUS_LIST: self.model.module.load_state_dict(torch.load(self.load_model_path)) else: self.model.load_state_dict(torch.load(self.load_model_path)) def run(self): self.initialize() for i in range(self.epoch): self.train(i) self.writer.close()
def setup_and_start_training(self): logging.basicConfig( stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and self. system_dict["params"]["use_cuda"] else "cpu") if self.system_dict["params"]["use_cuda"] and torch.cuda.is_available( ): torch.backends.cudnn.benchmark = True logging.info("Using gpu.") else: logging.info("Using cpu.") timer = Timer() logging.info(self.system_dict) if self.system_dict["params"]["net"] == 'vgg16-ssd': create_net = create_vgg_ssd config = vgg_ssd_config elif self.system_dict["params"]["net"] == 'mb1-ssd': create_net = create_mobilenetv1_ssd config = mobilenetv1_ssd_config elif self.system_dict["params"]["net"] == 'mb1-ssd-lite': create_net = create_mobilenetv1_ssd_lite config = mobilenetv1_ssd_config elif self.system_dict["params"]["net"] == 'sq-ssd-lite': create_net = create_squeezenet_ssd_lite config = squeezenet_ssd_config elif self.system_dict["params"]["net"] == 'mb2-ssd-lite': create_net = lambda num: create_mobilenetv2_ssd_lite( num, width_mult=self.system_dict["params"]["mb2_width_mult"]) config = mobilenetv1_ssd_config else: logging.fatal("The net type is wrong.") sys.exit(1) train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std) target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5) test_transform = TestTransform(config.image_size, config.image_mean, config.image_std) logging.info("Prepare training datasets.") datasets = [] dataset = VOCDataset( self.system_dict["dataset"]["val"]["img_dir"], self.system_dict["dataset"]["val"]["label_dir"], transform=train_transform, target_transform=target_transform, label_file=self.system_dict["params"]["label_file"]) label_file = self.system_dict["params"]["label_file"] #store_labels(label_file, dataset.class_names) num_classes = len(dataset.class_names) datasets.append(dataset) logging.info(f"Stored labels into file {label_file}.") train_dataset = ConcatDataset(datasets) logging.info("Train dataset size: {}".format(len(train_dataset))) train_loader = DataLoader( train_dataset, self.system_dict["params"]["batch_size"], num_workers=self.system_dict["params"]["num_workers"], shuffle=True) if (self.system_dict["dataset"]["val"]["status"]): val_dataset = VOCDataset( self.system_dict["dataset"]["val"]["img_dir"], self.system_dict["dataset"]["val"]["label_dir"], transform=test_transform, target_transform=target_transform, is_test=True, label_file=self.system_dict["params"]["label_file"]) logging.info("validation dataset size: {}".format( len(val_dataset))) val_loader = DataLoader( val_dataset, self.system_dict["params"]["batch_size"], num_workers=self.system_dict["params"]["num_workers"], shuffle=False) logging.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = -1 base_net_lr = self.system_dict["params"][ "base_net_lr"] if self.system_dict["params"][ "base_net_lr"] is not None else self.system_dict["params"]["lr"] extra_layers_lr = self.system_dict["params"][ "extra_layers_lr"] if self.system_dict["params"][ "extra_layers_lr"] is not None else self.system_dict["params"][ "lr"] if self.system_dict["params"]["freeze_base_net"]: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif self.system_dict["params"]["freeze_net"]: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] timer.start("Load Model") resume = self.system_dict["params"]["resume"] base_net = self.system_dict["params"]["base_net"] pretrained_ssd = self.system_dict["params"]["pretrained_ssd"] if self.system_dict["params"]["resume"]: logging.info(f"Resume from the model {resume}") net.load(self.system_dict["params"]["resume"]) elif self.system_dict["params"]["base_net"]: logging.info(f"Init from base net {base_net}") net.init_from_base_net(self.system_dict["params"]["base_net"]) elif self.system_dict["params"]["pretrained_ssd"]: logging.info(f"Init from pretrained ssd {pretrained_ssd}") net.init_from_pretrained_ssd( self.system_dict["params"]["pretrained_ssd"]) logging.info( f'Took {timer.end("Load Model"):.2f} seconds to load the model.') net.to(DEVICE) criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD( params, lr=self.system_dict["params"]["lr"], momentum=self.system_dict["params"]["momentum"], weight_decay=self.system_dict["params"]["weight_decay"]) lr = self.system_dict["params"]["lr"] logging.info( f"Learning rate: {lr}, Base net learning rate: {base_net_lr}, " + f"Extra Layers learning rate: {extra_layers_lr}.") if (not self.system_dict["params"]["milestones"]): self.system_dict["params"]["milestones"] = "" self.system_dict["params"]["milestones"] += str( int(self.system_dict["params"]["num_epochs"] / 3)) + "," self.system_dict["params"]["milestones"] += str( int(2 * self.system_dict["params"]["num_epochs"] / 3)) if self.system_dict["params"]["scheduler"] == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [ int(v.strip()) for v in self.system_dict["params"]["milestones"].split(",") ] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif self.system_dict["params"]["scheduler"] == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, self.system_dict["params"]["t_max"], last_epoch=last_epoch) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, self.system_dict["params"]["num_epochs"]): scheduler.step() self.base_train( train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=self.system_dict["params"]["debug_steps"], epoch=epoch) if ((self.system_dict["dataset"]["val"]["status"]) and (epoch % self.system_dict["params"]["validation_epochs"] == 0 or epoch == self.system_dict["params"]["num_epochs"] - 1)): val_loss, val_regression_loss, val_classification_loss = self.base_test( val_loader, net, criterion, DEVICE) logging.info( f"Epoch: {epoch}, " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Regression Loss {val_regression_loss:.4f}, " + f"Validation Classification Loss: {val_classification_loss:.4f}" ) net_name = self.system_dict["params"]["net"] model_path = os.path.join( self.system_dict["params"]["checkpoint_folder"], f"{net_name}-Epoch-{epoch}-Loss-{val_loss}.pth") net.save(model_path) logging.info(f"Saved model {model_path}") if (not self.system_dict["dataset"]["val"]["status"]): model_path = os.path.join( self.system_dict["params"]["checkpoint_folder"], f"{net_name}-Epoch-{epoch}.pth") net.save(model_path) logging.info(f"Saved model {model_path}")
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer cel = nn.CrossEntropyLoss() criterion = lambda pred, target, lam: (-F.log_softmax(pred, dim=1) * torch.zeros(pred.size()).cuda().scatter_(1, target.data.view(-1, 1), lam.view(-1, 1))).sum(dim=1).mean() parameters_bias = [p[1] for p in model.named_parameters() if 'bias' in p[0]] parameters_scale = [p[1] for p in model.named_parameters() if 'scale' in p[0]] parameters_others = [p[1] for p in model.named_parameters() if not ('bias' in p[0] or 'scale' in p[0])] optimizer = torch.optim.SGD( [{'params': parameters_bias, 'lr': args.base_lr/10.}, {'params': parameters_scale, 'lr': args.base_lr/10.}, {'params': parameters_others}], lr=base_learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return sgdr = CosineAnnealingLR(optimizer, args.epochs, eta_min=0, last_epoch=-1) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, cel, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) # save checkpoint for every epoch save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best, '{0}-checkpoint-{1}.pth.tar'.format(args.arch,epoch + 1)) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best)
xargs.data_path + f'/{xargs.dataset}-split.txt', xargs.batch_size, xargs.workers) search_model = NASNetworkGDAS(xargs.channel, xargs.num_cells, xargs.steps, xargs.multiplier, xargs.stem_multiplier, xargs.num_classes, xargs.space, xargs.affine, xargs.track_running_stats, xargs.fix_reduction, xargs.paper_arch, xargs.no_gumbel) criterion = torch.nn.CrossEntropyLoss() w_optimizer = torch.optim.SGD(search_model.get_weights(), xargs.LR, momentum=xargs.momentum, weight_decay=xargs.decay, nesterov=xargs.nesterov) w_scheduler = CosineAnnealingLR(w_optimizer, T_max=xargs.epochs, eta_min=xargs.eta_min) a_optimizer = torch.optim.Adam(search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay) network, criterion = search_model.cuda(), criterion.cuda() # save directories os.mkdir(xargs.save_dir + '/checkpoint/') model_base_path = xargs.save_dir + f'/checkpoint/seed-{xargs.rand_seed}-basic.pth' model_best_path = xargs.save_dir + f'/checkpoint/seed-{xargs.rand_seed}-best.pth' best_val_acc = -1 genotypes = {-1: search_model.genotype()} if xargs.mixed_prec: network, [w_optimizer,
def train(args, io): train_loader = DataLoader(ModelNet40(partition='train', num_points=args.num_points), num_workers=8, batch_size=args.batch_size, shuffle=True, drop_last=True,pin_memory=False) test_loader = DataLoader(ModelNet40(partition='test', num_points=args.num_points), num_workers=8, batch_size=args.test_batch_size, shuffle=True, drop_last=False,pin_memory=False) device =torch.device("cuda:0" if use_cuda else "cpu") #torch.device("cuda" if args.cuda else "cpu") #Try to load models if args.model == 'pointnet': model = PointNet(args).to(device) elif args.model == 'dgcnn': model = DGCNN(args).to(device) else: raise Exception("Not implemented") print(str(model)) model = nn.DataParallel(model) print("Let's use", torch.cuda.device_count(), "GPUs!") if args.use_sgd: print("Use SGD") opt = optim.SGD(model.parameters(), lr=args.lr*100, momentum=args.momentum, weight_decay=1e-4) else: print("Use Adam") opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr) criterion = cal_loss best_test_acc = 0 for epoch in range(args.epochs): scheduler.step() #################### # Train #################### train_loss = 0.0 count = 0.0 model.train() train_pred = [] train_true = [] for data, label in train_loader: data, label = data.to(device), label.to(device).squeeze() data = data.permute(0, 2, 1) batch_size = data.size()[0] opt.zero_grad() logits = model(data) loss = criterion(logits, label) loss.backward() opt.step() preds = logits.max(dim=1)[1] count += batch_size train_loss += loss.item() * batch_size train_true.append(label.cpu().numpy()) train_pred.append(preds.detach().cpu().numpy()) train_true = np.concatenate(train_true) train_pred = np.concatenate(train_pred) outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f' % (epoch, train_loss*1.0/count, metrics.accuracy_score( train_true, train_pred), metrics.balanced_accuracy_score( train_true, train_pred)) io.cprint(outstr) #################### # Test #################### test_loss = 0.0 count = 0.0 model.eval() test_pred = [] test_true = [] for data, label in test_loader: data, label = data.to(device), label.to(device).squeeze() data = data.permute(0, 2, 1) batch_size = data.size()[0] logits = model(data) loss = criterion(logits, label) preds = logits.max(dim=1)[1] count += batch_size test_loss += loss.item() * batch_size test_true.append(label.cpu().numpy()) test_pred.append(preds.detach().cpu().numpy()) test_true = np.concatenate(test_true) test_pred = np.concatenate(test_pred) test_acc = metrics.accuracy_score(test_true, test_pred) avg_per_class_acc = metrics.balanced_accuracy_score(test_true, test_pred) outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f' % (epoch, test_loss*1.0/count, test_acc, avg_per_class_acc) io.cprint(outstr) if test_acc >= best_test_acc: best_test_acc = test_acc torch.save(model.state_dict(), 'checkpoints/%s/models/model.t7' % args.exp_name)
# optimizer = Adas(model.parameters()) lr_start = 1e-4 lr_end = 1e-6 weight_decay = 0 epoch_num = 20 wandb.config.lr_start = lr_start wandb.config.lr_end = lr_end wandb.config.weight_decay = weight_decay wandb.config.epoch_num = epoch_num wandb.config.optimizer = 'adam' wandb.config.scheduler = 'CosineAnnealingLR' # optimizer = Adam(group_weight(model, weight_decay=weight_decay), lr=lr_start, weight_decay=0) optimizer = Adam(model.parameters(), lr=lr_start, weight_decay=0) scheduler = CosineAnnealingLR(optimizer, T_max=epoch_num, eta_min=lr_end, last_epoch=-1) model = model.to(device) max_val_auc = 0 for epoch in range(epoch_num): train_loss, train_avg_auc, train_auc, train_rocs, train_data_pr, train_duration = one_epoch_train( model, train_loader, optimizer, criterion, device, scaler, iters_to_accumulate=accumulation_step, clip_grads=False) val_loss, val_avg_auc, val_auc, val_rocs, val_data_pr, val_duration = eval_model(
def main(): with timer('load data'): df = pd.read_csv(FOLD_PATH) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ #ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03), GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), #OneOf([ # ShiftScaleRotate(p=0.5), ## RandomRotate90(p=0.5), # Rotate(p=0.5) #], p=0.5), OneOf([ Blur(blur_limit=8, p=0.5), MotionBlur(blur_limit=8,p=0.5), MedianBlur(blur_limit=8,p=0.5), GaussianBlur(blur_limit=8,p=0.5) ], p=0.5), OneOf([ #CLAHE(clip_limit=4, tile_grid_size=(4, 4), p=0.5), RandomGamma(gamma_limit=(100,140), p=0.5), RandomBrightnessContrast(p=0.5), RandomBrightness(p=0.5), RandomContrast(p=0.5) ], p=0.5), OneOf([ GaussNoise(p=0.5), Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5) ], p=0.5) ]) train_augmentation = Compose([ Flip(p=0.5) ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Linknet('se_resnext101_32x4d', encoder_weights='imagenet', classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False) model.load_state_dict(torch.load(model_path)) model.to(device) #criterion = torch.nn.BCEWithLogitsLoss() criterion = FocalLovaszLoss() optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) #scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=CLR_CYCLE*2, after_scheduler=scheduler_cosine) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = 0 for epoch in range(1, EPOCHS + 1): if epoch % (CLR_CYCLE * 2) == 0: if epoch != 0: y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) for i in range(N_CLASSES): th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :]) LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format( round(best_model_loss, 5), round(score, 5), best_model_ep, th, i)) checkpoint += 1 best_model_loss = 999 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss, val_pred, y_val = validate(model, val_loader, criterion, device) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save(model.state_dict(), '{}_fold{}_ckpt{}.pth'.format(EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch best_pred = val_pred del val_pred gc.collect() with timer('eval'): y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) for i in range(N_CLASSES): th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :]) LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format( round(best_model_loss, 5), round(score, 5), best_model_ep, th, i)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(): global best_prec1, args args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.sync_bn: import apex print("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # Scale learning rate based on global batch size args.lr = args.lr * float(args.batch_size * args.world_size) / 256. print('learning rate: ', args.lr) param = model.parameters() optimizer = torch.optim.SGD(param, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.lr_adjust_type == 'step': scheduler = MultiStepLR(optimizer, milestones=args.lr_adjust_step, gamma=0.1) elif args.lr_adjust_type == 'cosine': scheduler = CosineAnnealingLR(optimizer, args.epochs) elif args.lr_adjust_type == 'exp': scheduler = ExponentialLR(optimizer, args.gamma) # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. If model = DDP(model) is called # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) resume() # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): raise RuntimeError( "Currently, inception_v3 is not supported by this example.") # crop_size = 299 # val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 trans = transforms.Compose([ transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), Too slow # normalize, ]) train_dataset = datasets.ImageFolder(traindir, trans) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) if args.evaluate: validate(val_loader, model, criterion) return st_time = time.time() prec1 = 0. for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) if args.grid: grid.set_prob(epoch, args.st_epochs) # train for one epoch #adjust_learning_rate(scheduler, optimizer, epoch, 1, 1) train(train_loader, model, criterion, optimizer, epoch, scheduler) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) print(epoch) print('Learning rate:', optimizer.param_groups[0]['lr']) print('Total Time: ' + format_time(time.time() - st_time)) print('Remaining Time: ' + format_time((time.time() - st_time) / (epoch - args.start_epoch + 1) * (args.epochs - epoch - 1))) print('Best Acc: ' + str(best_prec1)) save_checkpoint( args, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(): args = parse_arg() set_seed(1217) print(args.model) print('%s fold-%d...' % (args.mode, args.fold)) args.run_root = args.run_root + '/' + args.model #'0822_efficientnetb0_LB804' run_root = Path(args.run_root) if run_root.exists() and args.clean: shutil.rmtree(run_root) run_root.mkdir(exist_ok=True, parents=True) train_root = DATA_ROOT / ('images_%d' % SIZE[0]) valid_root = train_root test_root = train_root sample_sub = pd.read_csv(DATA_ROOT / 'sample_submission.csv') ss = pd.DataFrame() ss['Image_Label'] = sample_sub['Image_Label'].apply( lambda x: x.split('_')[0]).unique() ss['EncodedPixels'] = '1 1' fold_df = pd.read_csv('./files/5-folds_%d.csv' % (SIZE[0])) train_fold = fold_df[fold_df['fold'] != args.fold].reset_index(drop=True) valid_fold = fold_df[fold_df['fold'] == args.fold].reset_index(drop=True) if args.pos_only and args.cls_label in [0, 1, 2, 3]: train_fold['flag'] = train_fold['Labels'].apply( lambda x: 1 if str(args.cls_label) in x else 0) valid_fold['flag'] = valid_fold['Labels'].apply( lambda x: 1 if str(args.cls_label) in x else 0) train_fold = train_fold[train_fold['flag'] == 1].reset_index(drop=True) valid_fold = valid_fold[valid_fold['flag'] == 1].reset_index(drop=True) PIXEL_THRESHOLDS = args.pixel AREA_SIZES = args.area if args.pl == 1: # add puesdo label df_pl = pd.read_csv('./files/df_pl.csv') for col in train_fold.columns: if col not in df_pl.columns: df_pl[col] = 0 train_fold = train_fold.append(df_pl) train_fold.fillna('', inplace=True) if args.limit: train_fold = train_fold[:args.limit] valid_fold = valid_fold[:args.limit] if args.sliding: train_transform = transform_train_al((256, 256)) else: train_transform = transform_train_al(SIZE) test_transform = transform_test_al(SIZE) # model_name = args.model if '-' not in args.model else args.model.split('-')[0] # if model_name.startswith('effi'): # model_name = model_name[:-2] + '-' + model_name[-2:] # #model = model_steel(model_name, pretrained=True, down=False) # if model_name == 'resnext101_32x16d': # encoder_weights = 'instagram' # else: # encoder_weights = 'imagenet' # if args.framework == 'Unet': # model = smp.Unet(model_name, classes=args.n_classes, encoder_weights=encoder_weights, activation=None) # elif args.framework == 'FPN': # model = smp.FPN(model_name, classes=args.n_classes, encoder_weights=encoder_weights, activation=None) # elif args.framework == 'JPU': # model = model_cloud_JPU(model_name, classes=args.n_classes, encoder_weights=encoder_weights, activation=None) # elif '_' in args.framework: # framework = args.framework.split('_')[0] # model = model_cloud_smp(framework, model_name, classes=args.n_classes, pretrained=True) # else: # raise RuntimeError('Framework %s not implemented.' % (args.framework)) model = get_model(args) if args.mode == 'train': (run_root / 'params.json').write_text( json.dumps(vars(args), indent=4, sort_keys=True)) training_set = Dataset_cloud(train_root, df=train_fold, transform=train_transform, mode='train', cls_label=args.cls_label) #sampler = EmptySampler(data_source=training_set, positive_ratio_range=sampler_ratio, epochs=args.n_epochs) validation_set = Dataset_cloud(train_root, df=valid_fold, transform=test_transform, mode='train', cls_label=args.cls_label) print(f'{len(training_set):,} items in train, ', f'{len(validation_set):,} in valid') train_loader = DataLoader( training_set, batch_size=args.batch_size, num_workers=args.workers, sampler=None, drop_last=False, shuffle=True, ) valid_loader = DataLoader( validation_set, shuffle=False, batch_size=args.batch_size, #collate_fn=null_collate, num_workers=args.workers) model = model.cuda() #optimizer = Adam([{'params': model.encoder.parameters(), 'lr': args.lr}, # {'params': model.decoder.parameters(), 'lr': args.lr*10}]) optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=0, betas=(0.9, 0.999), eps=1e-08) if args.lrc == 'reduceLR': scheduler = ReduceLROnPlateau(optimizer, patience=args.patience, factor=args.gamma, verbose=True, mode='max') elif args.lrc == 'cos': scheduler = CosineAnnealingLR(optimizer, args.patience, eta_min=args.lr * args.gamma) elif args.lrc == 'warmRestart': scheduler = WarmRestart(optimizer, T_max=args.patience, T_mult=1, eta_min=1e-6) elif args.lrc == '': scheduler = None # scheduler = StepLR(optimizer, step_size=args.patience, gamma=args.gamma) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # train_kwargs = dict( args=args, model=model, optimizer=optimizer, scheduler=scheduler, train_loader=train_loader, valid_loader=valid_loader, # use_cuda=use_cuda, epoch_length=len(training_set), ) train(n_epochs=args.n_epochs, **train_kwargs) file = '%s/train-%d.log' % (args.run_root, args.fold) df = pd.read_csv(file, sep='|') cols = df.columns df.columns = [x.strip() for x in cols] fig, ax = plt.subplots(2, 2, figsize=(12, 12)) #loss profile ax[0, 0].plot(df.epoch, df.loss, label='train-loss', marker='o') ax[0, 0].plot(df.epoch, df['val loss'], label='val-loss', marker='x') ax[0, 0].set_xlabel('epoch') ax[0, 0].set_ylabel('loss') ax[0, 0].legend() #lr profile ax[0, 1].plot(df.epoch, df.lr, label='lr', marker='o') ax[0, 1].set_xlabel('epoch') ax[0, 1].set_ylabel('lr') ax[0, 1].legend() if 'AUC-mean' in df.columns: #cls ax[1, 0].plot(df.epoch, df['AUC-mean'], '-ro', label='AUC-mean') ax[1, 0].set_xlabel('epoch') ax[1, 0].set_ylabel('AUC-mean') ax[1, 0].legend() for k in range(4): ax[1, 1].plot(df.epoch, df['class%d' % (k + 1)], '-o', label=CLASS_NAMES[k]) ax[1, 1].set_xlabel('epoch') ax[1, 1].set_ylabel('AUC') ax[1, 1].legend() else: ax[1, 0].plot(df.epoch, df['val dice'], '-ro', label='dice') ax[1, 0].set_xlabel('epoch') ax[1, 0].set_ylabel('val-dice') ax[1, 0].legend() fig.savefig(Path(args.run_root) / ('train_%d.png' % (args.fold))) elif args.mode.startswith('predict'): if (run_root / ('best-dice-%d.pt' % args.fold)).exists(): load_model(model, run_root / ('best-dice-%d.pt' % args.fold), multi2single=False) else: load_model(model, run_root / ('best-model-%d.pt' % args.fold), multi2single=False) model = model.cuda() if args.mode == 'predict_valid': valid_set = Dataset_cloud(valid_root, df=valid_fold, transform=test_transform, mode='test', cls_label=args.cls_label) valid_loader = DataLoader(valid_set, shuffle=False, batch_size=args.batch_size, num_workers=args.workers) predict(model, args.mode, loader=valid_loader, out_path=run_root, fold=args.fold, tta=args.tta, args=args) elif args.mode == 'predict_test': if args.limit: ss = ss[:args.limit] test_set = Dataset_cloud(test_root, df=ss, transform=test_transform, mode='test', cls_label=args.cls_label) test_loader = DataLoader(test_set, shuffle=False, batch_size=args.batch_size, num_workers=args.workers) predict(model, args.mode, loader=test_loader, out_path=run_root, fold=args.fold, tta=args.tta, args=args) elif args.mode == 'predict_5fold': if args.limit: ss = ss[:args.limit] test_set = Dataset_cloud(test_root, df=ss, transform=test_transform, mode='test') test_loader = DataLoader(test_set, shuffle=False, batch_size=args.batch_size, num_workers=args.workers) predict_5fold(test_loader, out_path=run_root, args=args, pixel_thresholds=PIXEL_THRESHOLDS, area_size=AREA_SIZES) else: RuntimeError('%s mode not implemented' % (args.mode)) elif args.mode == 'opt': # creat save folder if (run_root / ('opt')).exists(): pass else: output_root = Path(run_root / ('opt')) output_root.mkdir(exist_ok=True, parents=True) # Load model if (run_root / ('best-dice-%d.pt' % args.fold)).exists(): load_model(model, run_root / ('best-dice-%d.pt' % args.fold), multi2single=False) else: load_model(model, run_root / ('best-model-%d.pt' % args.fold), multi2single=False) model = model.cuda() valid_set = Dataset_cloud(valid_root, df=valid_fold, transform=test_transform, mode='test') valid_loader = DataLoader(valid_set, shuffle=False, batch_size=args.batch_size, num_workers=args.workers) area_ts_list = [0, 0, 0, 0] for pixel_ts in range(0, 80, 5): pixel_ts /= 100 pixel_ts_list = [pixel_ts] * 4 print('Processing: pixel-[%s]' % (str(pixel_ts))) predict( model, args.mode, loader=valid_loader, out_path=run_root / ('opt'), fold=args.fold, tta=args.tta, args=args, pixel_thresholds=pixel_ts_list, area_size=area_ts_list, ) else: print('%s mode not implemented' % (args.mode))
def main(): args = parse_args() os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" torch.manual_seed(args.seed) if torch.cuda.is_available(): device = 'cuda' torch.cuda.manual_seed(args.seed) else: device = 'cpu' print(f"==> Using device: {device}") if args.checkpoint is None: time_stamp = str(datetime.datetime.now().strftime('-%Y%m%d%H%M%S')) args.checkpoint = args.model + time_stamp args.checkpoint = 'checkpoints/' + args.checkpoint if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) save_args(args) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title="ModelNet" + args.model) logger.set_names([ "Epoch-Num", 'Learning-Rate', 'Train-Loss', 'Train-acc-B', 'Train-acc', 'Valid-Loss', 'Valid-acc-B', 'Valid-acc' ]) print('==> Preparing data..') train_loader = DataLoader(ModelNet40(partition='train', num_points=args.num_points, rotation=args.aug_rotate, scale=args.aug_scale), num_workers=8, batch_size=args.batch_size, shuffle=True, drop_last=True) test_loader = DataLoader(ModelNet40(partition='test', num_points=args.num_points), num_workers=8, batch_size=args.batch_size, shuffle=True, drop_last=False) # Model print('==> Building model..') net = models.__dict__[args.model]() criterion = cal_loss net = net.to(device) # criterion = criterion.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True optimizer = torch.optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) scheduler = CosineAnnealingLR(optimizer, args.epoch, eta_min=args.learning_rate / 100) best_test_acc = 0. # best test accuracy best_train_acc = 0. best_test_acc_avg = 0. best_train_acc_avg = 0. best_test_loss = float("inf") best_train_loss = float("inf") start_epoch = 0 # start from epoch 0 or last checkpoint epoch for epoch in range(start_epoch, args.epoch): print('Epoch(%d/%s) Learning Rate %s:' % (epoch + 1, args.epoch, optimizer.param_groups[0]['lr'])) train_out = train(net, train_loader, optimizer, criterion, device) # {"loss", "acc", "acc_avg", "time"} test_out = validate(net, test_loader, criterion, device) scheduler.step() if test_out["acc"] > best_test_acc: best_test_acc = test_out["acc"] is_best = True else: is_best = False best_test_acc = test_out["acc"] if ( test_out["acc"] > best_test_acc) else best_test_acc best_train_acc = train_out["acc"] if ( train_out["acc"] > best_train_acc) else best_train_acc best_test_acc_avg = test_out["acc_avg"] if ( test_out["acc_avg"] > best_test_acc_avg) else best_test_acc_avg best_train_acc_avg = train_out["acc_avg"] if ( train_out["acc_avg"] > best_train_acc_avg) else best_train_acc_avg best_test_loss = test_out["loss"] if ( test_out["loss"] < best_test_loss) else best_test_loss best_train_loss = train_out["loss"] if ( train_out["loss"] < best_train_loss) else best_train_loss save_model(net, epoch, path=args.checkpoint, acc=test_out["acc"], is_best=is_best) logger.append([ epoch, optimizer.param_groups[0]['lr'], train_out["loss"], train_out["acc_avg"], train_out["acc"], test_out["loss"], test_out["acc_avg"], test_out["acc"] ]) print( f"Training loss:{train_out['loss']} acc_avg:{train_out['acc_avg']} acc:{train_out['acc']} time:{train_out['time']}s)" ) print( f"Testing loss:{test_out['loss']} acc_avg:{test_out['acc_avg']} acc:{test_out['acc']}% time:{test_out['time']}s) \n\n" ) logger.close() print(f"++++++++" * 2 + "Final results" + "++++++++" * 2) print( f"++ Last Train time: {train_out['time']} | Last Test time: {test_out['time']} ++" ) print( f"++ Best Train loss: {best_train_loss} | Best Test loss: {best_test_loss} ++" ) print( f"++ Best Train acc_B: {best_train_acc_avg} | Best Test acc_B: {best_test_acc_avg} ++" ) print( f"++ Best Train acc: {best_train_acc} | Best Test acc: {best_test_acc} ++" ) print(f"++++++++" * 5)
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), RandomBrightness(p=0.5), RandomContrast(p=0.5) ], p=0.5), OneOf([ GaussNoise(p=0.5), Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5) ], p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish") model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = base_ckpt + 1 for epoch in range(1, EPOCHS + 1): seed = seed + epoch seed_torch(seed) if epoch % (CLR_CYCLE * 2) == 0: LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) checkpoint += 1 best_model_loss = 999 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss = validate(model, val_loader, criterion, device) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) torch.save(model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
class FlowTrainer(object): def __init__(self): super(FlowTrainer, self).__init__() # not the best model... self.model = FlowEstimator(shape=(256, 256), use_l2=False, channel_in=3, stride=1, kernel_size=2, use_cst=True) self.optimizer = None self.lr_scheduler = None self.save_dir = None self.epoch = 1000 self.train_loader = SintelLoader( batch_size=1, pin_memory=True, num_workers=8, ) self.val_loader = None self.test_loader = SintelLoader( sintel_root="/data/keshav/sintel/test/final", batch_size=1, pin_memory=True, num_workers=8) self.sample_test = [ *SintelLoader(sintel_root="/data/keshav/sintel/test/final", test=True, nsample=10, visualize=True).load() ][0] self.sample_train = [*SintelLoader(nsample=10, visualize=True).load() ][0] self.sample_val = None self.save_model_path = './best/' self.load_model_path = None self.best_metrics = {'train_loss': None, 'val_loss': None} self.gpu_ids = [0, 1, 2, 3, 4, 5, 6, 7] self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.0001) # self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01) # self.scheduler = ReduceLROnPlateau(self.optimizer) self.scheduler = CosineAnnealingLR(self.optimizer, len(self.train_loader.load())) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.photoloss = torch.nn.MSELoss() self.writer = SummaryWriter() self.global_step = 0 def resetsample(self): self.sample_test = [ *SintelLoader(sintel_root="/data/keshav/sintel/test/final", test=True, nsample=10, visualize=True).load() ][0] self.sample_train = [*SintelLoader(nsample=10, visualize=True).load() ][0] def initialize(self): self.model.to(self.device) self.model = torch.nn.DataParallel(self.model, device_ids=self.gpu_ids) if self.load_model_path: # LOAD MODEL WEIGHTS HERE pass self.initialized = True def savemodel(self, metrics, compare='val_loss'): # Save model in save_model_path if self.best_metrics.get('val_loss') > metrics.get('val_loss'): # save only if new metrics are low self.best_metrics.update(metrics) pass else: # Load from the best saved pass def train_epoch_end(self, metrics): self.resetsample() self.model.eval() with torch.no_grad(): frame1 = self.sample_train['frame1'].to(self.device) frame2 = self.sample_train['frame2'].to(self.device) frame1Unet = self.sample_train['frame1Unet'].to(self.device) frame2Unet = self.sample_train['frame2Unet'].to(self.device) frame1Unet_ = self.sample_train['frame1Unet_'].to(self.device) flow, occ = self.model(frame1, frame2) flow = flow * 256. frame1_ = warper(flow, frame2Unet) occ = replicatechannel(occ) #without unet # sampocc = replicatechannel(self.sample_train['occlusion'].cuda()) #with unet sampocc = replicatechannel( self.sample_train['occlusionUnet'].cuda()) occs = torch.cat([sampocc, occ]) occs = make_grid(occs, nrow=10).unsqueeze(0) #without unet # frames = torch.cat([frame1_, frame1, frame2]) # with unet frames = torch.cat([frame1_, frame1Unet_, frame1Unet, frame2Unet]) frames = make_grid(frames, nrow=10).unsqueeze(0) #without unet # flows = torch.cat([flow2rgb(flow.cpu()).cuda(), self.sample_train['flow'].cuda()]) # with unet flows = torch.cat([ flow2rgb(flow.cpu(), scaled=True).cuda(), self.sample_train['flowUnet'].cuda() ]) flows = make_grid(flows, nrow=10).unsqueeze(0) self.writer.add_images('TRAIN/Frames', frames, metrics.get('nb_batch')) self.writer.add_images('TRAIN/Flows', flows, metrics.get('nb_batch')) self.writer.add_images('TRAIN/Occlusions', occs, metrics.get('nb_batch')) return self.val(metrics) def val_end(self, metrics): return metrics def test_end(self, metrics): with torch.no_grad(): frame1 = self.sample_test['frame1'].to(self.device) frame2 = self.sample_test['frame2'].to(self.device) frame1Unet = self.sample_test['frame1Unet'].to(self.device) frame2Unet = self.sample_test['frame2Unet'].to(self.device) flow, occ = self.model(frame1, frame2) frame1_ = warper(flow, frame2Unet) occ = replicatechannel(occ) frames = torch.cat([ frame1_, frame1Unet, frame2Unet, flow2rgb(flow.cpu(), scaled=True).cuda(), occ ]) frames = make_grid(frames, nrow=10).unsqueeze(0) self.writer.add_images('TEST/Frames', frames, metrics.get('nb_batch')) return metrics def train(self, nb_epoch): trainstream = tqdm(self.train_loader.load()) self.avg_loss = AverageMeter() self.model.train() for i, data in enumerate(trainstream): self.global_step += 1 trainstream.set_description('TRAINING') # GET X and Frame 2 # wdt = data['displacement'].to(self.device) frame2 = data['frame2'].to(self.device) frame1 = data['frame1'].to(self.device) frame1Unet = data['frame1Unet'].to(self.device) frame2Unet = data['frame2Unet'].to(self.device) # frame1Unet1 = data['frame1Unet1'].to(self.device) # frame2Unet1 = data['frame2Unet1'].to(self.device) # # frame1Unet2 = data['frame1Unet2'].to(self.device) # frame2Unet2 = data['frame2Unet2'].to(self.device) # # frame1Unet3 = data['frame1Unet3'].to(self.device) # frame2Unet3 = data['frame2Unet3'].to(self.device) # frame1Unet4 = data['frame1Unet4'].to(self.device) # frame2Unet4 = data['frame2Unet4'].to(self.device) # frame1Unet5 = data['frame1Unet5'].to(self.device) # frame2Unet5 = data['frame2Unet5'].to(self.device) # # frame1Unet6 = data['frame1Unet6'].to(self.device) # frame2Unet6 = data['frame2Unet6'].to(self.device) self.optimizer.zero_grad() # forward with torch.set_grad_enabled(True): # flow1, flow2, flow3, flow4, flow5, flow6, flow, occ1, occ2, occ3, occ4, occ5, occ6, occ = self.model(frame1, frame2) flow, occ = self.model(frame1, frame2) print(flow.shape) print(frame2Unet.shape) frame1_ = warper(flow, frame2Unet) # frame1_1 = warper(flow1, frame2Unet1) # frame1_2 = warper(flow2, frame2Unet2) # frame1_3 = warper(flow3, frame2Unet3) # frame1_4 = warper(flow4, frame2Unet4) # frame1_5 = warper(flow5, frame2Unet5) # frame1_6 = warper(flow6, frame2Unet6) loss = comboloss(frame1Unet, frame2Unet, frame1_, occ) # loss1_1 = comboloss(frame1Unet1, frame2Unet1, frame1_1, occ1) # loss1_2 = comboloss(frame1Unet2, frame2Unet2, frame1_2, occ2) # loss1_3 = comboloss(frame1Unet3, frame2Unet3, frame1_3, occ3) # loss1_4 = comboloss(frame1Unet4, frame2Unet4, frame1_4, occ4) # loss1_5 = comboloss(frame1Unet5, frame2Unet5, frame1_5, occ5) # loss1_6 = comboloss(frame1Unet6, frame2Unet6, frame1_6, occ6) # loss = (loss1_ + loss1_4)/2. # loss = (loss1_ + loss1_4 + loss1_5 + loss1_6) / 4. # loss = (loss1_ + loss1_1 + loss1_2 + loss1_3 + loss1_4 + loss1_5 + loss1_6) / 7. #WITHOUT UNET # loss = photometricloss(frame1, frame1_, occ) #WITH UNET # loss = photometricloss(frame1Unet, frame1_,frame2Unet, occ) # loss = comboloss(frame1Unet,frame2Unet,frame1_,occ) self.avg_loss.update(loss.item(), i + 1) loss.backward() self.optimizer.step() self.writer.add_scalar('Loss/train', self.avg_loss.avg, self.global_step) trainstream.set_postfix({ 'epoch': nb_epoch, 'loss': self.avg_loss.avg }) self.scheduler.step(loss) trainstream.close() return self.train_epoch_end({ 'TRloss': self.avg_loss.avg, 'epoch': nb_epoch, }) def val(self, metrics): if self.val_loader is None: return self.test(metrics) # DO VAL STUFF HERE valstream = tqdm(self.val_loader.load()) for data in valstream: pass return self.val_end(metrics) def test(self, metrics={}): teststream = tqdm(self.test_loader.load()) self.avg_loss = AverageMeter() with torch.no_grad(): for i, data in enumerate(teststream): teststream.set_description('TESTING') frame2 = data['frame2'].to(self.device) frame1 = data['frame1'].to(self.device) frame2Unet = data['frame2Unet'].to(self.device) frame1Unet = data['frame1Unet'].to(self.device) flow, occ = self.model(frame1, frame2) frame1_ = warper(flow, frame2Unet) # loss = photometricloss(frame1Unet, frame1_,frame2Unet, occ) loss = comboloss(frame1Unet, frame2Unet, frame1_, occ) self.avg_loss.update(loss.item(), i + 1) metrics.update({'TSloss': self.avg_loss.avg}) teststream.set_postfix(metrics) self.writer.add_scalar('Loss/test', self.avg_loss.avg, metrics.get('epoch')) teststream.close() return self.test_end(metrics) def loggings(self, **metrics): pass def run(self): self.initialize() for i in range(self.epoch): metrics = self.train(i) self.test(metrics) self.writer.close()
print(datainfo) print(datainfo.datasets['train'][0]) model = DPCNN(init_embed=embedding, num_cls=len(datainfo.vocabs[C.TARGET]), embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout) print(model) # 3. 声明loss,metric,optimizer loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET) metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET) optimizer = SGD([param for param in model.parameters() if param.requires_grad == True], lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay) callbacks = [] callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) # callbacks.append( # LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch < # ops.train_epoch * 0.8 else ops.lr * 0.1)) # ) # callbacks.append( # FitlogCallback(data=datainfo.datasets, verbose=1) # ) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print(device) # 4.定义train方法 trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
def DelayedCosineAnnealingLR(optimizer, delay_epochs, cosine_annealing_epochs): base_scheduler = CosineAnnealingLR(optimizer, cosine_annealing_epochs) return DelayerScheduler(optimizer, delay_epochs, base_scheduler)
def main(args): root = 'runs_' + args.dataset exp = Experiment(args, root=root, main='model', ignore=('cuda', 'device', 'epochs', 'resume')) print(exp) if os.path.exists(exp.path_to('log')) and not args.resume: print('Skipping ...') sys.exit(0) train_data, test_data, in_ch, out = load_dataset(args) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False) if args.model == 'odenet': model = ODENet(in_ch, out=out, n_filters=args.filters, downsample=args.downsample, method=args.method, tol=args.tol, adjoint=args.adjoint, dropout=args.dropout) else: model = ResNet(in_ch, out=out, n_filters=args.filters, downsample=args.downsample, dropout=args.dropout) model = model.to(args.device) if args.optim == 'sgd': optimizer = SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd) elif args.optim == 'adam': optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # print(train_data) # print(test_data) # print(model) # print(optimizer) if args.resume: ckpt = torch.load(exp.ckpt('last')) print('Loaded: {}'.format(exp.ckpt('last'))) model.load_state_dict(ckpt['model']) optimizer.load_state_dict(ckpt['optim']) start_epoch = ckpt['epoch'] + 1 best_accuracy = exp.log['test_acc'].max() print('Resuming from epoch {}: {}'.format(start_epoch, exp.name)) else: metrics = evaluate(test_loader, model, args) best_accuracy = metrics['test_acc'] start_epoch = 1 if args.lrschedule == 'fixed': scheduler = LambdaLR( optimizer, lr_lambda=lambda x: 1) # no-op scheduler, just for cleaner code elif args.lrschedule == 'plateau': scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=args.patience) elif args.lrschedule == 'cosine': scheduler = CosineAnnealingLR(optimizer, args.lrcycle, last_epoch=start_epoch - 2) progress = trange(start_epoch, args.epochs + 1, initial=start_epoch, total=args.epochs) for epoch in progress: metrics = {'epoch': epoch} progress.set_postfix({'Best ACC': f'{best_accuracy:.2%}'}) progress.set_description('TRAIN') train_metrics = train(train_loader, model, optimizer, args) progress.set_description('EVAL') test_metrics = evaluate(test_loader, model, args) is_best = test_metrics['test_acc'] > best_accuracy best_accuracy = max(test_metrics['test_acc'], best_accuracy) metrics.update(train_metrics) metrics.update(test_metrics) save_checkpoint( exp, { 'epoch': epoch, 'params': vars(args), 'model': model.state_dict(), 'optim': optimizer.state_dict(), 'metrics': metrics }, is_best) exp.push_log(metrics) sched_args = metrics[ 'test_acc'] if args.lrschedule == 'plateau' else None scheduler.step(sched_args)
def init_scheduler(self): # self.scheduler_name if self.scheduler_name == "cosine": self.scheduler = CosineAnnealingLR(self.optimizer, T_max=10, eta_min=1e-5) else: self.scheduler = None
def train(args, io): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # sample_rate=1.5 to make sure some overlap train_loader = DataLoader(S3DISDataset(split='train', data_root=args.data_dir, num_point=args.num_points, test_area=args.test_area, block_size=args.block_size, sample_rate=1.5, num_class=args.num_classes), num_workers=8, batch_size=args.batch_size, shuffle=True, drop_last=True) test_loader = DataLoader(S3DISDataset(split='test', data_root=args.data_dir, num_point=args.num_points, test_area=args.test_area, block_size=args.block_size, sample_rate=1.5, num_class=args.num_classes), num_workers=8, batch_size=args.test_batch_size, shuffle=True, drop_last=True) device = torch.device("cuda" if args.cuda else "cpu") # Try to load models if args.model == 'dgcnn': model = DGCNN_semseg(args).to(device) else: raise Exception("Not implemented") print(str(model)) model = nn.DataParallel(model) print("Let's use", torch.cuda.device_count(), "GPUs!") if args.use_sgd: print("Use SGD") opt = optim.SGD(model.parameters(), lr=args.lr * 100, momentum=args.momentum, weight_decay=1e-4) else: print("Use Adam") opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) if args.scheduler == 'cos': scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=1e-3) elif args.scheduler == 'step': scheduler = StepLR(opt, 20, 0.5, args.epochs) try: checkpoint = torch.load( os.path.join(args.model_root, 'model_%s.t7' % args.test_area)) start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model_state_dict']) opt.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) best_test_iou = checkpoint['mIOU'] io.cprint('Use pretrained model') except: io.cprint('No existing model, starting training from scratch...') start_epoch = 0 best_test_iou = 0 criterion = cal_loss log_dir = os.path.join(BASE_DIR, args.tb_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) writer_train_loss = SummaryWriter(os.path.join(log_dir, 'train_loss')) writer_train_accuracy = SummaryWriter(os.path.join(log_dir)) writer_train_iou = SummaryWriter(os.path.join(log_dir)) writer_test_accuracy = SummaryWriter(os.path.join(log_dir)) writer_test_iou = SummaryWriter(os.path.join(log_dir)) for epoch in range(start_epoch, args.epochs): #################### # Train #################### train_loss = 0.0 count = 0.0 niter = epoch * len(train_loader) * args.batch_size model.train() train_true_cls = [] train_pred_cls = [] train_true_seg = [] train_pred_seg = [] train_label_seg = [] io.cprint('Start training for Epoch %d ...' % epoch) for data, seg in tqdm(train_loader): data, seg = data.to(device), seg.to(device) data = data.permute(0, 2, 1).float() batch_size = data.size()[0] opt.zero_grad() seg_pred = model(data) seg_pred = seg_pred.permute(0, 2, 1).contiguous() loss = criterion(seg_pred.view(-1, args.num_classes), seg.view(-1, 1).squeeze().long()) loss.backward() opt.step() pred = seg_pred.max(dim=2)[1] # (batch_size, num_points) count += batch_size train_loss += loss.item() * batch_size niter += batch_size writer_train_loss.add_scalar('Train/loss', loss.item(), niter) seg_np = seg.cpu().numpy() # (batch_size, num_points) pred_np = pred.detach().cpu().numpy() # (batch_size, num_points) train_true_cls.append( seg_np.reshape(-1)) # (batch_size * num_points) train_pred_cls.append( pred_np.reshape(-1)) # (batch_size * num_points) train_true_seg.append(seg_np) train_pred_seg.append(pred_np) if args.scheduler == 'cos': scheduler.step() elif args.scheduler == 'step': if opt.param_groups[0]['lr'] > 1e-5: scheduler.step() if opt.param_groups[0]['lr'] < 1e-5: for param_group in opt.param_groups: param_group['lr'] = 1e-5 train_true_cls = np.concatenate(train_true_cls) train_pred_cls = np.concatenate(train_pred_cls) train_acc = metrics.accuracy_score(train_true_cls, train_pred_cls) avg_per_class_acc = metrics.balanced_accuracy_score( train_true_cls, train_pred_cls) train_true_seg = np.concatenate(train_true_seg, axis=0) train_pred_seg = np.concatenate(train_pred_seg, axis=0) train_ious = calculate_sem_IoU(train_pred_seg, train_true_seg, args.num_classes) outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f, train iou: %.6f' % ( epoch, train_loss * 1.0 / count, train_acc, avg_per_class_acc, np.mean(train_ious)) io.cprint(outstr) writer_train_accuracy.add_scalar('Train/accuracy', train_acc, epoch) writer_train_iou.add_scalar('Train/mIOU', np.mean(train_ious), epoch) #################### # Test #################### test_loss = 0.0 count = 0.0 model.eval() test_true_cls = [] test_pred_cls = [] test_true_seg = [] test_pred_seg = [] io.cprint('Start evaluation for Epoch %d ...' % epoch) for data, seg in tqdm(test_loader): data, seg = data.to(device), seg.to(device) data = data.permute(0, 2, 1).float() batch_size = data.size()[0] seg_pred = model(data) seg_pred = seg_pred.permute(0, 2, 1).contiguous() loss = criterion(seg_pred.view(-1, args.num_classes), seg.view(-1, 1).squeeze().long()) pred = seg_pred.max(dim=2)[1] count += batch_size test_loss += loss.item() * batch_size seg_np = seg.cpu().numpy() pred_np = pred.detach().cpu().numpy() test_true_cls.append(seg_np.reshape(-1)) test_pred_cls.append(pred_np.reshape(-1)) test_true_seg.append(seg_np) test_pred_seg.append(pred_np) test_true_cls = np.concatenate(test_true_cls) test_pred_cls = np.concatenate(test_pred_cls) test_acc = metrics.accuracy_score(test_true_cls, test_pred_cls) avg_per_class_acc = metrics.balanced_accuracy_score( test_true_cls, test_pred_cls) test_true_seg = np.concatenate(test_true_seg, axis=0) test_pred_seg = np.concatenate(test_pred_seg, axis=0) test_ious = calculate_sem_IoU(test_pred_seg, test_true_seg, args.num_classes) outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f, test iou: %.6f' % ( epoch, test_loss * 1.0 / count, test_acc, avg_per_class_acc, np.mean(test_ious)) io.cprint(outstr) writer_test_accuracy.add_scalar('Test/accuracy', test_acc, epoch) writer_test_iou.add_scalar('Test/mIOU', np.mean(test_ious), epoch) if np.mean(test_ious) >= best_test_iou: best_test_iou = np.mean(test_ious) savepath = 'checkpoints/%s/models/model_%s.t7' % (args.exp_name, args.test_area) io.cprint('Saving the best model at %s' % savepath) state = { 'epoch': epoch, 'mIOU': best_test_iou, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': opt.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), } torch.save(state, savepath) writer_train_loss.close() writer_train_accuracy.close() writer_train_iou.close() writer_test_accuracy.close() writer_test_iou.close()
def train(args, io): train_dataset = ShapeNetPart(partition='trainval', num_points=args.num_points, class_choice=args.class_choice) if (len(train_dataset) < 100): drop_last = False else: drop_last = True train_loader = DataLoader(train_dataset, num_workers=8, batch_size=args.batch_size, shuffle=True, drop_last=drop_last) test_loader = DataLoader(ShapeNetPart(partition='test', num_points=args.num_points, class_choice=args.class_choice), num_workers=8, batch_size=args.test_batch_size, shuffle=True, drop_last=False) device = torch.device("cuda:0" if args.cuda else "cpu") #Try to load models seg_num_all = train_loader.dataset.seg_num_all seg_start_index = train_loader.dataset.seg_start_index if args.model == 'dgcnn': model = DGCNN_partseg(args, seg_num_all).to(device) else: raise Exception("Not implemented") #print(str(model)) #model = nn.DataParallel(model) print("Let's use", str(1), "GPUs!") if args.use_sgd: print("Use SGD") opt = optim.SGD(model.parameters(), lr=args.lr * 100, momentum=args.momentum, weight_decay=1e-4) else: print("Use Adam") opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) if args.scheduler == 'cos': scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=1e-3) elif args.scheduler == 'step': scheduler = StepLR(opt, step_size=20, gamma=0.5) criterion = cal_loss best_test_iou = 0 for epoch in range(args.epochs): #################### # Train #################### train_loss = 0.0 count = 0.0 model.train() train_true_cls = [] train_pred_cls = [] train_true_seg = [] train_pred_seg = [] train_label_seg = [] for data, label, seg in train_loader: seg = seg - seg_start_index label_one_hot = np.zeros((label.shape[0], 16)) for idx in range(label.shape[0]): label_one_hot[idx, label[idx]] = 1 label_one_hot = torch.from_numpy(label_one_hot.astype(np.float32)) data, label_one_hot, seg = data.to(device), label_one_hot.to( device), seg.to(device) data = data.permute(0, 2, 1) batch_size = data.size()[0] opt.zero_grad() seg_pred = model(data, label_one_hot) seg_pred = seg_pred.permute(0, 2, 1).contiguous() loss = criterion(seg_pred.view(-1, seg_num_all), seg.view(-1, 1).squeeze()) loss.backward() opt.step() pred = seg_pred.max(dim=2)[1] # (batch_size, num_points) count += batch_size train_loss += loss.item() * batch_size seg_np = seg.cpu().numpy() # (batch_size, num_points) pred_np = pred.detach().cpu().numpy() # (batch_size, num_points) train_true_cls.append( seg_np.reshape(-1)) # (batch_size * num_points) train_pred_cls.append( pred_np.reshape(-1)) # (batch_size * num_points) train_true_seg.append(seg_np) train_pred_seg.append(pred_np) train_label_seg.append(label.reshape(-1)) if args.scheduler == 'cos': scheduler.step() elif args.scheduler == 'step': if opt.param_groups[0]['lr'] > 1e-5: scheduler.step() if opt.param_groups[0]['lr'] < 1e-5: for param_group in opt.param_groups: param_group['lr'] = 1e-5 train_true_cls = np.concatenate(train_true_cls) train_pred_cls = np.concatenate(train_pred_cls) train_acc = metrics.accuracy_score(train_true_cls, train_pred_cls) avg_per_class_acc = metrics.balanced_accuracy_score( train_true_cls, train_pred_cls) train_true_seg = np.concatenate(train_true_seg, axis=0) train_pred_seg = np.concatenate(train_pred_seg, axis=0) train_label_seg = np.concatenate(train_label_seg) train_ious = calculate_shape_IoU(train_pred_seg, train_true_seg, train_label_seg, args.class_choice) outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f, train iou: %.6f' % ( epoch, train_loss * 1.0 / count, train_acc, avg_per_class_acc, np.mean(train_ious)) io.cprint(outstr) #################### # Test #################### test_loss = 0.0 count = 0.0 model.eval() test_true_cls = [] test_pred_cls = [] test_true_seg = [] test_pred_seg = [] test_label_seg = [] for data, label, seg in test_loader: seg = seg - seg_start_index label_one_hot = np.zeros((label.shape[0], 16)) for idx in range(label.shape[0]): label_one_hot[idx, label[idx]] = 1 label_one_hot = torch.from_numpy(label_one_hot.astype(np.float32)) data, label_one_hot, seg = data.to(device), label_one_hot.to( device), seg.to(device) data = data.permute(0, 2, 1) batch_size = data.size()[0] seg_pred = model(data, label_one_hot) seg_pred = seg_pred.permute(0, 2, 1).contiguous() loss = criterion(seg_pred.view(-1, seg_num_all), seg.view(-1, 1).squeeze()) pred = seg_pred.max(dim=2)[1] count += batch_size test_loss += loss.item() * batch_size seg_np = seg.cpu().numpy() pred_np = pred.detach().cpu().numpy() test_true_cls.append(seg_np.reshape(-1)) test_pred_cls.append(pred_np.reshape(-1)) test_true_seg.append(seg_np) test_pred_seg.append(pred_np) test_label_seg.append(label.reshape(-1)) test_true_cls = np.concatenate(test_true_cls) test_pred_cls = np.concatenate(test_pred_cls) test_acc = metrics.accuracy_score(test_true_cls, test_pred_cls) avg_per_class_acc = metrics.balanced_accuracy_score( test_true_cls, test_pred_cls) test_true_seg = np.concatenate(test_true_seg, axis=0) test_pred_seg = np.concatenate(test_pred_seg, axis=0) test_label_seg = np.concatenate(test_label_seg) test_ious = calculate_shape_IoU(test_pred_seg, test_true_seg, test_label_seg, args.class_choice) outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f, test iou: %.6f' % ( epoch, test_loss * 1.0 / count, test_acc, avg_per_class_acc, np.mean(test_ious)) io.cprint(outstr) if np.mean(test_ious) >= best_test_iou: best_test_iou = np.mean(test_ious) torch.save(model.state_dict(), 'checkpoints/%s/models/model.t7' % args.exp_name)
print(net.parameters) criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.RMSprop(params, lr=0.003,weight_decay=args.weight_decay, momentum=args.momentum) logging.info(f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, " + f"Extra Layers learning rate: {extra_layers_lr}.") if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif args.scheduler == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR( optimizer, args.t_max, last_epoch=last_epoch) else: logging.fatal(f"Unsupported Scheduler: {args.scheduler}.") parser.print_help(sys.stderr) sys.exit(1) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, args.num_epochs): train(train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=args.debug_steps, epoch=epoch) scheduler.step() if epoch % 10 == 0: # val_loss, val_regression_loss, val_classification_loss = test(val_loader, net, criterion, DEVICE) # logging.info(
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) y1 = (df.EncodedPixels_1 != "-1").astype("float32").values.reshape( -1, 1) y2 = (df.EncodedPixels_2 != "-1").astype("float32").values.reshape( -1, 1) y3 = (df.EncodedPixels_3 != "-1").astype("float32").values.reshape( -1, 1) y4 = (df.EncodedPixels_4 != "-1").astype("float32").values.reshape( -1, 1) y = np.concatenate([y1, y2, y3, y4], axis=1) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), ], p=0.5), OneOf([GaussNoise(p=0.5)], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y_train) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8, pin_memory=True) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('se_resnext50_32x4d', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION, attention_type="cbam", center=True, mode="train") #model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam([ { 'params': model.decoder.parameters(), 'lr': 3e-3 }, { 'params': model.encoder.parameters(), 'lr': 3e-4 }, ], eps=1e-4) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) if EMA: ema_model = copy.deepcopy(model) if base_model_ema is not None: ema_model.load_state_dict(torch.load(base_model_ema)) ema_model.to(device) ema_model = torch.nn.DataParallel(ema_model) else: ema_model = None model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ema_loss = 999 best_model_ep = 0 ema_decay = 0 checkpoint = base_ckpt + 1 for epoch in range(84, EPOCHS + 1): seed = seed + epoch seed_torch(seed) if epoch >= EMA_START: ema_decay = 0.99 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION, ema_model=ema_model, ema_decay=ema_decay) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss = validate(model, val_loader, criterion, device, classification=CLASSIFICATION) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) if EMA and epoch >= EMA_START: ema_valid_loss = validate(ema_model, val_loader, criterion, device, classification=CLASSIFICATION) LOGGER.info('Mean EMA valid loss: {}'.format( round(ema_valid_loss, 5))) if ema_valid_loss < best_model_ema_loss: torch.save( ema_model.module.state_dict(), 'models/{}_fold{}_ckpt{}_ema.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_ema_loss = ema_valid_loss scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save( model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) if EMA: torch.save( ema_model.module.state_dict(), 'models/{}_fold{}_latest_ema.pth'.format( EXP_ID, FOLD_ID)) LOGGER.info('Best ema valid loss: {}'.format( round(best_model_ema_loss, 5))) checkpoint += 1 best_model_loss = 999 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def train(self): device = self.device print('Running on device: {}'.format(device), 'start training...') print( f'Setting - Epochs: {self.num_epochs}, Learning rate: {self.learning_rate} ' ) train_loader = self.train_loader valid_loader = self.valid_loader model = self.model.to(device) if self.optimizer == 0: optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) elif self.optimizer == 1: optimizer = torch.optim.AdamW(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) elif self.optimizer == 2: optimizer = MADGRAD(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) elif self.optimizer == 3: optimizer = AdamP(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) criterion = torch.nn.CrossEntropyLoss().to(device) if self.use_swa: optimizer = SWA(optimizer, swa_start=2, swa_freq=2, swa_lr=1e-5) # scheduler # scheduler_dct = { 0: None, 1: StepLR(optimizer, 10, gamma=0.5), 2: ReduceLROnPlateau(optimizer, 'min', factor=0.4, patience=int(0.3 * self.early_stopping_patience)), 3: CosineAnnealingLR(optimizer, T_max=5, eta_min=0.) } scheduler = scheduler_dct[self.scheduler] # early stopping early_stopping = EarlyStopping(patience=self.early_stopping_patience, verbose=True, path=f'checkpoint_{self.job}.pt') # training self.train_loss_lst = list() self.train_acc_lst = list() self.val_loss_lst = list() self.val_acc_lst = list() for epoch in range(1, self.num_epochs + 1): with tqdm(train_loader, unit='batch') as tepoch: avg_val_loss, avg_val_acc = None, None for idx, (img, label) in enumerate(tepoch): tepoch.set_description(f"Epoch {epoch}") model.train() optimizer.zero_grad() img, label = img.float().to(device), label.long().to( device) output = model(img) loss = criterion(output, label) predictions = output.argmax(dim=1, keepdim=True).squeeze() correct = (predictions == label).sum().item() accuracy = correct / len(img) loss.backward() optimizer.step() if idx == len(train_loader) - 1: val_loss_lst, val_acc_lst = list(), list() model.eval() with torch.no_grad(): for val_img, val_label in valid_loader: val_img, val_label = val_img.float().to( device), val_label.long().to(device) val_out = model(val_img) val_loss = criterion(val_out, val_label) val_pred = val_out.argmax( dim=1, keepdim=True).squeeze() val_acc = (val_pred == val_label ).sum().item() / len(val_img) val_loss_lst.append(val_loss.item()) val_acc_lst.append(val_acc) avg_val_loss = np.mean(val_loss_lst) avg_val_acc = np.mean(val_acc_lst) * 100. self.train_loss_lst.append(loss) self.train_acc_lst.append(accuracy) self.val_loss_lst.append(avg_val_loss) self.val_acc_lst.append(avg_val_acc) if scheduler is not None: current_lr = optimizer.param_groups[0]['lr'] else: current_lr = self.learning_rate # log tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy, val_loss=avg_val_loss, val_acc=avg_val_acc, current_lr=current_lr) # early stopping check early_stopping(avg_val_loss, model) if early_stopping.early_stop: print("Early stopping") break # scheduler update if scheduler is not None: if self.scheduler == 2: scheduler.step(avg_val_loss) else: scheduler.step() if self.use_swa: optimizer.swap_swa_sgd() self.model.load_state_dict(torch.load(f'checkpoint_{self.job}.pt'))
def main(cfg): """Runs main training procedure.""" # fix random seeds for reproducibility seed_everything(seed=cfg['seed']) # neptune logging neptune.init(project_qualified_name=cfg['neptune_project_name'], api_token=cfg['neptune_api_token']) neptune.create_experiment(name=cfg['neptune_experiment'], params=cfg) print('Preparing model and data...') print('Using SMP version:', smp.__version__) num_classes = 1 if len(cfg['classes']) == 1 else (len(cfg['classes']) + 1) activation = 'sigmoid' if num_classes == 1 else 'softmax2d' background = False if cfg['ignore_channels'] else True binary = True if num_classes == 1 else False softmax = False if num_classes == 1 else True sigmoid = True if num_classes == 1 else False aux_params = dict( pooling=cfg['pooling'], # one of 'avg', 'max' dropout=cfg['dropout'], # dropout ratio, default is None activation='sigmoid', # activation function, default is None classes=num_classes) # define number of output labels # configure model models = { 'unet': Unet(encoder_name=cfg['encoder_name'], encoder_weights=cfg['encoder_weights'], decoder_use_batchnorm=cfg['use_batchnorm'], classes=num_classes, activation=activation, aux_params=aux_params), 'pspnet': PSPNet(encoder_name=cfg['encoder_name'], encoder_weights=cfg['encoder_weights'], classes=num_classes, activation=activation, aux_params=aux_params), 'pan': PAN(encoder_name=cfg['encoder_name'], encoder_weights=cfg['encoder_weights'], classes=num_classes, activation=activation, aux_params=aux_params), 'deeplabv3plus': DeepLabV3Plus(encoder_name=cfg['encoder_name'], encoder_weights=cfg['encoder_weights'], classes=num_classes, activation=activation, aux_params=aux_params) } assert cfg['architecture'] in models.keys() model = models[cfg['architecture']] # configure loss losses = { 'dice_loss': DiceLoss(include_background=background, softmax=softmax, sigmoid=sigmoid, batch=cfg['combine']), 'generalized_dice': GeneralizedDiceLoss(include_background=background, softmax=softmax, sigmoid=sigmoid, batch=cfg['combine']) } assert cfg['loss'] in losses.keys() loss = losses[cfg['loss']] # configure optimizer optimizers = { 'adam': Adam([dict(params=model.parameters(), lr=cfg['lr'])]), 'adamw': AdamW([dict(params=model.parameters(), lr=cfg['lr'])]), 'rmsprop': RMSprop([dict(params=model.parameters(), lr=cfg['lr'])]) } assert cfg['optimizer'] in optimizers.keys() optimizer = optimizers[cfg['optimizer']] # configure metrics metrics = { 'dice_score': DiceMetric(include_background=background, reduction='mean'), 'dice_smp': Fscore(threshold=cfg['rounding'], ignore_channels=cfg['ignore_channels']), 'iou_smp': IoU(threshold=cfg['rounding'], ignore_channels=cfg['ignore_channels']), 'generalized_dice': GeneralizedDiceLoss(include_background=background, softmax=softmax, sigmoid=sigmoid, batch=cfg['combine']), 'dice_loss': DiceLoss(include_background=background, softmax=softmax, sigmoid=sigmoid, batch=cfg['combine']), 'cross_entropy': BCELoss(reduction='mean'), 'accuracy': Accuracy(ignore_channels=cfg['ignore_channels']) } assert all(m['name'] in metrics.keys() for m in cfg['metrics']) metrics = [(metrics[m['name']], m['name'], m['type']) for m in cfg['metrics']] # tuple of (metric, name, type) # TODO: Fix metric names # configure scheduler schedulers = { 'steplr': StepLR(optimizer, step_size=cfg['step_size'], gamma=0.5), 'cosine': CosineAnnealingLR(optimizer, cfg['epochs'], eta_min=cfg['eta_min'], last_epoch=-1) } assert cfg['scheduler'] in schedulers.keys() scheduler = schedulers[cfg['scheduler']] # configure augmentations train_transform = load_train_transform(transform_type=cfg['transform'], patch_size=cfg['patch_size_train']) valid_transform = load_valid_transform( patch_size=cfg['patch_size_valid']) # manually selected patch size train_dataset = ArtifactDataset(df_path=cfg['train_data'], classes=cfg['classes'], transform=train_transform, normalize=cfg['normalize'], ink_filters=cfg['ink_filters']) valid_dataset = ArtifactDataset(df_path=cfg['valid_data'], classes=cfg['classes'], transform=valid_transform, normalize=cfg['normalize'], ink_filters=cfg['ink_filters']) test_dataset = ArtifactDataset(df_path=cfg['test_data'], classes=cfg['classes'], transform=valid_transform, normalize=cfg['normalize'], ink_filters=cfg['ink_filters']) # load pre-sampled patch arrays train_image, train_mask = train_dataset[0] valid_image, valid_mask = valid_dataset[0] print('Shape of image patch', train_image.shape) print('Shape of mask patch', train_mask.shape) print('Train dataset shape:', len(train_dataset)) print('Valid dataset shape:', len(valid_dataset)) assert train_image.shape[1] == cfg[ 'patch_size_train'] and train_image.shape[2] == cfg['patch_size_train'] assert valid_image.shape[1] == cfg[ 'patch_size_valid'] and valid_image.shape[2] == cfg['patch_size_valid'] # save intermediate augmentations if cfg['eval_dir']: default_dataset = ArtifactDataset(df_path=cfg['train_data'], classes=cfg['classes'], transform=None, normalize=None, ink_filters=cfg['ink_filters']) transform_dataset = ArtifactDataset(df_path=cfg['train_data'], classes=cfg['classes'], transform=train_transform, normalize=None, ink_filters=cfg['ink_filters']) for idx in range(0, min(500, len(train_dataset)), 10): image_input, image_mask = default_dataset[idx] image_input = image_input.transpose((1, 2, 0)).astype(np.uint8) image_mask = image_mask.transpose(1, 2, 0) image_mask = np.argmax( image_mask, axis=2) if not binary else image_mask.squeeze() image_mask = image_mask.astype(np.uint8) image_transform, _ = transform_dataset[idx] image_transform = image_transform.transpose( (1, 2, 0)).astype(np.uint8) idx_str = str(idx).zfill(3) skimage.io.imsave(os.path.join(cfg['eval_dir'], f'{idx_str}a_image_input.png'), image_input, check_contrast=False) plt.imsave(os.path.join(cfg['eval_dir'], f'{idx_str}b_image_mask.png'), image_mask, vmin=0, vmax=6, cmap='Spectral') skimage.io.imsave(os.path.join(cfg['eval_dir'], f'{idx_str}c_image_transform.png'), image_transform, check_contrast=False) del transform_dataset # update process print('Starting training...') print('Available GPUs for training:', torch.cuda.device_count()) # pytorch module wrapper class DataParallelModule(torch.nn.DataParallel): def __getattr__(self, name): try: return super().__getattr__(name) except AttributeError: return getattr(self.module, name) # data parallel training if torch.cuda.device_count() > 1: model = DataParallelModule(model) train_loader = DataLoader(train_dataset, batch_size=cfg['batch_size'], num_workers=cfg['workers'], shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=int(cfg['batch_size'] / 4), num_workers=cfg['workers'], shuffle=False) test_loader = DataLoader(test_dataset, batch_size=int(cfg['batch_size'] / 4), num_workers=cfg['workers'], shuffle=False) trainer = Trainer(model=model, device=cfg['device'], save_checkpoints=cfg['save_checkpoints'], checkpoint_dir=cfg['checkpoint_dir'], checkpoint_name=cfg['checkpoint_name']) trainer.compile(optimizer=optimizer, loss=loss, metrics=metrics, num_classes=num_classes) trainer.fit(train_loader, valid_loader, epochs=cfg['epochs'], scheduler=scheduler, verbose=cfg['verbose'], loss_weight=cfg['loss_weight'], test_loader=test_loader, binary=binary) # validation inference model.load_state_dict( torch.load(os.path.join(cfg['checkpoint_dir'], cfg['checkpoint_name']))) model.to(cfg['device']) model.eval() # save best checkpoint to neptune neptune.log_artifact( os.path.join(cfg['checkpoint_dir'], cfg['checkpoint_name'])) # setup directory to save plots if os.path.isdir(cfg['plot_dir_valid']): shutil.rmtree(cfg['plot_dir_valid']) os.makedirs(cfg['plot_dir_valid'], exist_ok=True) # valid dataset without transformations and normalization for image visualization valid_dataset_vis = ArtifactDataset(df_path=cfg['valid_data'], classes=cfg['classes'], ink_filters=cfg['ink_filters']) # keep track of valid masks valid_preds = [] valid_masks = [] if cfg['save_checkpoints']: print('Predicting valid patches...') for n in range(len(valid_dataset)): image_vis = valid_dataset_vis[n][0].astype('uint8') image_vis = image_vis.transpose(1, 2, 0) image, gt_mask = valid_dataset[n] gt_mask = gt_mask.transpose(1, 2, 0) gt_mask = np.argmax(gt_mask, axis=2) if not binary else gt_mask.squeeze() gt_mask = gt_mask.astype(np.uint8) valid_masks.append(gt_mask) x_tensor = torch.from_numpy(image).to(cfg['device']).unsqueeze(0) pr_mask, _ = model.predict(x_tensor) pr_mask = pr_mask.squeeze(axis=0).cpu().numpy().round() pr_mask = pr_mask.transpose(1, 2, 0) pr_mask = np.argmax(pr_mask, axis=2) if not binary else pr_mask.squeeze() pr_mask = pr_mask.astype(np.uint8) valid_preds.append(pr_mask) save_predictions(out_path=cfg['plot_dir_valid'], index=n + 1, image=image_vis, ground_truth_mask=gt_mask, predicted_mask=pr_mask) del train_dataset, valid_dataset del train_loader, valid_loader # calculate dice per class valid_masks = np.stack(valid_masks, axis=0) valid_masks = valid_masks.flatten() valid_preds = np.stack(valid_preds, axis=0) valid_preds = valid_preds.flatten() dice_score = f1_score(y_true=valid_masks, y_pred=valid_preds, average=None) neptune.log_text('valid_dice_class', str(dice_score)) print('Valid dice score (class):', str(dice_score)) if cfg['evaluate_test_set']: print('Predicting test patches...') # setup directory to save plots if os.path.isdir(cfg['plot_dir_test']): shutil.rmtree(cfg['plot_dir_test']) os.makedirs(cfg['plot_dir_test'], exist_ok=True) # test dataset without transformations and normalization for image visualization test_dataset_vis = ArtifactDataset(df_path=cfg['test_data'], classes=cfg['classes'], ink_filters=cfg['ink_filters']) # keep track of test masks test_masks = [] test_preds = [] for n in range(len(test_dataset)): image_vis = test_dataset_vis[n][0].astype('uint8') image_vis = image_vis.transpose(1, 2, 0) image, gt_mask = test_dataset[n] gt_mask = gt_mask.transpose(1, 2, 0) gt_mask = np.argmax(gt_mask, axis=2) if not binary else gt_mask.squeeze() gt_mask = gt_mask.astype(np.uint8) test_masks.append(gt_mask) x_tensor = torch.from_numpy(image).to(cfg['device']).unsqueeze(0) pr_mask, _ = model.predict(x_tensor) pr_mask = pr_mask.squeeze(axis=0).cpu().numpy().round() pr_mask = pr_mask.transpose(1, 2, 0) pr_mask = np.argmax(pr_mask, axis=2) if not binary else pr_mask.squeeze() pr_mask = pr_mask.astype(np.uint8) test_preds.append(pr_mask) save_predictions(out_path=cfg['plot_dir_test'], index=n + 1, image=image_vis, ground_truth_mask=gt_mask, predicted_mask=pr_mask) # calculate dice per class test_masks = np.stack(test_masks, axis=0) test_masks = test_masks.flatten() test_preds = np.stack(test_preds, axis=0) test_preds = test_preds.flatten() dice_score = f1_score(y_true=test_masks, y_pred=test_preds, average=None) neptune.log_text('test_dice_class', str({dice_score})) print('Test dice score (class):', str(dice_score)) # end of training process print('Finished training!')