class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights print( os.path.join(Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy')) if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) print(self.criterion) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler( args.lr_scheduler, args.lr, #args.epochs, len(self.train_loader)) args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() #tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) #for i, sample in enumerate(tbar): for i, sample in enumerate(self.train_loader): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) #print(torch.unique(output.detach())) #print(output.shape, target.shape) # For AICUP dataset, we only need output that being 0/1 if self.args.dataset == 'aicup' and self.args.task == 'regression': output = torch.sigmoid(output) loss = self.criterion(output, target) #print(loss.item()) loss.backward() self.optimizer.step() train_loss += loss.item() #tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch #if i % (num_img_tr // 10) == 0: if i % (num_img_tr // 3) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) #print(torch.unique(output.detach())) print('[Epoch: %d, numImages: %5d], Loss: %.3f' % (epoch, i * self.args.batch_size + image.data.shape[0], train_loss)) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) #print(len(pred[pred==0]), len(pred[pred==1]), len(target[target==0]), len(target[target==1])) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) #new_pred = mIoU new_pred = Acc_class if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, config): self.config = config self.best_pred = 0.0 # Define Saver self.saver = Saver(config) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.config['training']['tensorboard']['log_dir']) self.writer = self.summary.create_summary() self.train_loader, self.val_loader, self.test_loader, self.nclass = initialize_data_loader(config) # Define network model = DeepLab(num_classes=self.nclass, backbone=self.config['network']['backbone'], output_stride=self.config['image']['out_stride'], sync_bn=self.config['network']['sync_bn'], freeze_bn=self.config['network']['freeze_bn']) train_params = [{'params': model.get_1x_lr_params(), 'lr': self.config['training']['lr']}, {'params': model.get_10x_lr_params(), 'lr': self.config['training']['lr'] * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=self.config['training']['momentum'], weight_decay=self.config['training']['weight_decay'], nesterov=self.config['training']['nesterov']) # Define Criterion # whether to use class balanced weights if self.config['training']['use_balanced_weights']: classes_weights_path = os.path.join(self.config['dataset']['base_path'], self.config['dataset']['dataset_name'] + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(self.config, self.config['dataset']['dataset_name'], self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=self.config['network']['use_cuda']).build_loss(mode=self.config['training']['loss_type']) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(self.config['training']['lr_scheduler'], self.config['training']['lr'], self.config['training']['epochs'], len(self.train_loader)) # Using cuda if self.config['network']['use_cuda']: self.model = torch.nn.DataParallel(self.model) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint if self.config['training']['weights_initialization']['use_pretrained_weights']: if not os.path.isfile(self.config['training']['weights_initialization']['restore_from']): raise RuntimeError("=> no checkpoint found at '{}'" .format(self.config['training']['weights_initialization']['restore_from'])) if self.config['network']['use_cuda']: checkpoint = torch.load(self.config['training']['weights_initialization']['restore_from']) else: checkpoint = torch.load(self.config['training']['weights_initialization']['restore_from'], map_location={'cuda:0': 'cpu'}) self.config['training']['start_epoch'] = checkpoint['epoch'] if self.config['network']['use_cuda']: self.model.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) # if not self.config['ft']: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(self.config['training']['weights_initialization']['restore_from'], checkpoint['epoch'])) def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.config['network']['use_cuda']: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.config['dataset']['dataset_name'], image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.config['training']['batch_size'] + image.data.shape[0])) print('Loss: %.3f' % train_loss) #save last checkpoint self.saver.save_checkpoint({ 'epoch': epoch + 1, # 'state_dict': self.model.module.state_dict(), 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best = False, filename='checkpoint_last.pth.tar') #if training on a subset reshuffle the data if self.config['training']['train_on_subset']['enabled']: self.train_loader.dataset.shuffle_dataset() def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.config['network']['use_cuda']: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Val loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.config['training']['batch_size'] + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, # 'state_dict': self.model.module.state_dict(), 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best = True, filename='checkpoint_best.pth.tar')
class Trainer(object): def __init__(self, weight_path, resume, gpu_id): init_seeds(1) init_dirs("result") self.device = gpu.select_device(gpu_id) self.start_epoch = 0 self.best_mIoU = 0. self.epochs = cfg.TRAIN["EPOCHS"] self.weight_path = weight_path self.train_loader, self.val_loader, _, self.num_class = make_data_loader( ) self.model = DeepLab(num_classes=self.num_class, backbone="resnet", output_stride=16, sync_bn=False, freeze_bn=False).to(self.device) train_params = [{ 'params': self.model.get_1x_lr_params(), 'lr': cfg.TRAIN["LR_INIT"] }, { 'params': self.model.get_10x_lr_params(), 'lr': cfg.TRAIN["LR_INIT"] * 10 }] self.optimizer = optim.SGD(train_params, momentum=cfg.TRAIN["MOMENTUM"], weight_decay=cfg.TRAIN["WEIGHT_DECAY"]) self.criterion = SegmentationLosses().build_loss( mode=cfg.TRAIN["LOSS_TYPE"]) self.scheduler = LR_Scheduler(mode=cfg.TRAIN["LR_SCHEDULER"], base_lr=cfg.TRAIN["LR_INIT"], num_epochs=self.epochs, iters_per_epoch=len(self.train_loader)) self.evaluator = Evaluator(self.num_class) self.saver = Saver() self.summary = TensorboardSummary(os.path.join("result", "run")) if resume: self.__resume_model_weights() def __resume_model_weights(self): last_weight = os.path.join("result", "weights", "last.pt") chkpt = torch.load(last_weight, map_location=self.device) self.model.load_state_dict(chkpt['model']) self.start_epoch = chkpt['epoch'] + 1 if chkpt['optimizer'] is not None: self.optimizer.load_state_dict(chkpt['optimizer']) self.best_mIoU = chkpt['best_mIoU'] del chkpt print("resume model weights from : {}".format(last_weight)) def __training(self, epoch): self.model.train() train_loss = 0.0 for i, sample in enumerate(self.train_loader): image, target = sample["image"], sample["label"] image = image.to(self.device) target = target.to(self.device) self.scheduler(self.optimizer, i, epoch, self.best_mIoU) self.optimizer.zero_grad() out = self.model(image) loss = self.criterion(logit=out, target=target) loss.backward() self.optimizer.step() # Update running mean of tracked metrics train_loss = (train_loss * i + loss.item()) / (i + 1) # Print or log if i % 20 == 0: s = 'Epoch:[ {:d} | {:d} ] Batch:[ {:d} | {:d} ] loss: {:.4f} lr: {:.6f}'.format( epoch, self.epochs - 1, i, len(self.train_loader) - 1, train_loss, self.optimizer.param_groups[0]['lr']) # self.logger.info(s) print(s) # Write global_step = i + len(self.train_loader) * epoch self.summary.writer.add_scalar('train/total_loss_iter', loss.item(), global_step) self.summary.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) if i % (len(self.train_loader) // 10) == 0: self.summary.visualize_image(cfg.DATA["TYPE"], image, target, out, global_step) # Save last.pt if epoch <= 20: self.saver.save_checkpoint(state={ 'epoch': epoch, 'best_mAP': self.best_mIoU, 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() }, is_best=False) def __validating(self, epoch): self.model.eval() self.evaluator.reset() test_loss = 0.0 for i, sample in enumerate(self.val_loader): image, target = sample["image"], sample["label"] image = image.to(self.device) target = target.to(self.device) with torch.no_grad(): out = self.model(image) loss = self.criterion(logit=out, target=target) test_loss += loss.item() pred = out.data.cpu().numpy() pred = np.argmax(pred, axis=1) target = target.cpu().numpy() self.evaluator.add_batch(target, pred) # calculate the index Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() # Write self.summary.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.summary.writer.add_scalar('val/mIoU', mIoU, epoch) self.summary.writer.add_scalar('val/Acc', Acc, epoch) self.summary.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.summary.writer.add_scalar('val/fwIoU', FWIoU, epoch) # Save is_best = False if mIoU > self.best_mIoU: self.best_mIoU = mIoU is_best = True self.saver.save_checkpoint(state={ 'epoch': epoch, 'best_mAP': self.best_mIoU, 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() }, is_best=is_best) # print print('*' * 20 + "Validate" + '*' * 20) print( "Acc: {}\nAcc_class: {}\nmIoU: {}\nfwIoU: {}\nLoss: {:.3f}\nbest_mIoU: {}" .format(Acc, Acc_class, mIoU, FWIoU, test_loss, self.best_mIoU)) def train(self): print(self.model) for epoch in range(self.start_epoch, self.epochs): self.__training(epoch) if epoch > -1: self.__validating(epoch)
class Trainer(object): def __init__(self, config, args): self.args = args self.config = config self.visdom = args.visdom if args.visdom: self.vis = visdom.Visdom(env=os.getcwd().split('/')[-1], port=8888) # Define Dataloader self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( config) self.target_train_loader, self.target_val_loader, self.target_test_loader, _ = make_target_data_loader( config) # Define network self.model = DeepLab(num_classes=self.nclass, backbone=config.backbone, output_stride=config.out_stride, sync_bn=config.sync_bn, freeze_bn=config.freeze_bn) self.D = Discriminator(num_classes=self.nclass, ndf=16) train_params = [{ 'params': self.model.get_1x_lr_params(), 'lr': config.lr }, { 'params': self.model.get_10x_lr_params(), 'lr': config.lr * config.lr_ratio }] # Define Optimizer self.optimizer = torch.optim.SGD(train_params, momentum=config.momentum, weight_decay=config.weight_decay) self.D_optimizer = torch.optim.Adam(self.D.parameters(), lr=config.lr, betas=(0.9, 0.99)) # Define Criterion # whether to use class balanced weights self.criterion = SegmentationLosses( weight=None, cuda=args.cuda).build_loss(mode=config.loss) self.entropy_mini_loss = MinimizeEntropyLoss() self.bottleneck_loss = BottleneckLoss() self.instance_loss = InstanceLoss() # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(config.lr_scheduler, config.lr, config.epochs, len(self.train_loader), config.lr_step, config.warmup_epochs) self.summary = TensorboardSummary('./train_log') # labels for adversarial training self.source_label = 0 self.target_label = 1 # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model) patch_replication_callback(self.model) # cudnn.benchmark = True self.model = self.model.cuda() self.D = torch.nn.DataParallel(self.D) patch_replication_callback(self.D) self.D = self.D.cuda() self.best_pred_source = 0.0 self.best_pred_target = 0.0 # Resuming checkpoint if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) if args.cuda: self.model.module.load_state_dict(checkpoint) else: self.model.load_state_dict(checkpoint, map_location=torch.device('cpu')) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, args.start_epoch)) def training(self, epoch): train_loss, seg_loss_sum, bn_loss_sum, entropy_loss_sum, adv_loss_sum, d_loss_sum, ins_loss_sum = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 self.model.train() if config.freeze_bn: self.model.module.freeze_bn() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) target_train_iterator = iter(self.target_train_loader) for i, sample in enumerate(tbar): itr = epoch * len(self.train_loader) + i #if self.visdom: # self.vis.line(X=torch.tensor([itr]), Y=torch.tensor([self.optimizer.param_groups[0]['lr']]), # win='lr', opts=dict(title='lr', xlabel='iter', ylabel='lr'), # update='append' if itr>0 else None) self.summary.writer.add_scalar( 'Train/lr', self.optimizer.param_groups[0]['lr'], itr) A_image, A_target = sample['image'], sample['label'] # Get one batch from target domain try: target_sample = next(target_train_iterator) except StopIteration: target_train_iterator = iter(self.target_train_loader) target_sample = next(target_train_iterator) B_image, B_target, B_image_pair = target_sample[ 'image'], target_sample['label'], target_sample['image_pair'] if self.args.cuda: A_image, A_target = A_image.cuda(), A_target.cuda() B_image, B_target, B_image_pair = B_image.cuda( ), B_target.cuda(), B_image_pair.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred_source, self.best_pred_target, self.config.lr_ratio) self.scheduler(self.D_optimizer, i, epoch, self.best_pred_source, self.best_pred_target, self.config.lr_ratio) A_output, A_feat, A_low_feat = self.model(A_image) B_output, B_feat, B_low_feat = self.model(B_image) #B_output_pair, B_feat_pair, B_low_feat_pair = self.model(B_image_pair) #B_output_pair, B_feat_pair, B_low_feat_pair = flip(B_output_pair, dim=-1), flip(B_feat_pair, dim=-1), flip(B_low_feat_pair, dim=-1) self.optimizer.zero_grad() self.D_optimizer.zero_grad() # Train seg network for param in self.D.parameters(): param.requires_grad = False # Supervised loss seg_loss = self.criterion(A_output, A_target) main_loss = seg_loss # Unsupervised loss #ins_loss = 0.01 * self.instance_loss(B_output, B_output_pair) #main_loss += ins_loss # Train adversarial loss D_out = self.D(prob_2_entropy(F.softmax(B_output))) adv_loss = bce_loss(D_out, self.source_label) main_loss += self.config.lambda_adv * adv_loss main_loss.backward() # Train discriminator for param in self.D.parameters(): param.requires_grad = True A_output_detach = A_output.detach() B_output_detach = B_output.detach() # source D_source = self.D(prob_2_entropy(F.softmax(A_output_detach))) source_loss = bce_loss(D_source, self.source_label) source_loss = source_loss / 2 # target D_target = self.D(prob_2_entropy(F.softmax(B_output_detach))) target_loss = bce_loss(D_target, self.target_label) target_loss = target_loss / 2 d_loss = source_loss + target_loss d_loss.backward() self.optimizer.step() self.D_optimizer.step() seg_loss_sum += seg_loss.item() #ins_loss_sum += ins_loss.item() adv_loss_sum += self.config.lambda_adv * adv_loss.item() d_loss_sum += d_loss.item() #train_loss += seg_loss.item() + self.config.lambda_adv * adv_loss.item() train_loss += seg_loss.item() self.summary.writer.add_scalar('Train/SegLoss', seg_loss.item(), itr) #self.summary.writer.add_scalar('Train/InsLoss', ins_loss.item(), itr) self.summary.writer.add_scalar('Train/AdvLoss', adv_loss.item(), itr) self.summary.writer.add_scalar('Train/DiscriminatorLoss', d_loss.item(), itr) tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) # Show the results of the last iteration #if i == len(self.train_loader)-1: print("Add Train images at epoch" + str(epoch)) self.summary.visualize_image('Train-Source', self.config.dataset, A_image, A_target, A_output, epoch, 5) self.summary.visualize_image('Train-Target', self.config.target, B_image, B_target, B_output, epoch, 5) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.config.batch_size + A_image.data.shape[0])) print('Loss: %.3f' % train_loss) #print('Seg Loss: %.3f' % seg_loss_sum) #print('Ins Loss: %.3f' % ins_loss_sum) #print('BN Loss: %.3f' % bn_loss_sum) #print('Adv Loss: %.3f' % adv_loss_sum) #print('Discriminator Loss: %.3f' % d_loss_sum) #if self.visdom: #self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([seg_loss_sum]), win='train_loss', name='Seg_loss', # opts=dict(title='loss', xlabel='epoch', ylabel='loss'), # update='append' if epoch > 0 else None) #self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([ins_loss_sum]), win='train_loss', name='Ins_loss', # opts=dict(title='loss', xlabel='epoch', ylabel='loss'), # update='append' if epoch > 0 else None) #self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([bn_loss_sum]), win='train_loss', name='BN_loss', # opts=dict(title='loss', xlabel='epoch', ylabel='loss'), # update='append' if epoch > 0 else None) #self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([adv_loss_sum]), win='train_loss', name='Adv_loss', # opts=dict(title='loss', xlabel='epoch', ylabel='loss'), # update='append' if epoch > 0 else None) #self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([d_loss_sum]), win='train_loss', name='Dis_loss', # opts=dict(title='loss', xlabel='epoch', ylabel='loss'), # update='append' if epoch > 0 else None) def validation(self, epoch): def get_metrics(tbar, if_source=False): self.evaluator.reset() test_loss = 0.0 #feat_mean, low_feat_mean, feat_var, low_feat_var = 0, 0, 0, 0 #adv_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output, low_feat, feat = self.model(image) #low_feat = low_feat.cpu().numpy() #feat = feat.cpu().numpy() #if isinstance(feat, np.ndarray): # feat_mean += feat.mean(axis=0).mean(axis=1).mean(axis=1) # low_feat_mean += low_feat.mean(axis=0).mean(axis=1).mean(axis=1) # feat_var += feat.var(axis=0).var(axis=1).var(axis=1) # low_feat_var += low_feat.var(axis=0).var(axis=1).var(axis=1) #else: # feat_mean = feat.mean(axis=0).mean(axis=1).mean(axis=1) # low_feat_mean = low_feat.mean(axis=0).mean(axis=1).mean(axis=1) # feat_var = feat.var(axis=0).var(axis=1).var(axis=1) # low_feat_var = low_feat.var(axis=0).var(axis=1).var(axis=1) #d_output = self.D(prob_2_entropy(F.softmax(output))) #adv_loss += bce_loss(d_output, self.source_label).item() loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target_ = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target_, pred) if if_source: print("Add Validation-Source images at epoch" + str(epoch)) self.summary.visualize_image('Val-Source', self.config.dataset, image, target, output, epoch, 5) else: print("Add Validation-Target images at epoch" + str(epoch)) self.summary.visualize_image('Val-Target', self.config.target, image, target, output, epoch, 5) #feat_mean /= (i+1) #low_feat_mean /= (i+1) #feat_var /= (i+1) #low_feat_var /= (i+1) #adv_loss /= (i+1) # Fast test during the training Acc = self.evaluator.Building_Acc() IoU = self.evaluator.Building_IoU() mIoU = self.evaluator.Mean_Intersection_over_Union() if if_source: print('Validation on source:') else: print('Validation on target:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.config.batch_size + image.data.shape[0])) print("Acc:{}, IoU:{}, mIoU:{}".format(Acc, IoU, mIoU)) print('Loss: %.3f' % test_loss) if if_source: names = ['source', 'source_acc', 'source_IoU', 'source_mIoU'] self.summary.writer.add_scalar('Val/SourceAcc', Acc, epoch) self.summary.writer.add_scalar('Val/SourceIoU', IoU, epoch) else: names = ['target', 'target_acc', 'target_IoU', 'target_mIoU'] self.summary.writer.add_scalar('Val/TargetAcc', Acc, epoch) self.summary.writer.add_scalar('Val/TargetIoU', IoU, epoch) # Draw Visdom #if if_source: # names = ['source', 'source_acc', 'source_IoU', 'source_mIoU'] #else: # names = ['target', 'target_acc', 'target_IoU', 'target_mIoU'] #if self.visdom: # self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([test_loss]), win='val_loss', name=names[0], # update='append') # self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([adv_loss]), win='val_loss', name='adv_loss', # update='append') # self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([Acc]), win='metrics', name=names[1], # opts=dict(title='metrics', xlabel='epoch', ylabel='performance'), # update='append' if epoch > 0 else None) # self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([IoU]), win='metrics', name=names[2], # update='append') # self.vis.line(X=torch.tensor([epoch]), Y=torch.tensor([mIoU]), win='metrics', name=names[3], # update='append') return Acc, IoU, mIoU self.model.eval() tbar_source = tqdm(self.val_loader, desc='\r') tbar_target = tqdm(self.target_val_loader, desc='\r') s_acc, s_iou, s_miou = get_metrics(tbar_source, True) t_acc, t_iou, t_miou = get_metrics(tbar_target, False) new_pred_source = s_iou new_pred_target = t_iou if new_pred_source > self.best_pred_source or new_pred_target > self.best_pred_target: is_best = True self.best_pred_source = max(new_pred_source, self.best_pred_source) self.best_pred_target = max(new_pred_target, self.best_pred_target) print('Saving state, epoch:', epoch) torch.save( self.model.module.state_dict(), self.args.save_folder + 'models/' + 'epoch' + str(epoch) + '.pth') loss_file = { 's_Acc': s_acc, 's_IoU': s_iou, 's_mIoU': s_miou, 't_Acc': t_acc, 't_IoU': t_iou, 't_mIoU': t_miou } with open( os.path.join(self.args.save_folder, 'eval', 'epoch' + str(epoch) + '.json'), 'w') as f: json.dump(loss_file, f)
def main(): parser = argparse.ArgumentParser( description="PyTorch DeeplabV3Plus Training") parser.add_argument('--backbone', type=str, default='resnet', choices=['resnet', 'xception', 'drn', 'mobilenet'], help='backbone name (default: resnet)') parser.add_argument('--out-stride', type=int, default=16, help='network output stride (default: 16)') parser.add_argument('--dataset', type=str, default='active_cityscapes_image', choices=[ 'active_cityscapes_image', 'active_cityscapes_region', 'active_pascal_image', 'active_pascal_region' ], help='dataset name (default: active_cityscapes)') parser.add_argument('--use-sbd', action='store_true', default=False, help='whether to use SBD dataset (default: False)') parser.add_argument('--base-size', type=int, default=513, help='base image size') parser.add_argument('--crop-size', type=int, default=513, help='crop image size') parser.add_argument('--sync-bn', type=bool, default=None, help='whether to use sync bn (default: auto)') parser.add_argument( '--freeze-bn', type=bool, default=False, help='whether to freeze bn parameters (default: False)') parser.add_argument('--loss-type', type=str, default='ce', choices=['ce', 'focal'], help='loss func type (default: ce)') parser.add_argument('--workers', type=int, default=4, help='num workers') # training hyper params parser.add_argument('--epochs', type=int, default=None, metavar='N', help='number of epochs to train (default: auto)') parser.add_argument('--start_epoch', type=int, default=0, metavar='N', help='start epochs (default:0)') parser.add_argument('--batch-size', type=int, default=None, metavar='N', help='input batch size for \ training (default: auto)') parser.add_argument('--test-batch-size', type=int, default=None, metavar='N', help='input batch size for \ testing (default: auto)') parser.add_argument( '--use-balanced-weights', action='store_true', default=False, help='whether to use balanced weights (default: False)') # optimizer params parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (default: auto)') parser.add_argument('--lr-scheduler', type=str, default='poly', choices=['poly', 'step', 'cos'], help='lr scheduler mode: (default: poly)') parser.add_argument('--use-lr-scheduler', default=False, help='use learning rate scheduler', action='store_true') parser.add_argument('--optimizer', type=str, default='SGD', choices=['SGD', 'Adam']) parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='momentum (default: 0.9)') parser.add_argument('--weight-decay', type=float, default=5e-4, metavar='M', help='w-decay (default: 5e-4)') parser.add_argument('--nesterov', action='store_true', default=False, help='whether use nesterov (default: False)') # cuda, seed and logging parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--gpu-ids', type=str, default='0', help='use which gpu to train, must be a \ comma-separated list of integers only (default=0)') parser.add_argument('--seed', type=int, default=-1, metavar='S', help='random seed (default: 1)') # checking point parser.add_argument('--resume', type=int, default=0, help='iteration to resume from') parser.add_argument('--checkname', type=str, default=None, help='set the checkpoint name') parser.add_argument('--resume-selections', type=str, default=None, help='resume selections file') # finetuning pre-trained models parser.add_argument('--ft', action='store_true', default=False, help='finetuning on a different dataset') # evaluation option parser.add_argument('--eval-interval', type=int, default=1, help='evaluuation interval (default: 1)') parser.add_argument('--no-val', action='store_true', default=False, help='skip validation during training') parser.add_argument('--overfit', action='store_true', default=False, help='overfit to one sample') parser.add_argument('--seed_set', action='store_true', default='set_0.txt', help='initial labeled set') parser.add_argument('--active-batch-size', type=int, default=50, help='batch size queried from oracle') parser.add_argument('--active-selection-mode', type=str, default='random', choices=[ 'random', 'variance', 'coreset', 'ceal_confidence', 'ceal_margin', 'ceal_entropy', 'ceal_fusion', 'ceal_entropy_weakly_labeled', 'variance_representative', 'noise_image', 'noise_feature', 'noise_variance', 'accuracy_labels', 'accuracy_eval' ], help='method to select new samples') parser.add_argument('--active-region-size', type=int, default=129, help='size of regions in case region dataset is used') parser.add_argument('--max-iterations', type=int, default=1000, help='maximum active selection iterations') parser.add_argument( '--min-improvement', type=float, default=0.01, help='min improvement evaluation interval (default: 1)') parser.add_argument('--weak-label-entropy-threshold', type=float, default=0.80, help='initial threshold for entropy for weak labels') parser.add_argument('--weak-label-threshold-decay', type=float, default=0.015, help='decay for threshold on weak labels') parser.add_argument('--monitor-directory', type=str, default=None) parser.add_argument('--memory-hog', action='store_true', default=False, help='memory_hog mode') parser.add_argument('--no-early-stop', action='store_true', default=False, help='no early stopping') parser.add_argument('--architecture', type=str, default='deeplab', choices=['deeplab', 'enet', 'fastscnn']) args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: try: args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')] except ValueError: raise ValueError( 'Argument --gpu_ids must be a comma-separated list of integers only' ) if args.sync_bn is None: if args.cuda and len(args.gpu_ids) > 1: args.sync_bn = True else: args.sync_bn = False # default settings for epochs, batch_size and lr if args.epochs is None: epoches = { 'coco': 30, 'cityscapes': 200, 'active_cityscapes': 200, 'pascal': 50, } args.epochs = epoches[args.dataset.lower()] if args.batch_size is None: args.batch_size = 4 * len(args.gpu_ids) if args.test_batch_size is None: args.test_batch_size = args.batch_size if args.lr is None: lrs = { 'coco': 0.1, 'cityscapes': 0.01, 'active_cityscapes': 0.01, 'pascal': 0.007, } args.lr = lrs[args.dataset.lower()] / ( 4 * len(args.gpu_ids)) * args.batch_size if args.checkname is None: args.checkname = 'deeplab-' + str(args.backbone) mc_dropout = args.active_selection_mode == 'variance' or args.active_selection_mode == 'variance_representative' or args.active_selection_mode == 'noise_variance' args.active_batch_size = args.active_batch_size * 2 if args.active_selection_mode == 'variance_representative' else args.active_batch_size print() print(args) # manual seeding if args.seed == -1: args.seed = int(random.random() * 2000) print('Using random seed = ', args.seed) torch.manual_seed(args.seed) kwargs = { 'pin_memory': False, 'init_set': args.seed_set, 'memory_hog': args.memory_hog } dataloaders = make_dataloader(args.dataset, args.base_size, args.crop_size, args.batch_size, args.workers, args.overfit, **kwargs) training_set = dataloaders[0] dataloaders = dataloaders[1:] saver = Saver(args, remove_existing=False) saver.save_experiment_config() summary = TensorboardSummary(saver.experiment_dir) writer = summary.create_summary() print() active_selector = get_active_selection_class(args.active_selection_mode, training_set.NUM_CLASSES, training_set.env, args.crop_size, args.batch_size) max_subset_selector = get_max_subset_active_selector( training_set.env, args.crop_size, args.batch_size) # used only for representativeness cases total_active_selection_iterations = min( len(training_set.image_paths) // args.active_batch_size - 1, args.max_iterations) if args.resume != 0 and args.resume_selections != None: seed_size = len(training_set) with open(os.path.join(saver.experiment_dir, args.resume_selections), "r") as fptr: paths = [ u'{}'.format(x.strip()).encode('ascii') for x in fptr.readlines() if x is not '' ] training_set.expand_training_set(paths[seed_size:]) assert len(training_set) == (args.resume * args.active_batch_size + seed_size) assert args.eval_interval <= args.epochs and args.epochs % args.eval_interval == 0 trainer = Trainer(args, dataloaders, mc_dropout) trainer.initialize() for selection_iter in range(args.resume, total_active_selection_iterations): print( f'ActiveIteration-{selection_iter:03d}/{total_active_selection_iterations:03d}' ) fraction_of_data_labeled = round( training_set.get_fraction_of_labeled_data() * 100) if args.dataset.endswith('_image'): trainer.setup_saver_and_summary(fraction_of_data_labeled, training_set.current_image_paths) elif args.dataset.endswith('_region'): trainer.setup_saver_and_summary( fraction_of_data_labeled, training_set.current_image_paths, regions=[ training_set.current_paths_to_regions_map[x] for x in training_set.current_image_paths ]) else: raise NotImplementedError len_dataset_before = len(training_set) training_set.make_dataset_multiple_of_batchsize(args.batch_size) print( f'\nExpanding training set with {len_dataset_before} images to {len(training_set)} images' ) trainer.initialize() if not args.no_early_stop: early_stop = EarlyStopChecker(patience=5, min_improvement=args.min_improvement) best_mIoU = 0 best_Acc = 0 best_Acc_class = 0 best_FWIoU = 0 for outer_epoch in range(args.epochs // args.eval_interval): train_loss = 0 for inner_epoch in range(args.eval_interval): train_loss += trainer.training(outer_epoch * args.eval_interval + inner_epoch) test_loss, mIoU, Acc, Acc_class, FWIoU, visualizations = trainer.validation( outer_epoch * args.eval_interval + inner_epoch) if mIoU > best_mIoU: best_mIoU = mIoU if Acc > best_Acc: best_Acc = Acc if Acc_class > best_Acc_class: best_Acc_class = Acc_class if FWIoU > best_FWIoU: best_FWIoU = FWIoU if not args.no_early_stop: # check for early stopping if early_stop(mIoU): print( f'Early stopping triggered after {outer_epoch * args.eval_interval + inner_epoch} epochs' ) break training_set.reset_dataset() writer.add_scalar('active_loop/train_loss', train_loss / len(training_set), fraction_of_data_labeled) writer.add_scalar('active_loop/val_loss', test_loss, fraction_of_data_labeled) writer.add_scalar('active_loop/mIoU', best_mIoU, fraction_of_data_labeled) writer.add_scalar('active_loop/Acc', best_Acc, fraction_of_data_labeled) writer.add_scalar('active_loop/Acc_class', best_Acc_class, fraction_of_data_labeled) writer.add_scalar('active_loop/fwIoU', best_FWIoU, fraction_of_data_labeled) summary.visualize_image(writer, args.dataset, visualizations[0], visualizations[1], visualizations[2], len(training_set.current_image_paths)) trainer.writer.close() if selection_iter == (total_active_selection_iterations - 1): break checkpoint = torch.load( os.path.join(trainer.saver.experiment_dir, 'best.pth.tar')) trainer.model.module.load_state_dict(checkpoint['state_dict']) trainer.model.eval() if args.active_selection_mode == 'random': training_set.expand_training_set( active_selector.get_random_uncertainity( training_set.remaining_image_paths, args.active_batch_size)) elif args.active_selection_mode == 'variance' or args.active_selection_mode == 'variance_representative': if args.dataset.endswith('_image'): print('Calculating entropies..') selected_images = active_selector.get_vote_entropy_for_images( trainer.model, training_set.remaining_image_paths, args.active_batch_size) if args.active_selection_mode == 'variance_representative': selected_images = max_subset_selector.get_representative_images( trainer.model, training_set.image_paths, selected_images) training_set.expand_training_set(selected_images) elif args.dataset.endswith('_region'): print('Creating region maps..') regions, counts = active_selector.create_region_maps( trainer.model, training_set.image_paths, training_set.get_existing_region_maps(), args.active_region_size, args.active_batch_size) if args.active_selection_mode == 'variance_representative': regions, counts = max_subset_selector.get_representative_regions( trainer.model, training_set.image_paths, regions, args.active_region_size) print( f'Got {counts}/{math.ceil((args.active_batch_size) * args.crop_size * args.crop_size / (args.active_region_size * args.active_region_size))} regions' ) training_set.expand_training_set( regions, counts * args.active_region_size * args.active_region_size) else: raise NotImplementedError elif args.active_selection_mode == 'coreset': assert args.dataset.endswith( '_image'), 'only images supported for coreset approach' training_set.expand_training_set( active_selector.get_k_center_greedy_selections( args.active_batch_size, trainer.model, training_set.remaining_image_paths, training_set.current_image_paths)) elif args.active_selection_mode == 'ceal_confidence': training_set.expand_training_set( active_selector.get_least_confident_samples( trainer.model, training_set.remaining_image_paths, args.active_batch_size)) elif args.active_selection_mode == 'ceal_margin': training_set.expand_training_set( active_selector.get_least_margin_samples( trainer.model, training_set.remaining_image_paths, args.active_batch_size)) elif args.active_selection_mode == 'ceal_entropy': training_set.expand_training_set( active_selector.get_maximum_entropy_samples( trainer.model, training_set.remaining_image_paths, args.active_batch_size)[0]) elif args.active_selection_mode == 'ceal_fusion': training_set.expand_training_set( active_selector. get_fusion_of_confidence_margin_entropy_samples( trainer.model, training_set.remaining_image_paths, args.active_batch_size)) elif args.active_selection_mode == 'ceal_entropy_weakly_labeled': selected_samples, entropies = active_selector.get_maximum_entropy_samples( trainer.model, training_set.remaining_image_paths, args.active_batch_size) training_set.clear_weak_labels() weak_labels = active_selector.get_weakly_labeled_data( trainer.model, training_set.remaining_image_paths, args.weak_label_entropy_threshold - selection_iter * args.weak_label_threshold_decay, entropies) for sample in selected_samples: if sample in weak_labels: del weak_labels[sample] training_set.expand_training_set(selected_samples) training_set.add_weak_labels(weak_labels) elif args.active_selection_mode == 'noise_image': print('Calculating entropies..') selected_images = active_selector.get_vote_entropy_for_images_with_input_noise( trainer.model, training_set.remaining_image_paths, args.active_batch_size) training_set.expand_training_set(selected_images) elif args.active_selection_mode == 'noise_feature': print('Calculating entropies..') selected_images = active_selector.get_vote_entropy_for_images_with_feature_noise( trainer.model, training_set.remaining_image_paths, args.active_batch_size) training_set.expand_training_set(selected_images) elif args.active_selection_mode == 'noise_variance': if args.dataset.endswith('_image'): print('Calculating entropies..') selected_images = active_selector.get_vote_entropy_for_batch_with_noise_and_vote_entropy( trainer.model, training_set.remaining_image_paths, args.active_batch_size) training_set.expand_training_set(selected_images) elif args.dataset.endswith('_region'): print('Creating region maps..') regions, counts = active_selector.create_region_maps( trainer.model, training_set.image_paths, training_set.get_existing_region_maps(), args.active_region_size, args.active_batch_size) print( f'Got {counts}/{math.ceil((args.active_batch_size) * args.crop_size * args.crop_size / (args.active_region_size * args.active_region_size))} regions' ) training_set.expand_training_set( regions, counts * args.active_region_size * args.active_region_size) elif args.active_selection_mode == 'accuracy_labels': print('Evaluating accuracies..') selected_images = active_selector.get_least_accurate_sample_using_labels( trainer.model, training_set.remaining_image_paths, args.active_batch_size) training_set.expand_training_set(selected_images) elif args.active_selection_mode == 'accuracy_eval': full_monitor_directory = os.path.join(constants.RUNS, args.dataset, args.monitor_directory) selections_file = os.path.join( full_monitor_directory, f'run_{round(training_set.get_next_est_fraction_of_labeled_data(args.active_batch_size) * 100):04d}', "selections.txt") print('Waiting for the next folder to be available..', selections_file) selected_images = active_selector.wait_for_selected_samples( selections_file, training_set.remaining_image_paths) training_set.expand_training_set(selected_images) else: raise NotImplementedError writer.close()
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, # {'params': model.get_10x_lr_params(), 'lr': args.lr*10}] # Define Optimizer # optimizer = torch.optim.SGD(train_params, momentum=args.momentum, # weight_decay=args.weight_decay, nesterov=args.nesterov) train_params = [{'params': model.get_10x_lr_params(), 'lr': args.lr}] optimizer = torch.optim.Adam(train_params, weight_decay=self.args.weight_decay) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def mixup_data(self, x, y, alpha=0.2, use_cuda=True): '''Returns mixed inputs, pairs of targets, and lambda''' if alpha > 0: lam = np.random.beta(alpha, alpha) else: lam = 1 batch_size = x.size()[0] if use_cuda: index = torch.randperm(batch_size).cuda() else: index = torch.randperm(batch_size) mixed_x = lam * x + (1 - lam) * x[index, :] y_a, y_b = y, y[index] return mixed_x, y_a, y_b, lam def mixup_criterion(self, criterion, pred, y_a, y_b, lam): return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b) def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() targets_a, targets_b, lam = None, None, None if self.args.use_mixup: image, targets_a, targets_b, lam = self.mixup_data(image, target, self.args.mixup_alpha, self.args.cuda) image, targets_a, targets_b = map(Variable, (image, targets_a, targets_b)) self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) if self.args.use_mixup: # print ("mixed") loss = self.mixup_criterion(self.criterion, output, targets_a, targets_b, lam) else: loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class MyTrainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} if (args.dataset == "fashion_person"): train_set = fashion.FashionDataset(args, Path.db_root_dir("fashion_person"), mode='train',type = 'person') val_set = fashion.FashionDataset(args, Path.db_root_dir("fashion_person"), mode='test', type='person') self.nclass = train_set.nclass print("Train size {}, val size {}".format(len(train_set), len(val_set))) self.train_loader = DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=True, **kwargs) self.val_loader = DataLoader(dataset=val_set, batch_size=args.batch_size, shuffle=False, **kwargs) self.test_loader = None assert self.nclass == 2 elif (args.dataset == "fashion_clothes"): train_set = fashion.FashionDataset(args, Path.db_root_dir("fashion_clothes"), mode='train', type='clothes') val_set = fashion.FashionDataset(args, Path.db_root_dir("fashion_clothes"), mode='test', type='clothes') self.nclass = train_set.nclass print("Train size {}, val size {}".format(len(train_set), len(val_set))) self.train_loader = DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=True, **kwargs) self.val_loader = DataLoader(dataset=val_set, batch_size=args.batch_size, shuffle=False, **kwargs) self.test_loader = None assert self.nclass == 7 #self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network # model = DeepLab(num_classes=self.nclass, # backbone=args.backbone, # output_stride=args.out_stride, # sync_bn=args.sync_bn, # freeze_bn=args.freeze_bn) # Using original network to load pretrained and do fine tuning self.best_pred = 0.0 if args.model == 'deeplabv3+': model = DeepLab(backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) # Loading pretrained VOC model if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume)) if args.cuda: checkpoint = torch.load(args.resume) else: checkpoint = torch.load(args.resume,map_location='cpu') args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) #Freez the backbone if args.freeze_backbone: set_parameter_requires_grad(model.backbone, False) ######NEW DECODER###### #Different type of FT if args.ft_type == 'decoder': set_parameter_requires_grad(model, False) model.decoder = build_decoder(self.nclass, 'resnet', nn.BatchNorm2d) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] elif args.ft_type == 'last_layer': set_parameter_requires_grad(model, False) model.decoder.last_conv[8] = nn.Conv2d(in_channels=256, out_channels=self.nclass, kernel_size=1) model.decoder.last_conv[8].reset_parameters() train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] if args.ft_type == 'all': #Reset last layer, to generate output we want model.decoder.last_conv[8] = nn.Conv2d(in_channels=256, out_channels=self.nclass, kernel_size=1) model.decoder.last_conv[8].reset_parameters() train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) elif args.model == "unet": model = UNet(num_categories=self.nclass, num_filters=args.num_filters) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model == 'mydeeplab': model = My_DeepLab(num_classes=self.nclass, in_channels=3) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) print("weight is {}".format(weight)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: # TODO, ADD PARALLEL SUPPORT (NEED SYNC BATCH) # self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) # patch_replication_callback(self.model) self.model = self.model.cuda() args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def visulize_validation(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): #current_index_val_set image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) #we have image, target, output on GPU #j, index of image in batch self.summary.visualize_pregt(self.writer, self.args.dataset, image, target, output, i) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Visualizing:') pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Final Validation:') print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) def output_validation(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): #current_index_val_set image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) #we have image, target, output on GPU #j, index of image in batch #image save self.summary.save_pred(self.args.dataset, output, i) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Visualizing:') pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Final Validation:') print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) def _load_model(self, path): if self.args.cuda: checkpoint = torch.load(path) else: checkpoint = torch.load(path, map_location='cpu') self.model.load_state_dict(checkpoint['state_dict']) def train_loop(self): try: for epoch in range(self.args.start_epoch, self.args.epochs): self.training(epoch) if not self.args.no_val and epoch % self.args.eval_interval == (self.args.eval_interval - 1): self.validation(epoch) except KeyboardInterrupt: print('Early Stopping') finally: self.visulize_validation() self.writer.close()
class Trainer(object): def __init__(self, config, args): self.args = args self.config = config # Define Dataloader self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( config) # Define network self.model = DeepLab(num_classes=self.nclass, backbone=config.backbone, output_stride=config.out_stride, sync_bn=config.sync_bn, freeze_bn=config.freeze_bn) train_params = [{ 'params': self.model.get_1x_lr_params(), 'lr': config.lr }, { 'params': self.model.get_10x_lr_params(), 'lr': config.lr * config.lr_ratio }] # Define Optimizer self.optimizer = torch.optim.SGD(train_params, momentum=config.momentum, weight_decay=config.weight_decay) # Define Criterion # whether to use class balanced weights self.criterion = SegmentationLosses( weight=None, cuda=args.cuda).build_loss(mode=config.loss) # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(config.lr_scheduler, config.lr, config.epochs, len(self.train_loader), config.lr_step, config.warmup_epochs) self.summary = TensorboardSummary('./train_log') # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model) patch_replication_callback(self.model) # cudnn.benchmark = True self.model = self.model.cuda() self.best_pred_source = 0.0 # Resuming checkpoint if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) if args.cuda: self.model.module.load_state_dict(checkpoint) else: self.model.load_state_dict(checkpoint, map_location=torch.device('cpu')) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, args.start_epoch)) def training(self, epoch): train_loss, seg_loss_sum, bn_loss_sum, entropy_loss_sum, adv_loss_sum, d_loss_sum, ins_loss_sum = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 self.model.train() if config.freeze_bn: self.model.module.freeze_bn() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) target_train_iterator = iter(self.target_train_loader) for i, sample in enumerate(tbar): itr = epoch * len(self.train_loader) + i self.summary.writer.add_scalar( 'Train/lr', self.optimizer.param_groups[0]['lr'], itr) A_image, A_target = sample['image'], sample['label'] if self.args.cuda: A_image, A_target = A_image.cuda(), A_target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred_source, 0., self.config.lr_ratio) A_output, A_feat, A_low_feat = self.model(A_image) self.optimizer.zero_grad() # Train seg network # Supervised loss seg_loss = self.criterion(A_output, A_target) main_loss = seg_loss main_loss.backward() self.optimizer.step() seg_loss_sum += seg_loss.item() train_loss += seg_loss.item() self.summary.writer.add_scalar('Train/SegLoss', seg_loss.item(), itr) tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) # Show the results of the last iteration #if i == len(self.train_loader)-1: print("Add Train images at epoch" + str(epoch)) self.summary.visualize_image('Train-Source', self.config.dataset, A_image, A_target, A_output, epoch, 5) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.config.batch_size + A_image.data.shape[0])) print('Loss: %.3f' % train_loss) def validation(self, epoch): def get_metrics(tbar, if_source=False): self.evaluator.reset() test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output, low_feat, feat = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target_ = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target_, pred) if if_source: print("Add Validation-Source images at epoch" + str(epoch)) self.summary.visualize_image('Val-Source', self.config.dataset, image, target, output, epoch, 5) else: print("Add Validation-Target images at epoch" + str(epoch)) self.summary.visualize_image('Val-Target', self.config.target, image, target, output, epoch, 5) # Fast test during the training Acc = self.evaluator.Building_Acc() IoU = self.evaluator.Building_IoU() mIoU = self.evaluator.Mean_Intersection_over_Union() if if_source: print('Validation on source:') else: print('Validation on target:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.config.batch_size + image.data.shape[0])) print("Acc:{}, IoU:{}, mIoU:{}".format(Acc, IoU, mIoU)) print('Loss: %.3f' % test_loss) if if_source: names = ['source', 'source_acc', 'source_IoU', 'source_mIoU'] self.summary.writer.add_scalar('Val/SourceAcc', Acc, epoch) self.summary.writer.add_scalar('Val/SourceIoU', IoU, epoch) else: names = ['target', 'target_acc', 'target_IoU', 'target_mIoU'] self.summary.writer.add_scalar('Val/TargetAcc', Acc, epoch) self.summary.writer.add_scalar('Val/TargetIoU', IoU, epoch) return Acc, IoU, mIoU self.model.eval() tbar_source = tqdm(self.val_loader, desc='\r') s_acc, s_iou, s_miou = get_metrics(tbar_source, True) new_pred_source = s_iou if new_pred_source > self.best_pred_source: is_best = True self.best_pred_source = max(new_pred_source, self.best_pred_source) print('Saving state, epoch:', epoch) torch.save( self.model.module.state_dict(), self.args.save_folder + 'models/' + 'epoch' + str(epoch) + '.pth') loss_file = {'s_Acc': s_acc, 's_IoU': s_iou, 's_mIoU': s_miou} with open( os.path.join(self.args.save_folder, 'eval', 'epoch' + str(epoch) + '.json'), 'w') as f: json.dump(loss_file, f)
class operater(object): def __init__(self, args, student_model, teacher_model, src_loader, trg_loader, val_loader, optimizer, teacher_optimizer): self.args = args self.student_model = student_model self.teacher_model = teacher_model self.src_loader = src_loader self.trg_loader = trg_loader self.val_loader = val_loader self.optimizer = optimizer self.teacher_optimizer = teacher_optimizer # Define Evaluator self.evaluator = Evaluator(args.nclass) # Define lr scheduler # self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, # args.epochs, len(trn_loader)) #self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=[3, 6, 9, 12], gamma=0.5) #ft self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=[20], gamma=0.5) self.best_pred = 0 self.init_weight = 0.98 # Define Saver self.saver = Saver(self.args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() self.evaluator = Evaluator(self.args.nclass) def training(self, epoch, args): train_loss = 0.0 self.student_model.train() self.teacher_model.train() num_src = len(self.src_loader) num_trg = len(self.trg_loader) num_itr = np.maximum(num_src, num_trg) tbar = tqdm(range(1, num_itr + 1)) #w1 = 0.2 + 0.5 * (self.init_weight - 0.2) * (1 + np.cos(epoch * np.pi / args.epochs)) print('Learning rate:', self.optimizer.param_groups[0]['lr']) iter_src = iter(self.src_loader) iter_trg = iter(self.trg_loader) for i in tbar: src_x1, src_x2, src_y, src_idx = iter_src.next() trg_x1, trg_x2, trg_y, trg_idx = iter_trg.next() if i % num_src == 0: iter_src = iter(self.src_loader) if self.args.cuda: src_x1, src_x2 = src_x1.cuda(), src_x2.cuda() trg_x1, trg_x2 = trg_x1.cuda(), trg_x2.cuda() self.optimizer.zero_grad() # train with source _, _, src_output = self.student_model(src_x1, src_x2) src_output = F.softmax(src_output, dim=1) # CE loss of supervised data #loss_ce=CELossLayer(src_output,src_y) # #print('ce loss', loss_ce) # Focal loss of supervised data loss_focal = FocalLossLayer(src_output, src_y) #print('focal loss', loss_focal) loss_val_lovasz = LovaszLossLayer(src_output, src_y) #print('lovasz loss', loss_lovasz) if epoch > 3: loss_su = loss_val_lovasz + loss_focal else: loss_su = loss_val_lovasz + loss_focal # train with target trg_x1_s = trg_x1 + torch.randn( trg_x1.size()).cuda() * self.args.noise trg_x1_t = trg_x1 + torch.randn( trg_x1.size()).cuda() * self.args.noise trg_x2_s = trg_x2 + torch.randn( trg_x2.size()).cuda() * self.args.noise trg_x2_t = trg_x2 + torch.randn( trg_x2.size()).cuda() * self.args.noise _, _, trg_predict_s = self.student_model(trg_x1_s, trg_x2_s) _, spatial_mask_prob, trg_predict_t = self.teacher_model( trg_x1_t, trg_x2_t) trg_predict_s = F.softmax(trg_predict_s, dim=1) trg_predict_t = F.softmax(trg_predict_t, dim=1) loss_tes_lovasz = LovaszLossLayer(trg_predict_s, trg_y) # spatial mask #channel_mask = channel_mask_prob > args.attention_threshold spatial_mask = spatial_mask_prob > args.attention_threshold spatial_mask = spatial_mask.float() #spatial_mask = spatial_mask.permute(0,2,3,1)# N,H,W,C #channel_mask = channel_mask.float() #spatial_mask = spatial_mask.view(-1) num_pixel = spatial_mask.shape[0] * spatial_mask.shape[ -2] * spatial_mask.shape[-1] mask_num_rate = torch.sum(spatial_mask).float() / num_pixel # trg_output_s = trg_output_s.permute(0, 2, 3, 1)#N,H,W,C # trg_output_t = trg_output_t.permute(0, 2, 3, 1) #trg_output_s = trg_output_s * channel_mask trg_predict_s = trg_predict_s * spatial_mask #trg_output_t = trg_output_t * channel_mask trg_predict_t = trg_predict_t * spatial_mask # trg_output_s = trg_output_s.contiguous().view(-1, self.args.nclass) # trg_output_s = trg_output_s[spatial_mask] # # trg_output_t = trg_output_t.contiguous().view(-1, self.args.nclass) # trg_output_t = trg_output_t[spatial_mask] # consistency loss loss_con = ConsistencyLossLayer(trg_predict_s, trg_predict_t) if mask_num_rate == 0.: loss_con = torch.tensor(0.).float().cuda() loss = loss_su + self.args.con_weight * loss_con + self.args.teslab_weight * loss_tes_lovasz #self.writer.add_scalar('train/ce_loss_iter', loss_ce.item(), i + num_itr * epoch) self.writer.add_scalar('train/focal_loss_iter', loss_focal.item(), i + num_itr * epoch) self.writer.add_scalar('train/supervised_loss_iter', loss_su.item(), i + num_itr * epoch) self.writer.add_scalar('train/consistency_loss_iter', loss_con.item(), i + num_itr * epoch) self.writer.add_scalar('train/teslab_loss_iter', loss_tes_lovasz.item(), i + num_itr * epoch) #loss = w1 * loss_ce + (0.5 - 0.5 * w1) * loss_focal + (0.5 - 0.5 * w1) * loss_lovasz loss.backward() self.optimizer.step() self.teacher_optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_itr * epoch) #Show 10 * 3 inference results each epoch if i % 10 == 0: global_step = i + num_itr * epoch if self.args.oly_s1 and not self.args.oly_s2: self.summary.visualize_image( self.writer, self.args.dataset, src_x1[:, [0], :, :], trg_x1[:, [0], :, :], src_y, src_output, trg_predict_s, trg_predict_t, trg_y, global_step) elif not self.args.oly_s1: if self.args.rgb: self.summary.visualize_image(self.writer, self.args.dataset, src_x2, trg_x2, src_y, src_output, trg_predict_s, trg_predict_t, trg_y, global_step) else: self.summary.visualize_image( self.writer, self.args.dataset, src_x2[:, [2, 1, 0], :, :], trg_x2[:, [2, 1, 0], :, :], src_y, src_output, trg_predict_s, trg_predict_t, trg_y, global_step) else: raise NotImplementedError self.scheduler.step() self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + src_y.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'student_state_dict': self.student_model.module.state_dict(), 'teacher_state_dict': self.teacher_model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), }, is_best) def validation(self, epoch, args): self.teacher_model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') for i, (x1, x2, y, index) in enumerate(tbar): if self.args.cuda: x1, x2 = x1.cuda(), x2.cuda() with torch.no_grad(): _, _, output = self.teacher_model(x1, x2) output = F.softmax(output, dim=1) pred = output.data.cpu().numpy() #pred[:,[2,7],:,:]=0 target = y[:, 0, :, :].cpu().numpy() # batch_size * 256 * 256 pred = np.argmax(pred, axis=1) # batch_size * 256 * 256 # Add batch sample into evaluator self.evaluator.add_batch(target, pred) OA = self.evaluator.Pixel_Accuracy() AA = self.evaluator.val_Pixel_Accuracy_Class() self.writer.add_scalar('val/OA', OA, epoch) self.writer.add_scalar('val/AA', AA, epoch) print('AVERAGE ACCURACY:', AA) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + y.data.shape[0])) new_pred = AA if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'student_state_dict': self.student_model.module.state_dict(), 'teacher_state_dict': self.teacher_model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': False} if args.dataset == 'click': extract_hard_example(args, batch_size=32, recal=False) self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network sbox = DeepLabX(pretrain=False) sbox.load_state_dict( torch.load('run/sbox_513_8925.pth.tar', map_location=torch.device('cuda:0'))['state_dict']) click = ClickNet() model = FusionNet(sbox=sbox, click=click, pos_limit=2, neg_limit=2) model.sbox_net.eval() for para in model.sbox_net.parameters(): para.requires_grad = False train_params = [ { 'params': model.click_net.parameters(), 'lr': args.lr }, # {'params': model.sbox_net.get_1x_lr_params(), 'lr': args.lr*0.001} # {'params': model.sbox_net.get_train_click_params(), 'lr': args.lr*0.001} ] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: # self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) # patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() self.model.sbox_net.eval() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, gt = sample['crop_image'], sample['crop_gt'] if self.args.cuda: image, gt = image.cuda(), gt.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() sbox_pred, click_pred, sum_pred = self.model(image, crop_gt=gt) sum_pred = F.interpolate(sum_pred, size=gt.size()[-2:], align_corners=True, mode='bilinear') sbox_pred = F.interpolate(sbox_pred, size=gt.size()[-2:], align_corners=True, mode='bilinear') loss1 = self.criterion(sum_pred, gt) \ # + self.criterion(sbox_pred, gt) loss1.backward() self.optimizer.step() total_loss = loss1.item() train_loss += total_loss tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_steps', total_loss, i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch grid_image = make_grid(decode_seg_map_sequence( torch.max(sbox_pred[:3], 1)[1].detach().cpu().numpy(), dataset=self.args.dataset), 3, normalize=False, range=(0, 255)) self.summary.visualize_image(self.writer, self.args.dataset, image, sample['crop_gt'], sum_pred, global_step) self.writer.add_image('sbox_pred', grid_image, global_step) self.writer.add_scalar('train/total_epochs', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 total_clicks = 0 for i, sample in enumerate(tbar): image, gt = sample['crop_image'], sample['crop_gt'] if self.args.cuda: image, gt = image.cuda(), gt.cuda() with torch.no_grad(): sbox_pred, click_pred, sum_pred = self.model(image, crop_gt=gt) # sum_pred, clicks = self.model.click_eval(image, gt) # total_clicks += clicks sum_pred = F.interpolate(sum_pred, size=gt.size()[-2:], align_corners=True, mode='bilinear') loss1 = self.criterion(sum_pred, gt) total_loss = loss1.item() test_loss += total_loss pred = sum_pred.data.cpu().numpy() target = gt.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_epochs', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) # print('total clicks:' , total_clicks) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best, prefix='click')
class Trainer(object): def __init__(self, args): self.args = args self.saver = PassiveSaver(args) self.saver.save_experiment_config() self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() kwargs = {'pin_memory': False, 'memory_hog': args.memory_hog} self.train_set, self.train_loader, self.val_loader, self.test_loader, self.nclass = make_dataloader( args.dataset, args.base_size, args.crop_size, args.batch_size, args.workers, args.overfit, **kwargs) self.train_set.make_dataset_multiple_of_batchsize(args.batch_size) if args.architecture == 'deeplab': print('Using Deeplab') model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] elif args.architecture == 'enet': print('Using ENet') model = ENet(num_classes=self.nclass, encoder_relu=True, decoder_relu=True) train_params = [{'params': model.parameters(), 'lr': args.lr}] elif args.architecture == 'fastscnn': print('Using FastSCNN') model = FastSCNN(3, self.nclass) train_params = [{'params': model.parameters(), 'lr': args.lr}] if args.optimizer == 'SGD': optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) elif args.optimizer == 'Adam': optimizer = torch.optim.Adam(train_params, weight_decay=args.weight_decay) else: raise NotImplementedError if args.use_balanced_weights: weight = calculate_weights_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer self.evaluator = Evaluator(self.nclass) if args.use_lr_scheduler: self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) else: self.scheduler = None if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError(f"=> no checkpoint found at {args.resume}") checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print( f'=> loaded checkpoint {args.resume} (epoch {checkpoint["epoch"]})' ) def training(self, epoch): train_loss = 0.0 self.model.train() num_img_tr = len(self.train_loader) tbar = tqdm(self.train_loader, desc='\r') visualization_index = int(random.random() * len(self.val_loader)) vis_img = None vis_tgt = None vis_out = None for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() if self.scheduler: self.scheduler(self.optimizer, i, epoch, self.best_pred) self.writer.add_scalar('train/learning_rate', self.scheduler.current_lr, i + num_img_tr * epoch) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) if i == visualization_index: vis_img = image vis_tgt = target vis_out = output self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) self.summary.visualize_image(self.writer, self.args.dataset, vis_img, vis_tgt, vis_out, epoch, prefix='train') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) print('BestPred: %.3f' % self.best_pred) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 visualization_index = int(random.random() * len(self.val_loader)) vis_img = None vis_tgt = None vis_out = None for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) if i == visualization_index: vis_img = image vis_tgt = target vis_out = output loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) self.summary.visualize_image(self.writer, self.args.dataset, vis_img, vis_tgt, vis_out, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver if not args.inference and not args.eval: self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) ''' model.cuda() summary(model, input_size=(3, 720, 1280)) exit() ''' train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 # Palette for inferencing self.palette = np.asarray([ [0,0,0], [217,83,79], [91, 192, 222]], dtype=np.uint8) def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def id(self, pred, filename): save_dir = './prd' saveas = os.path.join(save_dir, filename) pred = pred[0] result = Image.fromarray(pred.astype('uint8')) result.save(saveas) def combined(self, pred, origin, filename): save_dir = './prd' saveas = os.path.join(save_dir, filename) origin = np.asarray(origin) origin2 = origin[0].swapaxes(0, 1).swapaxes(1, 2) origin2 = origin2 * np.array([0.197, 0.198, 0.201]) + np.array([0.279, 0.293, 0.290]) origin2 = origin2 * 255 pred = pred[0] img = np.array(self.palette[pred.squeeze()]) result = np.array(np.zeros(img.shape)) result[pred==0] = origin2[pred==0] result[pred!=0] = origin2[pred!=0] /2 + img[pred!=0] / 2 result = Image.fromarray(result.astype('uint8'), 'RGB') result.save(saveas) def validation(self, epoch, inference=False): self.model.eval() self.evaluator.reset() if self.args.submit: tbar = tqdm(self.test_loader, desc='\r') else: tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 if self.args.submit: for i, sample in enumerate(tbar): image = sample['image'] name = sample['name'][0] tbar.set_description('%s' % name) if self.args.cuda: image = image.cuda() with torch.no_grad(): output = self.model(image) pred = output.data.cpu().numpy() pred = np.argmax(pred, axis=1) self.id(pred, name) print("All done clear, good luck!") return for i, sample in enumerate(tbar): if inference and not self.args.submit and i >= 100: break image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) target = target.cpu().numpy() image = image.cpu().numpy() pred = output.data.cpu().numpy() pred = np.argmax(pred, axis=1) if inference: self.combined(pred, image, str(i) + '.png') # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() if not self.args.inference and not self.args.eval: self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) if not self.args.inference and not self.args.eval: new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) model.load_state_dict(torch.load( r"C:\Users\Jayant\Documents\segPipieline\pytorch-deeplab-xception-master\run\marsh\deeplab-resnet\model_best.pth.tar" )['state_dict'], strict=False) #r"./run/marsh/deeplab-resnet/model_best.pth.tar" train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch #print("I was here!!") self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch, dataloader='test'): self.model.eval() self.evaluator.reset() if (dataloader == 'test'): tbar = tqdm(self.test_loader, desc='\r') elif (dataloader == 'val'): tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU, IoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print("Classwise_IoU:") print(IoU) print('Loss: %.3f' % test_loss) confusion = self.evaluator.confusion_matrix.tolist() #removing second row and column because of nan: don't do it after model actually learns this class. del confusion[1] for i in range(len(confusion)): del confusion[i][1] cm = np.array(confusion) cm.astype(int) print(cm) true_pos = np.diag(cm) false_pos = np.sum(cm, axis=0) - true_pos false_neg = np.sum(cm, axis=1) - true_pos precision = true_pos / (true_pos + false_pos) recall = true_pos / (true_pos + false_neg) print('Precision:', precision) print('Recall:', recall) precision_one = np.sum(true_pos) / (np.sum(true_pos) + np.sum(false_pos)) recall_one = np.sum(true_pos) / (np.sum(true_pos) + np.sum(false_neg)) print('Precision:', false_pos) print('Recall:', false_neg)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network # model = DeepLab(num_classes=self.nclass, # backbone=args.backbone, # output_stride=args.out_stride, # sync_bn=args.sync_bn, # freeze_bn=args.freeze_bn) model = Resnet18_32s(num_classes=2) # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, # {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: pretrained_dict = { k: v for k, v in checkpoint['state_dict'].items() if k in model.state_dict() } #print(pretrained_dict.keys()) pretrained_dict[ 'decoder.last_conv.8.weight'] = model.state_dict( )['decoder.last_conv.8.weight'] pretrained_dict['decoder.last_conv.8.bias'] = model.state_dict( )['decoder.last_conv.8.bias'] self.model.module.load_state_dict(pretrained_dict) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] #print('target:',target.shape) if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output, output_up = self.model(image) #可视化upsample4之前的Pic # img = img[0,:,:,:] # img = img.data.cpu().numpy() # image_np = np.argmax(img, axis=0) # for m in range(len(image_np)): # for n in range(len(image_np[0])): # if image_np[m][n] == 1: # image_np[m][n] =255 # image_np = Image.fromarray(image_np.astype('uint8')) # name = i + num_img_tr * epoch # #print(name) # image_np.save('/home/xupeihan/Code/pytorch-deeplab-xception/pic/'+ str(name) +'.png' ) #loss = self.criterion(output, target) loss = L.xloss(output_up, target.long(), ignore=255) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch # if i % (num_img_tr // 10) == 0: # global_step = i + num_img_tr * epoch # self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() num_img_tr = len(self.train_loader) tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 F1 = 0.0 index = 0 FF = FT = TF = TT = 0 for i, sample in enumerate(tbar): image, target = sample[0]['image'], sample[0]['label'] w = sample[1] h = sample[2] name = sample[3] #label2 = sample[4] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output, output_up = self.model(image) #loss = self.criterion(output, target) loss = L.xloss(output_up, target.long(), ignore=255) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() #pred = img.data.cpu().numpy() #summary global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) h.numpy().tolist() w.numpy().tolist() index += len(h) #target = label2 if target.size()[0] == 1: target = target.cpu().numpy().astype(np.uint8) else: target = target.cpu().numpy().squeeze().astype(np.uint8) pred = np.argmax(pred, axis=1) for i in range(len(h)): target_ = target[i] pred_ = pred[i] tar_img = Image.fromarray(target_) pre_img = Image.fromarray(pred_.squeeze().astype(np.uint8)) tar_img = Resize((h[i], w[i]), interpolation=2)(tar_img) pred_ = Resize((h[i], w[i]), interpolation=2)(pre_img) target_ = np.array(tar_img) pred_ = np.array(pred_) pred_[pred_ != 0] = 1 target_[target_ != 0] = 1 pred_ = pred_.astype(int) target_ = target_.astype(int) ff, ft, tf, tt = np.bincount((target_ * 2 + pred_).reshape(-1), minlength=4) #print(ff,ft,tf,tt) FF += ff FT += ft TF += tf TT += tt # F1 score #F1 += self.evaluator.F1_score(target_, pred_) # Add batch sample into evaluator self.evaluator.add_batch(target_, pred_) # image_np = image[0].cpu().numpy() # image_np = np.array((image_np*128+128).transpose((1,2,0)),dtype=np.uint8) # self.writer.add_image('Input', image_np) R = TT / float(TT + FT) P = TT / float(TT + TF) F1 = (2 * R * P) / (R + P) #Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() desire = (F1 + mIoU) * 0.5 self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) self.writer.add_scalar('val/F1_score', F1, epoch) self.writer.add_scalar('val/desire', desire, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print( "Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}, F1_score: {}, desire: {}" .format(Acc, Acc_class, mIoU, FWIoU, F1, desire)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver # self.saver = Saver(args) # Recoder the running processing self.saver = Saver(args) sys.stdout = Logger( os.path.join( self.saver.experiment_dir, 'log_train-%s.txt' % time.strftime("%Y-%m-%d-%H-%M-%S"))) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) if args.dataset == 'pairwise_lits': proxy_nclasses = self.nclass = 3 elif args.dataset == 'pairwise_chaos': proxy_nclasses = 2 * self.nclass else: raise NotImplementedError # Define network model = ConsistentDeepLab(in_channels=3, num_classes=proxy_nclasses, pretrained=args.pretrained, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer # optimizer = torch.optim.SGD(train_params, momentum=args.momentum, # weight_decay=args.weight_decay, nesterov=args.nesterov) optimizer = torch.optim.Adam(train_params, weight_decay=args.weight_decay) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: weights = calculate_weigths_labels(args.dataset, self.train_loader, proxy_nclasses) else: weights = None # Initializing loss print("Initializing loss: {}".format(args.loss_type)) self.criterion = losses.init_loss(args.loss_type, weights=weights) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, (sample1, sample2, proxy_label, sample_indices) in enumerate(tbar): image1, target1 = sample1['image'], sample1['label'] image2, target2 = sample2['image'], sample2['label'] if self.args.cuda: image1, target1 = image1.cuda(), target1.cuda() image2, target2 = image2.cuda(), target2.cuda() proxy_label = proxy_label.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image1, image2) loss = self.criterion(output, proxy_label) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch image = torch.cat((image1, image2), dim=-2) if len(proxy_label.shape) > 3: output = output[:, 0:self.nclass] proxy_label = torch.argmax(proxy_label[:, 0:self.nclass], dim=1) self.summary.visualize_image(self.writer, self.args.dataset, image, proxy_label, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image1.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 val_time = 0 for i, (sample1, sample2, proxy_label, sample_indices) in enumerate(tbar): image1, target1 = sample1['image'], sample1['label'] image2, target2 = sample2['image'], sample2['label'] if self.args.cuda: image1, target1 = image1.cuda(), target1.cuda() image2, target2 = image2.cuda(), target2.cuda() proxy_label = proxy_label.cuda() with torch.no_grad(): start = time.time() output = self.model(image1, image2, is_val=True) end = time.time() val_time += end - start loss = self.criterion(output, proxy_label) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() proxy_label = proxy_label.cpu().numpy() # Add batch sample into evaluator if len(proxy_label.shape) > 3: pred = np.argmax(pred[:, 0:self.nclass], axis=1) proxy_label = np.argmax(proxy_label[:, 0:self.nclass], axis=1) else: pred = np.argmax(pred, axis=1) self.evaluator.add_batch(proxy_label, pred) if self.args.save_predict: self.saver.save_predict_mask( pred, sample_indices, self.val_loader.dataset.data1_files) print("Val time: {}".format(val_time)) print("Total paramerters: {}".format( sum(x.numel() for x in self.model.parameters()))) if self.args.save_predict: namelist = [] for fname in self.val_loader.dataset.data1_files: # namelist.append(fname.split('/')[-1].split('.')[0]) _, name = os.path.split(fname) name = name.split('.')[0] namelist.append(name) file = gzip.open( os.path.join(self.saver.save_dir, 'namelist.pkl.gz'), 'wb') pickle.dump(namelist, file, protocol=-1) file.close() # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image1.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) dice = self.evaluator.Dice() # self.writer.add_scalar('val/Dice_1', dice[1], epoch) self.writer.add_scalar('val/Dice_2', dice[2], epoch) print("Dice:{}".format(dice)) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader1, self.train_loader2, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: #if so, which trainloader to use? weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) # Define network model = AutoDeeplab(num_classes=self.nclass, num_layers=12, criterion=self.criterion) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader1)) self.architect = Architect(self.model, args) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model.cuda()) #patch_replication_callback(self.model) self.model = self.model.cuda() print('cuda finished') # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader1) num_img_tr = len(self.train_loader1) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() if epoch > self.args.ab_epoch: search = next(iter(self.train_loader2)) image_search, target_search = search['image'], search['label'] if self.args.cuda: image_search, target_search = image_search.cuda( ), target_search.cuda() self.architect.step(image_search, target_search) self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) #self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) #torch.cuda.empty_cache() self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) if args.loss_type == 'depth_loss_two_distributions': self.nclass = args.num_class + args.num_class2 + 1 if args.loss_type == 'depth_avg_sigmoid_class': self.nclass = args.num_class + args.num_class2 # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) print("\nDefine models...\n") self.model_aprox_depth = DeepLab(num_classes=1, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) self.input_conv = nn.Conv2d(4, 3, 3, padding=1) # Using cuda if args.cuda: self.model_aprox_depth = torch.nn.DataParallel(self.model_aprox_depth, device_ids=self.args.gpu_ids) patch_replication_callback(self.model_aprox_depth) self.model_aprox_depth = self.model_aprox_depth.cuda() self.input_conv = self.input_conv.cuda() print("\nLoad checkpoints...\n") if not args.cuda: ckpt_aprox_depth = torch.load(args.ckpt, map_location='cpu') self.model_aprox_depth.load_state_dict(ckpt_aprox_depth['state_dict']) else: ckpt_aprox_depth = torch.load(args.ckpt) self.model_aprox_depth.module.load_state_dict(ckpt_aprox_depth['state_dict']) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer # set optimizer optimizer = torch.optim.Adam(train_params, args.lr) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None if 'depth' in args.loss_type: self.criterion = DepthLosses(weight=weight, cuda=args.cuda, min_depth=args.min_depth, max_depth=args.max_depth, num_class=args.num_class, cut_point=args.cut_point, num_class2=args.num_class2).build_loss(mode=args.loss_type) self.infer = DepthLosses(weight=weight, cuda=args.cuda, min_depth=args.min_depth, max_depth=args.max_depth, num_class=args.num_class, cut_point=args.cut_point, num_class2=args.num_class2) self.evaluator_depth = EvaluatorDepth(args.batch_size) else: self.criterion = SegmentationLosses(cuda=args.cuda, weight=weight).build_loss(mode=args.loss_type) self.evaluator = Evaluator(self.nclass) self.model, self.optimizer = model, optimizer # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint if 'depth' in args.loss_type: self.best_pred = 1e6 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume)) if not args.cuda: checkpoint = torch.load(args.resume, map_location='cpu') else: checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: state_dict = checkpoint['state_dict'] state_dict.popitem(last=True) state_dict.popitem(last=True) self.model.module.load_state_dict(state_dict, strict=False) else: state_dict = checkpoint['state_dict'] state_dict.popitem(last=True) state_dict.popitem(last=True) self.model.load_state_dict(state_dict, strict=False) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] if 'depth' in args.loss_type: self.best_pred = 1e6 print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 # add input layer to the model self.model = nn.Sequential( self.input_conv, self.model ) if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() def training(self, epoch): train_loss = 0.0 self.model.train() self.model_aprox_depth.eval() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.dataset == 'apollo_seg' or self.args.dataset == 'farsight_seg': target[target <= self.args.cut_point] = 0 target[target > self.args.cut_point] = 1 if image.shape[0] == 1: target = torch.cat([target, target], dim=0) image = torch.cat([image, image], dim=0) if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() aprox_depth = self.model_aprox_depth(image) aprox_depth = self.infer.sigmoid(aprox_depth) input = torch.cat([image, aprox_depth], dim=1) output = self.model(input) if self.args.loss_type == 'depth_sigmoid_loss_inverse': loss = self.criterion(output, target, inverse=True) else: loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch target[ torch.isnan(target)] = 0 # change nan values to zero for display (handle warning from tensorboard) self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step, n_class=self.args.num_class) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.model_aprox_depth.eval() if 'depth' in self.args.loss_type: self.evaluator_depth.reset() else: softmax = nn.Softmax(1) self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): aprox_depth = self.model_aprox_depth(image) aprox_depth = self.infer.sigmoid(aprox_depth) input = torch.cat([image, aprox_depth], dim=1) output = self.model(input) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Val loss: %.3f' % (test_loss / (i + 1))) if 'depth' in self.args.loss_type: if self.args.loss_type == 'depth_loss': pred = self.infer.pred_to_continous_depth(output) elif self.args.loss_type == 'depth_avg_sigmoid_class': pred = self.infer.pred_to_continous_depth_avg(output) elif self.args.loss_type == 'depth_loss_combination': pred = self.infer.pred_to_continous_combination(output) elif self.args.loss_type == 'depth_loss_two_distributions': pred = self.infer.pred_to_continous_depth_two_distributions(output) elif 'depth_sigmoid_loss' in self.args.loss_type: output = self.infer.sigmoid(output.squeeze(1)) pred = self.infer.depth01_to_depth(output) # Add batch sample into evaluator self.evaluator_depth.evaluateError(pred, target) else: output = softmax(output) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) if 'depth' in self.args.loss_type: # Fast test during the training MSE = self.evaluator_depth.averageError['MSE'] RMSE = self.evaluator_depth.averageError['RMSE'] ABS_REL = self.evaluator_depth.averageError['ABS_REL'] LG10 = self.evaluator_depth.averageError['LG10'] MAE = self.evaluator_depth.averageError['MAE'] DELTA1 = self.evaluator_depth.averageError['DELTA1'] DELTA2 = self.evaluator_depth.averageError['DELTA2'] DELTA3 = self.evaluator_depth.averageError['DELTA3'] self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/MSE', MSE, epoch) self.writer.add_scalar('val/RMSE', RMSE, epoch) self.writer.add_scalar('val/ABS_REL', ABS_REL, epoch) self.writer.add_scalar('val/LG10', LG10, epoch) self.writer.add_scalar('val/MAE', MAE, epoch) self.writer.add_scalar('val/DELTA1', DELTA1, epoch) self.writer.add_scalar('val/DELTA2', DELTA2, epoch) self.writer.add_scalar('val/DELTA3', DELTA3, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print( "MSE:{}, RMSE:{}, ABS_REL:{}, LG10: {}\nMAE:{}, DELTA1:{}, DELTA2:{}, DELTA3: {}".format(MSE, RMSE, ABS_REL, LG10, MAE, DELTA1, DELTA2, DELTA3)) new_pred = RMSE if new_pred < self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) else: # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) print('Loss: %.3f' % test_loss)
class Trainer(object): def __init__(self, args): self.args = args # Generate .npy file for dataloader self.img_process(args) # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = getattr(modeling, args.model_name)(pretrained=args.pretrained) # Define Optimizer optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, # {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Criterion self.criterion = SegmentationLosses( weight=None, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 # 将大图按unit_size的大小,每次stride的移动量进行裁剪。将分好的训练集和验证机以np数组形式存储在save_dir中, # 方便下次使用,并减少内存的占用。请将路径修改为自己的。 def img_process(self, args): unit_size = args.base_size stride = unit_size # int(unit_size/2) save_dir = os.path.join( '/data/dingyifeng/pytorch-jingwei-master/npy_process', str(unit_size)) # npy_process if not os.path.exists(save_dir): Image.MAX_IMAGE_PIXELS = 100000000000 # load train image 1 img = Image.open( '/data/dingyifeng/jingwei/jingwei_round1_train_20190619/image_1.png' ) img = np.asarray(img) #(50141, 47161, 4) anno_map = Image.open( '/data/dingyifeng/jingwei/jingwei_round1_train_20190619/image_1_label.png' ) anno_map = np.asarray(anno_map) #(50141, 47161) length, width = img.shape[0], img.shape[1] x1, x2, y1, y2 = 0, unit_size, 0, unit_size Img1 = [] # 保存小图的数组 Label1 = [] # 保存label的数组 while (x1 < length): #判断横向是否越界 if x2 > length: x2, x1 = length, length - unit_size while (y1 < width): if y2 > width: y2, y1 = width, width - unit_size im = img[x1:x2, y1:y2, :] if 255 in im[:, :, -1]: # 判断裁剪出来的小图中是否存在有像素点 Img1.append(im[:, :, 0:3]) # 添加小图 Label1.append(anno_map[x1:x2, y1:y2]) # 添加label if y2 == width: break y1 += stride y2 += stride if x2 == length: break y1, y2 = 0, unit_size x1 += stride x2 += stride Img1 = np.array(Img1) #(4123, 448, 448, 3) Label1 = np.array(Label1) #(4123, 448, 448) # load train image 2 img = Image.open( '/data/dingyifeng/jingwei/jingwei_round1_train_20190619/image_2.png' ) img = np.asarray(img) #(50141, 47161, 4) anno_map = Image.open( '/data/dingyifeng/jingwei/jingwei_round1_train_20190619/image_2_label.png' ) anno_map = np.asarray(anno_map) #(50141, 47161) length, width = img.shape[0], img.shape[1] x1, x2, y1, y2 = 0, unit_size, 0, unit_size Img2 = [] # 保存小图的数组 Label2 = [] # 保存label的数组 while (x1 < length): #判断横向是否越界 if x2 > length: x2, x1 = length, length - unit_size while (y1 < width): if y2 > width: y2, y1 = width, width - unit_size im = img[x1:x2, y1:y2, :] if 255 in im[:, :, -1]: # 判断裁剪出来的小图中是否存在有像素点 Img2.append(im[:, :, 0:3]) # 添加小图 Label2.append(anno_map[x1:x2, y1:y2]) # 添加label if y2 == width: break y1 += stride y2 += stride if x2 == length: break y1, y2 = 0, unit_size x1 += stride x2 += stride Img2 = np.array(Img2) #(5072, 448, 448, 3) Label2 = np.array(Label2) #(5072, 448, 448) Img = np.concatenate((Img1, Img2), axis=0) cat = np.concatenate((Label1, Label2), axis=0) # shuffle np.random.seed(1) assert (Img.shape[0] == cat.shape[0]) shuffle_id = np.arange(Img.shape[0]) np.random.shuffle(shuffle_id) Img = Img[shuffle_id] cat = cat[shuffle_id] os.mkdir(save_dir) print("=> generate {}".format(unit_size)) # split train dataset images_train = Img #[:int(Img.shape[0]*0.8)] categories_train = cat #[:int(cat.shape[0]*0.8)] assert (len(images_train) == len(categories_train)) np.save(os.path.join(save_dir, 'train_img.npy'), images_train) np.save(os.path.join(save_dir, 'train_label.npy'), categories_train) # split val dataset images_val = Img[int(Img.shape[0] * 0.8):] categories_val = cat[int(cat.shape[0] * 0.8):] assert (len(images_val) == len(categories_val)) np.save(os.path.join(save_dir, 'val_img.npy'), images_val) np.save(os.path.join(save_dir, 'val_label.npy'), categories_val) print("=> img_process finished!") else: print("{} file already exists!".format(unit_size)) for x in locals().keys(): del locals()[x] # 释放内存 import gc gc.collect() def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer if args.densecrfloss >0: self.densecrflosslayer = DenseCRFLoss(weight=args.densecrfloss, sigma_rgb=args.sigma_rgb, sigma_xy=args.sigma_xy, scale_factor=args.rloss_scale) print(self.densecrflosslayer) # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 train_celoss = 0.0 train_crfloss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) softmax = nn.Softmax(dim=1) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] croppings = (target!=254).float() target[target==254]=255 # Pixels labeled 255 are those unlabeled pixels. Padded region are labeled 254. # see function RandomScaleCrop in dataloaders/custom_transforms.py for the detail in data preprocessing if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) celoss = self.criterion(output, target) if self.args.densecrfloss ==0: loss = celoss else: max_output = (max(torch.abs(torch.max(output)), torch.abs(torch.min(output)))) mean_output = torch.mean(torch.abs(output)).item() # std_output = torch.std(output).item() probs = softmax(output) # /max_output*4 denormalized_image = denormalizeimage(sample['image'], mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) densecrfloss = self.densecrflosslayer(denormalized_image,probs,croppings) if self.args.cuda: densecrfloss = densecrfloss.cuda() loss = celoss + densecrfloss train_crfloss += densecrfloss.item() logits_copy = output.detach().clone().requires_grad_(True) max_output_copy = (max(torch.abs(torch.max(logits_copy)), torch.abs(torch.min(logits_copy)))) probs_copy = softmax(logits_copy) # /max_output_copy*4 denormalized_image_copy = denormalized_image.detach().clone() croppings_copy = croppings.detach().clone() densecrfloss_copy = self.densecrflosslayer(denormalized_image_copy, probs_copy, croppings) @torch.no_grad() def add_grad_map(grad, plot_name): if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch batch_grads = torch.max(torch.abs(grad), dim=1)[0].detach().cpu().numpy() color_imgs = [] for grad_img in batch_grads: grad_img[0,0]=0 img = colorize(grad_img)[:,:,:3] color_imgs.append(img) color_imgs = torch.from_numpy(np.array(color_imgs).transpose([0, 3, 1, 2])) grid_image = make_grid(color_imgs[:3], 3, normalize=False, range=(0, 255)) self.writer.add_image(plot_name, grid_image, global_step) output.register_hook(lambda grad: add_grad_map(grad, 'Grad Logits')) probs.register_hook(lambda grad: add_grad_map(grad, 'Grad Probs')) logits_copy.register_hook(lambda grad: add_grad_map(grad, 'Grad Logits Rloss')) densecrfloss_copy.backward() if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch img_entropy = torch.sum(-probs*torch.log(probs+1e-9), dim=1).detach().cpu().numpy() color_imgs = [] for e in img_entropy: e[0,0] = 0 img = colorize(e)[:,:,:3] color_imgs.append(img) color_imgs = torch.from_numpy(np.array(color_imgs).transpose([0, 3, 1, 2])) grid_image = make_grid(color_imgs[:3], 3, normalize=False, range=(0, 255)) self.writer.add_image('Entropy', grid_image, global_step) self.writer.add_histogram('train/total_loss_iter/logit_histogram', output, i + num_img_tr * epoch) self.writer.add_histogram('train/total_loss_iter/probs_histogram', probs, i + num_img_tr * epoch) self.writer.add_scalar('train/total_loss_iter/rloss', densecrfloss.item(), i + num_img_tr * epoch) self.writer.add_scalar('train/total_loss_iter/max_output', max_output.item(), i + num_img_tr * epoch) self.writer.add_scalar('train/total_loss_iter/mean_output', mean_output, i + num_img_tr * epoch) loss.backward() self.optimizer.step() train_loss += loss.item() train_celoss += celoss.item() tbar.set_description('Train loss: %.3f = CE loss %.3f + CRF loss: %.3f' % (train_loss / (i + 1),train_celoss / (i + 1),train_crfloss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) self.writer.add_scalar('train/total_loss_iter/ce', celoss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) #if self.args.no_val: if self.args.save_interval: # save checkpoint every interval epoch is_best = False if (epoch + 1) % self.args.save_interval == 0: self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best, filename='checkpoint_epoch_{}.pth.tar'.format(str(epoch+1))) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] target[target==254]=255 if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # for VOC self.test_loader is None # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) v_model = v_DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) self.vnet = VNet(1, 100, 1).cuda() train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] v_model_train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) self.optimizer_v_model = torch.optim.SGD( v_model_train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) self.optimizer_vnet = torch.optim.Adam(self.vnet.params(), 1e-3, weight_decay=1e-4) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.valcriterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss('ce') self.model, self.v_model, self.optimizer = model, v_model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 val_loader_iter = iter(self.val_loader) self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.v_model.load_state_dict(self.model.state_dict()) output = self.v_model(image) cost = self.criterion(output, target) cost_v = torch.reshape(cost, (-1, 1)) v_lambda = self.vnet(cost_v.data) l_f_v = torch.sum(cost_v * v_lambda) / len(cost_v) self.v_model.zero_grad() grads = torch.autograd.grad(l_f_v, (self_v_model.params()), create_graph=True) v_lr = args.lr * ((0.1**int(epoch >= 80)) * (0.1**int(epoch >= 100))) # For ResNet32 v_model.update_params(lr_inner=v_lr, source_params=grads) del grads # phase 2. pixel weights step try: sample_val = next(val_loader_iter) except StopIteration: val_loader_iter = iter(self.val_loader) sample_val = next(val_loader_iter) inputs_val, targets_val = sample_val['image'], sample_val['label'] if self.args.cuda: inputs_val, targets_val = inputs_val.cuda(), targets_val.cuda() y_g_hat = self.v_model(inputs_val) l_g_meta = self.valcriterion(y_g_hat, targets_val) self.optimizer_vnet.zero_grad() l_g_meta.backward() self.optimizer_vnet.step() # phase 1. network weight step (w) output = self.model(image) cost = self.criterion(output, target) cost_v = torch.reshape(cost, (-1, 1)) with torch.no_grad(): v_new = self.vnet(cost_v) loss = torch.sum(cost_v * v_new) / len(cost_v) self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.valcriterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 self.best_thresh = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] self.best_thresh = checkpoint['best_thresh'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['mask'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) if self.args.dataset == 'augsiim': target = target.squeeze(1) # print('target min: {}, target max: {}'.format(target.min(), target.max())) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, 'best_thresh': self.best_thresh, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() dice_data = [] tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['mask'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) if self.args.dataset == 'augsiim': target = target.squeeze(1) if self.args.use_sigmoid: prob = torch.sigmoid(output[:, 1, :, :]) else: prob = torch.softmax(output, dim=1)[:, 1, ...] # Dice data dice_data.append((prob.cpu().numpy(), target.cpu().numpy())) # Loss loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() mDice = self.evaluator.Mean_Dice_Coefficient() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) self.writer.add_scalar('val/mDice', mDice, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print("mDice:{}".format(mDice)) if self.nclass == 2: Dice = self.evaluator.Dice_Coefficient() self.writer.add_scalar('val/Dice', Dice, epoch) print('Dice:{}'.format(Dice)) print('Loss: %.3f' % test_loss) # Best Dice and Best Threshold print('Compute Best Dice Score and Best Threshold') dsc = DiceCoefficient(thresh_step=self.args.thresh_step) PREDS = np.concatenate(np.array([p for p, _ in dice_data])) TARGS = np.concatenate(np.array([t for _, t in dice_data])) best_dice, best_threshold = dsc.compute_best_threshold(PREDS, TARGS) print('Best Dice:{}, Best Threshold:{}'.format(best_dice, best_threshold)) # Free memory dsc = None del PREDS del TARGS del dice_data new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.best_thresh = best_threshold self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, 'best_thresh': self.best_thresh, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver if args.distributed: if args.local_rank ==0: self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() else: self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # PATH = args.path # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = SCNN(nclass=self.nclass,backbone=args.backbone,output_stride=args.out_stride,cuda = args.cuda,extension=args.ext) # Define Optimizer # optimizer = torch.optim.SGD(model.parameters(),args.lr, momentum=args.momentum, # weight_decay=args.weight_decay, nesterov=args.nesterov) optimizer = torch.optim.Adam(model.parameters(), args.lr,weight_decay=args.weight_decay) # model, optimizer = amp.initialize(model,optimizer,opt_level="O1") # Define Criterion weight = None # criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) # self.criterion = SegmentationCELosses(weight=weight, cuda=args.cuda) # self.criterion = SegmentationCELosses(weight=weight, cuda=args.cuda) # self.criterion = FocalLoss(gamma=0, alpha=[0.2, 0.98], img_size=512*512) self.criterion1 = FocalLoss(gamma=5, alpha=[0.2, 0.98], img_size=512 * 512) self.criterion2 = disc_loss(delta_v=0.5, delta_d=3.0, param_var=1.0, param_dist=1.0, param_reg=0.001, EMBEDDING_FEATS_DIMS=21,image_shape=[512,512]) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.val_loader),local_rank=args.local_rank) # Using cuda if args.cuda: self.model = self.model.cuda() if args.distributed: self.model = DistributedDataParallel(self.model) # self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) # patch_replication_callback(self.model) # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: filename = 'checkpoint.pth.tar' args.resume = os.path.join(self.saver.experiment_dir, filename) if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] # if args.cuda: # self.model.module.load_state_dict(checkpoint['state_dict']) # else: self.model.load_state_dict(checkpoint['state_dict']) # if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) max_instances = 1 for i, sample in enumerate(tbar): # image, target = sample['image'], sample['label'] image, target, ins_target = sample['image'], sample['bin_label'], sample['label'] # _target = target.cpu().numpy() # if np.max(_target) > max_instances: # max_instances = np.max(_target) # print(max_instances) if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) # if i % 10==0: # misc.imsave('/mfc/user/1623600/.temp6/train_{:s}_epoch:{}_i:{}.png'.format(str(self.args.distributed),epoch,i),np.transpose(image[0].cpu().numpy(),(1,2,0))) # os.chmod('/mfc/user/1623600/.temp6/train_{:s}_epoch:{}_i:{}.png'.format(str(self.args.distributed),epoch,i),0o777) # self.criterion = DataParallelCriterion(self.criterion) loss1 = self.criterion1(output, target) loss2 = self.criterion2(output, ins_target) reg_lambda = 0.01 loss = loss1 + 10*loss2 # loss = loss1 output=output[1] # loss.back # with amp.scale_loss(loss, self.optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) if self.args.distributed: if self.args.local_rank == 0: self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) else: self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr / 10) == 0: global_step = i + num_img_tr * epoch if self.args.distributed: if self.args.local_rank == 0: self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) else: self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) if self.args.distributed: if self.args.local_rank == 0: self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) else: self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) if self.args.local_rank == 0: print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) # if self.args.distributed: # if self.args.local_rank == 0: # if self.args.no_val: # # save checkpoint every epoch # is_best = False # self.saver.save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': self.model.module.state_dict(), # 'optimizer': self.optimizer.state_dict(), # 'best_pred': self.best_pred, # }, is_best) # else: # if self.args.no_val: # # save checkpoint every epoch # is_best = False # self.saver.save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': self.model.module.state_dict(), # 'optimizer': self.optimizer.state_dict(), # 'best_pred': self.best_pred, # }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): # image, target = sample['image'], sample['label'] image, target = sample['image'], sample['bin_label'] a= target.numpy() aa_max = np.max(a) if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion1(output, target) test_loss += loss.item() instance_seg = output[0].data.cpu().numpy() instance_seg = np.squeeze(instance_seg[0]) instance_seg = np.transpose(instance_seg, (1, 2, 0)) output = output[1] pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) if i % 30==0: misc.imsave('/mfc/user/1623600/.temp6/{:s}_val_epoch:{}_i:{}.png' .format(str(self.args.distributed),epoch,i), np.transpose(image[0].cpu().numpy(),(1,2,0))+3*np.asarray(np.stack((pred[0],pred[0],pred[0]),axis=-1),dtype=np.uint8)) os.chmod('/mfc/user/1623600/.temp6/{:s}_val_epoch:{}_i:{}.png'.format(str(self.args.distributed),epoch,i),0o777) temp_instance_seg = np.zeros_like(np.transpose(image[0].cpu().numpy(),(1,2,0))) for j in range(21): if j<7: temp_instance_seg[:, :, 0] += instance_seg[:, :, j] elif j<14: temp_instance_seg[:, :, 1] += instance_seg[:, :, j] else: temp_instance_seg[:, :, 2] += instance_seg[:, :, j] for k in range(3): temp_instance_seg[:, :, k] = self.minmax_scale(temp_instance_seg[:, :, k]) instance_seg = np.array(temp_instance_seg, np.uint8) misc.imsave('/mfc/user/1623600/.temp6/emb_{:s}_val_epoch:{}_i:{}.png' .format(str(self.args.distributed), epoch, i),instance_seg[...,:3]) os.chmod( '/mfc/user/1623600/.temp6/emb_{:s}_val_epoch:{}_i:{}.png'.format(str(self.args.distributed), epoch, i), 0o777) if self.args.distributed: if self.args.local_rank == 0: # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() F0 = self.evaluator.F0() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) if self.args.local_rank == 0: print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}".format(Acc, Acc_class, mIoU)) print('Loss: %.3f' % test_loss) new_pred = F0 if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) else: # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() F0 = self.evaluator.F0() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) if self.args.local_rank == 0: print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}".format(Acc, Acc_class, mIoU)) print('Loss: %.3f' % test_loss) new_pred = F0 if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def minmax_scale(self,input_arr): """ :param input_arr: :return: """ min_val = np.min(input_arr) max_val = np.max(input_arr) output_arr = (input_arr - min_val) * 255.0 / (max_val - min_val) return output_arr
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} # 方式1 dataloader self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # optimizer = torch.optim.Adam(train_params, weight_decay=args.weight_decay) # Define Criterion self.criterion = SegmentationLosses(weight=None, cuda=args.cuda).build_loss(mode=args.loss_type) self.criterion1 = SegmentationLosses(weight=None, cuda=args.cuda).build_loss(mode='ce') self.criterion2 = SegmentationLosses(weight=None, cuda=args.cuda).build_loss(mode='dice') self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: #多gpu self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 prev_time = time.time() self.model.train() self.evaluator.reset() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] print(image.shape) if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) if self.args.loss_type == 'diceplusce': loss1 = self.criterion1(output, target) loss2 = self.criterion2(output, make_one_hot(target.long(), num_classes=self.nclass)) loss = loss1 + loss2 else: loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) if self.args.loss_type == 'diceplusce': end_time = time.time() tbar.set_description('val loss: %.3f, celoss: %.4f, diceloss: %.4f, time cost: %.3f s' \ % (train_loss / (i + 1), loss1.item(), loss2.item(), end_time - prev_time)) else: tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) # train evaluate Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() IoU = self.evaluator.Mean_Intersection_over_Union() mIoU = np.nanmean(IoU) FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print("Acc_tr:{}, Acc_class_tr:{}, IoU_tr:{}, mIoU_tr:{}, fwIoU_tr: {}".format(Acc, Acc_class, IoU, mIoU, FWIoU)) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') val_loss = 0.0 prev_time = time.time() for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): # output = self.model(image) if self.args.loss_type == 'diceplusce': loss1 = self.criterion1(output, target) loss2 = self.criterion2(output, make_one_hot(target.long(), num_classes=self.nclass)) loss = loss1 + loss2 else: loss = self.criterion(output, target) val_loss += loss.item() if self.args.loss_type == 'diceplusce': tbar.set_description('val loss: %.3f, celoss: %.4f, diceloss: %.4f, time cost: %.3f s' \ % (val_loss / (i + 1), loss1.item(), loss2.item(), end_time - prev_time)) else: tbar.set_description('val loss: %.3f' % (val_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() IoU = self.evaluator.Mean_Intersection_over_Union() mIoU = np.nanmean(IoU) FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', val_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % val_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class trainNew(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() self.use_amp = True if (APEX_AVAILABLE and args.use_amp) else False self.opt_level = args.opt_level # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) cell_path_d = os.path.join(args.saved_arch_path, 'genotype_device.npy') cell_path_c = os.path.join(args.saved_arch_path, 'genotype_cloud.npy') network_path_space = os.path.join(args.saved_arch_path, 'network_path_space.npy') new_cell_arch_d = np.load(cell_path_d) new_cell_arch_c = np.load(cell_path_c) new_network_arch = np.load(network_path_space) new_network_arch = [1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2] # Define network model = new_cloud_Model(network_arch= new_network_arch, cell_arch_d = new_cell_arch_d, cell_arch_c = new_cell_arch_c, num_classes=self.nclass, device_num_layers=6) # TODO: look into these # TODO: ALSO look into different param groups as done int deeplab below # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, # {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # train_params = [{'params': model.parameters(), 'lr': args.lr}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator_device = Evaluator(self.nclass) self.evaluator_cloud = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) #TODO: use min_lr ? # Using cuda if self.use_amp and args.cuda: keep_batchnorm_fp32 = True if (self.opt_level == 'O2' or self.opt_level == 'O3') else None # fix for current pytorch version with opt_level 'O1' if self.opt_level == 'O1' and torch.__version__ < '1.3': for module in self.model.modules(): if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): # Hack to fix BN fprop without affine transformation if module.weight is None: module.weight = torch.nn.Parameter( torch.ones(module.running_var.shape, dtype=module.running_var.dtype, device=module.running_var.device), requires_grad=False) if module.bias is None: module.bias = torch.nn.Parameter( torch.zeros(module.running_var.shape, dtype=module.running_var.dtype, device=module.running_var.device), requires_grad=False) # print(keep_batchnorm_fp32) self.model, [self.optimizer, self.architect_optimizer] = amp.initialize( self.model, [self.optimizer, self.architect_optimizer], opt_level=self.opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic") print('cuda finished') # Using data parallel if args.cuda and len(self.args.gpu_ids) >1: if self.opt_level == 'O2' or self.opt_level == 'O3': print('currently cannot run with nn.DataParallel and optimization level', self.opt_level) self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) print('training on multiple-GPUs') # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] # if the weights are wrapped in module object we have to clean it if args.clean_module: self.model.load_state_dict(checkpoint['state_dict']) state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v self.model.load_state_dict(new_state_dict) else: if (torch.cuda.device_count() > 1 or args.load_parallel): self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() device_output, cloud_output = self.model(image) device_loss = self.criterion(device_output, target) cloud_loss = self.criterion(cloud_output, target) loss = device_loss + cloud_loss if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) if i %50 == 0: self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % 100 == 0: output = (device_output + cloud_output)/2 global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator_device.reset() self.evaluator_cloud.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): device_output, cloud_output = self.model(image) device_loss = self.criterion(device_output, target) cloud_loss = self.criterion(cloud_output, target) loss = device_loss + cloud_loss test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred_d = device_output.data.cpu().numpy() pred_c = cloud_output.data.cpu().numpy() target = target.cpu().numpy() pred_d = np.argmax(pred_d, axis=1) pred_c = np.argmax(pred_c, axis=1) # Add batch sample into evaluator self.evaluator_device.add_batch(target, pred_d) self.evaluator_cloud.add_batch(target, pred_c) mIoU_d = self.evaluator_device.Mean_Intersection_over_Union() mIoU_c = self.evaluator_cloud.Mean_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/device/mIoU', mIoU_d, epoch) self.writer.add_scalar('val/cloud/mIoU', mIoU_c, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("device_mIoU:{}, cloud_mIoU: {}".format(mIoU_d, mIoU_c)) print('Loss: %.3f' % test_loss) new_pred = (mIoU_d + mIoU_c)/2 if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() self.use_amp = True if (APEX_AVAILABLE and args.use_amp) else False self.opt_level = args.opt_level kwargs = { 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True } self.train_loaderA, self.train_loaderB, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: raise NotImplementedError #if so, which trainloader to use? # weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) # Define network model = AutoDeeplab(self.nclass, 12, self.criterion, self.args.filter_multiplier, self.args.block_multiplier, self.args.step) optimizer = torch.optim.SGD(model.weight_parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.model, self.optimizer = model, optimizer self.architect_optimizer = torch.optim.Adam( self.model.arch_parameters(), lr=args.arch_lr, betas=(0.9, 0.999), weight_decay=args.arch_weight_decay) # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loaderA), min_lr=args.min_lr) # TODO: Figure out if len(self.train_loader) should be devided by two ? in other module as well # Using cuda if args.cuda: self.model = self.model.cuda() # mixed precision if self.use_amp and args.cuda: keep_batchnorm_fp32 = True if (self.opt_level == 'O2' or self.opt_level == 'O3') else None # fix for current pytorch version with opt_level 'O1' if self.opt_level == 'O1' and torch.__version__ < '1.3': for module in self.model.modules(): if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): # Hack to fix BN fprop without affine transformation if module.weight is None: module.weight = torch.nn.Parameter( torch.ones(module.running_var.shape, dtype=module.running_var.dtype, device=module.running_var.device), requires_grad=False) if module.bias is None: module.bias = torch.nn.Parameter( torch.zeros(module.running_var.shape, dtype=module.running_var.dtype, device=module.running_var.device), requires_grad=False) # print(keep_batchnorm_fp32) self.model, [self.optimizer, self.architect_optimizer] = amp.initialize( self.model, [self.optimizer, self.architect_optimizer], opt_level=self.opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic") print('cuda finished') # Using data parallel if args.cuda and len(self.args.gpu_ids) > 1: if self.opt_level == 'O2' or self.opt_level == 'O3': print( 'currently cannot run with nn.DataParallel and optimization level', self.opt_level) self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) print('training on multiple-GPUs') #checkpoint = torch.load(args.resume) #print('about to load state_dict') #self.model.load_state_dict(checkpoint['state_dict']) #print('model loaded') #sys.exit() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] # if the weights are wrapped in module object we have to clean it if args.clean_module: self.model.load_state_dict(checkpoint['state_dict']) state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v # self.model.load_state_dict(new_state_dict) copy_state_dict(self.model.state_dict(), new_state_dict) else: if torch.cuda.device_count() > 1 or args.load_parallel: # self.model.module.load_state_dict(checkpoint['state_dict']) copy_state_dict(self.model.module.state_dict(), checkpoint['state_dict']) else: # self.model.load_state_dict(checkpoint['state_dict']) copy_state_dict(self.model.state_dict(), checkpoint['state_dict']) if not args.ft: # self.optimizer.load_state_dict(checkpoint['optimizer']) copy_state_dict(self.optimizer.state_dict(), checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loaderA) num_img_tr = len(self.train_loaderA) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optimizer.step() if epoch >= self.args.alpha_epoch: search = next(iter(self.train_loaderB)) image_search, target_search = search['image'], search['label'] if self.args.cuda: image_search, target_search = image_search.cuda( ), target_search.cuda() self.architect_optimizer.zero_grad() output_search = self.model(image_search) arch_loss = self.criterion(output_search, target_search) if self.use_amp: with amp.scale_loss( arch_loss, self.architect_optimizer) as arch_scaled_loss: arch_scaled_loss.backward() else: arch_loss.backward() self.architect_optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) #self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) #torch.cuda.empty_cache() self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False if torch.cuda.device_count() > 1: state_dict = self.model.module.state_dict() else: state_dict = self.model.state_dict() self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': state_dict, 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred if torch.cuda.device_count() > 1: state_dict = self.model.module.state_dict() else: state_dict = self.model.state_dict() self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': state_dict, 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader # kwargs = {'num_workers': args.workers, 'pin_memory': True} kwargs = {'num_workers': 0, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) if args.nir: input_channels = 4 else: input_channels = 3 # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, in_channels=input_channels, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) weight[1] = 4 weight[2] = 2 weight[0] = 1 else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch # place_holder_target = target # place_holder_output = output self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def pred_single_image(self, path): self.model.eval() img_path = path lbl_path = os.path.join( os.path.split(os.path.split(path)[0])[0], 'lbl', os.path.split(path)[1]) activations = collections.defaultdict(list) def save_activation(name, mod, input, output): activations[name].append(output.cpu()) for name, m in self.model.named_modules(): if type(m) == nn.ReLU: m.register_forward_hook(partial(save_activation, name)) input = cv2.imread(path) label = cv2.imread(lbl_path) # bkg = cv2.createBackgroundSubtractorMOG2() # back = bkg.apply(input) # cv2.imshow('back', back) # cv2.waitKey() input = cv2.resize(input, (513, 513), interpolation=cv2.INTER_CUBIC) image = Image.open(img_path).convert('RGB') # width x height x 3 # _tmp = np.array(Image.open(lbl_path), dtype=np.uint8) _tmp = np.array(Image.open(img_path), dtype=np.uint8) _tmp[_tmp == 255] = 1 _tmp[_tmp == 0] = 0 _tmp[_tmp == 128] = 2 _tmp = Image.fromarray(_tmp) mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) composed_transforms = transforms.Compose([ tr.FixedResize(size=513), tr.Normalize(mean=mean, std=std), tr.ToTensor() ]) sample = {'image': image, 'label': _tmp} sample = composed_transforms(sample) image, target = sample['image'], sample['label'] image = torch.unsqueeze(image, dim=0) if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) # output = output.data.cpu().numpy().squeeze(0).transpose([1, 2, 0]) # output = np.argmax(output, axis=2) * 255 output = output.data.cpu().numpy() prediction = np.argmax(output, axis=1) prediction = np.squeeze(prediction, axis=0) prediction[prediction == 1] = 255 if np.any(prediction == 2): prediction[prediction == 2] = 128 if np.any(prediction == 1): prediction[prediction == 1] = 255 print(np.unique(prediction)) see = Analysis(activations, label=1, path=self.saver.experiment_dir) see.backtrace(output) # for key in keys: # # see.visualize_tensor(see.image) # see.save_tensor(see.image, self.saver.experiment_dir) cv2.imwrite(os.path.join(self.saver.experiment_dir, 'rgb.png'), input) cv2.imwrite(os.path.join(self.saver.experiment_dir, 'lbl.png'), label) cv2.imwrite(os.path.join(self.saver.experiment_dir, 'prediction.png'), prediction) # pred = output.data.cpu().numpy() # target = target.cpu().numpy() # pred = np.argmax(pred, axis=1) # pred = np.reshape(pred, (513, 513)) # # prediction = np.append(target, pred, axis=1) # prediction = pred # # rgb = np.zeros((prediction.shape[0], prediction.shape[1], 3)) # # r = prediction.copy() # g = prediction.copy() # b = prediction.copy() # # g[g != 1] = 0 # g[g == 1] = 255 # # r[r != 2] = 0 # r[r == 2] = 255 # b = np.zeros(b.shape) # # rgb[:, :, 0] = b # rgb[:, :, 1] = g # rgb[:, :, 2] = r # # prediction = np.append(input, rgb.astype(np.uint8), axis=1) # result = np.append(input, prediction.astype(np.uint8), axis=1) # cv2.line(rgb, (513, 0), (513, 1020), (255, 255, 255), thickness=1) # cv2.line(rgb, (513, 0), (513, 1020), (255, 255, 255), thickness=1) # cv2.imwrite('/home/robot/git/pytorch-deeplab-xception/run/cropweed/deeplab-resnet/experiment_41/samples/synthetic_{}.png'.format(counter), prediction) # plt.imshow(see.weed_filter) # # cv2.waitKey() # plt.show() def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def testing(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.test_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) self.evaluator.add_batch(target, pred) # Add batch sample into evaluator prediction = np.append(target, pred, axis=2) print(pred.shape) input = image[0, 0:3, :, :].cpu().numpy().transpose([1, 2, 0]) # cv2.imshow('figure', prediction) # cv2.waitKey() # Fast test during the testing Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print( '[INFO] Network performance measures on the test dataset are as follows: \n ' 'mIOU: {} \n FWIOU: {} \n Class accuracy: {} \n Pixel Accuracy: {}' .format(mIoU, FWIoU, Acc_class, Acc)) self.evaluator.per_class_accuracy() def explain_image(self, path, counter): self.model.eval() img_path = path lbl_path = os.path.join( os.path.split(os.path.split(path)[0])[0], 'lbl', os.path.split(path)[1]) image = Image.open(img_path).convert('RGB') # width x height x 3 _tmp = np.array(Image.open(lbl_path), dtype=np.uint8) _tmp[_tmp == 255] = 1 _tmp[_tmp == 0] = 0 _tmp[_tmp == 128] = 2 _tmp = Image.fromarray(_tmp) mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) composed_transforms = transforms.Compose([ tr.FixedResize(size=513), tr.Normalize(mean=mean, std=std), tr.ToTensor() ]) sample = {'image': image, 'label': _tmp} sample = composed_transforms(sample) image, target = sample['image'], sample['label'] image = torch.unsqueeze(image, dim=0) # if self.args.cuda: # image, target = image.cuda(), target.cuda() # with torch.no_grad(): # output = self.model(image) # inn_model = InnvestigateModel(self.model, lrp_exponent=1, # method="b-rule", # beta=0, epsilon=1e-6) # # inn_model.eval() # model_prediction, heatmap = inn_model.innvestigate(in_tensor=image) # model_prediction = np.argmax(model_prediction, axis=1) # def run_guided_backprop(net, image_tensor): # return interpretation.guided_backprop(net, image_tensor, cuda=True, verbose=False, apply_softmax=False) # # def run_LRP(net, image_tensor): # return inn_model.innvestigate(in_tensor=image_tensor, rel_for_class=1) print('hold')
class trainNew(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) cell_path = os.path.join(args.saved_arch_path, 'genotype.npy') network_path_space = os.path.join(args.saved_arch_path, 'network_path_space.npy') new_cell_arch = np.load(cell_path) new_network_arch = np.load(network_path_space) # Define network model = newModel(network_arch=new_network_arch, cell_arch=new_cell_arch, num_classes=self.nclass, num_layers=12) # output_stride=args.out_stride, # sync_bn=args.sync_bn, # freeze_bn=args.freeze_bn) self.decoder = Decoder(self.nclass, 'autodeeplab', args, False) # TODO: look into these # TODO: ALSO look into different param groups as done int deeplab below # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, # {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # train_params = [{'params': model.parameters(), 'lr': args.lr}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler( args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) #TODO: use min_lr ? # TODO: Figure out if len(self.train_loader) should be devided by two ? in other module as well # Using cuda if args.cuda: if (torch.cuda.device_count() > 1 or args.load_parallel): self.model = torch.nn.DataParallel(self.model.cuda()) patch_replication_callback(self.model) self.model = self.model.cuda() print('cuda finished') # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] # if the weights are wrapped in module object we have to clean it if args.clean_module: self.model.load_state_dict(checkpoint['state_dict']) state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v self.model.load_state_dict(new_state_dict) else: if (torch.cuda.device_count() > 1 or args.load_parallel): self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() encoder_output, low_level_feature = self.model(image) output = self.decoder(encoder_output, low_level_feature) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): encoder_output, low_level_feature = self.model(image) output = self.decoder(encoder_output, low_level_feature) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network if args.sync_bn == True: BN = SynchronizedBatchNorm2d else: BN = nn.BatchNorm2d ### deeplabV3 start ### self.backbone_model = MobileNetV2(output_stride = args.out_stride, BatchNorm = BN) self.assp_model = ASPP(backbone = args.backbone, output_stride = args.out_stride, BatchNorm = BN) self.y_model = Decoder(num_classes = self.nclass, backbone = args.backbone, BatchNorm = BN) ### deeplabV3 end ### self.d_model = DomainClassifer(backbone = args.backbone, BatchNorm = BN) f_params = list(self.backbone_model.parameters()) + list(self.assp_model.parameters()) y_params = list(self.y_model.parameters()) d_params = list(self.d_model.parameters()) # Define Optimizer if args.optimizer == 'SGD': self.task_optimizer = torch.optim.SGD(f_params+y_params, lr= args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) self.d_optimizer = torch.optim.SGD(d_params, lr= args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) self.d_inv_optimizer = torch.optim.SGD(f_params, lr= args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) self.c_optimizer = torch.optim.SGD(f_params+y_params, lr= args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) elif args.optimizer == 'Adam': self.task_optimizer = torch.optim.Adam(f_params + y_params, lr=args.lr) self.d_optimizer = torch.optim.Adam(d_params, lr=args.lr) self.d_inv_optimizer = torch.optim.Adam(f_params, lr=args.lr) self.c_optimizer = torch.optim.Adam(f_params+y_params, lr=args.lr) else: raise NotImplementedError # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = 'dataloders\\datasets\\'+args.dataset + '_classes_weights.npy' if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(self.train_loader, self.nclass, classes_weights_path, self.args.dataset) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.task_loss = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.domain_loss = DomainLosses(cuda=args.cuda).build_loss() self.ca_loss = '' # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.backbone_model = torch.nn.DataParallel(self.backbone_model, device_ids=self.args.gpu_ids) self.assp_model = torch.nn.DataParallel(self.assp_model, device_ids=self.args.gpu_ids) self.y_model = torch.nn.DataParallel(self.y_model, device_ids=self.args.gpu_ids) self.d_model = torch.nn.DataParallel(self.d_model, device_ids=self.args.gpu_ids) patch_replication_callback(self.backbone_model) patch_replication_callback(self.assp_model) patch_replication_callback(self.y_model) patch_replication_callback(self.d_model) self.backbone_model = self.backbone_model.cuda() self.assp_model = self.assp_model.cuda() self.y_model = self.y_model.cuda() self.d_model = self.d_model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.backbone_model.module.load_state_dict(checkpoint['backbone_model_state_dict']) self.assp_model.module.load_state_dict(checkpoint['assp_model_state_dict']) self.y_model.module.load_state_dict(checkpoint['y_model_state_dict']) self.d_model.module.load_state_dict(checkpoint['d_model_state_dict']) else: self.backbone_model.load_state_dict(checkpoint['backbone_model_state_dict']) self.assp_model.load_state_dict(checkpoint['assp_model_state_dict']) self.y_model.load_state_dict(checkpoint['y_model_state_dict']) self.d_model.load_state_dict(checkpoint['d_model_state_dict']) if not args.ft: self.task_optimizer.load_state_dict(checkpoint['task_optimizer']) self.d_optimizer.load_state_dict(checkpoint['d_optimizer']) self.d_inv_optimizer.load_state_dict(checkpoint['d_inv_optimizer']) self.c_optimizer.load_state_dict(checkpoint['c_optimizer']) if self.args.dataset == 'gtav': self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 train_task_loss = 0.0 train_d_loss = 0.0 train_d_inv_loss = 0.0 self.backbone_model.train() self.assp_model.train() self.y_model.train() self.d_model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): if self.args.dataset == 'gtav': src_image,src_label = sample['image'], sample['label'] else: src_image, src_label, tgt_image = sample['src_image'], sample['src_label'], sample['tgt_image'] if self.args.cuda: if self.args.dataset != 'gtav': src_image, src_label, tgt_image = src_image.cuda(), src_label.cuda(), tgt_image.cuda() else: src_image, src_label = src_image.cuda(), src_label.cuda() self.scheduler(self.task_optimizer, i, epoch, self.best_pred) self.scheduler(self.d_optimizer, i, epoch, self.best_pred) self.scheduler(self.d_inv_optimizer, i, epoch, self.best_pred) self.scheduler(self.c_optimizer, i, epoch, self.best_pred) self.task_optimizer.zero_grad() self.d_optimizer.zero_grad() self.d_inv_optimizer.zero_grad() self.c_optimizer.zero_grad() # source image feature src_high_feature_0, src_low_feature = self.backbone_model(src_image) src_high_feature = self.assp_model(src_high_feature_0) src_output = F.interpolate(self.y_model(src_high_feature, src_low_feature), src_image.size()[2:], \ mode='bilinear', align_corners=True) src_d_pred = self.d_model(src_high_feature) task_loss = self.task_loss(src_output, src_label) if self.args.dataset != 'gtav': # target image feature tgt_high_feature_0, tgt_low_feature = self.backbone_model(tgt_image) tgt_high_feature = self.assp_model(tgt_high_feature_0) tgt_output = F.interpolate(self.y_model(tgt_high_feature, tgt_low_feature), tgt_image.size()[2:], \ mode='bilinear', align_corners=True) tgt_d_pred = self.d_model(tgt_high_feature) d_loss,d_acc = self.domain_loss(src_d_pred, tgt_d_pred) d_inv_loss, _ = self.domain_loss(tgt_d_pred, src_d_pred) loss = task_loss + d_loss + d_inv_loss loss.backward() self.task_optimizer.step() self.d_optimizer.step() self.d_inv_optimizer.step() else: d_acc = 0 d_loss = torch.tensor(0.0) d_inv_loss = torch.tensor(0.0) loss = task_loss loss.backward() self.task_optimizer.step() train_task_loss += task_loss.item() train_d_loss += d_loss.item() train_d_inv_loss += d_inv_loss.item() train_loss += task_loss.item() + d_loss.item() + d_inv_loss.item() tbar.set_description('Train loss: %.3f t_loss: %.3f d_loss: %.3f , d_inv_loss: %.3f d_acc: %.2f' \ % (train_loss / (i + 1),train_task_loss / (i + 1),\ train_d_loss / (i + 1), train_d_inv_loss / (i + 1), d_acc*100)) self.writer.add_scalar('train/task_loss_iter', task_loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch if self.args.dataset != 'gtav': image = torch.cat([src_image,tgt_image],dim=0) output = torch.cat([src_output,tgt_output],dim=0) else: image = src_image output = src_output self.summary.visualize_image(self.writer, self.args.dataset, image, src_label, output, global_step) self.writer.add_scalar('train/task_loss_epoch', train_task_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + src_image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'backbone_model_state_dict': self.backbone_model.module.state_dict(), 'assp_model_state_dict': self.assp_model.module.state_dict(), 'y_model_state_dict': self.y_model.module.state_dict(), 'd_model_state_dict': self.d_model.module.state_dict(), 'task_optimizer': self.task_optimizer.state_dict(), 'd_optimizer': self.d_optimizer.state_dict(), 'd_inv_optimizer': self.d_inv_optimizer.state_dict(), 'c_optimizer': self.c_optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.backbone_model.eval() self.assp_model.eval() self.y_model.eval() self.d_model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): high_feature, low_feature = self.backbone_model(image) high_feature = self.assp_model(high_feature) output = F.interpolate(self.y_model(high_feature, low_feature), image.size()[2:], \ mode='bilinear', align_corners=True) task_loss = self.task_loss(output, target) test_loss += task_loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU,IoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'backbone_model_state_dict': self.backbone_model.module.state_dict(), 'assp_model_state_dict': self.assp_model.module.state_dict(), 'y_model_state_dict': self.y_model.module.state_dict(), 'd_model_state_dict': self.d_model.module.state_dict(), 'task_optimizer': self.task_optimizer.state_dict(), 'd_optimizer': self.d_optimizer.state_dict(), 'd_inv_optimizer': self.d_inv_optimizer.state_dict(), 'c_optimizer': self.c_optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args self.train_dir = './data_list/train_lite.csv' self.train_list = pd.read_csv(self.train_dir) self.val_dir = './data_list/val_lite.csv' self.val_list = pd.read_csv(self.val_dir) self.train_length = len(self.train_list) self.val_length = len(self.val_list) # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # 方式2 self.train_gen, self.val_gen, self.test_gen, self.nclass = make_data_loader2(args) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # optimizer = torch.optim.Adam(train_params, weight_decay=args.weight_decay) # Define Criterion # self.criterion = SegmentationLosses(weight=None, cuda=args.cuda).build_loss(mode=args.loss_type) self.criterion1 = SegmentationLosses(weight=None, cuda=args.cuda).build_loss(mode='ce') self.criterion2= SegmentationLosses(weight=None, cuda=args.cuda).build_loss(mode='dice') self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, self.train_length) # Using cuda if args.cuda: self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: # self.model.module.load_state_dict(checkpoint['state_dict']) self.model.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 prev_time = time.time() self.model.train() self.evaluator.reset() num_img_tr = self.train_length / self.args.batch_size for iteration in range(int(num_img_tr)): samples = next(self.train_gen) image, target = samples['image'], samples['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, iteration, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss1 = self.criterion1(output, target) loss2 = self.criterion2(output, make_one_hot(target.long(), num_classes=self.nclass)) loss = loss1 + loss2 loss.backward() self.optimizer.step() train_loss += loss.item() self.writer.add_scalar('train/total_loss_iter', loss.item(), iteration + num_img_tr * epoch) # print log 默认log_iters = 4 if iteration % 4 == 0: end_time = time.time() print("Iter - %d: train loss: %.3f, celoss: %.4f, diceloss: %.4f, time cost: %.3f s" \ % (iteration, loss.item(), loss1.item(), loss2.item(), end_time - prev_time)) prev_time = time.time() # Show 10 * 3 inference results each epoch if iteration % (num_img_tr // 10) == 0: global_step = iteration + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) print("input image shape/iter:", image.shape) # train evaluate Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() IoU = self.evaluator.Mean_Intersection_over_Union() mIoU = np.nanmean(IoU) FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print("Acc_tr:{}, Acc_class_tr:{}, IoU_tr:{}, mIoU_tr:{}, fwIoU_tr: {}".format(Acc, Acc_class, IoU, mIoU, FWIoU)) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, iteration * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() val_loss = 0.0 prev_time = time.time() num_img_val = self.val_length / self.args.batch_size print("Validation:","epoch ", epoch) print(num_img_val) for iteration in range(int(num_img_val)): samples = next(self.val_gen) image, target = samples['image'], samples['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): # output = self.model(image) loss1 = self.criterion1(output, target) loss2 = self.criterion2(output, make_one_hot(target.long(), num_classes=self.nclass)) loss = loss1 + loss2 val_loss += loss.item() self.writer.add_scalar('val/total_loss_iter', loss.item(), iteration + num_img_val * epoch) val_loss += loss.item() # print log 默认log_iters = 4 if iteration % 4 == 0: end_time = time.time() print("Iter - %d: validation loss: %.3f, celoss: %.4f, diceloss: %.4f, time cost: %.3f s" \ % (iteration, loss.item(), loss1.item(), loss2.item(), end_time - prev_time)) prev_time = time.time() pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) print(image.shape) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() IoU = self.evaluator.Mean_Intersection_over_Union() mIoU = np.nanmean(IoU) FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', val_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, iteration * self.args.batch_size + image.data.shape[0])) print("Acc_val:{}, Acc_class_val:{}, IoU:val:{}, mIoU_val:{}, fwIoU_val: {}".format(Acc, Acc_class, IoU, mIoU, FWIoU)) print('Loss: %.3f' % val_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network net = segModel(self.args, self.nclass) net.build_model() model = net.model optimizer = net.optimizer # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) print('weight', weight) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: #self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) self.model = torch.nn.DataParallel(self.model) patch_replication_callback(self.model) #?? self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) if isinstance(output, tuple): loss = self.criterion(output, target, epoch) #print('--------') #print('features:',torch.sum(torch.isnan(output[1]))) #print('scores:', torch.sum(torch.isnan(output[0]))) output = output[0] else: loss = self.criterion(output, target) if isinstance(loss, tuple): #return many loss value loss_sum = loss[0] loss1 = loss[1] loss2 = loss[2] self.writer.add_scalars('train/indi_loss_iter', { 'ce': loss1, 'triplet': loss2 }, i + num_img_tr * epoch) self.writer.add_scalar('train/total_loss_iter', loss_sum.item(), i + num_img_tr * epoch) loss_sum.backward() self.optimizer.step() train_loss += loss_sum.item() else: loss.backward() self.optimizer.step() train_loss += loss.item() self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch if self.args.dataset != 'brain': self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) else: self.summary.visualize_image_four(self.writer, self.args.dataset, image, target, output, global_step, True) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) self.writer.add_scalar('train/lr', self.optimizer.param_groups[0]['lr'], epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] #print('val:',image.shape,target.shape) if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) if isinstance(output, tuple): #to distinguish triplet loss and other loss loss = self.criterion(output, target, epoch) output = output[0] else: loss = self.criterion(output, target) #to distinguish ce_dice and other loss if isinstance(loss, tuple): loss = loss[0] test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) #show num_img_tr = len(self.val_loader) if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch if self.args.dataset != 'brain': self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) else: self.summary.visualize_image_four(self.writer, self.args.dataset, image, target, output, global_step) #eval pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU is_best = new_pred > self.best_pred if epoch >= 9 and (epoch + 1) % 10 == 0: if is_best: self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)