def plot_lr( optim, args, hyper ): lr_hist = [] batch_size = hyper.batch_size n_per_epoch = int( dataset_size / batch_size ) print( "number of iterations per epoch:{}".format( n_per_epoch ) ) start_epoch = args.start_epoch - 1 end_epoch = start_epoch + args.epochs for epoch in range( start_epoch, end_epoch ): for i in range( n_per_epoch ): niter = epoch * n_per_epoch + i lr = adjust_learning_rate( optim, niter, hyper ) lr_hist.append( lr ) index = list( range( n_per_epoch * args.epochs ) ) plt.plot( index, lr_hist ) plt.show()
if N_batches is not None: train_loader = data.DataLoader(train_ds, batch_size=args.bs, num_workers=args.n_threads, sampler=LimitedRandomSampler( train_ds, N_batches, args.bs)) else: train_loader = data.DataLoader(train_ds, batch_size=args.bs, num_workers=args.n_threads, shuffle=True) print(colored('==> ', 'blue') + 'Epoch:', epoch + 1, cur_snapshot) # Adjusting learning rate using the scheduler optimizer, cur_lr = adjust_learning_rate(optimizer, epoch + 1, args) print(colored('==> ', 'red') + 'LR:', cur_lr) # Training one epoch and measure the time start = time.time() train_loss = train_epoch(epoch, net, optimizer, train_loader, criterion, args.n_epoch) epoch_time = np.round(time.time() - start, 4) print( colored('==> ', 'green') + 'Epoch training time: {} s.'.format(epoch_time)) # If it is time to start the validation, we will do it # args.args.start_val can be used to avoid time-consuming validation # in the beginning of the training if epoch >= args.start_val: start = time.time() val_loss, probs, truth, _ = validate_epoch(net, val_loader,
try: print("Training for %d epochs..." % NUM_EPOCHS) log_parameters = train_utils.log_init() for epoch in range(1, NUM_EPOCHS + 1): # perform training and validation train_loss, train_r_sq, train_accu, val_loss, val_r_sq, val_accu = train_utils.train_and_validate( perf_model, criterion, perf_optimizer, training_data, validation_data, METRIC ) # adjut learning rate train_utils.adjust_learning_rate(perf_optimizer, epoch, ADJUST_EVERY) # log data for visualization later log_parameters = train_utils.log_epoch_stats( log_parameters, epoch, train_loss, train_r_sq, train_accu, val_loss, val_r_sq, val_accu ) # print loss if epoch % PRINT_EVERY == 0:
def train_or_eval(train, gpu, loader, model, criterion, optimizer, args, hyper, epoch): phase = "train" if train else "test" model.train() if train else model.eval() losses = AverageMeter("Loss", ":.4e") top1 = AverageMeter("Accuracy1", ":6.2f") top5 = AverageMeter("Accuracy5", ":6.2f") prefix = "Epoch:[{}]".format(epoch + 1) if train else "Test: " progress = ProgressMeter(len(loader), [losses, top1, top5], prefix=prefix) if args.prof: print("Profiling started") torch.cuda.cudart().cudaProfilerStart() t_init = time.time() prefetcher = data_prefetcher(loader) with torch.set_grad_enabled(mode=train): for i, (images, target) in enumerate(prefetcher): niter = epoch * len(loader) + i if args.prof: torch.cuda.nvtx.range_push("Prof start iteration {}".format(i)) if args.prof: torch.cuda.nvtx.range_push("forward") output = model(images) if args.prof: torch.cuda.nvtx.range_pop() loss = criterion(output, target) if train: lr = adjust_learning_rate(optimizer, niter, hyper, len(loader)) optimizer.zero_grad() if args.prof: torch.cuda.nvtx.range_push("backward") with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if args.prof: torch.cuda.nvtx.range_pop() if args.prof: torch.cuda.nvtx.range_push("optimizer step") optimizer.step() if args.prof: torch.cuda.nvtx.range_pop() distributed = args.gpu is None publish_stats = (not distributed or gpu == 0) and i % 100 == 0 if not train or publish_stats: acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) if publish_stats: progress.display(i) if train and publish_stats: args.writer.add_scalar("Loss/{}".format(phase), loss.item(), niter) args.writer.add_scalar("Accuracy/{}".format(phase), acc1, niter) args.writer.add_scalar("Loss/Accuracy", acc1, lr * 10000) if args.prof: torch.cuda.nvtx.range_pop() if args.prof and i == 20: break if args.prof: print("Profiling stopped") torch.cuda.cudart().cudaProfilerStop() print("Total {} epoch time: {}".format(phase, HTIME(time.time() - t_init))) return top1.avg
def main(): parser = argparse.ArgumentParser() parser.add_argument('--DATASET_PATH', type=str, default='/home/zhangdong/database/DUTS/') parser.add_argument('--WEIGHTS_PATH', type=str, default='/home/yangle/DAVIS/result/models/') parser.add_argument('--EXPERIMENT', type=str, default='/home/yangle/DAVIS/result/TrainNet/') parser.add_argument('--N_EPOCHS', type=int, default=200) parser.add_argument('--MAX_PATIENCE', type=int, default=30) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--N_CLASSES', type=int, default=2) parser.add_argument('--LEARNING_RATE', type=float, default=1e-4) parser.add_argument('--LR_DECAY', type=float, default=0.995) parser.add_argument('--DECAY_LR_EVERY_N_EPOCHS', type=int, default=1) parser.add_argument('--WEIGHT_DECAY', type=float, default=0.0001) parser.add_argument('--CUDNN', type=bool, default=True) args = parser.parse_args() torch.cuda.manual_seed(args.seed) cudnn.benchmark = args.CUDNN normalize = transforms.Normalize(mean=saliency.mean, std=saliency.std) train_joint_transformer_img = transforms.Compose([joint_transforms.JointResize(224)]) mask_size_list = [14, 28, 56, 112, 224] train_dset = saliency.Saliency( args.DATASET_PATH, 'train',train_joint_transformer_img, mask_size_list, transform=transforms.Compose([transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dset, batch_size=args.batch_size, shuffle=True) test_joint_transforms_img = transforms.Compose([joint_transforms.JointResize(224)]) val_dset = saliency.Saliency( args.DATASET_PATH, 'val',test_joint_transforms_img, mask_size_list, transform=transforms.Compose([transforms.ToTensor(),normalize])) val_loader = torch.utils.data.DataLoader( val_dset, batch_size=args.batch_size, shuffle=False) print("TrainImages: %d" % len(train_loader.dataset.imgs)) print("ValImages: %d" % len(val_loader.dataset.imgs)) # example_inputs, example_targets = next(iter(train_loader)) # print("InputsBatchSize: ", example_inputs.size()) # print("TargetsBatchSize: ", len(example_targets)) # print("\nInput (size, max, min) ---") # # input # i = example_inputs[0] # print(i.size()) # print(i.max()) # print(i.min()) # print("Target (size, max, min) ---") # # target # for mask in example_targets: # print(mask.size()) # print(mask.max()) # print(mask.min()) # initialize ResNet18 from the pre-trained classification model resnet = torchvision.models.resnet50(pretrained=True) pre_trained_dict = resnet.state_dict() model = SegNet.resnet50() model_dict = model.state_dict() # 1. filter out unnecessary keys pre_trained_dict = {k: v for k, v in pre_trained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pre_trained_dict) # 3. load the new state dict model.load_state_dict(model_dict) model = model.cuda() #model = torch.nn.DataParallel(model).cuda() print(' + Number of params: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # model.apply(utils.weights_init) optimizer = optim.RMSprop(model.parameters(), lr=args.LEARNING_RATE, weight_decay=args.WEIGHT_DECAY, eps=1e-12) criterion = nn.NLLLoss2d().cuda() exp_dir = args.EXPERIMENT + 'test' if os.path.exists(exp_dir): shutil.rmtree(exp_dir) exp = experiment.Experiment('test', args.EXPERIMENT) exp.init() START_EPOCH = exp.epoch END_EPOCH = START_EPOCH + args.N_EPOCHS for epoch in range(START_EPOCH, END_EPOCH): since = time.time() # ### Train ### trn_loss, trn_err = utils.train(model, train_loader, optimizer, criterion, epoch) print('Epoch {:d}: Train - Loss: {:.4f}\tErr: {:.4f}'.format(epoch, trn_loss, trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) ### Test ### val_loss, val_err = utils.test(model, val_loader, criterion, epoch) print('Val - Loss: {:.4f}, Error: {:.4f}'.format(val_loss, val_err)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) ### Save Metrics ### exp.save_history('train', trn_loss, trn_err) exp.save_history('val', val_loss, val_err) ### Checkpoint ### exp.save_weights(model, trn_loss, val_loss, trn_err, val_err) exp.save_optimizer(optimizer, val_loss) ## Early Stopping ## if (epoch - exp.best_val_loss_epoch) > args.MAX_PATIENCE: print(("Early stopping at epoch %d since no " +"better loss found since epoch %.3").format(epoch, exp.best_val_loss)) break # Adjust Lr ###--old method utils.adjust_learning_rate(args.LEARNING_RATE, args.LR_DECAY, optimizer, epoch, args.DECAY_LR_EVERY_N_EPOCHS) exp.epoch += 1
def main(): global args ## create models and optimizers print("=> creating models...") classifier = archs.resnet50shared(pretrained=True).to(device) decoder = archs.decoder(final_upsample_mode=args.upsample).to(device) optimizer = {} optimizer['classifier'] = torch.optim.SGD(classifier.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer['decoder'] = torch.optim.Adam(decoder.parameters(), args.lr_casme, weight_decay=args.weight_decay) cudnn.benchmark = True ## data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader(datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, sampler=None) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False) ## training loop for epoch in range(args.epochs): epoch_start_time = time.time() adjust_learning_rate(optimizer, epoch, args) ## train for one epoch tr_s = train_or_eval(train_loader, classifier, decoder, True, optimizer, epoch) ## evaluate on validation set val_s = train_or_eval(val_loader, classifier, decoder) ## save checkpoint save_checkpoint( { 'epoch': epoch + 1, 'state_dict_classifier': classifier.state_dict(), 'state_dict_decoder': decoder.state_dict(), 'optimizer_classifier': optimizer['classifier'].state_dict(), 'optimizer_decoder': optimizer['decoder'].state_dict(), 'args': args, }, args) ## log with open(args.log_path, 'a') as f: f.write( str(epoch + 1) + ' ' + str(time.time() - epoch_start_time) + ' ' + tr_s['acc'] + ' ' + val_s['acc'] + ' ' + tr_s['acc_m'] + ' ' + val_s['acc_m'] + ' ' + tr_s['avg_mask'] + ' ' + val_s['avg_mask'] + ' ' + tr_s['std_mask'] + ' ' + val_s['std_mask'] + ' ' + tr_s['entropy'] + ' ' + val_s['entropy'] + ' ' + tr_s['tv'] + ' ' + val_s['tv'] + '\n')
def main_worker(gpu, ngpus_per_node, args): global best_acc1 if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # Define training directory in case number of classes is required by the model instance main_file = args.root / args.main_file num_classes = len( [cur_dir.name for cur_dir in main_file.iterdir() if len(list(cur_dir.iterdir())) >= args.min_allowed_imgs] ) if not num_classes == 1000: print('[INFO]: Using {} classes instead of 1000 ImageNet classes'.format(num_classes)) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.model)) model = models.__dict__[args.model](num_classes=num_classes) if args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer if args.loss_func in ['cross', 'cross_entropy', 'entropy']: criterion = nn.CrossEntropyLoss().cuda(args.gpu) elif args.loss_func in ['l2', 'l2_squared', 'squared', 'MSE']: print('[INFO] Using MSE loss function instead of Cross Entropy.') args.loss_func = 'l2' criterion = nn.MSELoss().cuda(args.gpu) if args.opt.lower() == 'sgd': optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.opt.lower() == 'adam': print('[INFO] Using Adam optimizer instead of SGD.') optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) elif args.opt.lower() == 'lbfgs': print('[INFO] Using LBFGS optimizer instead of SGD.') optimizer = torch.optim.LBFGS(model.parameters(), args.lr, history_size=20 ) else: raise ValueError('Incorrect optimizer selection {}'.format(args.opt)) if args.initial_lr: param_setup = [{'params': cur_lay.parameters()} for i, cur_lay in enumerate(model) if 'weight' in dir(cur_lay)] optimizer = torch.optim.SGD(param_setup, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.schedule_lr: scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, args.lr / 100, args.lr) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code test_file = args.root / args.test_file if args.sub_file: sub_file = args.root / args.sub_file normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_trans_list = [] if not args.norandomcrop: train_trans_list.append(transforms.RandomResizedCrop(224)) if not args.norandomflip: train_trans_list.append(transforms.RandomHorizontalFlip()) train_trans_list = train_trans_list + [transforms.ToTensor(), normalize] train_dataset = datasets.ImageFolder( main_file, transforms.Compose(train_trans_list) ) test_dataset = datasets.ImageFolder(test_file, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]), train=False) if args.sub_file: sub_dataset = datasets.ImageFolder(test_file, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]), train=False) if args.train_size or args.select_class_list: if not args.select_class_list: args.select_class_list = list(range(args.num_classes)) sel_idx = [] for lbl in args.select_class_list: lbl_idx = [i for i, t in enumerate(train_dataset.targets) if t == lbl] sel_idx += random.sample(lbl_idx, (args.train_size if args.train_size else len(lbl_idx))) train_dataset.samples = train_dataset.samples[sel_idx] train_dataset.targets = train_dataset.targets[sel_idx] for cur_idx, cur_cls in enumerate(args.select_class_list): train_dataset.targets[train_dataset.targets==cur_cls] = cur_idx sel_idx = [] for lbl in args.select_class_list: lbl_idx = [i for i, t in enumerate(test_dataset.targets) if t == lbl] sel_idx += lbl_idx test_dataset.samples = test_dataset.samples[sel_idx] test_dataset.targets = test_dataset.targets[sel_idx] for cur_idx, cur_cls in enumerate(args.select_class_list): test_dataset.targets[test_dataset.targets==cur_cls] = cur_idx # Inject symmetric noise to training set if args.inject_noise: im_per_class = int(len(train_dataset) / args.num_classes) noisy_labels = np.zeros((len(train_dataset),), dtype=int) num_shuffle = int(im_per_class * args.inject_noise) for i in range(args.num_classes): noisy_idx = [] cur_idx = [idx for idx, label in enumerate(train_dataset.targets) if label==i] shuffled_idx = random.sample(cur_idx, len(cur_idx)) for r in range(args.num_classes): noisy_idx += [r for idx in shuffled_idx[im_per_class - (r+1)*num_shuffle:im_per_class - r*num_shuffle]] noisy_idx += [i for idx in shuffled_idx[:im_per_class - args.num_classes*num_shuffle]] noisy_labels[cur_idx] = np.array(noisy_idx) train_dataset.targets = noisy_labels # TODO: Replace fraction of one training set randomly with another. if args.mix_cifar: assert args.mix_rate, "mix_rate should be given when mix_cifar is set" assert args.traindir2, "traindir2 must be given when mix_cifar is set" assert not args.inject_noise, "inject_noise should not be given when mix_cifar is set" assert not args.testdir2, "only one testdir can be set when mix_cifar is set" traindir2 = os.path.join(args.root, args.traindir2) clean_dataset = datasets.ImageFolder( traindir2, transforms.Compose([ transforms.ToTensor(), normalize, ])) im_per_class = int(len(train_dataset) / len(train_dataset.classes)) num_shuffle = int(im_per_class * args.mix_rate) shuffled_samples = [] clean_samples = [] for i in range(len(train_dataset.classes)): cur_imgs = [s[0] for s in train_dataset.samples if s[1]==i] cur_imgs = random.sample(cur_imgs, im_per_class - num_shuffle) mix_imgs = [s[0] for s in clean_dataset.samples if s[1]==i] mix_imgs = random.sample(mix_imgs, num_shuffle) clean_samples += [(img, i) for img in mix_imgs] shuffled_samples += [(img, i) for img in cur_imgs + mix_imgs] train_dataset.samples = shuffled_samples clean_dataset.samples = clean_samples val_loader2 = torch.utils.data.DataLoader( clean_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.sub_file: val_loader2 = torch.utils.data.DataLoader( sub_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return if args.compute_jacobian: gvec = (torch.randn((1, args.num_classes)) / len(train_dataset)).cuda(args.gpu, non_blocking=True) # TODO: tracking weights of the model if args.track_weights: layer_idx = [i for i, cl in enumerate(model) if 'weight' in dir(cl)] cur_weights = get_weights(model, layer_idx) if args.track_weights == 'filters': filter_w_file = args.outpath / 'filter_weights.pickle' filter_w_dict = {('layer_'+str(l)): [] for i, l in enumerate(layer_idx) if cur_weights[i].ndim > 2} if args.track_weights == 'norm': w_norm_dict = {('layer_'+str(l)): 0 for i, l in enumerate(layer_idx) if cur_weights[i].ndim > 1} # TODO: scaling the weights of the model manually if args.scale_weights: scale_dict = {} for cur_l, cur_w in enumerate(cur_weights): if not (cur_w.ndim > 2): continue scale_dict['layer_' + str(layer_idx[cur_l])] = np.linalg.norm(cur_w.flatten()).item() rescale_weights(model, scale_dict) save_config(args) train_log = [] log_file = args.outpath / 'log.json' for epoch in range(args.start_epoch, args.epochs): if (epoch < args.max_lr_adjusting_epoch) and (not args.schedule_lr): adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) epoch_log = {'epoch': epoch} # update learning rate with scheduler if args.schedule_lr: scheduler.step() # evaluate on validation set dum_acc1, dum_acc5 = validate(train_loader, model, criterion, args) epoch_log.update({'train': {'acc1': dum_acc1.cpu().numpy().item(), 'acc5': dum_acc5.cpu().numpy().item()}}) acc1, acc5 = validate(val_loader, model, criterion, args) epoch_log.update({'test': {'acc1': acc1.cpu().numpy().item(), 'acc5': acc5.cpu().numpy().item()}}) if args.sub_file or args.mix_cifar: dum_acc1, dum_acc5 = validate(val_loader2, model, criterion, args) epoch_log.update({'subset': {'acc1': dum_acc1.cpu().numpy().item(), 'acc5': dum_acc5.cpu().numpy().item()}}) # compute the jacobian of the network if args.compute_jacobian: jTg = get_jacobian_prod(train_loader, model, criterion, gvec, args) epoch_log.update({'J_norm': {str(k): v.item() for k, v in enumerate(jTg)}}) # TODO: tracking the weights of the layers if args.track_weights: w_change_dict = {('layer_'+str(l)): 0 for l in layer_idx} new_weights = get_weights(model, layer_idx) if args.track_weights == 'norm': for cur_l, cur_w in enumerate(new_weights): if not (cur_w.ndim > 1): continue w_norm_dict['layer_' + str(layer_idx[cur_l])] = np.linalg.norm(cur_w.flatten()).item() epoch_log.update({'w_norm': {k: v for k, v in w_norm_dict.items()}}) else: for cur_l in range(len(layer_idx)): cur_change = new_weights[cur_l] - cur_weights[cur_l] if args.track_weights == 'filters': if cur_change.ndim > 2: cur_change = np.mean(cur_change, axis=(2,3)) filter_w_dict['layer_' + str(layer_idx[cur_l])].append(np.absolute(cur_change)) chng = np.absolute(np.mean(cur_change)) w_change_dict['layer_' + str(layer_idx[cur_l])] = chng.item() epoch_log.update({'weight_change': {k: v for k, v in w_change_dict.items()}}) if args.track_weights == 'filters': with open(filter_w_file, 'wb') as fn: pickle.dump({k: np.stack(v) for k, v in filter_w_dict.items()}, fn) cur_weights = [wh for wh in new_weights] new_weight = None train_log.append(epoch_log) with open(log_file, 'w') as fn: json.dump(train_log, fn, indent=2) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best)
def train(args): args.print_freq = 100 args.gpu = None np.random.seed(args.seed) torch.manual_seed(args.seed) # load data train_dl, valid_dl, test_dl = load_imagenet(args) # define model model = torchvision.models.resnet50(pretrained=False) # multiple gpus model = torch.nn.DataParallel(model).cuda() loss_fn = torch.nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) model_dir = gen_model_dir(args) model_dir.mkdir(parents=True, exist_ok=True) torch.backends.cudnn.benchmark = True best_acc1 = 0 for epoch in range(args.n_epochs): adjust_learning_rate(optimizer, epoch, args) batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(train_dl), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() end = time.time() for batch_idx, (images, target) in enumerate(train_dl): # measure data loading time data_time.update(time.time() - end) # if args.gpu is not None: images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output = model(images) loss = loss_fn(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % args.print_freq == 0: progress.display(batch_idx) # evaluate on validation set acc1 = validate(valid_dl, model, loss_fn, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) torch.save( { 'epoch': epoch + 1, 'model_weight': model.state_dict(), 'heldout_best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, Path(model_dir, 'model')) if is_best: shutil.copyfile(Path(model_dir, 'model'), Path(model_dir, 'model_best')) # load best model with open(Path(model_dir, "model_best"), 'rb') as f: params = torch.load(f) model.load_state_dict(params['model_weight']) # test model.eval() # evaluate on test set acc_test = validate(test_dl, model, loss_fn, args) print('epoch: {}, val acc: {:.4f}, test acc: {:.4f}'.format( params["epoch"], params["heldout_best_acc1"], acc_test)) with open(Path(model_dir, "res.json"), 'w') as fp: json.dump( { 'epoch': params["epoch"], 'heldout_best_acc1': params["heldout_best_acc1"].item(), 'test_best_acc1': acc_test.item(), }, fp)
def train(self, trainloader, epoch): # criterion # print('\nEpoch: %d/%d Epoch in filtering step: %d/%d[wait: %d]' # % (epoch+1, self.max_total_epochs, epoch_in_filtering+1, # self.max_epochs_per_filtering, self.wait)) # print('Filtering step: %d Seed: %d' % (iter_filtering, self.seed)) class_criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=-1) if self.consistency_type == 'mse': consistency_criterion = losses.softmax_mse_loss elif self.consistency_type == 'kl': consistency_criterion = losses.softmax_kl_loss else: assert False, self.consistency_type residual_logit_criterion = losses.symmetric_mse_loss self.model.train() self.ema_model.train() running_class_loss = 0 running_consistency_loss = 0 running_res_loss = 0 running_loss = 0 correct = 0 total = 0 for batch_idx, ((inputs, ema_inputs), targets) in enumerate(trainloader): adjust_learning_rate(self.optimizer, epoch, batch_idx, len(trainloader)) inputs, ema_inputs, targets = inputs.cuda(), ema_inputs.cuda( ), targets.cuda() outputs = self.model(inputs) ema_outputs = self.ema_model(ema_inputs) minibatch_size = len(targets) labeled_minibatch_size = torch.sum(targets != -1).item() logit1, logit2 = outputs ema_logit, _ = ema_outputs if self.logit_distance_cost >= 0: class_logit, cons_logit = logit1, logit2 res_loss = self.logit_distance_cost * residual_logit_criterion( class_logit, cons_logit) / minibatch_size else: class_logit, cons_logit = logit1, logit1 res_loss = 0 class_loss = class_criterion(class_logit, targets) / minibatch_size consistency_weight = get_current_consistency_weight(epoch) consistency_loss = consistency_weight * consistency_criterion( cons_logit, ema_logit) / minibatch_size _, predicted = torch.max(class_logit, 1) total += labeled_minibatch_size correct += predicted.eq(targets).cpu().sum().item() loss = class_loss + consistency_loss + res_loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.global_step += 1 update_ema_variables(self.model, self.ema_model, self.ema_decay, self.global_step) running_res_loss += res_loss.item() running_class_loss += class_loss.item() running_consistency_loss += consistency_loss.item() running_loss += loss.item() progress_bar( batch_idx, len(trainloader), 'Loss: %.3f | ClassLoss = %.3f | ConsLoss: %.3f | LesLoss: %.3f | Acc: %.3f%% (%d/%d) | lr: %.6f' % (running_loss / (batch_idx + 1), running_class_loss / (batch_idx + 1), running_consistency_loss / (batch_idx + 1), running_res_loss / (batch_idx + 1), 100. * correct / total, correct, total, self.optimizer.param_groups[-1]['lr'])) loss = { 'loss': running_loss / (batch_idx + 1), 'class_loss': running_class_loss / (batch_idx + 1), 'consistency_loss': running_consistency_loss / (batch_idx + 1), 'res_loss': running_res_loss / (batch_idx + 1) } acc = 100. * correct / total return loss['loss'], acc
def main(): parser = argparse.ArgumentParser() parser.add_argument('--DATASET_PATH', type=str, default='/disk5/yangle/PAMI/dataset/fc-resnet/') parser.add_argument('--EXPERIMENT', type=str, default='/disk5/yangle/PAMI/result/LearnModel/') # parser.add_argument('--DATASET_PATH', type=str, default='/disk1/hpl/segmentation/dataset/') # parser.add_argument('--EXPERIMENT', type=str, default='/disk1/hpl/segmentation/model/model_baselinexin/') parser.add_argument('--N_EPOCHS', type=int, default=200) parser.add_argument('--MAX_PATIENCE', type=int, default=30) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--N_CLASSES', type=int, default=2) parser.add_argument('--LEARNING_RATE', type=float, default=1e-4) parser.add_argument('--LR_DECAY', type=float, default=0.995) parser.add_argument('--DECAY_LR_EVERY_N_EPOCHS', type=int, default=1) parser.add_argument('--WEIGHT_DECAY', type=float, default=0.0001) parser.add_argument('--CUDNN', type=bool, default=True) args = parser.parse_args() torch.cuda.manual_seed(args.seed) cudnn.benchmark = args.CUDNN normalize = transforms.Normalize(mean=dataset.mean, std=dataset.std) train_joint_transformer = transforms.Compose([ joint_transforms.JointResize(256), joint_transforms.JointRandomCrop(224), joint_transforms.JointRandomHorizontalFlip(), ]) mask_size_list = [28, 28, 28, 56, 112] train_dset = dataset.Saliency( args.DATASET_PATH, 'TRain', train_joint_transformer, mask_size_list, transform=transforms.Compose([joint_transforms.RandomErasing_random(probability=0.5, sh=0.4, r1=0.3, ), transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dset, batch_size=args.batch_size, shuffle=True, num_workers=args.batch_size) test_joint_transforms_img = transforms.Compose([joint_transforms.JointResize(224)]) val_dset = dataset.TestData(args.DATASET_PATH, 'VAl', test_joint_transforms_img, transform=transforms.Compose([transforms.ToTensor(), normalize]), target_transform=transforms.Compose([transforms.ToTensor()])) val_loader = torch.utils.data.DataLoader( val_dset, batch_size=args.batch_size, shuffle=False) print("TrainImages: %d" % len(train_loader.dataset.imgs)) print("ValImages: %d" % len(val_loader.dataset.imgs)) example_inputs, example_targets = next(iter(train_loader)) print("InputsBatchSize: ", example_inputs.size()) print("TargetsBatchSize: ", len(example_targets)) print("\nInput (size, max, min) ---") # input i = example_inputs[0] print(i.size()) print(i.max()) print(i.min()) print("Target (size, max, min) ---") # target for mask in example_targets: print(mask.size()) print(mask.max()) print(mask.min()) resnet34 = torchvision.models.resnet34(pretrained=True) dict_resnet34 = resnet34.state_dict() model = SegNet.resnet34() # # initialize model.apply(utils.weights_init) SegNet_dict = model.state_dict() pretrained_dict = {k: v for k, v in dict_resnet34.items() if k in SegNet_dict} # for k in pretrained_dict: # print(k) SegNet_dict.update(pretrained_dict) model.load_state_dict(SegNet_dict) # seperate layers, to set different lr param_exist = [] param_add = [] for k, (name, module) in enumerate(model.named_children()): # existing layers including: conv1 bn1 relu maxpool # layer1 layer2 layer3 layer4 if k < 8: for param in module.parameters(): param_exist.append(param) # adding layers including: bottleneck skip3 skip2 skip1 skip0 # conv_end_1 bn_end_1 salmap Sigmoid mask0 mask4 mask3 mask2 mask1 else: for param in module.parameters(): param_add.append(param) model = model.cuda() # model = torch.nn.DataParallel(model).cuda() print(' + Number of params: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) optimizer = optim.RMSprop([{'params': param_exist, 'lr': args.LEARNING_RATE*0.1}, {'params': param_add}], lr=args.LEARNING_RATE, weight_decay=args.WEIGHT_DECAY, eps=1e-12) criterion = nn.NLLLoss().cuda() exp_dir = args.EXPERIMENT + 'test' if os.path.exists(exp_dir): shutil.rmtree(exp_dir) exp = experiment.Experiment('test', args.EXPERIMENT) exp.init() START_EPOCH = exp.epoch END_EPOCH = START_EPOCH + args.N_EPOCHS for epoch in range(START_EPOCH, END_EPOCH): since = time.time() # ### Train ### trn_loss, trn_err = utils.train(model, train_loader, optimizer, criterion, epoch) print('Epoch {:d}: Train - Loss: {:.4f}\tErr: {:.4f}'.format(epoch, trn_loss, trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) ### Test ### val_loss, val_err = utils.test_score(model, val_loader) print('Val - Loss: {:.4f}, Error: {:.4f}'.format(val_loss, val_err)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) ### Save Metrics ### exp.save_history('train', trn_loss, trn_err) exp.save_history('val', val_loss, val_err) ### Checkpoint ### exp.save_weights(model, trn_loss, val_loss, trn_err, val_err) exp.save_optimizer(optimizer, val_loss) ## Early Stopping ## if (epoch - exp.best_val_loss_epoch) > args.MAX_PATIENCE: print(("Early stopping at epoch %d since no " +"better loss found since epoch %.3").format(epoch, exp.best_val_loss)) break # Adjust Lr ###--old method utils.adjust_learning_rate(args.LEARNING_RATE, args.LR_DECAY, optimizer, epoch, args.DECAY_LR_EVERY_N_EPOCHS) exp.epoch += 1