def __init__(self, model: Module, train_loader: DataLoader, test_loader: DataLoader, device=DEFAULT_DEVICE, lr=DEFAULT_LR, momentum=DEFAULT_MOMENTUM, epochs=DEFAULT_EPOCHS, batch_size=DEFAULT_BATCH_SIZE, parallelism=DEFAULT_PARALLELISM, milestones=MILESTONES, gamma=0.2, warm_phases=WARM_PHASES, criterion=loss.CrossEntropyLoss()): print("initialize trainer") # parameter pre-processing self.test_loader = test_loader if torch.cuda.device_count() > 1 and parallelism: print(f"using {torch.cuda.device_count()} GPUs") self.model = nn.DataParallel(model) else: self.model = model self.model.to(device) optimizer = optim.SGD( # choose whether train or not filter(lambda p: p.requires_grad, self.model.parameters()), lr=lr, momentum=momentum, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma) # warm phases self.warm_phases = warm_phases # warmup learning rate self.warmup_scheduler = WarmUpLR(optimizer, len(train_loader) * self.warm_phases) self.hp = HyperParameter(scheduler=train_scheduler, optimizer=optimizer, criterion=criterion, batch_size=batch_size, epochs=epochs, device=device) self.train_loader = train_loader print("initialize finished") print(f"hyper parameter: {self.hp}")
cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) if args.resume: recent_folder = most_recent_folder(os.path.join( settings.CHECKPOINT_PATH, args.net), fmt=settings.DATE_FORMAT) if not recent_folder: raise Exception('no recent folder were found') checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder) else: checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW)
checkpoints_path = os.path.join(conf.CHECKPOINTS_PATH, args.model, datetime.now().isoformat()) if not os.path.exists(checkpoints_path): os.makedirs(checkpoints_path) checkpoints_path = os.path.join(checkpoints_path, '{model}-{epoch}-{type}.pth') loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=conf.LEARNING_RATE, momentum=conf.MOMENTUM, weight_decay=conf.WEIGHT_DECAY) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=conf.MILESTONES, gamma=conf.GAMMA) iter_per_epoch = len(train_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * conf.WARM_EPOCH) best_acc = 0.0 for epoch in range(1, conf.EPOCH): if epoch > conf.WARM_EPOCH: train_scheduler.step(epoch) train(model, epoch, train_loader, loss_function, optimizer, warmup_scheduler, args.gpu) acc = eval(model, epoch, val_loader, loss_function, args.gpu) if best_acc < acc: torch.save( model.state_dict(), checkpoints_path.format(model=args.model, epoch=epoch,
shuffle=True) cell_train_test_loader = get_test_dataloader(path=trainpath, mean=cell_train_mean, std=cell_train_std, num_workers=4, batch_size=args.b, shuffle=True) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) iter_per_epoch = len(cell_training_loader) warmup_scheduler = WarmUpLR(optimizer, 10000 * args.warm) checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, '3_4', settings.TIME_NOW) # create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}-{accuracy}.pth') best_acc = 0.7 for epoch in range(1, settings.EPOCH + 1): if epoch > args.warm: # =1 train_scheduler.step(epoch) train(epoch) acc_train = eval_training(epoch)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) print('==> Preparing dataset %s' % args.dataset) if args.dataset == 'cifar100': training_loader = get_training_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.train_batch, shuffle=True) test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.test_batch, shuffle=False) num_classes = 100 else: training_loader = get_training_dataloader_10( settings.CIFAR10_TRAIN_MEAN, settings.CIFAR10_TRAIN_STD, num_workers=4, batch_size=args.train_batch, shuffle=True) test_loader = get_test_dataloader_10(settings.CIFAR10_TRAIN_MEAN, settings.CIFAR10_TRAIN_STD, num_workers=4, batch_size=args.test_batch, shuffle=False) num_classes = 10 #data preprocessing: print("==> creating model '{}'".format(args.arch)) model = get_network(args, num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion1 = am_softmax.AMSoftmax() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.schedule, gamma=0.2) #learning rate decay iter_per_epoch = len(training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) for epoch in range(start_epoch, args.epochs): if epoch > args.warm: train_scheduler.step(epoch) train_loss, train_acc = train(training_loader, model, warmup_scheduler, criterion, criterion1, optimizer, epoch, use_cuda) test_loss, test_acc = eval_training(test_loader, model, criterion, epoch, use_cuda) logger.append([ optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, test_acc ]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() # logger.plot() # savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) # learning rate decay iter_per_epoch = len(cifar100_train_loader) total_iters = iter_per_epoch * args.warm warmup_scheduler = WarmUpLR(optimizer, total_iters) checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) # create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth') best_acc = 0.0 for epoch in range(1, settings.EPOCH): if epoch > args.warm: train_scheduler.step(epoch) train(epoch) acc = eval(epoch)
def main(): global args, best_prec1 args = parser.parse_args() print(args) check_rootfolders() categories, train_list, val_list, root_path, prefix = datasets_video.return_dataset( args.dataset, args.root_path) num_class = len(categories) global store_name store_name = '_'.join([ args.type, args.dataset, args.arch, 'segment%d' % args.num_segments, args.store_name ]) print(('storing name: ' + store_name)) if args.dataset == 'somethingv1' or args.dataset == 'somethingv2': # label transformation for left/right categories # please refer to labels.json file in sometingv2 for detail. target_transforms = { 86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166 } else: target_transforms = None if args.conv_config in conv_configs: conv_config = conv_configs[args.conv_config] conv_index = None # conv_indexs[args.conv_config] else: conv_config = None model = TemporalModel(num_class, args.num_segments, model=args.type, backbone=args.arch, alpha=args.alpha, beta=args.beta, dropout=args.dropout, target_transforms=target_transforms, search=args.search, op_code=args.op_code, conv_config=conv_config) crop_size = model.crop_size scale_size = model.scale_size input_mean = model.input_mean input_std = model.input_std policies = get_optim_policies(model) train_augmentation = model.get_augmentation() if torch.cuda.is_available(): model = torch.nn.DataParallel(model).cuda() if args.prune: prune(model, args.prune_model_path, './conv_config.txt') # prune_select(model, args.prune_model_path, './conv_config.txt') exit(0) if args.resume: if os.path.isfile(args.resume): print(("=> loading checkpoint '{}'".format(args.resume))) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] # model.module.load_state_dict(checkpoint['state_dict']) model.load_state_dict(checkpoint['state_dict']) # print(("=> loaded checkpoint '{}' (epoch {})" # .format(args.evaluate, checkpoint['epoch']))) else: print(("=> no checkpoint found at '{}'".format(args.resume))) if args.finetune: if os.path.isfile(args.finetune): print(("=> loading checkpoint '{}'".format(args.finetune))) checkpoint = torch.load(args.finetune) from I3D import load_state_dict_supernet model = load_state_dict_supernet(model, checkpoint['state_dict'], conv_index) #args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] #model.module.load_state_dict(checkpoint['state_dict']) print(("=> loaded checkpoint '{}' (epoch {})".format( args.evaluate, checkpoint['epoch']))) else: print(("=> no checkpoint found at '{}'".format(args.finetune))) exit(0) cudnn.benchmark = True # Data loading code normalize = GroupNormalize(input_mean, input_std) train_loader = torch.utils.data.DataLoader(VideoDataSet( root_path, train_list, num_segments=args.num_segments, image_tmpl=prefix, transform=torchvision.transforms.Compose([ train_augmentation, Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])), ToTorchFormatTensor( div=(args.arch not in ['BNInception', 'InceptionV3'])), normalize, ])), batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(VideoDataSet( root_path, val_list, num_segments=args.num_segments, image_tmpl=prefix, random_shift=False, transform=torchvision.transforms.Compose([ GroupScale(int(scale_size)), GroupCenterCrop(crop_size), Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])), ToTorchFormatTensor( div=(args.arch not in ['BNInception', 'InceptionV3'])), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss().cuda() for group in policies: print(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format( group['name'], len(group['params']), group['lr_mult'], group['decay_mult']))) optimizer = torch.optim.SGD(policies, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.evaluate: prec1 = validate(val_loader, model, criterion, 0) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) exit() return log_training = open( os.path.join(args.checkpoint_dir, 'log', '%s.csv' % store_name), 'w') warmup_scheduler = WarmUpLR(optimizer, len(train_loader) * args.warm) for epoch in range(args.start_epoch, args.epochs): # adjust learning rate if epoch > args.warm: adjust_learning_rate(optimizer, epoch, args.lr_steps) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, log_training, warmup_scheduler, args) # evaluate on validation set if ((epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1): if 'dropout' not in args.op_code: prec1 = validate(val_loader, model, criterion, (epoch + 1) * len(train_loader), log_training) else: prec1 = 0.0 # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best)
def f(self, x, return_acc=False): #x, layer number to calculate if x.size == 1: x = np.append(x, 0.32) x = x.reshape(1, 2) target = int(x[:, 0]) print("Start run ", target) start_time = default_timer() self.net = resnet50(60).cuda() device = 'cuda' if torch.cuda.is_available() else 'cpu' if device == 'cuda': self.net = torch.nn.DataParallel(self.net) cudnn.benchmark = True self.net.load_state_dict(torch.load(checkpoint), True) if self.inc_index == 1: self.net.module.fc = nn.Linear(512 * 4, 30).cuda() else: self.net.module.fc = nn.Linear(512 * 4, 10).cuda() self.net.train() cur_wc = 0 count = 0 for m in self.net.modules(): if target == count: break elif isinstance(m, nn.Conv2d): for param in m.parameters(): cur_wc += param.numel() param.requires_grad = False elif isinstance(m, nn.BatchNorm2d): for param in m.parameters(): param.requires_grad = False count += 1 BASE_DATA_ROOT = '/home/bbboming/HDD/Paper/datasets_object/ICIFAR100_60_30_10/BASE/' DATA_ROOT = '/home/bbboming/HDD/Paper/datasets_object/ICIFAR100_60_30_10/INC%d/' % self.inc_index train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD), ]) trainset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'train'), train_transform) cifar100_training_loader = torch.utils.data.DataLoader( trainset, batch_size=self.batch_size, pin_memory=True, num_workers=4, shuffle=self.shuffle) test_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD), ]) testset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'test'), test_transform) cifar100_test_loader = torch.utils.data.DataLoader( testset, batch_size=self.batch_size, pin_memory=True, num_workers=4, shuffle=False) base_testset = datasets.ImageFolder( os.path.join(BASE_DATA_ROOT, 'test'), test_transform) cifar100_base_test_loader = torch.utils.data.DataLoader( base_testset, batch_size=self.batch_size, pin_memory=True, num_workers=4, shuffle=False) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(self.net.parameters(), lr=self.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min') iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * self.warm) checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, 'resnet50_inc%d' % self.inc_index, settings.TIME_NOW) #create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{target}-{type}.pth') best_acc = 0.0 best_base_acc = 0.0 best_inc_acc = 0.0 for epoch in range(1, settings.EPOCH): self.net.train() # train(epoch) for batch_index, (images, labels) in enumerate(cifar100_training_loader): images = Variable(images) labels = Variable(labels) labels = labels.cuda() images = images.cuda() optimizer.zero_grad() outputs = self.net(images) loss = loss_function(outputs, labels) loss.backward() optimizer.step() if epoch <= self.warm: warmup_scheduler.step() n_iter = (epoch - 1) * len(cifar100_training_loader) + batch_index + 1 #print('[Target {target}] [Training Epoch: {epoch}/{total_epoch}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format( # loss.item(), # optimizer.param_groups[0]['lr'], # target=target, # epoch=epoch, # total_epoch=settings.EPOCH #)) #Evaluation Accuracy self.net.eval() self.basenet.eval() test_loss = 0.0 # cost function error correct = 0.0 #INC Testset for (images, labels) in cifar100_test_loader: images = Variable(images) labels = Variable(labels) images = images.cuda() labels = labels.cuda() soft_layer = nn.Softmax(dim=1).cuda() base_outputs = self.basenet(images) outputs = self.net(images) loss = loss_function(outputs, labels) test_loss += loss.item() soft_base = soft_layer(base_outputs) soft_inc = soft_layer(outputs) softmax = torch.cat([soft_base, soft_inc], dim=1) labels_all = labels + 60 _, preds = softmax.max(1) correct += preds.eq(labels_all).sum() #Base Testset correct_base = 0.0 for (images, labels) in cifar100_base_test_loader: images = Variable(images) labels = Variable(labels) images = images.cuda() labels = labels.cuda() soft_layer = nn.Softmax(dim=1).cuda() base_outputs = self.basenet(images) outputs = self.net(images) soft_base = soft_layer(base_outputs) soft_inc = soft_layer(outputs) softmax = torch.cat([soft_base, soft_inc], dim=1) labels_all = labels _, preds = softmax.max(1) correct_base += preds.eq(labels_all).sum() avg_loss = test_loss / len(cifar100_test_loader.dataset) base_acc = correct_base.float() / len( cifar100_base_test_loader.dataset) inc_acc = correct.float() / len(cifar100_test_loader.dataset) acc = (correct.float() + correct_base.float()) / ( len(cifar100_test_loader.dataset) + len(cifar100_base_test_loader.dataset)) print( 'Test set: Average loss: {:.4f}, Accuracy: {:.4f} (BaseAcc {:.4f} IncAcc {:.4f})' .format(avg_loss, acc, base_acc, inc_acc)) train_scheduler.step(avg_loss) #start to save best performance model after learning rate decay to 0.01 if epoch > 10 and best_acc < acc: torch.save( self.net.state_dict(), checkpoint_path.format(target=target, net='resnet50', type='best')) best_acc = acc best_inc_acc = inc_acc best_base_acc = base_acc # share_ratio = target / self.count best_dict[str(target)] = best_acc.detach().cpu().item() memory_efficiency = cur_wc / self.total_wc obj_acc = best_acc.detach().cpu().item() alpha = x[:, 1].item() threshold = 0.02 target_mem_eff = 0.70 #Objective Function obj_f = np.abs((self.max_acc - obj_acc) - threshold) print_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " x= {x}, alpha= {alpha} Memory_Efficiency= {memory_efficiency}, combined_classification_acc= {best_acc}, obj_acc= {obj_acc}, OBJ_F= {obj_f}" \ .format(x=target, alpha=alpha,best_acc=best_acc, obj_acc=obj_acc, memory_efficiency=memory_efficiency, obj_f=obj_f) with open("history.log", "a") as f_hist: f_hist.write(print_str + "\n") print(print_str) if self.min_acc != 0: csv.write("%d, %d, %f, %f, %f, %f, %f\n" % (self.iteration, target, obj_acc, threshold, obj_f, self.min_acc, self.max_acc)) self.iteration += 1 end_time = default_timer() print("operation time: ", (end_time - start_time)) if return_acc: return (best_acc.detach().cpu().item()) return (obj_f)
def main(args): print(f'started: {args}') torch.cuda.set_device(args.gpu) cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True) cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True) net = resnet18(with_permute_adain=(args.padain > 0), p_adain=args.padain) net = net.cuda() print(net) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) # learning rate decay iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, 'cifar100', args.net, str(args.padain), settings.TIME_NOW) # use tensorboard if not os.path.exists(settings.LOG_DIR): os.mkdir(settings.LOG_DIR) writer = SummaryWriter( log_dir=os.path.join(settings.LOG_DIR, args.net + '_padain' + str(args.padain), settings.TIME_NOW)) input_tensor = torch.Tensor(1, 3, 32, 32).cuda() writer.add_graph(net, input_tensor) # create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{type}.pth') best_acc = 0.0 for epoch in range(1, settings.EPOCH): if epoch > args.warm: train_scheduler.step(epoch) train(net, cifar100_training_loader, warmup_scheduler, optimizer, loss_function, writer, epoch, args.warm, args.b) acc = eval_training(net, cifar100_test_loader, loss_function, writer, epoch) # start to save best performance model after learning rate decay to 0.01 if epoch > settings.MILESTONES[1] and best_acc < acc: torch.save(net.state_dict(), checkpoint_path.format(net=args.net, type='best')) best_acc = acc continue if not epoch % settings.SAVE_EPOCH: torch.save(net.state_dict(), checkpoint_path.format(net=args.net, type='other')) writer.close()
def train_variant(conv, fcl, args): net, arch_name = construct_vgg_variant(conv_variant=conv, fcl_variant=fcl, batch_norm=True, progress=True, pretrained=False) args.net = arch_name if args.gpu: #use_gpu net = net.cuda() loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) # learning rate decay iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) if args.resume: recent_folder = most_recent_folder(os.path.join( settings.CHECKPOINT_PATH, args.net), fmt=settings.DATE_FORMAT) if not recent_folder: raise Exception('no recent folder were found') checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder) else: checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) #use tensorboard if not os.path.exists(settings.LOG_DIR): os.mkdir(settings.LOG_DIR) #since tensorboard can't overwrite old values #so the only way is to create a new tensorboard log writer = SummaryWriter( log_dir=os.path.join(settings.LOG_DIR, args.net, settings.TIME_NOW)) if args.gpu: input_tensor = torch.Tensor(1, 3, 32, 32).cuda() else: input_tensor = torch.Tensor(1, 3, 32, 32) writer.add_graph(net, input_tensor) #create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth') best_acc = 0.0 if args.resume: best_weights = best_acc_weights( os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) if best_weights: weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, best_weights) print('found best acc weights file:{}'.format(weights_path)) print('load best training file to test acc...') net.load_state_dict(torch.load(weights_path)) best_acc = eval_training(tb=False) print('best acc is {:0.2f}'.format(best_acc)) recent_weights_file = most_recent_weights( os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) if not recent_weights_file: raise Exception('no recent weights file were found') weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, recent_weights_file) print('loading weights file {} to resume training.....'.format( weights_path)) net.load_state_dict(torch.load(weights_path)) resume_epoch = last_epoch( os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) train_params = { 'net': net, 'warmup_scheduler': warmup_scheduler, 'loss_function': loss_function, 'optimizer': optimizer, 'writer': writer } for epoch in range(1, settings.EPOCH): # for epoch in [1]:# range(1, 2): if epoch > args.warm: train_scheduler.step(epoch) if args.resume: if epoch <= resume_epoch: continue train(epoch=epoch, **train_params) acc = eval_training(epoch=epoch, **train_params) #start to save best performance model after learning rate decay to 0.01 if epoch > settings.MILESTONES[1] and best_acc < acc: torch.save( net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='best')) best_acc = acc continue if not epoch % settings.SAVE_EPOCH: torch.save( net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='regular')) writer.close()
class Trainer: def __init__(self, model: Module, train_loader: DataLoader, test_loader: DataLoader, device=DEFAULT_DEVICE, lr=DEFAULT_LR, momentum=DEFAULT_MOMENTUM, epochs=DEFAULT_EPOCHS, batch_size=DEFAULT_BATCH_SIZE, parallelism=DEFAULT_PARALLELISM, milestones=MILESTONES, gamma=0.2, warm_phases=WARM_PHASES, criterion=loss.CrossEntropyLoss()): print("initialize trainer") # parameter pre-processing self.test_loader = test_loader if torch.cuda.device_count() > 1 and parallelism: print(f"using {torch.cuda.device_count()} GPUs") self.model = nn.DataParallel(model) else: self.model = model self.model.to(device) optimizer = optim.SGD( # choose whether train or not filter(lambda p: p.requires_grad, self.model.parameters()), lr=lr, momentum=momentum, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma) # warm phases self.warm_phases = warm_phases # warmup learning rate self.warmup_scheduler = WarmUpLR(optimizer, len(train_loader) * self.warm_phases) self.hp = HyperParameter(scheduler=train_scheduler, optimizer=optimizer, criterion=criterion, batch_size=batch_size, epochs=epochs, device=device) self.train_loader = train_loader print("initialize finished") print(f"hyper parameter: {self.hp}") def train(self, save_path, attack=False, attacker=None, params: Dict = None): self._init_attacker(attack, attacker, params) batch_number = len(self.train_loader) # get current learning rate now_lr = self.hp.optimizer.state_dict().get("param_groups")[0].get( "lr") # record best accuracy best_acc = 0 for ep in range(1, self.hp.epochs + 1): training_acc, running_loss = 0, .0 start_time = time.process_time() for index, data in enumerate(self.train_loader): inputs, labels = data[0].to(self.hp.device), data[1].to( self.hp.device) self.hp.optimizer.zero_grad() if attack: # calculate this first, for this will zero the grad adv_inputs = self.attacker.calc_perturbation( inputs, labels) # zero the grad self.hp.optimizer.zero_grad() outputs = self.model(inputs) adv_outputs = self.model(adv_inputs) _loss = self.hp.criterion(outputs, labels) + self.hp.criterion( adv_outputs, labels) else: outputs = self.model(inputs) _loss = self.hp.criterion(outputs, labels) _loss.backward() self.hp.optimizer.step() outputs: torch.Tensor training_acc += (outputs.argmax( dim=1) == labels).float().mean().item() # warm up learning rate if ep <= self.warm_phases: self.warmup_scheduler.step() # detect learning rate change new_lr = self.hp.optimizer.state_dict().get( "param_groups")[0].get("lr") if new_lr != now_lr: now_lr = new_lr print(f"learning rate changes to {now_lr:.6f}") running_loss += _loss.item() if index % batch_number == batch_number - 1: end_time = time.process_time() acc = self.test(self.model, test_loader=self.test_loader, device=self.hp.device) print( f"epoch: {ep} loss: {(running_loss / batch_number):.6f} train accuracy: {training_acc / batch_number} " f"test accuracy: {acc} time: {end_time - start_time:.2f}s" ) if best_acc < acc: best_acc = acc self._save_best_model(save_path, ep, acc) # change learning rate by step self.hp.scheduler.step(ep) torch.save(self.model.state_dict(), f"{save_path}-latest") print("finished training") print(f"best accuracy on test set: {best_acc}") @staticmethod def test(model: Module, test_loader, device, debug=False): correct = 0 with torch.no_grad(): for data in test_loader: inputs, labels = data[0].to(device), data[1].to(device) _, y_hats = model(inputs).max(1) match = (y_hats == labels) correct += len(match.nonzero()) if debug: print(f"Testing: {len(test_loader.dataset)}") print(f"correct: {correct}") print(f"accuracy: {100*correct/len(test_loader.dataset):.3f}%") return correct / len(test_loader.dataset) def _init_attacker(self, attack, attacker, params): self.attack = attack if attack: print(f"robustness training with {attacker.__name__}") self.attacker = attacker(self.model, **params) self.attacker.print_params() else: print("normal training") def _save_best_model(self, save_path, current_epochs, accuracy): """save best model with current info""" info = { "current_epochs": current_epochs, "total_epochs": self.hp.epochs, "accuracy": accuracy } if self.attack: info.update({ "attack": self.attack, "attacker": type(self.attacker).__name__, "epsilons": self.attacker.epsilon, }) with open(os.path.join(os.path.dirname(save_path), "info.json"), "w", encoding="utf8") as f: json.dump(info, f) torch.save(self.model.state_dict(), f"{save_path}-best") @staticmethod def train_tl(origin_model_path, save_path, train_loader, test_loader, device, choice="resnet50"): print(f"transform learning on model: {origin_model_path}") model = TLResNet.create_model(choice) model.load_model(origin_model_path) trainer = Trainer(model=model, train_loader=train_loader, test_loader=test_loader, device=device) trainer.train(save_path)