def optimize_epoch(self, optimizer, loader, epoch, validation=False): logger.warning(f"Starting epoch {epoch}, validation: {validation} " + "="*30) loss_value = util.AverageMeter() # house keeping self.model.train() if self.lr_schedule(epoch+1) != self.lr_schedule(epoch): files.save_checkpoint_all(self.checkpoint_dir, self.model, args.arch, optimizer, self.L, epoch, lowest=False, save_str='pre-lr-drop') lr = self.lr_schedule(epoch) for pg in optimizer.param_groups: pg['lr'] = lr XE = torch.nn.CrossEntropyLoss() for iter, (data, label, selected) in tqdm(enumerate(loader),desc="epoch={}/{}".format(epoch,args.epochs)): now = time.time() niter = epoch * len(loader) + iter if niter*args.batch_size >= self.optimize_times[-1]: ############ optimize labels ######################################### self.model.headcount = 1 logger.warning('Optimizaton starting') with torch.no_grad(): _ = self.optimize_times.pop() self.optimize_labels(niter) data = data.to(self.dev) mass = data.size(0) final = self.model(data) #################### train CNN #################################################### if self.hc == 1: loss = XE(final, self.L[0, selected]) else: loss = torch.mean(torch.stack([XE(final[h], self.L[h, selected]) for h in range(self.hc)])) optimizer.zero_grad() loss.backward() optimizer.step() loss_value.update(loss.item(), mass) data = 0 # some logging stuff ############################################################## if iter % args.log_iter == 0: if self.writer: self.writer.add_scalar('lr', self.lr_schedule(epoch), niter) logger.info(niter, " Loss: {0:.3f}".format(loss.item())) logger.info(niter, " Freq: {0:.2f}".format(mass/(time.time() - now))) if writer: self.writer.add_scalar('Loss', loss.item(), niter) if iter > 0: self.writer.add_scalar('Freq(Hz)', mass/(time.time() - now), niter) # end of epoch logging ################################################################ if self.writer and (epoch % args.log_intv == 0): util.write_conv(self.writer, self.model, epoch=epoch) files.save_checkpoint_all(self.checkpoint_dir, self.model, args.arch, optimizer, self.L, epoch, lowest=False) return {'loss': loss_value.avg}
def optimize(self): """Perform full optimization.""" first_epoch = 0 self.model = self.model.to(self.dev) N = len(self.pseudo_loader.dataset) # optimization times (spread exponentially), can also just be linear in practice (i.e. every n-th epoch) self.optimize_times = [(self.num_epochs+2)*N] + \ ((self.num_epochs+1.01)*N*(np.linspace(0, 1, args.nopts)**2)[::-1]).tolist() optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), weight_decay=self.weight_decay, momentum=self.momentum, lr=self.lr) if self.checkpoint_dir is not None and self.resume: self.L, first_epoch = files.load_checkpoint_all(self.checkpoint_dir, self.model, optimizer) logger.warning('found first epoch to be', first_epoch) include = [(qq/N >= first_epoch) for qq in self.optimize_times] self.optimize_times = (np.array(self.optimize_times)[include]).tolist() logger.warning('We will optimize L at epochs: {}'.format([np.round(1.0 * t / N, 2) for t in self.optimize_times])) if first_epoch == 0: # initiate labels as shuffled. self.L = np.zeros((self.hc, N), dtype=np.int32) for nh in range(self.hc): for _i in range(N): self.L[nh, _i] = _i % self.outs[nh] self.L[nh] = np.random.permutation(self.L[nh]) self.L = torch.LongTensor(self.L).to(self.dev) # Perform optmization ############################################################### lowest_loss = 1e9 epoch = first_epoch while epoch < (self.num_epochs+1): m = self.optimize_epoch(optimizer, self.train_loader, epoch, validation=False) if m['loss'] < lowest_loss: lowest_loss = m['loss'] files.save_checkpoint_all(self.checkpoint_dir, self.model, args.arch, optimizer, self.L, epoch, lowest=True) epoch += 1 logger.info(f"optimization completed. Saving model to {os.path.join(self.checkpoint_dir,'model_final.pth.tar')}") torch.save(self.model, os.path.join(self.checkpoint_dir, 'model_final.pth.tar')) return self.model
def eval_k_s(K_, sigma_): total = 0 top1 = 0. top5 = 0. with torch.no_grad(): retrieval_one_hot = torch.zeros(K_, C) # .cuda() for batch_idx, (inputs, targets, _) in enumerate(testloader): targets = targets # .cuda(async=True) # or without async for py3.7 inputs = inputs.cuda() batchSize = inputs.size(0) features = net(inputs) if use_pca: features = pca.transform(features.cpu().numpy()) features = torch.Tensor(features).cuda() features = normalize(features).cpu() dist = torch.mm(features, trainFeatures) yd, yi = dist.topk(K_, dim=1, largest=True, sorted=True) candidates = trainLabels.view(1, -1).expand(batchSize, -1) retrieval = torch.gather(candidates, 1, yi) retrieval_one_hot.resize_(batchSize * K_, C).zero_() retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1) yd_transform = yd.clone().div_(sigma_).exp_() probs = torch.sum( torch.mul(retrieval_one_hot.view(batchSize, -1, C), yd_transform.view(batchSize, -1, 1)), 1) _, predictions = probs.sort(1, True) # Find which predictions match the target correct = predictions.eq(targets.data.view(-1, 1)) top1 = top1 + correct.narrow(1, 0, 1).sum().item() top5 = top5 + correct.narrow(1, 0, 5).sum().item() total += targets.size(0) logger.warning(f"{K_}-NN,s={sigma_}: TOP1: {top1 * 100. / total}") return top1 / total
def eval_epoch(self, d_loader): self.model.eval() eval_dict = {} total_loss = 0.0 count = 1.0 for i, data in tqdm.tqdm( enumerate(d_loader, 0), total=len(d_loader), leave=False, desc='val'): self.optimizer.zero_grad() _, loss, eval_res = self.model_fn(self.model, data, eval=True) total_loss += loss.item() count += 1 for k, v in eval_res.items(): if v is not None: eval_dict[k] = eval_dict.get(k, []) + [v] logger.warning("evaluation loss={}, result={}".format(total_loss / count, eval_dict)) return total_loss / count, eval_dict
def model_summary(model_list): if not isinstance(model_list, list): model_list = [model_list] from operator import mul for model in model_list: data = [] trainable_param_num = 0 all_param_num = 0 for key, value in model.named_parameters(): data.append([ key, list(value.size()), value.requires_grad, value.dtype, value.device, value.is_leaf, str(value.grad_fn) ]) _num = reduce(mul, list(value.size()), 1) all_param_num += _num if value.requires_grad: trainable_param_num += _num table = tabulate(data, headers=[ "name", "shape", "requires_grad", "dtype", "device", "is_leaf", "grad_fn" ]) logger.warning( " Arg Parameters: #param={}, #param(trainable) = {}".format( all_param_num, trainable_param_num)) logger.info(colored( "Model Summary", "cyan", )) logger.info("\n\n" + table) logger.info(model) return all_param_num, trainable_param_num
def load_checkpoint(model=None, optimizer=None, filename='checkpoint'): filename = "{}.pth.tar".format(filename) if os.path.isfile(filename): print("==> Loading from checkpoint '{}'".format(filename)) checkpoint = torch.load(filename) epoch = checkpoint['epoch'] it = checkpoint.get('it', 0.0) best_prec = checkpoint['best_prec'] logger.warning("checkpoint it:{}, best_prec:{}".format(it,best_prec)) if model is not None and checkpoint['model_state'] is not None: logger.warning("load model_state") model.load_state_dict(checkpoint['model_state']) if optimizer is not None and checkpoint['optimizer_state'] is not None: logger.warning("load optimizer_state") optimizer.load_state_dict(checkpoint['optimizer_state']) print("==> Done") return it, epoch, best_prec else: logger.warning("==> Checkpoint '{}' not found".format(filename)) raise return None
pytorchgo_args.get_args().epochs)) #optimizer_summary(optimizer) cpu_prototype = model.prototype_N2K.detach().cpu().numpy() return cpu_prototype optimizer_summary(optimizer) model_summary(model) pytorchgo_args.get_args().step = 0 for epoch in range(start_epoch, start_epoch + args.epochs): if args.debug and epoch >= 2: break prototype = train(epoch) feature_return_switch(model, True) logger.warning(logger.get_logger_dir()) logger.warning("doing KNN evaluation.") acc = kNN(model, trainloader, testloader, K=10, sigma=0.1, dim=knn_dim) logger.warning("finish KNN evaluation.") feature_return_switch(model, False) if acc > best_acc: logger.info('get better result, saving..') state = { 'net': model.state_dict(), 'acc': acc, 'epoch': epoch, 'opt': optimizer.state_dict(), 'prototype': prototype, } torch.save(state, os.path.join(logger.get_logger_dir(), 'best_ckpt.t7'))
def kNN(net, trainloader, testloader, K, sigma=0.1, dim=128, use_pca=False): net.eval() # this part is ugly but made to be backwards-compatible. there was a change in cifar dataset's structure. if hasattr(trainloader.dataset, 'imgs'): trainLabels = torch.LongTensor( [y for (p, y) in trainloader.dataset.imgs]) # .cuda() elif hasattr(trainloader.dataset, 'indices'): trainLabels = torch.LongTensor([ k for path, k in trainloader.dataset.dataset.dt.imgs ])[trainloader.dataset.indices] elif hasattr(trainloader.dataset, 'train_labels'): trainLabels = torch.LongTensor( trainloader.dataset.train_labels) # .cuda() if hasattr(trainloader.dataset, 'dt'): if hasattr(trainloader.dataset.dt, 'targets'): trainLabels = torch.LongTensor( trainloader.dataset.dt.targets) # .cuda() else: # hasattr(trainloader.dataset.dt, 'imgs'): trainLabels = torch.LongTensor( [k for path, k in trainloader.dataset.dt.imgs]) # .cuda() else: trainLabels = torch.LongTensor(trainloader.dataset.targets) # .cuda() C = trainLabels.max() + 1 if hasattr(trainloader.dataset, 'transform'): transform_bak = trainloader.dataset.transform trainloader.dataset.transform = testloader.dataset.transform elif hasattr(trainloader.dataset.dataset.dt, 'transform'): transform_bak = trainloader.dataset.dataset.dt.transform trainloader.dataset.dataset.dt.transform = testloader.dataset.dt.transform else: transform_bak = trainloader.dataset.dt.transform trainloader.dataset.dt.transform = testloader.dataset.dt.transform temploader = torch.utils.data.DataLoader(trainloader.dataset, batch_size=64, num_workers=1) if hasattr(trainloader.dataset, 'indices'): LEN = len(trainloader.dataset.indices) else: LEN = len(trainloader.dataset) trainFeatures = torch.zeros((dim, LEN)) # , device='cuda:0') normalize = Normalize() for batch_idx, (inputs, targets, _) in enumerate(temploader): if pytorchgo_args.get_args().debug and batch_idx > 1: break batchSize = inputs.size(0) inputs = inputs.cuda() features = net(inputs) if not use_pca: features = normalize(features) trainFeatures[:, batch_idx * batchSize:batch_idx * batchSize + batchSize] = features.data.t().cpu() if hasattr(temploader.dataset, 'imgs'): trainLabels = torch.LongTensor( temploader.dataset.train_labels) # .cuda() elif hasattr(temploader.dataset, 'indices'): trainLabels = torch.LongTensor([ k for path, k in temploader.dataset.dataset.dt.imgs ])[temploader.dataset.indices] elif hasattr(temploader.dataset, 'train_labels'): trainLabels = torch.LongTensor( temploader.dataset.train_labels) # .cuda() elif hasattr(temploader.dataset, 'targets'): trainLabels = torch.LongTensor(temploader.dataset.targets) # .cuda() elif hasattr(temploader.dataset.dt, 'imgs'): trainLabels = torch.LongTensor( [k for path, k in temploader.dataset.dt.imgs]) #.cuda() elif hasattr(temploader.dataset.dt, 'targets'): trainLabels = torch.LongTensor(temploader.dataset.dt.targets) #.cuda() else: trainLabels = torch.LongTensor(temploader.dataset.labels) #.cuda() trainLabels = trainLabels.cpu() if hasattr(trainloader.dataset, 'transform'): trainloader.dataset.transform = transform_bak elif hasattr(trainloader.dataset, 'indices'): trainloader.dataset.dataset.dt.transform = transform_bak else: trainloader.dataset.dt.transform = transform_bak if use_pca: comps = 128 logger.warning('doing PCA with {} components'.format(comps)) from sklearn.decomposition import PCA pca = PCA(n_components=comps, whiten=False) trainFeatures = pca.fit_transform(trainFeatures.numpy().T) trainFeatures = torch.Tensor(trainFeatures) trainFeatures = normalize(trainFeatures).t() logger.warning('..done') def eval_k_s(K_, sigma_): total = 0 top1 = 0. top5 = 0. with torch.no_grad(): retrieval_one_hot = torch.zeros(K_, C) # .cuda() for batch_idx, (inputs, targets, _) in enumerate(testloader): targets = targets # .cuda(async=True) # or without async for py3.7 inputs = inputs.cuda() batchSize = inputs.size(0) features = net(inputs) if use_pca: features = pca.transform(features.cpu().numpy()) features = torch.Tensor(features).cuda() features = normalize(features).cpu() dist = torch.mm(features, trainFeatures) yd, yi = dist.topk(K_, dim=1, largest=True, sorted=True) candidates = trainLabels.view(1, -1).expand(batchSize, -1) retrieval = torch.gather(candidates, 1, yi) retrieval_one_hot.resize_(batchSize * K_, C).zero_() retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1) yd_transform = yd.clone().div_(sigma_).exp_() probs = torch.sum( torch.mul(retrieval_one_hot.view(batchSize, -1, C), yd_transform.view(batchSize, -1, 1)), 1) _, predictions = probs.sort(1, True) # Find which predictions match the target correct = predictions.eq(targets.data.view(-1, 1)) top1 = top1 + correct.narrow(1, 0, 1).sum().item() top5 = top5 + correct.narrow(1, 0, 5).sum().item() total += targets.size(0) logger.warning(f"{K_}-NN,s={sigma_}: TOP1: {top1 * 100. / total}") return top1 / total if isinstance(K, list): res = [] for K_ in K: for sigma_ in sigma: res.append(eval_k_s(K_, sigma_)) return res else: res = eval_k_s(K, sigma) return res
try: args.device = [int(item) for item in args.device.split(',')] except AttributeError: args.device = [int(args.device)] args.modeldevice = args.device util.setup_runtime(seed=42, cuda_dev_id=list(np.unique(args.modeldevice + args.device))) logger.info(args) logger.info(name) time.sleep(5) writer = SummaryWriter('./runs/%s'%name) writer.add_text('args', " \n".join(['%s %s' % (arg, getattr(args, arg)) for arg in vars(args)])) # Setup model and train_loader model, train_loader = return_model_loader(args) logger.warning("dataset len={}".format(len(train_loader.dataset))) model.to('cuda:0') if torch.cuda.device_count() > 1: logger.info("Let's use", len(args.modeldevice), "GPUs for the model") if len(args.modeldevice) == 1: logger.warning('single GPU model') else: model.features = nn.DataParallel(model.features, device_ids=list(range(len(args.modeldevice)))) # Setup optimizer o = Optimizer(m=model, hc=args.hc, ncl=args.ncl, t_loader=train_loader, n_epochs=args.epochs, lr=args.lr, weight_decay=10**args.wd, ckpt_dir=os.path.join(args.exp, 'checkpoints')) o.writer = writer # Optimize o.optimize()
lr_clip / args.lr, ) bn_lbmd = lambda it: max( args.bn_momentum * args.bnm_decay ** (int(it * args.batch_size / args.decay_step)), bnm_clip, ) # default value it = -1 # for the initialize value of `LambdaLR` and `BNMomentumScheduler` best_loss = 1e10 start_epoch = 1 # load status from checkpoint if args.checkpoint is not None: logger.warning("loading checkpoint weight file") checkpoint_status = pt_utils.load_checkpoint( model, optimizer, filename=args.checkpoint ) if checkpoint_status is not None: it, start_epoch, best_loss = checkpoint_status lr_scheduler = lr_sched.LambdaLR(optimizer, lr_lambda=lr_lbmd, last_epoch=it) bnm_scheduler = pt_utils.BNMomentumScheduler( model, bn_lambda=bn_lbmd, last_epoch=it ) it = max(it, 0) # for the initialize value of `trainer.train` if args.pointmixup: model_fn = model_fn_decorator_mix(cross_entropy_with_probs, nn.CrossEntropyLoss(), num_class=num_class) else: model_fn = model_fn_decorator(nn.CrossEntropyLoss())
def train(self, start_it, start_epoch, n_epochs, train_loader, test_loader=None, best_loss=0.0, writer=None): r""" Call to begin training the model Parameters ---------- start_epoch : int Epoch to start at n_epochs : int Number of epochs to train for test_loader : torch.utils.data.DataLoader DataLoader of the test_data train_loader : torch.utils.data.DataLoader DataLoader of training data best_loss : float Testing loss of the best model """ eval_frequency = (self.eval_frequency if self.eval_frequency > 0 else len(train_loader)) it = start_it best_acc = -1 with tqdm.trange(start_epoch, n_epochs + 1, desc='epochs') as tbar, \ tqdm.tqdm(total=eval_frequency, leave=False, desc='train') as pbar: for epoch in tbar: for batch in train_loader: idx_minor = None mixrates = None strategy = None manilayer_batch = 0 # all use in no mixup case if self.n_strategies > 0: strategy_idx = np.random.randint(self.n_strategies) strategy = self.strategies[strategy_idx] if self.manimixup: manilayer_batch = np.random.randint(self.manilayer_all) else: manilayer_batch = 0 B, N, C = batch[0].shape idx_minor = torch.randperm(B) mixrates = (0.5 - np.abs(np.random.beta(self.alpha, self.alpha, B) - 0.5)) label_main = batch[1] label_minor = batch[1][idx_minor] label = torch.zeros(B, self.n_class) for i in range(B): if label_main[i] == label_minor[i]: # same label label[i][label_main[i]] = 1.0 else: label[i][label_main[i]] = 1 - mixrates[i] label[i][label_minor[i]] = mixrates[i] batch[1] = label res = self._train_it(it, batch, idx_minor, mixrates, strategy, manilayer_batch) it += 1 pbar.update() pbar.set_postfix(dict(total_it=it)) tbar.refresh() if (it % eval_frequency) == 0: pbar.close() if test_loader is not None: val_loss, res = self.eval_epoch(test_loader) if writer is not None: writer.add_scalar('{}/valacc'.format(self.savename), np.mean(res['acc']), epoch) writer.add_scalar('{}/valloss'.format(self.savename), np.mean(res['loss']), epoch) #is_best = val_loss < best_loss best_loss = min(best_loss, val_loss) is_best = np.mean(res['acc']) > best_acc if is_best: best_acc = np.mean(res['acc']) logger.warning("eval_loss={}, eval_acc={}, eval_best_acc={}".format(np.mean(res['loss']), np.mean(res['acc']), best_acc)) save_checkpoint( checkpoint_state(self.model, self.optimizer, val_loss, epoch, it), is_best, filename=self.checkpoint_name, bestname=self.best_name) pbar = tqdm.tqdm( total=eval_frequency, leave=False, desc='train') pbar.set_postfix(dict(total_it=it)) return best_loss
testset = CIFAR100Instance(root=args.datadir, train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) logger.info('==> Building model..') ########################################## numc = [args.ncl] * args.hc model = models.__dict__[args.arch](num_classes=numc,return_features=False) knn_dim = 4096 N = len(trainloader.dataset) optimize_times = ((args.epochs + 1.0001)*N*(np.linspace(0, 1, args.nopts))[::-1]).tolist() optimize_times = [(args.epochs +10)*N] + optimize_times logger.warning('We will optimize L at epochs: {}'.format([np.round(1.0*t/N, 2) for t in optimize_times])) # init selflabels randomly if args.hc == 1: selflabels = np.zeros(N, dtype=np.int32) for qq in range(N): selflabels[qq] = qq % args.ncl selflabels = np.random.permutation(selflabels) selflabels = torch.LongTensor(selflabels).cuda() else: selflabels = np.zeros((args.hc, N), dtype=np.int32) for nh in range(args.hc): for _i in range(N): selflabels[nh, _i] = _i % numc[nh] selflabels[nh] = np.random.permutation(selflabels[nh]) selflabels = torch.LongTensor(selflabels).cuda()