def step(self, trn_X, trn_y, val_X, val_y, xi, w_optim, a_optim): """ Compute unrolled loss and backward its gradients Args: xi: learning rate for virtual gradient step (same as net lr) w_optim: weights optimizer - for virtual step """ a_optim.zero_grad() # sample k if self.sample: NASModule.param_module_call('sample_ops', n_samples=self.n_samples) loss = self.net.loss(val_X, val_y) m_out_dev = [] for dev_id in NASModule.get_device(): m_out = [ m.get_state('m_out' + dev_id) for m in NASModule.modules() ] m_len = len(m_out) m_out_dev.extend(m_out) m_grad = torch.autograd.grad(loss, m_out_dev) for i, dev_id in enumerate(NASModule.get_device()): NASModule.param_backward_from_grad( m_grad[i * m_len:(i + 1) * m_len], dev_id) if not self.renorm: a_optim.step() else: # renormalization prev_pw = [] for p, m in NASModule.param_modules(): s_op = m.get_state('s_op') pdt = p.detach() pp = pdt.index_select(-1, s_op) if pp.size() == pdt.size(): continue k = torch.sum(torch.exp(pdt)) / torch.sum(torch.exp(pp)) - 1 prev_pw.append(k) a_optim.step() for kprev, (p, m) in zip(prev_pw, NASModule.param_modules()): s_op = m.get_state('s_op') pdt = p.detach() pp = pdt.index_select(-1, s_op) k = torch.sum(torch.exp(pdt)) / torch.sum(torch.exp(pp)) - 1 for i in s_op: p[i] += (torch.log(k) - torch.log(kprev)) NASModule.module_call('reset_ops')
def augment(out_dir, chkpt_path, train_loader, valid_loader, model, writer, logger, device, config): w_optim = utils.get_optim(model.weights(), config.w_optim) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.epochs, eta_min=config.w_optim.lr_min) init_epoch = -1 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model.load_state_dict(checkpoint['model']) w_optim.load_state_dict(checkpoint['w_optim']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) init_epoch = checkpoint['epoch'] else: logger.info("Starting new training run") logger.info("Model params count: {:.3f} M, size: {:.3f} MB".format( utils.param_size(model), utils.param_count(model))) # training loop logger.info('begin training') best_top1 = 0. tot_epochs = config.epochs for epoch in itertools.count(init_epoch + 1): if epoch == tot_epochs: break drop_prob = config.drop_path_prob * epoch / tot_epochs model.drop_path_prob(drop_prob) lr = lr_scheduler.get_lr()[0] # training train(train_loader, None, model, writer, logger, None, w_optim, None, lr, epoch, tot_epochs, device, config) lr_scheduler.step() # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, writer, logger, epoch, tot_epochs, cur_step, device, config) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False if config.save_freq != 0 and epoch % config.save_freq == 0: save_checkpoint(out_dir, model, w_optim, None, lr_scheduler, epoch, logger) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) tprof.stat_acc('model_' + NASModule.get_device()[0])
def search(out_dir, chkpt_path, w_train_loader, a_train_loader, model, arch, writer, logger, device, config): valid_loader = a_train_loader w_optim = utils.get_optim(model.weights(), config.w_optim) a_optim = utils.get_optim(model.alphas(), config.a_optim) init_epoch = -1 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model.load_state_dict(checkpoint['model']) NASModule.nasmod_load_state_dict(checkpoint['arch']) w_optim.load_state_dict(checkpoint['w_optim']) a_optim.load_state_dict(checkpoint['a_optim']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) init_epoch = checkpoint['epoch'] else: logger.info("Starting new training run") architect = arch(config, model) # warmup training loop logger.info('begin warmup training') try: if config.warmup_epochs > 0: warmup_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.warmup_epochs, eta_min=config.w_optim.lr_min) last_epoch = 0 else: last_epoch = -1 tot_epochs = config.warmup_epochs for epoch in itertools.count(init_epoch + 1): if epoch == tot_epochs: break lr = warmup_lr_scheduler.get_lr()[0] # training train(w_train_loader, None, model, writer, logger, architect, w_optim, a_optim, lr, epoch, tot_epochs, device, config) # validation cur_step = (epoch + 1) * len(w_train_loader) top1 = validate(valid_loader, model, writer, logger, epoch, tot_epochs, cur_step, device, config) warmup_lr_scheduler.step() print("") except KeyboardInterrupt: print('skipped') lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.epochs, eta_min=config.w_optim.lr_min, last_epoch=last_epoch) save_checkpoint(out_dir, model, w_optim, a_optim, lr_scheduler, init_epoch, logger) save_genotype(out_dir, model.genotype(), init_epoch, logger) # training loop logger.info('begin w/a training') best_top1 = 0. tot_epochs = config.epochs for epoch in itertools.count(init_epoch + 1): if epoch == tot_epochs: break lr = lr_scheduler.get_lr()[0] model.print_alphas(logger) # training train(w_train_loader, a_train_loader, model, writer, logger, architect, w_optim, a_optim, lr, epoch, tot_epochs, device, config) # validation cur_step = (epoch + 1) * len(w_train_loader) top1 = validate(valid_loader, model, writer, logger, epoch, tot_epochs, cur_step, device, config) # genotype genotype = model.genotype() save_genotype(out_dir, genotype, epoch, logger) # genotype as image if config.plot: for i, dag in enumerate(model.dags()): plot_path = os.path.join(config.plot_path, "EP{:02d}".format(epoch + 1)) caption = "Epoch {} - DAG {}".format(epoch + 1, i) plot(genotype.dag[i], dag, plot_path + "-dag_{}".format(i), caption) if best_top1 < top1: best_top1 = top1 best_genotype = genotype if config.save_freq != 0 and epoch % config.save_freq == 0: save_checkpoint(out_dir, model, w_optim, a_optim, lr_scheduler, epoch, logger) lr_scheduler.step() print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) logger.info("Best Genotype = {}".format(best_genotype)) tprof.stat_acc('model_' + NASModule.get_device()[0]) gt.to_file(best_genotype, os.path.join(out_dir, 'best.gt'))
def train(train_loader, valid_loader, model, writer, logger, architect, w_optim, a_optim, lr, epoch, tot_epochs, device, config): top1 = utils.AverageMeter() top5 = utils.AverageMeter() losses = utils.AverageMeter() cur_step = epoch * len(train_loader) writer.add_scalar('train/lr', lr, cur_step) model.train() if not valid_loader is None: tr_ratio = len(train_loader) // len(valid_loader) val_iter = iter(valid_loader) eta_m = utils.ETAMeter(tot_epochs, epoch, len(train_loader)) eta_m.start() for step, (trn_X, trn_y) in enumerate(train_loader): trn_X, trn_y = trn_X.to(device, non_blocking=True), trn_y.to(device, non_blocking=True) N = trn_X.size(0) # phase 1. child network step (w) w_optim.zero_grad() tprof.timer_start('train') loss, logits = model.loss_logits(trn_X, trn_y, config.aux_weight) tprof.timer_stop('train') loss.backward() # gradient clipping if config.w_grad_clip > 0: nn.utils.clip_grad_norm_(model.weights(), config.w_grad_clip) w_optim.step() # phase 2. architect step (alpha) if not valid_loader is None and step % tr_ratio == 0: try: val_X, val_y = next(val_iter) except: val_iter = iter(valid_loader) val_X, val_y = next(val_iter) val_X, val_y = val_X.to(device, non_blocking=True), val_y.to( device, non_blocking=True) tprof.timer_start('arch') architect.step(trn_X, trn_y, val_X, val_y, lr, w_optim, a_optim) tprof.timer_stop('arch') prec1, prec5 = utils.accuracy(logits, trn_y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if step != 0 and step % config.print_freq == 0 or step == len( train_loader) - 1: eta = eta_m.step(step) logger.info( "Train: [{:2d}/{}] Step {:03d}/{:03d} LR {:.3f} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%}) | ETA: {eta}". format(epoch + 1, tot_epochs, step, len(train_loader) - 1, lr, losses=losses, top1=top1, top5=top5, eta=utils.format_time(eta))) writer.add_scalar('train/loss', loss.item(), cur_step) writer.add_scalar('train/top1', prec1.item(), cur_step) writer.add_scalar('train/top5', prec5.item(), cur_step) cur_step += 1 logger.info("Train: [{:2d}/{}] Final Prec@1 {:.4%}".format( epoch + 1, tot_epochs, top1.avg)) tprof.stat_acc('model_' + NASModule.get_device()[0]) tprof.print_stat('train') tprof.print_stat('arch')