def valid_func(xloader, network, criterion): data_time, batch_time = AverageMeter(), AverageMeter() arch_losses, arch_top1, arch_top5 = AverageMeter(), AverageMeter( ), AverageMeter() end = time.time() with torch.no_grad(): network.eval() for step, (arch_inputs, arch_targets) in enumerate(xloader): arch_targets = arch_targets.cuda(non_blocking=True) # measure data loading time data_time.update(time.time() - end) # prediction _, logits = network(arch_inputs) arch_loss = criterion(logits, arch_targets) # record arch_prec1, arch_prec5 = obtain_accuracy(logits.data, arch_targets.data, topk=(1, 5)) arch_losses.update(arch_loss.item(), arch_inputs.size(0)) arch_top1.update(arch_prec1.item(), arch_inputs.size(0)) arch_top5.update(arch_prec5.item(), arch_inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() return arch_losses.avg, arch_top1.avg, arch_top5.avg
def get_best_arch(controller, shared_cnn, xloader, n_samples=10): with torch.no_grad(): controller.eval() shared_cnn.eval() archs, valid_accs = [], [] loader_iter = iter(xloader) for i in range(n_samples): try: inputs, targets = next(loader_iter) except: loader_iter = iter(xloader) inputs, targets = next(loader_iter) _, _, sampled_arch = controller() arch = shared_cnn.module.update_arch(sampled_arch) _, logits = shared_cnn(inputs) val_top1, val_top5 = obtain_accuracy(logits.cpu().data, targets.data, topk=(1, 5)) archs.append(arch) valid_accs.append(val_top1.item()) best_idx = np.argmax(valid_accs) best_arch, best_valid_acc = archs[best_idx], valid_accs[best_idx] return best_arch, best_valid_acc
def search_find_best(xloader, network, n_samples): with torch.no_grad(): network.eval() archs, valid_accs = [], [] # print ('obtain the top-{:} architectures'.format(n_samples)) loader_iter = iter(xloader) for i in range(n_samples): arch = network.module.random_genotype(True) try: inputs, targets = next(loader_iter) except: loader_iter = iter(xloader) inputs, targets = next(loader_iter) _, logits = network(inputs) val_top1, val_top5 = obtain_accuracy( logits.cpu().data, targets.data, topk=(1, 5) ) archs.append(arch) valid_accs.append(val_top1.item()) best_idx = np.argmax(valid_accs) best_arch, best_valid_acc = archs[best_idx], valid_accs[best_idx] return best_arch, best_valid_acc
def test_func( xloader, network, criterion, ): base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter(), AverageMeter() network.eval() for step, (base_inputs, base_targets) in enumerate( xloader ): base_targets = base_targets.cuda(non_blocking=True) _, logits = network(base_inputs.cuda()) base_loss = criterion(logits, base_targets) base_prec1, base_prec5 = obtain_accuracy( logits.data, base_targets.data, topk=(1, 5) ) base_losses.update(base_loss.item(), base_inputs.size(0)) base_top1.update(base_prec1.item(), base_inputs.size(0)) base_top5.update(base_prec5.item(), base_inputs.size(0)) return ( base_losses.avg, base_top1.avg, base_top5.avg, )
def train_shared_cnn( xloader, shared_cnn, controller, criterion, scheduler, optimizer, epoch_str, print_freq, logger, ): data_time, batch_time = AverageMeter(), AverageMeter() losses, top1s, top5s, xend = ( AverageMeter(), AverageMeter(), AverageMeter(), time.time(), ) shared_cnn.train() controller.eval() for step, (inputs, targets) in enumerate(xloader): scheduler.update(None, 1.0 * step / len(xloader)) targets = targets.cuda(non_blocking=True) # measure data loading time data_time.update(time.time() - xend) with torch.no_grad(): _, _, sampled_arch = controller() optimizer.zero_grad() shared_cnn.module.update_arch(sampled_arch) _, logits = shared_cnn(inputs) loss = criterion(logits, targets) loss.backward() torch.nn.utils.clip_grad_norm_(shared_cnn.parameters(), 5) optimizer.step() # record prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1s.update(prec1.item(), inputs.size(0)) top5s.update(prec5.item(), inputs.size(0)) # measure elapsed time batch_time.update(time.time() - xend) xend = time.time() if step % print_freq == 0 or step + 1 == len(xloader): Sstr = ( "*Train-Shared-CNN* " + time_string() + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader))) Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format( batch_time=batch_time, data_time=data_time) Wstr = "[Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=losses, top1=top1s, top5=top5s) logger.log(Sstr + " " + Tstr + " " + Wstr) return losses.avg, top1s.avg, top5s.avg
def search_func( xloader, network, criterion, scheduler, w_optimizer, epoch_str, print_freq, logger ): data_time, batch_time = AverageMeter(), AverageMeter() base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter(), AverageMeter() network.train() end = time.time() for step, (base_inputs, base_targets, arch_inputs, arch_targets) in enumerate( xloader ): scheduler.update(None, 1.0 * step / len(xloader)) base_targets = base_targets.cuda(non_blocking=True) arch_targets = arch_targets.cuda(non_blocking=True) # measure data loading time data_time.update(time.time() - end) # update the weights network.module.random_genotype(True) w_optimizer.zero_grad() _, logits = network(base_inputs) base_loss = criterion(logits, base_targets) base_loss.backward() nn.utils.clip_grad_norm_(network.parameters(), 5) w_optimizer.step() # record base_prec1, base_prec5 = obtain_accuracy( logits.data, base_targets.data, topk=(1, 5) ) base_losses.update(base_loss.item(), base_inputs.size(0)) base_top1.update(base_prec1.item(), base_inputs.size(0)) base_top5.update(base_prec5.item(), base_inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % print_freq == 0 or step + 1 == len(xloader): Sstr = ( "*SEARCH* " + time_string() + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader)) ) Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format( batch_time=batch_time, data_time=data_time ) Wstr = "Base [Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=base_losses, top1=base_top1, top5=base_top5 ) logger.log(Sstr + " " + Tstr + " " + Wstr) return base_losses.avg, base_top1.avg, base_top5.avg
def get_best_arch(xloader, network, n_samples): with torch.no_grad(): network.eval() archs, valid_accs = network.module.return_topK(n_samples), [] # print ('obtain the top-{:} architectures'.format(n_samples)) loader_iter = iter(xloader) for i, sampled_arch in enumerate(archs): network.module.set_cal_mode("dynamic", sampled_arch) try: inputs, targets = next(loader_iter) except: loader_iter = iter(xloader) inputs, targets = next(loader_iter) _, logits = network(inputs) val_top1, val_top5 = obtain_accuracy(logits.cpu().data, targets.data, topk=(1, 5)) valid_accs.append(val_top1.item()) best_idx = np.argmax(valid_accs) best_arch, best_valid_acc = archs[best_idx], valid_accs[best_idx] return best_arch, best_valid_acc
def search_func( xloader, network, criterion, scheduler, w_optimizer, a_optimizer, epoch_str, print_freq, logger, ): data_time, batch_time = AverageMeter(), AverageMeter() base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter( ), AverageMeter() arch_losses, arch_top1, arch_top5 = AverageMeter(), AverageMeter( ), AverageMeter() end = time.time() network.train() for step, (base_inputs, base_targets, arch_inputs, arch_targets) in enumerate(xloader): scheduler.update(None, 1.0 * step / len(xloader)) base_targets = base_targets.cuda(non_blocking=True) arch_targets = arch_targets.cuda(non_blocking=True) # measure data loading time data_time.update(time.time() - end) # update the weights sampled_arch = network.module.dync_genotype(True) network.module.set_cal_mode("dynamic", sampled_arch) # network.module.set_cal_mode( 'urs' ) network.zero_grad() _, logits = network(base_inputs) base_loss = criterion(logits, base_targets) base_loss.backward() w_optimizer.step() # record base_prec1, base_prec5 = obtain_accuracy(logits.data, base_targets.data, topk=(1, 5)) base_losses.update(base_loss.item(), base_inputs.size(0)) base_top1.update(base_prec1.item(), base_inputs.size(0)) base_top5.update(base_prec5.item(), base_inputs.size(0)) # update the architecture-weight network.module.set_cal_mode("joint") network.zero_grad() _, logits = network(arch_inputs) arch_loss = criterion(logits, arch_targets) arch_loss.backward() a_optimizer.step() # record arch_prec1, arch_prec5 = obtain_accuracy(logits.data, arch_targets.data, topk=(1, 5)) arch_losses.update(arch_loss.item(), arch_inputs.size(0)) arch_top1.update(arch_prec1.item(), arch_inputs.size(0)) arch_top5.update(arch_prec5.item(), arch_inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % print_freq == 0 or step + 1 == len(xloader): Sstr = ( "*SEARCH* " + time_string() + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader))) Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format( batch_time=batch_time, data_time=data_time) Wstr = "Base [Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=base_losses, top1=base_top1, top5=base_top5) Astr = "Arch [Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=arch_losses, top1=arch_top1, top5=arch_top5) logger.log(Sstr + " " + Tstr + " " + Wstr + " " + Astr) # print (nn.functional.softmax(network.module.arch_parameters, dim=-1)) # print (network.module.arch_parameters) return ( base_losses.avg, base_top1.avg, base_top5.avg, arch_losses.avg, arch_top1.avg, arch_top5.avg, )
def train_controller( xloader, shared_cnn, controller, criterion, optimizer, config, epoch_str, print_freq, logger, ): # config. (containing some necessary arg) # baseline: The baseline score (i.e. average val_acc) from the previous epoch data_time, batch_time = AverageMeter(), AverageMeter() ( GradnormMeter, LossMeter, ValAccMeter, EntropyMeter, BaselineMeter, RewardMeter, xend, ) = ( AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), time.time(), ) shared_cnn.eval() controller.train() controller.zero_grad() # for step, (inputs, targets) in enumerate(xloader): loader_iter = iter(xloader) for step in range(config.ctl_train_steps * config.ctl_num_aggre): try: inputs, targets = next(loader_iter) except: loader_iter = iter(xloader) inputs, targets = next(loader_iter) targets = targets.cuda(non_blocking=True) # measure data loading time data_time.update(time.time() - xend) log_prob, entropy, sampled_arch = controller() with torch.no_grad(): shared_cnn.module.update_arch(sampled_arch) _, logits = shared_cnn(inputs) val_top1, val_top5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) val_top1 = val_top1.view(-1) / 100 reward = val_top1 + config.ctl_entropy_w * entropy if config.baseline is None: baseline = val_top1 else: baseline = config.baseline - (1 - config.ctl_bl_dec) * ( config.baseline - reward) loss = -1 * log_prob * (reward - baseline) # account RewardMeter.update(reward.item()) BaselineMeter.update(baseline.item()) ValAccMeter.update(val_top1.item() * 100) LossMeter.update(loss.item()) EntropyMeter.update(entropy.item()) # Average gradient over controller_num_aggregate samples loss = loss / config.ctl_num_aggre loss.backward(retain_graph=True) # measure elapsed time batch_time.update(time.time() - xend) xend = time.time() if (step + 1) % config.ctl_num_aggre == 0: grad_norm = torch.nn.utils.clip_grad_norm_(controller.parameters(), 5.0) GradnormMeter.update(grad_norm) optimizer.step() controller.zero_grad() if step % print_freq == 0: Sstr = ("*Train-Controller* " + time_string() + " [{:}][{:03d}/{:03d}]".format( epoch_str, step, config.ctl_train_steps * config.ctl_num_aggre)) Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format( batch_time=batch_time, data_time=data_time) Wstr = "[Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Reward {reward.val:.2f} ({reward.avg:.2f})] Baseline {basel.val:.2f} ({basel.avg:.2f})".format( loss=LossMeter, top1=ValAccMeter, reward=RewardMeter, basel=BaselineMeter, ) Estr = "Entropy={:.4f} ({:.4f})".format(EntropyMeter.val, EntropyMeter.avg) logger.log(Sstr + " " + Tstr + " " + Wstr + " " + Estr) return ( LossMeter.avg, ValAccMeter.avg, BaselineMeter.avg, RewardMeter.avg, baseline.item(), )
def search_func( xloader, network, criterion, scheduler, w_optimizer, a_optimizer, enable_controller, algo, epoch_str, print_freq, logger, ): data_time, batch_time = AverageMeter(), AverageMeter() base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter(), AverageMeter() arch_losses, arch_top1, arch_top5 = AverageMeter(), AverageMeter(), AverageMeter() end = time.time() network.train() for step, (base_inputs, base_targets, arch_inputs, arch_targets) in enumerate( xloader ): scheduler.update(None, 1.0 * step / len(xloader)) base_inputs = base_inputs.cuda(non_blocking=True) arch_inputs = arch_inputs.cuda(non_blocking=True) base_targets = base_targets.cuda(non_blocking=True) arch_targets = arch_targets.cuda(non_blocking=True) # measure data loading time data_time.update(time.time() - end) # Update the weights network.zero_grad() _, logits, _ = network(base_inputs) base_loss = criterion(logits, base_targets) base_loss.backward() w_optimizer.step() # record base_prec1, base_prec5 = obtain_accuracy( logits.data, base_targets.data, topk=(1, 5) ) base_losses.update(base_loss.item(), base_inputs.size(0)) base_top1.update(base_prec1.item(), base_inputs.size(0)) base_top5.update(base_prec5.item(), base_inputs.size(0)) # update the architecture-weight network.zero_grad() a_optimizer.zero_grad() _, logits, log_probs = network(arch_inputs) arch_prec1, arch_prec5 = obtain_accuracy( logits.data, arch_targets.data, topk=(1, 5) ) if algo == "mask_rl": with torch.no_grad(): RL_BASELINE_EMA.update(arch_prec1.item()) rl_advantage = arch_prec1 - RL_BASELINE_EMA.value rl_log_prob = sum(log_probs) arch_loss = -rl_advantage * rl_log_prob elif algo == "tas" or algo == "mask_gumbel": arch_loss = criterion(logits, arch_targets) else: raise ValueError("invalid algorightm name: {:}".format(algo)) if enable_controller: arch_loss.backward() a_optimizer.step() # record arch_losses.update(arch_loss.item(), arch_inputs.size(0)) arch_top1.update(arch_prec1.item(), arch_inputs.size(0)) arch_top5.update(arch_prec5.item(), arch_inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % print_freq == 0 or step + 1 == len(xloader): Sstr = ( "*SEARCH* " + time_string() + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader)) ) Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format( batch_time=batch_time, data_time=data_time ) Wstr = "Base [Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=base_losses, top1=base_top1, top5=base_top5 ) Astr = "Arch [Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=arch_losses, top1=arch_top1, top5=arch_top5 ) logger.log(Sstr + " " + Tstr + " " + Wstr + " " + Astr) return ( base_losses.avg, base_top1.avg, base_top5.avg, arch_losses.avg, arch_top1.avg, arch_top5.avg, )
def search_func( xloader, network, global_network, criterion, scheduler, w_optimizer, a_optimizer, epoch_str, print_freq, logger, local_epoch ): # network, criterion = torch.nn.DataParallel(search_model).cuda(), criterion.cuda() data_time, batch_time = AverageMeter(), AverageMeter() base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter(), AverageMeter() arch_losses, arch_top1, arch_top5 = AverageMeter(), AverageMeter(), AverageMeter() network.train() end = time.time() for _ in range(local_epoch): for step, (base_inputs, base_targets, arch_inputs, arch_targets) in enumerate( xloader ): scheduler.update(None, 1.0 * step / len(xloader)) base_targets = base_targets.cuda(non_blocking=True) arch_targets = arch_targets.cuda(non_blocking=True) # measure data loading time data_time.update(time.time() - end) # update the weights w_optimizer.zero_grad() _, logits = network(base_inputs.cuda()) base_loss = criterion(logits, base_targets) base_loss.backward() torch.nn.utils.clip_grad_norm_(network.parameters(), 5) if args.baseline == 'dl': w_optimizer.step(global_network.get_weights()) else: w_optimizer.step() # record base_prec1, base_prec5 = obtain_accuracy( logits.data, base_targets.data, topk=(1, 5) ) base_losses.update(base_loss.item(), base_inputs.size(0)) base_top1.update(base_prec1.item(), base_inputs.size(0)) base_top5.update(base_prec5.item(), base_inputs.size(0)) # update the architecture-weight a_optimizer.zero_grad() _, logits = network(arch_inputs.cuda()) arch_loss = criterion(logits, arch_targets) arch_loss.backward() a_optimizer.step() # record arch_prec1, arch_prec5 = obtain_accuracy( logits.data, arch_targets.data, topk=(1, 5) ) arch_losses.update(arch_loss.item(), arch_inputs.size(0)) arch_top1.update(arch_prec1.item(), arch_inputs.size(0)) arch_top5.update(arch_prec5.item(), arch_inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % print_freq == 0 or step + 1 == len(xloader): Sstr = ( "*SEARCH* " + time_string() + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader)) ) Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format( batch_time=batch_time, data_time=data_time ) Wstr = "Base [Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=base_losses, top1=base_top1, top5=base_top5 ) Astr = "Arch [Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format( loss=arch_losses, top1=arch_top1, top5=arch_top5 ) logger.log(Sstr + " " + Tstr + " " + Wstr + " " + Astr) return ( base_losses.avg, base_top1.avg, base_top5.avg, arch_losses.avg, arch_top1.avg, arch_top5.avg, network.state_dict() )