예제 #1
0
def evaluate_for_seed(arch_config, config, arch, train_loader, valid_loaders, seed, logger):

  prepare_seed(seed) # random seed
  net = get_cell_based_tiny_net(dict2config({'name': 'infer.tiny',
                                             'C': arch_config['channel'], 'N': arch_config['num_cells'],
                                             'genotype': arch, 'num_classes': config.class_num}
                                            , None)
                                 )
  #net = TinyNetwork(arch_config['channel'], arch_config['num_cells'], arch, config.class_num)
  flop, param  = get_model_infos(net, config.xshape)
  logger.log('Network : {:}'.format(net.get_message()), False)
  logger.log('{:} Seed-------------------------- {:} --------------------------'.format(time_string(), seed))
  logger.log('FLOP = {:} MB, Param = {:} MB'.format(flop, param))
  # train and valid
  optimizer, scheduler, criterion = get_optim_scheduler(net.parameters(), config)
  network, criterion = torch.nn.DataParallel(net).cuda(), criterion.cuda()
  # start training
  start_time, epoch_time, total_epoch = time.time(), AverageMeter(), config.epochs + config.warmup
  train_losses, train_acc1es, train_acc5es, valid_losses, valid_acc1es, valid_acc5es = {}, {}, {}, {}, {}, {}
  train_times , valid_times = {}, {}
  for epoch in range(total_epoch):
    scheduler.update(epoch, 0.0)

    train_loss, train_acc1, train_acc5, train_tm = procedure(train_loader, network, criterion, scheduler, optimizer, 'train')
    train_losses[epoch] = train_loss
    train_acc1es[epoch] = train_acc1 
    train_acc5es[epoch] = train_acc5
    train_times [epoch] = train_tm
    with torch.no_grad():
      for key, xloder in valid_loaders.items():
        valid_loss, valid_acc1, valid_acc5, valid_tm = procedure(xloder  , network, criterion,      None,      None, 'valid')
        valid_losses['{:}@{:}'.format(key,epoch)] = valid_loss
        valid_acc1es['{:}@{:}'.format(key,epoch)] = valid_acc1 
        valid_acc5es['{:}@{:}'.format(key,epoch)] = valid_acc5
        valid_times ['{:}@{:}'.format(key,epoch)] = valid_tm

    # measure elapsed time
    epoch_time.update(time.time() - start_time)
    start_time = time.time()
    need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.avg * (total_epoch-epoch-1), True) )
    logger.log('{:} {:} epoch={:03d}/{:03d} :: Train [loss={:.5f}, acc@1={:.2f}%, acc@5={:.2f}%] Valid [loss={:.5f}, acc@1={:.2f}%, acc@5={:.2f}%]'.format(time_string(), need_time, epoch, total_epoch, train_loss, train_acc1, train_acc5, valid_loss, valid_acc1, valid_acc5))
  info_seed = {'flop' : flop,
               'param': param,
               'channel'     : arch_config['channel'],
               'num_cells'   : arch_config['num_cells'],
               'config'      : config._asdict(),
               'total_epoch' : total_epoch ,
               'train_losses': train_losses,
               'train_acc1es': train_acc1es,
               'train_acc5es': train_acc5es,
               'train_times' : train_times,
               'valid_losses': valid_losses,
               'valid_acc1es': valid_acc1es,
               'valid_acc5es': valid_acc5es,
               'valid_times' : valid_times,
               'net_state_dict': net.state_dict(),
               'net_string'  : '{:}'.format(net),
               'finish-train': True
              }
  return info_seed
예제 #2
0
def main(args):

    assert os.path.isdir(args.data_path), 'invalid data-path : {:}'.format(
        args.data_path)
    assert os.path.isfile(args.checkpoint), 'invalid checkpoint : {:}'.format(
        args.checkpoint)

    checkpoint = torch.load(args.checkpoint)
    xargs = checkpoint['args']
    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, args.data_path, xargs.cutout_length)
    valid_loader = torch.utils.data.DataLoader(valid_data,
                                               batch_size=xargs.batch_size,
                                               shuffle=False,
                                               num_workers=xargs.workers,
                                               pin_memory=True)

    logger = PrintLogger()
    model_config = dict2config(checkpoint['model-config'], logger)
    base_model = obtain_model(model_config)
    flop, param = get_model_infos(base_model, xshape)
    logger.log('model ====>>>>:\n{:}'.format(base_model))
    logger.log('model information : {:}'.format(base_model.get_message()))
    logger.log('-' * 50)
    logger.log('Params={:.2f} MB, FLOPs={:.2f} M ... = {:.2f} G'.format(
        param, flop, flop / 1e3))
    logger.log('-' * 50)
    logger.log('valid_data : {:}'.format(valid_data))
    optim_config = dict2config(checkpoint['optim-config'], logger)
    _, _, criterion = get_optim_scheduler(base_model.parameters(),
                                          optim_config)
    logger.log('criterion  : {:}'.format(criterion))
    base_model.load_state_dict(checkpoint['base-model'])
    _, valid_func = get_procedures(xargs.procedure)
    logger.log(
        'initialize the CNN done, evaluate it using {:}'.format(valid_func))
    network = torch.nn.DataParallel(base_model).cuda()

    try:
        valid_loss, valid_acc1, valid_acc5 = valid_func(
            valid_loader, network, criterion, optim_config, 'pure-evaluation',
            xargs.print_freq_eval, logger)
    except:
        _, valid_func = get_procedures('basic')
        valid_loss, valid_acc1, valid_acc5 = valid_func(
            valid_loader, network, criterion, optim_config, 'pure-evaluation',
            xargs.print_freq_eval, logger)

    num_bytes = torch.cuda.max_memory_cached(
        next(network.parameters()).device) * 1.0
    logger.log(
        '***{:s}*** EVALUATION loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f}, error@1 = {:.2f}, error@5 = {:.2f}'
        .format(time_string(), valid_loss, valid_acc1, valid_acc5,
                100 - valid_acc1, 100 - valid_acc5))
    logger.log(
        '[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]'
        .format(
            next(network.parameters()).device, int(num_bytes), num_bytes / 1e3,
            num_bytes / 1e6, num_bytes / 1e9))
    logger.close()
예제 #3
0
def main(width_mult):
    # model = MobileNetV2(1001, width_mult, 32, 1280, 'InvertedResidual', 0.2)
    model = MobileNetV2(width_mult=width_mult)
    print(model)
    flops, params = get_model_infos(model, (2, 3, 224, 224))
    print("FLOPs : {:}".format(flops))
    print("Params : {:}".format(params))
    print("-" * 50)
예제 #4
0
def evaluate(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    print('The image is {:}'.format(args.image))
    print('The face bounding box is {:}'.format(args.face))
    assert len(args.face) == 4, 'Invalid face input : {:}'.format(args.face)

    # General Data Argumentation
    print('Prepare input data')
    [image, _, _, _, _, _,
     cropped_size], meta = dataset.prepare_input(args.image, args.face)
    # network forward
    with torch.no_grad():
        inputs = image.unsqueeze(0).cuda()
        batch_heatmaps, batch_locs, batch_scos = net(inputs)
        flops, params = get_model_infos(net, inputs.shape)
        print('IN-shape : {:}, FLOPs : {:} MB, Params : {:} MB'.format(
            list(inputs.shape), flops, params))
    # obtain the locations on the image in the orignial size
    cpu = torch.device('cpu')
    np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(
        cpu).numpy(), batch_scos.to(cpu).numpy(), cropped_size.numpy()
    locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(
        np_batch_scos[0, :-1], -1)

    scale_h, scale_w = cropped_size[0] * 1. / \
        inputs.size(-2), cropped_size[1] * 1. / inputs.size(-1)

    locations[:, 0], locations[:, 1] = locations[:, 0] * scale_w + \
        cropped_size[2], locations[:, 1] * scale_h + cropped_size[3]
    prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0)

    print('the coordinates for {:} facial landmarks:'.format(param.num_pts))
    for i in range(param.num_pts):
        point = prediction[:, i]
        print('the {:02d}/{:02d}-th point : ({:.1f}, {:.1f}), score = {:.2f}'.
              format(i, param.num_pts, float(point[0]), float(point[1]),
                     float(point[2])))

    if args.save:
        resize = 0
        image = draw_image_by_points(args.image, prediction, 2, (255, 0, 0),
                                     args.face, resize)
        image.save(args.save)
        print('save the visualization results into {:}'.format(args.save))
    else:
        print('ignore the visualization procedure')
def main(xargs):
  assert torch.cuda.is_available(), 'CUDA is not available.'
  torch.backends.cudnn.enabled   = True
  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True
  torch.set_num_threads( xargs.workers )
  prepare_seed(xargs.rand_seed)
  logger = prepare_logger(args)

  train_data, valid_data, xshape, class_num = get_datasets(xargs.dataset, xargs.data_path, -1)
  config = load_config(xargs.config_path, {'class_num': class_num, 'xshape': xshape}, logger)
  search_loader, _, valid_loader = get_nas_search_loaders(train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', \
                                        (config.batch_size, config.test_batch_size), xargs.workers)
  logger.log('||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'.format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size))
  logger.log('||||||| {:10s} ||||||| Config={:}'.format(xargs.dataset, config))

  search_space = get_search_spaces('cell', xargs.search_space_name)
  model_config = dict2config({'name': 'SPOS', 'C': xargs.channel, 'N': xargs.num_cells,
                              'max_nodes': xargs.max_nodes, 'num_classes': class_num,
                              'space'    : search_space,
                              'affine'   : False, 'track_running_stats': bool(xargs.track_running_stats)}, None)
  logger.log('search space : {:}'.format(search_space))
  model = get_cell_based_tiny_net(model_config)
  
  flop, param  = get_model_infos(model, xshape)
  logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
  logger.log('search-space : {:}'.format(search_space))
  if xargs.arch_nas_dataset is None:
    api = None
  else:
    api = API(xargs.arch_nas_dataset)
  logger.log('{:} create API = {:} done'.format(time_string(), api))

  checkpoint_path_template = '{}/checkpoint/seed-{}_epoch-{}.pth'
  logger.log("=> loading checkpoint from {}".format(checkpoint_path_template.format(args.save_dir, args.rand_seed, 0)))
  load(checkpoint_path_template.format(args.save_dir, args.rand_seed, 0), model)
  init_model = deepcopy(model)

  angles = []
  for epoch in range(xargs.epochs):
    genotype = load(checkpoint_path_template.format(args.save_dir, args.rand_seed, epoch), model)
    logger.log("=> loading checkpoint from {}".format(checkpoint_path_template.format(args.dataset, args.rand_seed, epoch)))
    cur_model = deepcopy(model)
    angle = get_arch_angle(init_model, cur_model, genotype, search_space)
    logger.log('[{:}] cal angle : angle={}'.format(epoch, angle))
    angle = round(angle,2)
    angles.append(angle)
  print(angles)
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)

    if os.path.isdir(xargs.save_dir):
        if click.confirm(
                '\nSave directory already exists in {}. Erase?'.format(
                    xargs.save_dir, default=False)):
            os.system('rm -r ' + xargs.save_dir)
            assert not os.path.exists(xargs.save_dir)
            os.mkdir(xargs.save_dir)

    logger = prepare_logger(args)
    writer = SummaryWriter(xargs.save_dir)
    perturb_alpha = None
    if xargs.perturb:
        perturb_alpha = random_alpha

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    # config_path = 'configs/nas-benchmark/algos/DARTS.config'
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/',
        config.batch_size, xargs.workers)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    if xargs.model_config is None:
        model_config = dict2config(
            {
                'name': xargs.model,
                'C': xargs.channel,
                'N': xargs.num_cells,
                'max_nodes': xargs.max_nodes,
                'num_classes': class_num,
                'space': search_space,
                'affine': bool(xargs.affine),
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    else:
        model_config = load_config(
            xargs.model_config, {
                'num_classes': class_num,
                'space': search_space,
                'affine': bool(xargs.affine),
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    search_model = get_cell_based_tiny_net(model_config)
    # logger.log('search-model :\n{:}'.format(search_model))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config, xargs.weight_learning_rate)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    # logger.log('{:}'.format(search_model))
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint = torch.load(last_info['last_checkpoint'])
        genotypes = checkpoint['genotypes']
        valid_accuracies = checkpoint['valid_accuracies']
        search_model.load_state_dict(checkpoint['search_model'])
        w_scheduler.load_state_dict(checkpoint['w_scheduler'])
        w_optimizer.load_state_dict(checkpoint['w_optimizer'])
        a_optimizer.load_state_dict(checkpoint['a_optimizer'])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = 0, {
            'best': -1
        }, {
            -1: search_model.genotype()
        }

    # start training
    # start_time, search_time, epoch_time, total_epoch = time.time(), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    start_time, search_time, epoch_time = time.time(), AverageMeter(
    ), AverageMeter()
    total_epoch = config.epochs + config.warmup
    assert 0 < xargs.early_stop_epoch <= total_epoch - 1
    for epoch in range(start_epoch, total_epoch):
        if epoch >= xargs.early_stop_epoch:
            logger.log(f"Early stop @ {epoch} epoch.")
            break
        if xargs.perturb:
            epsilon_alpha = 0.03 + (xargs.epsilon_alpha -
                                    0.03) * epoch / total_epoch
            logger.log(f'epoch {epoch} epsilon_alpha {epsilon_alpha}')
        else:
            epsilon_alpha = None

        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format(
            epoch_str, need_time, min(w_scheduler.get_lr())))

        search_w_loss, search_w_top1, search_w_top5, search_a_loss, search_a_top1, search_a_top5 = search_func(
            search_loader, network, criterion, w_scheduler, w_optimizer,
            a_optimizer, epoch_str, xargs.print_freq, logger,
            xargs.gradient_clip, perturb_alpha, epsilon_alpha)
        search_time.update(time.time() - start_time)
        logger.log(
            '[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)

        writer.add_scalar('search/weight_loss', search_w_loss, epoch)
        writer.add_scalar('search/weight_top1_acc', search_w_top1, epoch)
        writer.add_scalar('search/weight_top5_acc', search_w_top5, epoch)

        writer.add_scalar('search/arch_loss', search_a_loss, epoch)
        writer.add_scalar('search/arch_top1_acc', search_a_top1, epoch)
        writer.add_scalar('search/arch_top5_acc', search_a_top5, epoch)

        writer.add_scalar('evaluate/loss', valid_a_loss, epoch)
        writer.add_scalar('evaluate/top1_acc', valid_a_top1, epoch)
        writer.add_scalar('evaluate/top5_acc', valid_a_top5, epoch)
        logger.log(
            '[{:}] evaluate  : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        writer.add_scalar('entropy', search_model.entropy, epoch)
        per_edge_dict = get_per_egde_value_dict(search_model.arch_parameters)
        for edge_name, edge_val in per_edge_dict.items():
            writer.add_scalars(f"cell/{edge_name}", edge_val, epoch)
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1
        if valid_a_top1 > valid_accuracies['best']:
            valid_accuracies['best'] = valid_a_top1
            genotypes['best'] = search_model.genotype()
            find_best = True
        else:
            find_best = False

        genotypes[epoch] = search_model.genotype()
        logger.log('<<<--->>> The {:}-th epoch : {:}'.format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'a_optimizer': a_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict(),
                'genotypes': genotypes,
                'valid_accuracies': valid_accuracies
            }, model_base_path, logger)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)

        if xargs.snapshoot > 0 and epoch % xargs.snapshoot == 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'args': deepcopy(args),
                    'search_model': search_model.state_dict(),
                },
                os.path.join(str(logger.model_dir),
                             f"checkpoint_epoch{epoch}.pth"), logger)

        if find_best:
            logger.log(
                '<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.'
                .format(epoch_str, valid_a_top1))
            copy_checkpoint(model_base_path, model_best_path, logger)
        with torch.no_grad():
            logger.log('{:}'.format(search_model.show_alphas()))
        if api is not None:
            logger.log('{:}'.format(api.query_by_arch(genotypes[epoch])))
            index = api.query_index_by_arch(genotypes[epoch])
            info = api.query_meta_info_by_index(
                index)  # This is an instance of `ArchResults`
            res_metrics = info.get_metrics(
                f'{xargs.dataset}',
                'ori-test')  # This is a dict with metric names as keys
            # cost_metrics = info.get_comput_costs('cifar10')
            writer.add_scalar(f'{xargs.dataset}_ground_acc_ori-test',
                              res_metrics['accuracy'], epoch)
            writer.add_scalar(f'{xargs.dataset}_search_acc', valid_a_top1,
                              epoch)
            if xargs.dataset.lower() != 'cifar10':
                writer.add_scalar(
                    f'{xargs.dataset}_ground_acc_x-test',
                    info.get_metrics(f'{xargs.dataset}', 'x-test')['accuracy'],
                    epoch)
            if find_best:
                valid_accuracies['best_gt'] = res_metrics['accuracy']
            writer.add_scalar(f"{xargs.dataset}_cur_best_gt_acc_ori-test",
                              valid_accuracies['best_gt'], epoch)

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log('\n' + '-' * 100)
    logger.log('{:} : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(
        args.model, xargs.early_stop_epoch, search_time.sum,
        genotypes[xargs.early_stop_epoch - 1]))
    if api is not None:
        logger.log('{:}'.format(
            api.query_by_arch(genotypes[xargs.early_stop_epoch - 1])))
    logger.close()
예제 #7
0
 def num_flops(self):
     self.prepaer_input()
     flops1, params1 = get_model_infos(self.netG_A.model, None, self.real_A)
     fake_B = self.netG_A(self.real_A)
     flops2, params2 = get_model_infos(self.netD_A.model, None, fake_B)
     return flops1 + flops2
예제 #8
0
def main(xargs):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    config = load_config(xargs.config_path, {
        "class_num": class_num,
        "xshape": xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data,
        valid_data,
        xargs.dataset,
        "configs/nas-benchmark/",
        config.batch_size,
        xargs.workers,
    )
    logger.log(
        "||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}"
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log("||||||| {:10s} ||||||| Config={:}".format(
        xargs.dataset, config))

    search_space = get_search_spaces("cell", xargs.search_space_name)
    model_config = dict2config(
        {
            "name": "DARTS-V2",
            "C": xargs.channel,
            "N": xargs.num_cells,
            "max_nodes": xargs.max_nodes,
            "num_classes": class_num,
            "space": search_space,
            "affine": False,
            "track_running_stats": bool(xargs.track_running_stats),
        },
        None,
    )
    search_model = get_cell_based_tiny_net(model_config)
    logger.log("search-model :\n{:}".format(search_model))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(
        search_model.get_alphas(),
        lr=xargs.arch_learning_rate,
        betas=(0.5, 0.999),
        weight_decay=xargs.arch_weight_decay,
    )
    logger.log("w-optimizer : {:}".format(w_optimizer))
    logger.log("a-optimizer : {:}".format(a_optimizer))
    logger.log("w-scheduler : {:}".format(w_scheduler))
    logger.log("criterion   : {:}".format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    # logger.log('{:}'.format(search_model))
    logger.log("FLOP = {:.2f} M, Params = {:.2f} MB".format(flop, param))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log("{:} create API = {:} done".format(time_string(), api))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info["epoch"]
        checkpoint = torch.load(last_info["last_checkpoint"])
        genotypes = checkpoint["genotypes"]
        valid_accuracies = checkpoint["valid_accuracies"]
        search_model.load_state_dict(checkpoint["search_model"])
        w_scheduler.load_state_dict(checkpoint["w_scheduler"])
        w_optimizer.load_state_dict(checkpoint["w_optimizer"])
        a_optimizer.load_state_dict(checkpoint["a_optimizer"])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = (
            0,
            {
                "best": -1
            },
            {
                -1: search_model.genotype()
            },
        )

    # start training
    start_time, search_time, epoch_time, total_epoch = (
        time.time(),
        AverageMeter(),
        AverageMeter(),
        config.epochs + config.warmup,
    )
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch)
        min_LR = min(w_scheduler.get_lr())
        logger.log("\n[Search the {:}-th epoch] {:}, LR={:}".format(
            epoch_str, need_time, min_LR))

        search_w_loss, search_w_top1, search_w_top5 = search_func(
            search_loader,
            network,
            criterion,
            w_scheduler,
            w_optimizer,
            a_optimizer,
            epoch_str,
            xargs.print_freq,
            logger,
        )
        search_time.update(time.time() - start_time)
        logger.log(
            "[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s"
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)
        logger.log(
            "[{:}] evaluate  : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%"
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1
        if valid_a_top1 > valid_accuracies["best"]:
            valid_accuracies["best"] = valid_a_top1
            genotypes["best"] = search_model.genotype()
            find_best = True
        else:
            find_best = False

        genotypes[epoch] = search_model.genotype()
        logger.log("<<<--->>> The {:}-th epoch : {:}".format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(xargs),
                "search_model": search_model.state_dict(),
                "w_optimizer": w_optimizer.state_dict(),
                "a_optimizer": a_optimizer.state_dict(),
                "w_scheduler": w_scheduler.state_dict(),
                "genotypes": genotypes,
                "valid_accuracies": valid_accuracies,
            },
            model_base_path,
            logger,
        )
        last_info = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )
        if find_best:
            logger.log(
                "<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%."
                .format(epoch_str, valid_a_top1))
            copy_checkpoint(model_base_path, model_best_path, logger)
        with torch.no_grad():
            logger.log("arch-parameters :\n{:}".format(
                nn.functional.softmax(search_model.arch_parameters,
                                      dim=-1).cpu()))
        if api is not None:
            logger.log("{:}".format(api.query_by_arch(genotypes[epoch],
                                                      "200")))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log("\n" + "-" * 100)
    # check the performance from the architecture dataset
    logger.log(
        "DARTS-V2 : run {:} epochs, cost {:.1f} s, last-geno is {:}.".format(
            total_epoch, search_time.sum, genotypes[total_epoch - 1]))
    if api is not None:
        logger.log("{:}".format(api.query_by_arch(genotypes[total_epoch - 1]),
                                "200"))
    logger.close()
예제 #9
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    if xargs.dataset == 'cifar10' or xargs.dataset == 'cifar100':
        split_Fpath = 'configs/nas-benchmark/cifar-split.txt'
        cifar_split = load_config(split_Fpath, None, None)
        train_split, valid_split = cifar_split.train, cifar_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    elif xargs.dataset.startswith('ImageNet16'):
        split_Fpath = 'configs/nas-benchmark/{:}-split.txt'.format(
            xargs.dataset)
        imagenet16_split = load_config(split_Fpath, None, None)
        train_split, valid_split = imagenet16_split.train, imagenet16_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    else:
        raise ValueError('invalid dataset : {:}'.format(xargs.dataset))
    config_path = 'configs/nas-benchmark/algos/DARTS.config'
    config = load_config(config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    # To split data
    train_data_v2 = deepcopy(train_data)
    train_data_v2.transform = valid_data.transform
    valid_data = train_data_v2
    search_data = SearchDataset(xargs.dataset, train_data, train_split,
                                valid_split)
    # data loader
    search_loader = torch.utils.data.DataLoader(search_data,
                                                batch_size=config.batch_size,
                                                shuffle=True,
                                                num_workers=xargs.workers,
                                                pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split),
        num_workers=xargs.workers,
        pin_memory=True)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    model_config = dict2config(
        {
            'name': 'DARTS-V2',
            'C': xargs.channel,
            'N': xargs.num_cells,
            'max_nodes': xargs.max_nodes,
            'num_classes': class_num,
            'space': search_space
        }, None)
    search_model = get_cell_based_tiny_net(model_config)
    logger.log('search-model :\n{:}'.format(search_model))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    #logger.log('{:}'.format(search_model))
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    logger.close()
예제 #10
0
def main(args):

    save_folder = '%s_%s' % (args.dataset, args.affix)

    log_folder = os.path.join(args.log_root, save_folder)
    model_folder = os.path.join(args.model_root, save_folder)

    makedirs(log_folder)
    makedirs(model_folder)

    setattr(args, 'log_folder', log_folder)
    setattr(args, 'model_folder', model_folder)

    logger = create_logger(log_folder, args.todo, 'info')

    print_args(args, logger)

    # Using a WideResNet model
    model = WideResNet(depth=34, num_classes=10, widen_factor=1, dropRate=0.0)
    flop, param = get_model_infos(model, (1, 3, 32, 32))
    logger.info('Model Info: FLOP = {:.2f} M, Params = {:.2f} MB'.format(
        flop, param))

    # Configuring the train attack mode
    if args.adv_train_mode == 'FGSM':
        train_attack = FastGradientSignUntargeted(model,
                                                  args.epsilon,
                                                  args.alpha,
                                                  min_val=0,
                                                  max_val=1,
                                                  max_iters=args.k,
                                                  _type=args.perturbation_type,
                                                  logger=logger)
    elif args.adv_train_mode == 'CW':
        mean = [0]
        std = [1]
        inputs_box = (min((0 - m) / s for m, s in zip(mean, std)),
                      max((1 - m) / s for m, s in zip(mean, std)))
        train_attack = carlini_wagner_L2.L2Adversary(targeted=False,
                                                     confidence=0.0,
                                                     search_steps=10,
                                                     optimizer_lr=5e-4,
                                                     logger=logger)

    # Configuring the test attack mode
    if args.adv_test_mode == 'FGSM':
        test_attack = FastGradientSignUntargeted(model,
                                                 args.epsilon,
                                                 args.alpha,
                                                 min_val=0,
                                                 max_val=1,
                                                 max_iters=args.k,
                                                 _type=args.perturbation_type,
                                                 logger=logger)
    elif args.adv_test_mode == 'CW':
        mean = [0]
        std = [1]
        inputs_box = (min((0 - m) / s for m, s in zip(mean, std)),
                      max((1 - m) / s for m, s in zip(mean, std)))
        test_attack = carlini_wagner_L2.L2Adversary(targeted=False,
                                                    confidence=0.0,
                                                    search_steps=10,
                                                    optimizer_lr=5e-4,
                                                    logger=logger)

    if torch.cuda.is_available():
        model.cuda()

    trainer = Trainer(args, logger, train_attack, test_attack)

    if args.todo == 'train':
        transform_train = tv.transforms.Compose([
            tv.transforms.ToTensor(),
            tv.transforms.Lambda(lambda x: F.pad(
                x.unsqueeze(0),
                (4, 4, 4, 4), mode='constant', value=0).squeeze()),
            tv.transforms.ToPILImage(),
            tv.transforms.RandomCrop(32),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
        ])
        tr_dataset = tv.datasets.CIFAR10(args.data_root,
                                         train=True,
                                         transform=transform_train,
                                         download=True)

        tr_loader = DataLoader(tr_dataset,
                               batch_size=args.batch_size,
                               shuffle=True,
                               num_workers=4)

        # evaluation during training
        te_dataset = tv.datasets.CIFAR10(args.data_root,
                                         train=False,
                                         transform=tv.transforms.ToTensor(),
                                         download=True)

        te_loader = DataLoader(te_dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
                               num_workers=4)

        trainer.train(model, tr_loader, te_loader, args.adv_train)
    elif args.todo == 'test':
        pass
    else:
        raise NotImplementedError
def main(xargs):
  assert torch.cuda.is_available(), 'CUDA is not available.'
  torch.backends.cudnn.enabled   = True
  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True
  torch.set_num_threads( xargs.workers )
  prepare_seed(xargs.rand_seed)
  logger = prepare_logger(args)

  train_data, valid_data, xshape, class_num = get_datasets(xargs.dataset, xargs.data_path, -1)
  config = load_config(xargs.config_path, {'class_num': class_num, 'xshape': xshape}, logger)
  search_loader, _, valid_loader = get_nas_search_loaders(train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', \
                                        (config.batch_size, config.test_batch_size), xargs.workers)
  logger.log('||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'.format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size))
  logger.log('||||||| {:10s} ||||||| Config={:}'.format(xargs.dataset, config))

  search_space = get_search_spaces('cell', xargs.search_space_name)
  model_config = dict2config({'name': 'SPOS', 'C': xargs.channel, 'N': xargs.num_cells,
                              'max_nodes': xargs.max_nodes, 'num_classes': class_num,
                              'space'    : search_space,
                              'affine'   : False, 'track_running_stats': bool(xargs.track_running_stats)}, None)
  logger.log('search space : {:}'.format(search_space))
  model = get_cell_based_tiny_net(model_config)
  
  flop, param  = get_model_infos(model, xshape)
  logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
  logger.log('search-space : {:}'.format(search_space))
  if xargs.arch_nas_dataset is None:
    api = None
  else:
    api = API(xargs.arch_nas_dataset)
  logger.log('{:} create API = {:} done'.format(time_string(), api))

  angles = {}
  for arch_idx in range(0, 10000, 200):
    checkpoint_path_template = 'output/search-cell-nas-bench-102/result-{}/standalone_arch-{}/checkpoint/seed-{}_epoch-{}.pth'
    logger.log("=> loading checkpoint from {}".format(checkpoint_path_template.format(args.dataset, arch_idx, args.rand_seed, 0)))
    epochs = config.epochs - 1
    load(checkpoint_path_template.format(args.dataset, arch_idx,  args.rand_seed, 0), model)
    init_model = deepcopy(model)
    genotype = load(checkpoint_path_template.format(args.dataset, arch_idx,  args.rand_seed, epochs), model)
    logger.log("=> loading checkpoint from {}".format(checkpoint_path_template.format(args.dataset, arch_idx,  args.rand_seed, epochs)))
    cur_model = deepcopy(model)
    angle = get_arch_angle(init_model, cur_model, genotype, search_space)
    logger.log('[{:}] cal angle : angle={} | {:}, acc: {}'.format(arch_idx, angle, genotype, get_arch_real_acc(api, genotype, args)))
    angles[genotype.tostr()] = angle

  real_acc = {}
  for key in angles.keys():
    real_acc[key] = get_arch_real_acc(api, key, args)
    assert(real_acc[key] is not None)

  real_acc = sorted(real_acc.items(), key=lambda d: d[1], reverse=True)
  angles = sorted(angles.items(), key=lambda d: d[1], reverse=True)
  angle_rank = {}
  rank = 1
  for value in angles:
    angle_rank[value[0]] = rank
    rank += 1

  angle_rank_list, real_rank_list = [],[]
  rank = 1
  for value in real_acc:
    angle_rank_list.append(angle_rank[value[0]])
    real_rank_list.append(rank)
    rank += 1

  logger.log('Real_rank_list={}'.format(real_rank_list))
  logger.log('Angle_rank_list={}'.format(angle_rank_list))
  logger.log('Tau={}'.format(scipy.stats.stats.kendalltau(real_rank_list, angle_rank_list)[0]))
예제 #12
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    print(config)
    search_loader, _, valid_loader = get_nas_search_loaders(train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', \
                                          (config.batch_size, config.test_batch_size), xargs.workers)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    model_config = dict2config(
        {
            'name': 'SPOS',
            'C': xargs.channel,
            'N': xargs.num_cells,
            'max_nodes': xargs.max_nodes,
            'num_classes': class_num,
            'space': search_space,
            'affine': False,
            'track_running_stats': bool(xargs.track_running_stats)
        }, None)
    logger.log('search space : {:}'.format(search_space))
    model = get_cell_based_tiny_net(model_config)

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        model.get_weights(), config)
    a_optimizer = torch.optim.Adam(model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(model, xshape)
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space : {:}'.format(search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(model).cuda(), criterion.cuda()

    checkpoint_path = 'output/search-cell-nas-bench-102/result-{}/checkpoint/seed-{}_epoch-{}.pth'.format(
        xargs.dataset, xargs.rand_seed, xargs.epoch)
    if checkpoint_path is not None:  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint from {}".format(checkpoint_path))
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['search_model'])

    # start inference
    start_time, search_time, epoch_time, total_epoch = time.time(
    ), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    all_archs = network.module.get_all_archs()
    random.shuffle(all_archs)

    valid_accuracies = {}
    process_start_time = time.time()
    for i, genotype in enumerate(all_archs):
        network.module.set_cal_mode('dynamic', genotype)
        recalculate_bn(network, search_loader)
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)
        logger.log(
            '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}% | {:}'
            .format(i, valid_a_loss, valid_a_top1, valid_a_top5, genotype))
        valid_accuracies[genotype.tostr()] = valid_a_top1
    process_end_time = time.time()
    logger.log('process time: {}'.format(process_end_time -
                                         process_start_time))

    torch.save(valid_accuracies, '{}/result.dat'.format(xargs.save_dir))
    logger.log('\n' + '-' * 100)
    # check the performance from the architecture dataset
    logger.log(
        'SPOS : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(
            total_epoch, search_time.sum, genotype))
    if api is not None: logger.log('{:}'.format(api.query_by_arch(genotype)))
    logger.close()
예제 #13
0
def evaluate(image, model, face, save_path, cpu):
    org_image = image
    if not cpu:
        assert torch.cuda.is_available(), 'CUDA is not available.'
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

    print('The image is {:}'.format(image))
    print('The model is {:}'.format(model))
    snapshot = model
    assert os.path.exists(snapshot), 'The model path {:} does not exist'
    print('The face bounding box is {:}'.format(face))
    assert len(face) == 4, 'Invalid face input : {:}'.format(face)
    if cpu: snapshot = torch.load(snapshot, map_location='cpu')
    else: snapshot = torch.load(snapshot)
    mean_fill = tuple([int(x * 255) for x in [0.5, 0.5, 0.5]])
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    param = snapshot['args']
    eval_transform = transforms.Compose([
        transforms.PreCrop(param.pre_crop_expand),
        transforms.TrainScale2WH((param.crop_width, param.crop_height)),
        transforms.ToTensor(), normalize
    ])

    net = models.__dict__[param.arch](param.modelconfig, None)

    if not cpu: net = net.cuda()
    weights = models.remove_module_dict(snapshot['state_dict'])
    net.load_state_dict(weights)

    dataset = datasets.GeneralDataset(eval_transform, param.sigma,
                                      param.downsample, param.heatmap_type,
                                      param.dataset_name)
    dataset.reset(param.num_pts)

    print('[{:}] prepare the input data'.format(time_string()))
    [image, _, _, _, _, _,
     cropped_size], meta = dataset.prepare_input(image, face)
    print('[{:}] prepare the input data done'.format(time_string()))
    print('Net : \n{:}'.format(net))
    # network forward
    with torch.no_grad():
        if cpu: inputs = image.unsqueeze(0)
        else: inputs = image.unsqueeze(0).cuda()
        batch_heatmaps, batch_locs, batch_scos, _ = net(inputs)
        #print ('input-shape : {:}'.format(inputs.shape))
        flops, params = get_model_infos(net, inputs.shape, None)
        print('\nIN-shape : {:}, FLOPs : {:} MB, Params : {:}.'.format(
            list(inputs.shape), flops, params))
        flops, params = get_model_infos(net, None, inputs)
        print('\nIN-shape : {:}, FLOPs : {:} MB, Params : {:}.'.format(
            list(inputs.shape), flops, params))
    print('[{:}] the network forward done'.format(time_string()))

    # obtain the locations on the image in the orignial size
    cpu = torch.device('cpu')
    np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(
        cpu).numpy(), batch_scos.to(cpu).numpy(), cropped_size.numpy()
    locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(
        np_batch_scos[0, :-1], -1)

    scale_h, scale_w = cropped_size[0] * 1. / inputs.size(
        -2), cropped_size[1] * 1. / inputs.size(-1)

    locations[:,
              0], locations[:, 1] = locations[:, 0] * scale_w + cropped_size[
                  2], locations[:, 1] * scale_h + cropped_size[3]
    prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0)
    for i in range(param.num_pts):
        point = prediction[:, i]
        print(
            'The coordinate of {:02d}/{:02d}-th points : ({:.1f}, {:.1f}), score = {:.3f}'
            .format(i, param.num_pts, float(point[0]), float(point[1]),
                    float(point[2])))
    if save_path:
        image = draw_image_by_points(org_image, prediction, 1, (255, 0, 0),
                                     False, False)
        image.save(save_path)
        print('save image with landmarks into {:}'.format(save_path))
    print('finish san evaluation on a single image : {:}'.format(image))
    return locations
예제 #14
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', \
                                          (config.batch_size, config.test_batch_size), xargs.workers)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    model_config = dict2config(
        {
            'name': 'SPOS',
            'C': xargs.channel,
            'N': xargs.num_cells,
            'max_nodes': xargs.max_nodes,
            'num_classes': class_num,
            'space': search_space,
            'affine': False,
            'track_running_stats': bool(xargs.track_running_stats)
        }, None)
    logger.log('search space : {:}'.format(search_space))
    search_model = get_cell_based_tiny_net(model_config)

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    #logger.log('{:}'.format(search_model))
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space : {:}'.format(search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()
    start_epoch, valid_accuracies = 0, {'best': -1}

    # start training
    start_time, search_time, epoch_time, total_epoch = time.time(
    ), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    for epoch in range(start_epoch, xargs.epochs):
        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format(
            epoch_str, need_time, min(w_scheduler.get_lr())))

        search_w_loss, search_w_top1, search_w_top5, search_a_loss, search_a_top1, search_a_top5 \
                    = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger)
        search_time.update(time.time() - start_time)
        logger.log(
            '[{:}] search [base] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        logger.log(
            '[{:}] search [arch] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, search_a_loss, search_a_top1, search_a_top5))

        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'a_optimizer': a_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict()
            }, "{}_epoch-{}.pth".format(model_base_path, epoch), logger)

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    # the final post procedure : count the time
    start_time = time.time()
    # genotype, temp_accuracy = get_best_arch(valid_loader, network, xargs.select_num)
    search_time.update(time.time() - start_time)

    logger.log('\n' + '-' * 100)
    logger.log('SPOS : run {:} epochs, cost {:.1f} s.'.format(
        total_epoch, search_time.sum))
    logger.close()
예제 #15
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled   = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads( xargs.workers )
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(xargs.dataset, xargs.data_path, -1)
    # class_num = 4
    # xshape = (1,3,88,88)
    # print(xshape)
    if xargs.dataset == 'cifar10' or xargs.dataset == 'cifar100':
        split_Fpath = '/home/city/Projects/NAS-Projects/configs/nas-benchmark/cifar-split.txt'
        cifar_split = load_config(split_Fpath, None, None)
        train_split, valid_split = cifar_split.train, cifar_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    elif xargs.dataset.startswith('ImageNet16'):
        split_Fpath = 'configs/nas-benchmark/{:}-split.txt'.format(xargs.dataset)
        imagenet16_split = load_config(split_Fpath, None, None)
        train_split, valid_split = imagenet16_split.train, imagenet16_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    else:
        raise ValueError('invalid dataset : {:}'.format(xargs.dataset))
    config_path = '/home/city/Projects/NAS-Projects/configs/nas-benchmark/algos/DARTS.config'
    config = load_config(config_path, {'class_num': class_num, 'xshape': xshape}, logger)
    print('config')
    print(config)
    # To split data
    train_data_v2 = deepcopy(train_data)
    train_data_v2.transform = valid_data.transform
    valid_data    = train_data_v2
    search_data   = SearchDataset(xargs.dataset, train_data, train_split, valid_split)
    # data loader
    search_loader = torch.utils.data.DataLoader(search_data, batch_size=config.batch_size, shuffle=True , num_workers=xargs.workers, pin_memory=True)
    valid_loader  = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split), num_workers=xargs.workers, pin_memory=True)
    logger.log('||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'.format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(xargs.dataset, config))

    # train_transform = transforms.Compose([
    #     transforms.RandomHorizontalFlip(),
    #     transforms.ToTensor()
    #     # transforms.Normalize(mean=[128, 128, 128], std=[50, 50, 50])
    # ])
    # val_transform = transforms.Compose([
    #     # transforms.RandomHorizontalFlip(),
    #     transforms.ToTensor()
    #     # transforms.Normalize(mean=[128, 128, 128], std=[50, 50, 50])
    # ])
    #
    # train_data = datasets.ImageFolder(root='/home/city/Projects/build_assessment/data/train',
    #                                   transform=train_transform)
    # valid_data = datasets.ImageFolder(root='/home/city/Projects/build_assessment/data/val',
    #                                   transform=val_transform)
    # print(len(train_data))
    # print('2333333333333333333333333333333')
    # train_split = []
    # valid_split = []
    #
    # for i in range(len(train_data)):
    #     if i%2==0:
    #         train_split.append(i)
    #     else:
    #         valid_split.append(i)
    # search_data = SearchDataset('builds', train_data, train_split, valid_split)
    #
    # search_loader = torch.utils.data.DataLoader(search_data,
    #                                              batch_size=32, shuffle=True,
    #                                              num_workers=4, pin_memory=True
    #                                              )
    # valid_loader = torch.utils.data.DataLoader(valid_data,
    #                                              batch_size=32, shuffle=True,
    #                                              num_workers=2, pin_memory=True
    #                                              )



    search_space = get_search_spaces('cell', xargs.search_space_name)
    model_config = dict2config({'name': 'DARTS-V2', 'C': xargs.channel, 'N': xargs.num_cells,
                                'max_nodes': xargs.max_nodes, 'num_classes': class_num,
                                'space'    : search_space}, None)
    search_model = get_cell_based_tiny_net(model_config)
    logger.log('search-model :\n{:}'.format(search_model))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param  = get_model_infos(search_model, xshape)
    #logger.log('{:}'.format(search_model))
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path('info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(search_model).cuda(), criterion.cuda()

    if last_info.exists(): # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info))
        last_info   = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint  = torch.load(last_info['last_checkpoint'])
        genotypes   = checkpoint['genotypes']
        valid_accuracies = checkpoint['valid_accuracies']
        search_model.load_state_dict( checkpoint['search_model'] )
        w_scheduler.load_state_dict ( checkpoint['w_scheduler'] )
        w_optimizer.load_state_dict ( checkpoint['w_optimizer'] )
        a_optimizer.load_state_dict ( checkpoint['a_optimizer'] )
        logger.log("=> loading checkpoint of the last-info '{:}' start with {:}-th epoch.".format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = 0, {'best': -1}, {}

    # start training
    start_time, search_time, epoch_time, total_epoch = time.time(), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.val * (total_epoch-epoch), True) )
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        min_LR    = min(w_scheduler.get_lr())
        logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format(epoch_str, need_time, min_LR))

        search_w_loss, search_w_top1, search_w_top5 = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger)
        search_time.update(time.time() - start_time)
        logger.log('[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'.format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum))
        valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        logger.log('[{:}] evaluate  : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1
        if valid_a_top1 > valid_accuracies['best']:
            valid_accuracies['best'] = valid_a_top1
            genotypes['best']        = search_model.genotype()
            op_list, _ = genotypes['best'].tolist(remove_str=None)
            find_best = True
            best_arch_nums = op_list2str(op_list)
            torch.save(search_model,'/home/city/disk/log/builds-darts/darts2_%04d_%s_%s_%.2f.pth' %(epoch,time_string_short(),best_arch_nums, valid_a_top1))
            print('/home/city/disk/log/builds-darts/darts2_%04d_%s_%s_%.2f.pth' %(epoch,time_string_short(),best_arch_nums, valid_a_top1))
        else: find_best = False

        genotypes[epoch] = search_model.genotype()
        logger.log('<<<--->>> The {:}-th epoch : {:}'.format(epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint({'epoch' : epoch + 1,
                                     'args'  : deepcopy(xargs),
                                     'search_model': search_model.state_dict(),
                                     'w_optimizer' : w_optimizer.state_dict(),
                                     'a_optimizer' : a_optimizer.state_dict(),
                                     'w_scheduler' : w_scheduler.state_dict(),
                                     'genotypes'   : genotypes,
                                     'valid_accuracies' : valid_accuracies},
                                    model_base_path, logger)
        last_info = save_checkpoint({
            'epoch': epoch + 1,
            'args' : deepcopy(args),
            'last_checkpoint': save_path,
        }, logger.path('info'), logger)
        if find_best:
            logger.log('<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.'.format(epoch_str, valid_a_top1))
            copy_checkpoint(model_base_path, model_best_path, logger)
        with torch.no_grad():
            logger.log('arch-parameters :\n{:}'.format( nn.functional.softmax(search_model.arch_parameters, dim=-1).cpu() ))
            logger.log('arch :\n{:}'.format(nn.functional.softmax(search_model.arch_parameters, dim=-1).cpu().argmax(dim=-1)))
        if api is not None: logger.log('{:}'.format(api.query_by_arch( genotypes[epoch] )))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log('\n' + '-'*100)
    # check the performance from the architecture dataset
    logger.log('DARTS-V2 : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(total_epoch, search_time.sum, genotypes[total_epoch-1]))
    if api is not None: logger.log('{:}'.format( api.query_by_arch(genotypes[total_epoch-1]) ))
    logger.close()
예제 #16
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    #config_path = 'configs/nas-benchmark/algos/GDAS.config'
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/',
        config.batch_size, xargs.workers)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, batch size={:}'.format(
            xargs.dataset, len(search_loader), config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    if xargs.model_config is None and not args.constrain:
        model_config = dict2config(
            {
                'name': 'GDAS',
                'C': xargs.channel,
                'N': xargs.num_cells,
                'max_nodes': xargs.max_nodes,
                'num_classes': class_num,
                'space': search_space,
                'inp_size': 0,
                'affine': False,
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    elif xargs.model_config is None:
        model_config = dict2config(
            {
                'name': 'GDAS',
                'C': xargs.channel,
                'N': xargs.num_cells,
                'max_nodes': xargs.max_nodes,
                'num_classes': class_num,
                'space': search_space,
                'inp_size': 32,
                'affine': False,
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    else:
        model_config = load_config(
            xargs.model_config, {
                'num_classes': class_num,
                'space': search_space,
                'affine': False,
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    search_model = get_cell_based_tiny_net(model_config)
    #logger.log('search-model :\n{:}'.format(search_model))
    logger.log('model-config : {:}'.format(model_config))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    #logger.log('{:}'.format(search_model))
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space [{:} ops] : {:}'.format(len(search_space),
                                                     search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()
    #network, criterion = search_model.cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint = torch.load(last_info['last_checkpoint'])
        genotypes = checkpoint['genotypes']
        valid_accuracies = checkpoint['valid_accuracies']
        search_model.load_state_dict(checkpoint['search_model'])
        w_scheduler.load_state_dict(checkpoint['w_scheduler'])
        w_optimizer.load_state_dict(checkpoint['w_optimizer'])
        a_optimizer.load_state_dict(checkpoint['a_optimizer'])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = 0, {
            'best': -1
        }, {
            -1: search_model.genotype()
        }

    # start training
    start_time, search_time, epoch_time, total_epoch = time.time(
    ), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    sampled_weights = []
    for epoch in range(start_epoch, total_epoch + config.t_epochs):
        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(
                epoch_time.val * (total_epoch - epoch + config.t_epochs),
                True))
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        search_model.set_tau(xargs.tau_max -
                             (xargs.tau_max - xargs.tau_min) * epoch /
                             (total_epoch - 1))
        logger.log('\n[Search the {:}-th epoch] {:}, tau={:}, LR={:}'.format(
            epoch_str, need_time, search_model.get_tau(),
            min(w_scheduler.get_lr())))
        if epoch < total_epoch:
            search_w_loss, search_w_top1, search_w_top5, valid_a_loss , valid_a_top1 , valid_a_top5 \
                      = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger, xargs.bilevel)
        else:
            search_w_loss, search_w_top1, search_w_top5, valid_a_loss , valid_a_top1 , valid_a_top5, arch_iter \
                       = train_func(search_loader, network, criterion, w_scheduler, w_optimizer, epoch_str, xargs.print_freq, sampled_weights[0], arch_iter, logger)
        search_time.update(time.time() - start_time)
        logger.log(
            '[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        logger.log(
            '[{:}] evaluate  : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))

        if (epoch + 1) % 50 == 0 and not config.t_epochs:
            weights = search_model.sample_weights(100)
            sampled_weights.append(weights)
        elif (epoch + 1) == total_epoch and config.t_epochs:
            weights = search_model.sample_weights(100)
            sampled_weights.append(weights)
            arch_iter = iter(weights)
        # validate with single arch
        single_weight = search_model.sample_weights(1)[0]
        single_valid_acc = AverageMeter()
        network.eval()
        for i in range(10):
            try:
                val_input, val_target = next(valid_iter)
            except Exception as e:
                valid_iter = iter(valid_loader)
                val_input, val_target = next(valid_iter)
            n_val = val_input.size(0)
            with torch.no_grad():
                val_target = val_target.cuda(non_blocking=True)
                _, logits, _ = network(val_input, weights=single_weight)
                val_acc1, val_acc5 = obtain_accuracy(logits.data,
                                                     val_target.data,
                                                     topk=(1, 5))
                single_valid_acc.update(val_acc1.item(), n_val)
        logger.log('[{:}] valid : accuracy = {:.2f}'.format(
            epoch_str, single_valid_acc.avg))

        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1
        if valid_a_top1 > valid_accuracies['best']:
            valid_accuracies['best'] = valid_a_top1
            genotypes['best'] = search_model.genotype()
            find_best = True
        else:
            find_best = False

        if epoch < total_epoch:
            genotypes[epoch] = search_model.genotype()
            logger.log('<<<--->>> The {:}-th epoch : {:}'.format(
                epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'a_optimizer': a_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict(),
                'genotypes': genotypes,
                'valid_accuracies': valid_accuracies
            }, model_base_path, logger)
        last_info = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)
        if find_best:
            logger.log(
                '<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.'
                .format(epoch_str, valid_a_top1))
            copy_checkpoint(model_base_path, model_best_path, logger)
        with torch.no_grad():
            logger.log('{:}'.format(search_model.show_alphas()))
        if api is not None and epoch < total_epoch:
            logger.log('{:}'.format(api.query_by_arch(genotypes[epoch])))

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    network.eval()
    # Evaluate the architectures sampled throughout the search
    for i in range(len(sampled_weights) - 1):
        logger.log('Sample eval : epoch {}'.format((i + 1) * 50 - 1))
        for w in sampled_weights[i]:
            sample_valid_acc = AverageMeter()
            for i in range(10):
                try:
                    val_input, val_target = next(valid_iter)
                except Exception as e:
                    valid_iter = iter(valid_loader)
                    val_input, val_target = next(valid_iter)
                n_val = val_input.size(0)
                with torch.no_grad():
                    val_target = val_target.cuda(non_blocking=True)
                    _, logits, _ = network(val_input, weights=w)
                    val_acc1, val_acc5 = obtain_accuracy(logits.data,
                                                         val_target.data,
                                                         topk=(1, 5))
                    sample_valid_acc.update(val_acc1.item(), n_val)
            w_gene = search_model.genotype(w)
            if api is not None:
                ind = api.query_index_by_arch(w_gene)
                info = api.query_meta_info_by_index(ind)
                metrics = info.get_metrics('cifar10', 'ori-test')
                acc = metrics['accuracy']
            else:
                acc = 0.0
            logger.log(
                'sample valid : val_acc = {:.2f} test_acc = {:.2f}'.format(
                    sample_valid_acc.avg, acc))
    # Evaluate the final sampling separately to find the top 10 architectures
    logger.log('Final sample eval')
    final_archs = []
    for w in sampled_weights[-1]:
        sample_valid_acc = AverageMeter()
        for i in range(10):
            try:
                val_input, val_target = next(valid_iter)
            except Exception as e:
                valid_iter = iter(valid_loader)
                val_input, val_target = next(valid_iter)
            n_val = val_input.size(0)
            with torch.no_grad():
                val_target = val_target.cuda(non_blocking=True)
                _, logits, _ = network(val_input, weights=w)
                val_acc1, val_acc5 = obtain_accuracy(logits.data,
                                                     val_target.data,
                                                     topk=(1, 5))
                sample_valid_acc.update(val_acc1.item(), n_val)
        w_gene = search_model.genotype(w)
        if api is not None:
            ind = api.query_index_by_arch(w_gene)
            info = api.query_meta_info_by_index(ind)
            metrics = info.get_metrics('cifar10', 'ori-test')
            acc = metrics['accuracy']
        else:
            acc = 0.0
        logger.log('sample valid : val_acc = {:.2f} test_acc = {:.2f}'.format(
            sample_valid_acc.avg, acc))
        final_archs.append((w, sample_valid_acc.avg))
    top_10 = sorted(final_archs, key=lambda x: x[1], reverse=True)[:10]
    # Evaluate the top 10 architectures on the entire validation set
    logger.log('Evaluating top archs')
    for w, prev_acc in top_10:
        full_valid_acc = AverageMeter()
        for val_input, val_target in valid_loader:
            n_val = val_input.size(0)
            with torch.no_grad():
                val_target = val_target.cuda(non_blocking=True)
                _, logits, _ = network(val_input, weights=w)
                val_acc1, val_acc5 = obtain_accuracy(logits.data,
                                                     val_target.data,
                                                     topk=(1, 5))
                full_valid_acc.update(val_acc1.item(), n_val)
        w_gene = search_model.genotype(w)
        logger.log('genotype {}'.format(w_gene))
        if api is not None:
            ind = api.query_index_by_arch(w_gene)
            info = api.query_meta_info_by_index(ind)
            metrics = info.get_metrics('cifar10', 'ori-test')
            acc = metrics['accuracy']
        else:
            acc = 0.0
        logger.log(
            'full valid : val_acc = {:.2f} test_acc = {:.2f} pval_acc = {:.2f}'
            .format(full_valid_acc.avg, acc, prev_acc))

    logger.log('\n' + '-' * 100)
    # check the performance from the architecture dataset
    logger.log(
        'GDAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(
            total_epoch, search_time.sum, genotypes[total_epoch - 1]))
    if api is not None:
        logger.log('{:}'.format(api.query_by_arch(genotypes[total_epoch - 1])))
    logger.close()
예제 #17
0
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    #torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)

    prepare_seed(args.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        args.dataset, args.data_path, args.cutout_length)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(valid_data,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=args.workers,
                                               pin_memory=True)
    # get configures
    model_config = load_config(args.model_config, {'class_num': class_num},
                               logger)
    optim_config = load_config(
        args.optim_config, {
            'class_num': class_num,
            'KD_alpha': args.KD_alpha,
            'KD_temperature': args.KD_temperature
        }, logger)

    # load checkpoint
    teacher_base = load_net_from_checkpoint(args.KD_checkpoint)
    teacher = torch.nn.DataParallel(teacher_base).cuda()

    base_model = obtain_model(model_config)
    flop, param = get_model_infos(base_model, xshape)
    logger.log('Student ====>>>>:\n{:}'.format(base_model))
    logger.log('Teacher ====>>>>:\n{:}'.format(teacher_base))
    logger.log('model information : {:}'.format(base_model.get_message()))
    logger.log('-' * 50)
    logger.log('Params={:.2f} MB, FLOPs={:.2f} M ... = {:.2f} G'.format(
        param, flop, flop / 1e3))
    logger.log('-' * 50)
    logger.log('train_data : {:}'.format(train_data))
    logger.log('valid_data : {:}'.format(valid_data))
    optimizer, scheduler, criterion = get_optim_scheduler(
        base_model.parameters(), optim_config)
    logger.log('optimizer  : {:}'.format(optimizer))
    logger.log('scheduler  : {:}'.format(scheduler))
    logger.log('criterion  : {:}'.format(criterion))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        base_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch'] + 1
        checkpoint = torch.load(last_info['last_checkpoint'])
        base_model.load_state_dict(checkpoint['base-model'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        valid_accuracies = checkpoint['valid_accuracies']
        max_bytes = checkpoint['max_bytes']
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    elif args.resume is not None:
        assert Path(
            args.resume).exists(), 'Can not find the resume file : {:}'.format(
                args.resume)
        checkpoint = torch.load(args.resume)
        start_epoch = checkpoint['epoch'] + 1
        base_model.load_state_dict(checkpoint['base-model'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        valid_accuracies = checkpoint['valid_accuracies']
        max_bytes = checkpoint['max_bytes']
        logger.log(
            "=> loading checkpoint from '{:}' start with {:}-th epoch.".format(
                args.resume, start_epoch))
    elif args.init_model is not None:
        assert Path(args.init_model).exists(
        ), 'Can not find the initialization file : {:}'.format(args.init_model)
        checkpoint = torch.load(args.init_model)
        base_model.load_state_dict(checkpoint['base-model'])
        start_epoch, valid_accuracies, max_bytes = 0, {'best': -1}, {}
        logger.log('=> initialize the model from {:}'.format(args.init_model))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, max_bytes = 0, {'best': -1}, {}

    train_func, valid_func = get_procedures(args.procedure)

    total_epoch = optim_config.epochs + optim_config.warmup
    # Main Training and Evaluation Loop
    start_time = time.time()
    epoch_time = AverageMeter()
    for epoch in range(start_epoch, total_epoch):
        scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True))
        epoch_str = 'epoch={:03d}/{:03d}'.format(epoch, total_epoch)
        LRs = scheduler.get_lr()
        find_best = False

        logger.log(
            '\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}'
            .format(time_string(), epoch_str, need_time, min(LRs), max(LRs),
                    scheduler))

        # train for one epoch
        train_loss, train_acc1, train_acc5 = train_func(
            train_loader, teacher, network, criterion, scheduler, optimizer,
            optim_config, epoch_str, args.print_freq, logger)
        # log the results
        logger.log(
            '***{:s}*** TRAIN [{:}] loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}'
            .format(time_string(), epoch_str, train_loss, train_acc1,
                    train_acc5))

        # evaluate the performance
        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
            logger.log('-' * 150)
            valid_loss, valid_acc1, valid_acc5 = valid_func(
                valid_loader, teacher, network, criterion, optim_config,
                epoch_str, args.print_freq_eval, logger)
            valid_accuracies[epoch] = valid_acc1
            logger.log(
                '***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}'
                .format(time_string(), epoch_str, valid_loss, valid_acc1,
                        valid_acc5, valid_accuracies['best'],
                        100 - valid_accuracies['best']))
            if valid_acc1 > valid_accuracies['best']:
                valid_accuracies['best'] = valid_acc1
                find_best = True
                logger.log(
                    'Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}.'
                    .format(epoch, valid_acc1, valid_acc5, 100 - valid_acc1,
                            100 - valid_acc5, model_best_path))
            num_bytes = torch.cuda.max_memory_cached(
                next(network.parameters()).device) * 1.0
            logger.log(
                '[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]'
                .format(
                    next(network.parameters()).device, int(num_bytes),
                    num_bytes / 1e3, num_bytes / 1e6, num_bytes / 1e9))
            max_bytes[epoch] = num_bytes
        if epoch % 10 == 0: torch.cuda.empty_cache()

        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch,
                'args': deepcopy(args),
                'max_bytes': deepcopy(max_bytes),
                'FLOP': flop,
                'PARAM': param,
                'valid_accuracies': deepcopy(valid_accuracies),
                'model-config': model_config._asdict(),
                'optim-config': optim_config._asdict(),
                'base-model': base_model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, model_base_path, logger)
        if find_best: copy_checkpoint(model_base_path, model_best_path, logger)
        last_info = save_checkpoint(
            {
                'epoch': epoch,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log('\n' + '-' * 200)
    logger.log('||| Params={:.2f} MB, FLOPs={:.2f} M ... = {:.2f} G'.format(
        param, flop, flop / 1e3))
    logger.log(
        'Finish training/validation in {:} with Max-GPU-Memory of {:.2f} MB, and save final checkpoint into {:}'
        .format(convert_secs2time(epoch_time.sum, True),
                max(v for k, v in max_bytes.items()) / 1e6,
                logger.path('info')))
    logger.log('-' * 200 + '\n')
    logger.close()
예제 #18
0
def main(xargs, myargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(xargs)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data, valid_data, xargs.dataset,
        'AutoDL-Projects/configs/nas-benchmark/',
        (config.batch_size, config.test_batch_size), xargs.num_worker)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    if not hasattr(xargs, 'model_config') or xargs.model_config is None:
        model_config = dict2config(
            dict(name='SETN',
                 C=xargs.channel,
                 N=xargs.num_cells,
                 max_nodes=xargs.max_nodes,
                 num_classes=class_num,
                 space=search_space,
                 affine=False,
                 track_running_stats=bool(xargs.track_running_stats)), None)
    else:
        model_config = load_config(
            xargs.model_config,
            dict(num_classes=class_num,
                 space=search_space,
                 affine=False,
                 track_running_stats=bool(xargs.track_running_stats)), None)
    logger.log('search space : {:}'.format(search_space))
    search_model = get_cell_based_tiny_net(model_config)

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space : {:}'.format(search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint = torch.load(last_info['last_checkpoint'])
        genotypes = checkpoint['genotypes']
        valid_accuracies = checkpoint['valid_accuracies']
        search_model.load_state_dict(checkpoint['search_model'])
        w_scheduler.load_state_dict(checkpoint['w_scheduler'])
        w_optimizer.load_state_dict(checkpoint['w_optimizer'])
        a_optimizer.load_state_dict(checkpoint['a_optimizer'])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        init_genotype, _ = get_best_arch(valid_loader, network,
                                         xargs.select_num)
        start_epoch, valid_accuracies, genotypes = 0, {
            'best': -1
        }, {
            -1: init_genotype
        }

    # start training
    start_time, search_time, epoch_time, total_epoch = time.time(
    ), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format(
            epoch_str, need_time, min(w_scheduler.get_lr())))

        search_w_loss, search_w_top1, search_w_top5, search_a_loss, search_a_top1, search_a_top5 \
                    = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger)
        search_time.update(time.time() - start_time)
        logger.log(
            '[{:}] search [base] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        logger.log(
            '[{:}] search [arch] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, search_a_loss, search_a_top1, search_a_top5))

        genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                                xargs.select_num)
        network.module.set_cal_mode('dynamic', genotype)
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)
        logger.log(
            '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}% | {:}'
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5,
                    genotype))
        #search_model.set_cal_mode('urs')
        #valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        #logger.log('[{:}] URS---evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        #search_model.set_cal_mode('joint')
        #valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        #logger.log('[{:}] JOINT-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        #search_model.set_cal_mode('select')
        #valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        #logger.log('[{:}] Selec-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1

        genotypes[epoch] = genotype
        logger.log('<<<--->>> The {:}-th epoch : {:}'.format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'a_optimizer': a_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict(),
                'genotypes': genotypes,
                'valid_accuracies': valid_accuracies
            }, model_base_path, logger)
        last_info = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)
        with torch.no_grad():
            logger.log('{:}'.format(search_model.show_alphas()))
        if api is not None:
            logger.log('{:}'.format(api.query_by_arch(genotypes[epoch],
                                                      '200')))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    # the final post procedure : count the time
    start_time = time.time()
    genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                            xargs.select_num)
    search_time.update(time.time() - start_time)
    network.module.set_cal_mode('dynamic', genotype)
    valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
        valid_loader, network, criterion)
    logger.log(
        'Last : the gentotype is : {:}, with the validation accuracy of {:.3f}%.'
        .format(genotype, valid_a_top1))

    logger.log('\n' + '-' * 100)
    # check the performance from the architecture dataset
    logger.log(
        'SETN : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(
            total_epoch, search_time.sum, genotype))
    if api is not None:
        logger.log('{:}'.format(api.query_by_arch(genotype, '200')))
    logger.close()
예제 #19
0
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    #torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)

    prepare_seed(args.rand_seed)
    logger = prepare_logger(args)

    # prepare dataset
    train_data, valid_data, xshape, class_num = get_datasets(
        args.dataset, args.data_path, args.cutout_length)
    #train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True , num_workers=args.workers, pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(valid_data,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=args.workers,
                                               pin_memory=True)

    split_file_path = Path(args.split_path)
    assert split_file_path.exists(), '{:} does not exist'.format(
        split_file_path)
    split_info = torch.load(split_file_path)

    train_split, valid_split = split_info['train'], split_info['valid']
    assert len(
        set(train_split).intersection(set(valid_split))
    ) == 0, 'There should be 0 element that belongs to both train and valid'
    assert len(train_split) + len(valid_split) == len(
        train_data), '{:} + {:} vs {:}'.format(len(train_split),
                                               len(valid_split),
                                               len(train_data))
    search_dataset = SearchDataset(args.dataset, train_data, train_split,
                                   valid_split)

    search_train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(train_split),
        pin_memory=True,
        num_workers=args.workers)
    search_valid_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split),
        pin_memory=True,
        num_workers=args.workers)
    search_loader = torch.utils.data.DataLoader(search_dataset,
                                                batch_size=args.batch_size,
                                                shuffle=True,
                                                num_workers=args.workers,
                                                pin_memory=True,
                                                sampler=None)
    # get configures
    if args.ablation_num_select is None or args.ablation_num_select <= 0:
        model_config = load_config(args.model_config, {
            'class_num': class_num,
            'search_mode': 'shape'
        }, logger)
    else:
        model_config = load_config(
            args.model_config, {
                'class_num': class_num,
                'search_mode': 'ablation',
                'num_random_select': args.ablation_num_select
            }, logger)

    # obtain the model
    search_model = obtain_search_model(model_config)
    MAX_FLOP, param = get_model_infos(search_model, xshape)
    optim_config = load_config(args.optim_config, {
        'class_num': class_num,
        'FLOP': MAX_FLOP
    }, logger)
    logger.log('Model Information : {:}'.format(search_model.get_message()))
    logger.log('MAX_FLOP = {:} M'.format(MAX_FLOP))
    logger.log('Params   = {:} M'.format(param))
    logger.log('train_data : {:}'.format(train_data))
    logger.log('search-data: {:}'.format(search_dataset))
    logger.log('search_train_loader : {:} samples'.format(len(train_split)))
    logger.log('search_valid_loader : {:} samples'.format(len(valid_split)))
    base_optimizer, scheduler, criterion = get_optim_scheduler(
        search_model.base_parameters(), optim_config)
    arch_optimizer = torch.optim.Adam(search_model.arch_parameters(
        optim_config.arch_LR),
                                      lr=optim_config.arch_LR,
                                      betas=(0.5, 0.999),
                                      weight_decay=optim_config.arch_decay)
    logger.log('base-optimizer : {:}'.format(base_optimizer))
    logger.log('arch-optimizer : {:}'.format(arch_optimizer))
    logger.log('scheduler      : {:}'.format(scheduler))
    logger.log('criterion      : {:}'.format(criterion))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    # load checkpoint
    if last_info.exists() or (args.resume is not None and osp.isfile(
            args.resume)):  # automatically resume from previous checkpoint
        if args.resume is not None and osp.isfile(args.resume):
            resume_path = Path(args.resume)
        elif last_info.exists():
            resume_path = last_info
        else:
            raise ValueError('Something is wrong.')
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            resume_path))
        checkpoint = torch.load(resume_path)
        if 'last_checkpoint' in checkpoint:
            last_checkpoint_path = checkpoint['last_checkpoint']
            if not last_checkpoint_path.exists():
                logger.log('Does not find {:}, try another path'.format(
                    last_checkpoint_path))
                last_checkpoint_path = resume_path.parent / last_checkpoint_path.parent.name / last_checkpoint_path.name
            assert last_checkpoint_path.exists(
            ), 'can not find the checkpoint from {:}'.format(
                last_checkpoint_path)
            checkpoint = torch.load(last_checkpoint_path)
        start_epoch = checkpoint['epoch'] + 1
        #for key, value in checkpoint['search_model'].items():
        #  print('K {:} = Shape={:}'.format(key, value.shape))
        search_model.load_state_dict(checkpoint['search_model'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        base_optimizer.load_state_dict(checkpoint['base_optimizer'])
        arch_optimizer.load_state_dict(checkpoint['arch_optimizer'])
        valid_accuracies = checkpoint['valid_accuracies']
        arch_genotypes = checkpoint['arch_genotypes']
        discrepancies = checkpoint['discrepancies']
        max_bytes = checkpoint['max_bytes']
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(resume_path, start_epoch))
    else:
        logger.log(
            "=> do not find the last-info file : {:} or resume : {:}".format(
                last_info, args.resume))
        start_epoch, valid_accuracies, arch_genotypes, discrepancies, max_bytes = 0, {
            'best': -1
        }, {}, {}, {}

    # main procedure
    train_func, valid_func = get_procedures(args.procedure)
    total_epoch = optim_config.epochs + optim_config.warmup
    start_time, epoch_time = time.time(), AverageMeter()
    for epoch in range(start_epoch, total_epoch):
        scheduler.update(epoch, 0.0)
        search_model.set_tau(args.gumbel_tau_max, args.gumbel_tau_min,
                             epoch * 1.0 / total_epoch)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True))
        epoch_str = 'epoch={:03d}/{:03d}'.format(epoch, total_epoch)
        LRs = scheduler.get_lr()
        find_best = False

        logger.log(
            '\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}, tau={:}, FLOP={:.2f}'
            .format(time_string(), epoch_str, need_time, min(LRs), max(LRs),
                    scheduler, search_model.tau, MAX_FLOP))

        # train for one epoch
        train_base_loss, train_arch_loss, train_acc1, train_acc5 = train_func(search_loader, network, criterion, scheduler, base_optimizer, arch_optimizer, optim_config, \
                                                                                    {'epoch-str'  : epoch_str,        'FLOP-exp': MAX_FLOP * args.FLOP_ratio,
                                                                                     'FLOP-weight': args.FLOP_weight, 'FLOP-tolerant': MAX_FLOP * args.FLOP_tolerant}, args.print_freq, logger)
        # log the results
        logger.log(
            '***{:s}*** TRAIN [{:}] base-loss = {:.6f}, arch-loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}'
            .format(time_string(), epoch_str, train_base_loss, train_arch_loss,
                    train_acc1, train_acc5))
        cur_FLOP, genotype = search_model.get_flop('genotype',
                                                   model_config._asdict(),
                                                   None)
        arch_genotypes[epoch] = genotype
        arch_genotypes['last'] = genotype
        logger.log('[{:}] genotype : {:}'.format(epoch_str, genotype))
        # save the configuration
        configure2str(
            genotype,
            str(
                logger.path('log') /
                'seed-{:}-temp.config'.format(args.rand_seed)))
        arch_info, discrepancy = search_model.get_arch_info()
        logger.log(arch_info)
        discrepancies[epoch] = discrepancy
        logger.log(
            '[{:}] FLOP : {:.2f} MB, ratio : {:.4f}, Expected-ratio : {:.4f}, Discrepancy : {:.3f}'
            .format(epoch_str, cur_FLOP, cur_FLOP / MAX_FLOP, args.FLOP_ratio,
                    np.mean(discrepancy)))

        #if cur_FLOP/MAX_FLOP > args.FLOP_ratio:
        #  init_flop_weight = init_flop_weight * args.FLOP_decay
        #else:
        #  init_flop_weight = init_flop_weight / args.FLOP_decay

        # evaluate the performance
        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
            logger.log('-' * 150)
            valid_loss, valid_acc1, valid_acc5 = valid_func(
                search_valid_loader, network, criterion, epoch_str,
                args.print_freq_eval, logger)
            valid_accuracies[epoch] = valid_acc1
            logger.log(
                '***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}'
                .format(time_string(), epoch_str, valid_loss, valid_acc1,
                        valid_acc5, valid_accuracies['best'],
                        100 - valid_accuracies['best']))
            if valid_acc1 > valid_accuracies['best']:
                valid_accuracies['best'] = valid_acc1
                arch_genotypes['best'] = genotype
                find_best = True
                logger.log(
                    'Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}.'
                    .format(epoch, valid_acc1, valid_acc5, 100 - valid_acc1,
                            100 - valid_acc5, model_best_path))
            # log the GPU memory usage
            #num_bytes = torch.cuda.max_memory_allocated( next(network.parameters()).device ) * 1.0
            num_bytes = torch.cuda.max_memory_cached(
                next(network.parameters()).device) * 1.0
            logger.log(
                '[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]'
                .format(
                    next(network.parameters()).device, int(num_bytes),
                    num_bytes / 1e3, num_bytes / 1e6, num_bytes / 1e9))
            max_bytes[epoch] = num_bytes

        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch,
                'args': deepcopy(args),
                'max_bytes': deepcopy(max_bytes),
                'valid_accuracies': deepcopy(valid_accuracies),
                'model-config': model_config._asdict(),
                'optim-config': optim_config._asdict(),
                'search_model': search_model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'base_optimizer': base_optimizer.state_dict(),
                'arch_optimizer': arch_optimizer.state_dict(),
                'arch_genotypes': arch_genotypes,
                'discrepancies': discrepancies,
            }, model_base_path, logger)
        if find_best: copy_checkpoint(model_base_path, model_best_path, logger)
        last_info = save_checkpoint(
            {
                'epoch': epoch,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log('')
    logger.log('-' * 100)
    last_config_path = logger.path('log') / 'seed-{:}-last.config'.format(
        args.rand_seed)
    configure2str(arch_genotypes['last'], str(last_config_path))
    logger.log('save the last config int {:} :\n{:}'.format(
        last_config_path, arch_genotypes['last']))

    best_arch, valid_acc = arch_genotypes['best'], valid_accuracies['best']
    for key, config in arch_genotypes.items():
        if key == 'last': continue
        FLOP_ratio = config['estimated_FLOP'] / MAX_FLOP
        if abs(FLOP_ratio - args.FLOP_ratio) <= args.FLOP_tolerant:
            if valid_acc <= valid_accuracies[key]:
                best_arch, valid_acc = config, valid_accuracies[key]
    print('Best-Arch : {:}\nRatio={:}, Valid-ACC={:}'.format(
        best_arch, best_arch['estimated_FLOP'] / MAX_FLOP, valid_acc))
    best_config_path = logger.path('log') / 'seed-{:}-best.config'.format(
        args.rand_seed)
    configure2str(best_arch, str(best_config_path))
    logger.log('save the last config int {:} :\n{:}'.format(
        best_config_path, best_arch))
    logger.log('\n' + '-' * 200)
    logger.log(
        'Finish training/validation in {:} with Max-GPU-Memory of {:.2f} GB, and save final checkpoint into {:}'
        .format(convert_secs2time(epoch_time.sum, True),
                max(v for k, v in max_bytes.items()) / 1e9,
                logger.path('info')))
    logger.close()
예제 #20
0
def main(xargs):
    PID = os.getpid()
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    prepare_seed(xargs.rand_seed)

    if xargs.timestamp == 'none':
        xargs.timestamp = "{:}".format(
            time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time())))

    train_data, valid_data, xshape, class_num = get_datasets(xargs, -1)

    ##### config & logging #####
    config = edict()
    config.class_num = class_num
    config.xshape = xshape
    config.batch_size = xargs.batch_size
    xargs.save_dir = xargs.save_dir + \
        "/repeat%d-prunNum%d-prec%d-%s-batch%d"%(
                xargs.repeat, xargs.prune_number, xargs.precision, xargs.init, config["batch_size"]) + \
        "/{:}/seed{:}".format(xargs.timestamp, xargs.rand_seed)
    config.save_dir = xargs.save_dir
    logger = prepare_logger(xargs)
    ###############

    if xargs.dataset in [
            'MiniImageNet', 'MetaMiniImageNet', 'TieredImageNet',
            'MetaTieredImageNet'
    ]:
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=xargs.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
    elif xargs.dataset != 'imagenet-1k':
        search_loader, train_loader, valid_loader = get_nas_search_loaders(
            train_data, valid_data, xargs.dataset, 'configs/',
            config.batch_size, xargs.workers)
    else:
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=xargs.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
    logger.log(
        '||||||| {:10s} ||||||| Train-Loader-Num={:}, batch size={:}'.format(
            xargs.dataset, len(train_loader), config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    if xargs.search_space_name == 'nas-bench-201':
        model_config = edict({
            'name':
            'DARTS-V1',
            'C':
            3,
            'N':
            1,
            'depth':
            -1,
            'use_stem':
            True,
            'max_nodes':
            xargs.max_nodes,
            'num_classes':
            class_num,
            'space':
            search_space,
            'affine':
            True,
            'track_running_stats':
            bool(xargs.track_running_stats),
        })
        model_config_thin = edict({
            'name':
            'DARTS-V1',
            'C':
            1,
            'N':
            1,
            'depth':
            1,
            'use_stem':
            False,
            'max_nodes':
            xargs.max_nodes,
            'num_classes':
            class_num,
            'space':
            search_space,
            'affine':
            True,
            'track_running_stats':
            bool(xargs.track_running_stats),
        })
    elif xargs.search_space_name in ['darts', 'darts_fewshot']:
        model_config = edict({
            'name':
            'DARTS-V1',
            'C':
            1,
            'N':
            1,
            'depth':
            2,
            'use_stem':
            True,
            'stem_multiplier':
            1,
            'num_classes':
            class_num,
            'space':
            search_space,
            'affine':
            True,
            'track_running_stats':
            bool(xargs.track_running_stats),
            'super_type':
            xargs.super_type,
            'steps':
            xargs.max_nodes,
            'multiplier':
            xargs.max_nodes,
        })
        model_config_thin = edict({
            'name':
            'DARTS-V1',
            'C':
            1,
            'N':
            1,
            'depth':
            2,
            'use_stem':
            False,
            'stem_multiplier':
            1,
            'max_nodes':
            xargs.max_nodes,
            'num_classes':
            class_num,
            'space':
            search_space,
            'affine':
            True,
            'track_running_stats':
            bool(xargs.track_running_stats),
            'super_type':
            xargs.super_type,
            'steps':
            xargs.max_nodes,
            'multiplier':
            xargs.max_nodes,
        })
    network = get_cell_based_tiny_net(model_config)
    logger.log('model-config : {:}'.format(model_config))
    arch_parameters = [
        alpha.detach().clone() for alpha in network.get_alphas()
    ]
    for alpha in arch_parameters:
        alpha[:, :] = 0

    # TODO Linear_Region_Collector
    lrc_model = Linear_Region_Collector(xargs,
                                        input_size=(1000, 1, 3, 3),
                                        sample_batch=3,
                                        dataset=xargs.dataset,
                                        data_path=xargs.data_path,
                                        seed=xargs.rand_seed)

    # ### all params trainable (except train_bn) #########################
    flop, param = get_model_infos(network, xshape)
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space [{:} ops] : {:}'.format(len(search_space),
                                                     search_space))
    if xargs.arch_nas_dataset is None or xargs.search_space_name in [
            'darts', 'darts_fewshot'
    ]:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    network = network.cuda()

    genotypes = {}
    genotypes['arch'] = {
        -1: network.genotype()
    }

    arch_parameters_history = []
    arch_parameters_history_npy = []
    start_time = time.time()
    epoch = -1

    for alpha in arch_parameters:
        alpha[:, 0] = -INF
    arch_parameters_history.append(
        [alpha.detach().clone() for alpha in arch_parameters])
    arch_parameters_history_npy.append(
        [alpha.detach().clone().cpu().numpy() for alpha in arch_parameters])
    np.save(os.path.join(xargs.save_dir, "arch_parameters_history.npy"),
            arch_parameters_history_npy)
    while not is_single_path(network):
        epoch += 1
        torch.cuda.empty_cache()
        print("<< ============== JOB (PID = %d) %s ============== >>" %
              (PID, '/'.join(xargs.save_dir.split("/")[-6:])))

        arch_parameters, op_pruned = prune_func_rank(
            xargs,
            arch_parameters,
            model_config,
            model_config_thin,
            train_loader,
            lrc_model,
            search_space,
            precision=xargs.precision,
            prune_number=xargs.prune_number)
        # rebuild supernet
        network = get_cell_based_tiny_net(model_config)
        network = network.cuda()
        network.set_alphas(arch_parameters)

        arch_parameters_history.append(
            [alpha.detach().clone() for alpha in arch_parameters])
        arch_parameters_history_npy.append([
            alpha.detach().clone().cpu().numpy() for alpha in arch_parameters
        ])
        np.save(os.path.join(xargs.save_dir, "arch_parameters_history.npy"),
                arch_parameters_history_npy)
        genotypes['arch'][epoch] = network.genotype()

        logger.log('operators remaining (1s) and prunned (0s)\n{:}'.format(
            '\n'.join([
                str((alpha > -INF).int()) for alpha in network.get_alphas()
            ])))

    if xargs.search_space_name in ['darts', 'darts_fewshot']:
        print("===>>> Prune Edge Groups...")
        if xargs.max_nodes == 4:
            edge_groups = [(0, 2), (2, 5), (5, 9), (9, 14)]
        elif xargs.max_nodes == 3:
            edge_groups = [(0, 2), (2, 5), (5, 9)]
        arch_parameters = prune_func_rank_group(
            xargs,
            arch_parameters,
            model_config,
            model_config_thin,
            train_loader,
            lrc_model,
            search_space,
            edge_groups=edge_groups,
            num_per_group=2,
            precision=xargs.precision,
        )
        network = get_cell_based_tiny_net(model_config)
        network = network.cuda()
        network.set_alphas(arch_parameters)
        arch_parameters_history.append(
            [alpha.detach().clone() for alpha in arch_parameters])
        arch_parameters_history_npy.append([
            alpha.detach().clone().cpu().numpy() for alpha in arch_parameters
        ])
        np.save(os.path.join(xargs.save_dir, "arch_parameters_history.npy"),
                arch_parameters_history_npy)

    logger.log('<<<--->>> End: {:}'.format(network.genotype()))
    logger.log('operators remaining (1s) and prunned (0s)\n{:}'.format(
        '\n'.join(
            [str((alpha > -INF).int()) for alpha in network.get_alphas()])))

    end_time = time.time()
    logger.log('\n' + '-' * 100)
    logger.log("Time spent: %d s" % (end_time - start_time))
    # check the performance from the architecture dataset
    if api is not None:
        logger.log('{:}'.format(api.query_by_arch(genotypes['arch'][epoch])))

    logger.close()
예제 #21
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    if xargs.dataset == 'cifar10' or xargs.dataset == 'cifar100':
        split_Fpath = 'configs/nas-benchmark/cifar-split.txt'
        cifar_split = load_config(split_Fpath, None, None)
        train_split, valid_split = cifar_split.train, cifar_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    elif xargs.dataset.startswith('ImageNet16'):
        split_Fpath = 'configs/nas-benchmark/{:}-split.txt'.format(
            xargs.dataset)
        imagenet16_split = load_config(split_Fpath, None, None)
        train_split, valid_split = imagenet16_split.train, imagenet16_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    else:
        raise ValueError('invalid dataset : {:}'.format(xargs.dataset))
    config_path = 'configs/nas-benchmark/algos/DARTS.config'
    config = load_config(config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    # To split data
    train_data_v2 = deepcopy(train_data)
    train_data_v2.transform = valid_data.transform
    valid_data = train_data_v2
    search_data = SearchDataset(xargs.dataset, train_data, train_split,
                                valid_split)
    # data loader
    search_loader = torch.utils.data.DataLoader(search_data,
                                                batch_size=config.batch_size,
                                                shuffle=True,
                                                num_workers=xargs.workers,
                                                pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split),
        num_workers=xargs.workers,
        pin_memory=True)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    model_config = dict2config(
        {
            'name': 'DARTS-V1',
            'C': xargs.channel,
            'N': xargs.num_cells,
            'max_nodes': xargs.max_nodes,
            'num_classes': class_num,
            'space': search_space
        }, None)
    search_model = get_cell_based_tiny_net(model_config)

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    #logger.log('{:}'.format(search_model))
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint = torch.load(last_info['last_checkpoint'])
        genotypes = checkpoint['genotypes']
        valid_accuracies = checkpoint['valid_accuracies']
        search_model.load_state_dict(checkpoint['search_model'])
        w_scheduler.load_state_dict(checkpoint['w_scheduler'])
        w_optimizer.load_state_dict(checkpoint['w_optimizer'])
        a_optimizer.load_state_dict(checkpoint['a_optimizer'])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = 0, {'best': -1}, {}

    # start training
    start_time, epoch_time, total_epoch = time.time(), AverageMeter(
    ), config.epochs + config.warmup
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format(
            epoch_str, need_time, min(w_scheduler.get_lr())))

        search_w_loss, search_w_top1, search_w_top5 = search_func(
            search_loader, network, criterion, w_scheduler, w_optimizer,
            a_optimizer, epoch_str, xargs.print_freq, logger)
        logger.log(
            '[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5))
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)
        logger.log(
            '[{:}] evaluate  : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1
        if valid_a_top1 > valid_accuracies['best']:
            valid_accuracies['best'] = valid_a_top1
            genotypes['best'] = search_model.genotype()
            find_best = True
        else:
            find_best = False

        genotypes[epoch] = search_model.genotype()
        logger.log('<<<--->>> The {:}-th epoch : {:}'.format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'a_optimizer': a_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict(),
                'genotypes': genotypes,
                'valid_accuracies': valid_accuracies
            }, model_base_path, logger)
        last_info = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)
        if find_best:
            logger.log(
                '<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.'
                .format(epoch_str, valid_a_top1))
            copy_checkpoint(model_base_path, model_best_path, logger)
        with torch.no_grad():
            logger.log('arch-parameters :\n{:}'.format(
                nn.functional.softmax(search_model.arch_parameters,
                                      dim=-1).cpu()))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log('\n' + '-' * 100)
    # check the performance from the architecture dataset
    #if xargs.arch_nas_dataset is None or not os.path.isfile(xargs.arch_nas_dataset):
    #  logger.log('Can not find the architecture dataset : {:}.'.format(xargs.arch_nas_dataset))
    #else:
    #  nas_bench = NASBenchmarkAPI(xargs.arch_nas_dataset)
    #  geno = genotypes[total_epoch-1]
    #  logger.log('The last model is {:}'.format(geno))
    #  info = nas_bench.query_by_arch( geno )
    #  if info is None: logger.log('Did not find this architecture : {:}.'.format(geno))
    #  else           : logger.log('{:}'.format(info))
    #  logger.log('-'*100)
    #  geno = genotypes['best']
    #  logger.log('The best model is {:}'.format(geno))
    #  info = nas_bench.query_by_arch( geno )
    #  if info is None: logger.log('Did not find this architecture : {:}.'.format(geno))
    #  else           : logger.log('{:}'.format(info))
    logger.close()
예제 #22
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    #config_path = 'configs/nas-benchmark/algos/GDAS.config'
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    search_loader, train_loader, valid_loader = get_nas_search_loaders(
        train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/',
        config.batch_size, xargs.workers)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, batch size={:}'.format(
            xargs.dataset, len(search_loader), config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    if xargs.model_config is None:
        model_config = dict2config(
            {
                'name': 'GDAS',
                'C': xargs.channel,
                'N': xargs.num_cells,
                'max_nodes': xargs.max_nodes,
                'num_classes': class_num,
                'space': search_space,
                'affine': False,
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    else:
        model_config = load_config(
            xargs.model_config, {
                'num_classes': class_num,
                'space': search_space,
                'affine': False,
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    search_model = get_cell_based_tiny_net(model_config)
    logger.log('search-model :\n{:}'.format(search_model))
    logger.log('model-config : {:}'.format(model_config))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space [{:} ops] : {:}'.format(len(search_space),
                                                     search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if False:  #last_info.exists(): # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint = torch.load(last_info['last_checkpoint'])
        search_model.load_state_dict(checkpoint['search_model'])
        w_scheduler.load_state_dict(checkpoint['w_scheduler'])
        w_optimizer.load_state_dict(checkpoint['w_optimizer'])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies = 0, {'best': -1}

    if len(xargs.supernet_path) > 0:
        saved_info = torch.load(xargs.supernet_path)
        assert saved_info[
            'epoch'] == 'finished', "Epoch is not finished in this file"
        search_model.load_state_dict(saved_info['search_model'])
    else:
        # start training supernet
        start_time = time.time()
        train_shared_cnn(train_loader, network, criterion, w_scheduler,
                         w_optimizer, xargs.print_freq, logger, config,
                         start_epoch)
        logger.log(
            'Supernet trained. Time-cost = {:.1f} s'.format(time.time() -
                                                            start_time))
        # save supernetweight
        save_path = save_checkpoint(
            {
                'epoch': 'finished',  #epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict()
            },
            model_base_path,
            logger)
        last_info = save_checkpoint(
            {
                'epoch': 'finished',  #epoch + 1,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            },
            logger.path('info'),
            logger)

    search_start_time = time.time()
    searcher = search_model.getSearcher(network, train_loader, valid_loader,
                                        logger, config)
    best_cands, performance_dict, performance_trace = searcher.search()
    logger.log(
        'Architect Searched. Time-cost = {:.1f} s'.format(time.time() -
                                                          search_start_time))
    search_result = save_checkpoint(
        {
            'epoch': 'finished',  #epoch + 1,
            'args': deepcopy(args),
            'genotypes': best_cands,
            'performance_dict': performance_dict,
            'performance_trace': performance_trace
        },
        model_best_path,
        logger)

    logger.close()
예제 #23
0
def evaluate(args):
    if not args.cpu:
        assert torch.cuda.is_available(), 'CUDA is not available.'
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

    print('The image is {:}'.format(args.image))
    print('The model is {:}'.format(args.model))
    snapshot = Path(args.model)
    assert snapshot.exists(), 'The model path {:} does not exist'
    if args.cpu: snapshot = torch.load(snapshot, map_location='cpu')
    else: snapshot = torch.load(snapshot)

    mean_fill = tuple([int(x * 255) for x in [0.5, 0.5, 0.5]])
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    param = snapshot['args']
    eval_transform = transforms.Compose([
        transforms.PreCrop(param.pre_crop_expand),
        transforms.TrainScale2WH((param.crop_width, param.crop_height)),
        transforms.ToTensor(), normalize
    ])

    net = models.__dict__[param.arch](param.modelconfig, None)

    if not args.cpu: net = net.cuda()
    weights = models.remove_module_dict(snapshot['state_dict'])
    net.load_state_dict(weights)

    dataset = datasets.GeneralDataset(eval_transform, param.sigma,
                                      param.downsample, param.heatmap_type,
                                      param.dataset_name)
    dataset.reset(param.num_pts)

    print('[{:}] prepare the input data'.format(time_string()))

    print("Using MT-CNN face detector.")
    try:
        face = utils.detect_face_mtcnn(args.image)
    except utils.mtcnn_detector.BBoxNotFound:
        print("MT-CNN detector failed! Using default bbox instead.")
        face = [153.08, 462., 607.78, 1040.42]

    [image, _, _, _, _, _,
     cropped_size], meta = dataset.prepare_input(args.image, face)
    print('[{:}] prepare the input data done'.format(time_string()))
    print('Net : \n{:}'.format(net))
    # network forward
    with torch.no_grad():
        if args.cpu: inputs = image.unsqueeze(0)
        else: inputs = image.unsqueeze(0).cuda()
        gan_output = (net.netG_A(inputs) + net.netG_B(inputs)) / 2
        gan_output = (gan_output * 0.5 + 0.5).squeeze(0).cpu().permute(
            1, 2, 0).numpy()
        Image.fromarray((gan_output * 255).astype(np.uint8)).save(
            args.save_path.replace(".jpg", ".gan.jpg"))
        batch_heatmaps, batch_locs, batch_scos, _ = net(inputs)
        #print ('input-shape : {:}'.format(inputs.shape))
        flops, params = get_model_infos(net, inputs.shape, None)
        print('\nIN-shape : {:}, FLOPs : {:} MB, Params : {:}.'.format(
            list(inputs.shape), flops, params))
        flops, params = get_model_infos(net, None, inputs)
        print('\nIN-shape : {:}, FLOPs : {:} MB, Params : {:}.'.format(
            list(inputs.shape), flops, params))
    print('[{:}] the network forward done'.format(time_string()))

    # obtain the locations on the image in the orignial size
    cpu = torch.device('cpu')
    np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(
        cpu).numpy(), batch_scos.to(cpu).numpy(), cropped_size.numpy()
    locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(
        np_batch_scos[0, :-1], -1)

    scale_h, scale_w = cropped_size[0] * 1. / inputs.size(
        -2), cropped_size[1] * 1. / inputs.size(-1)

    locations[:,
              0], locations[:, 1] = locations[:, 0] * scale_w + cropped_size[
                  2], locations[:, 1] * scale_h + cropped_size[3]
    prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0)
    for i in range(param.num_pts):
        point = prediction[:, i]
        print(
            'The coordinate of {:02d}/{:02d}-th points : ({:.1f}, {:.1f}), score = {:.3f}'
            .format(i, param.num_pts, float(point[0]), float(point[1]),
                    float(point[2])))

    if args.save_path:
        image = draw_image_by_points(args.image, prediction, 1, (255, 0, 0),
                                     False, False)
        image.save(args.save_path)
        print('save image with landmarks into {:}'.format(args.save_path))
    print('finish san evaluation on a single image : {:}'.format(args.image))
예제 #24
0
def evaluate(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    print('The image is {:}'.format(args.image))
    print('The model is {:}'.format(args.model))
    snapshot = Path(args.model)
    assert snapshot.exists(), 'The model path {:} does not exist'
    print('The face bounding box is {:}'.format(args.face))
    assert len(args.face) == 4, 'Invalid face input : {:}'.format(args.face)
    snapshot = torch.load(snapshot)

    # General Data Argumentation
    mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]])
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    param = snapshot['args']
    eval_transform = transforms.Compose([
        transforms.PreCrop(param.pre_crop_expand),
        transforms.TrainScale2WH((param.crop_width, param.crop_height)),
        transforms.ToTensor(), normalize
    ])
    model_config = load_configure(param.model_config, None)
    dataset = Dataset(eval_transform, param.sigma, model_config.downsample,
                      param.heatmap_type, param.data_indicator)
    dataset.reset(param.num_pts)

    net = obtain_model(model_config, param.num_pts + 1)
    net = net.cuda()
    #import pdb; pdb.set_trace()
    try:
        weights = remove_module_dict(snapshot['detector'])
    except:
        weights = remove_module_dict(snapshot['state_dict'])
    net.load_state_dict(weights)
    print('Prepare input data')
    [image, _, _, _, _, _,
     cropped_size], meta = dataset.prepare_input(args.image, args.face)
    # network forward
    with torch.no_grad():
        inputs = image.unsqueeze(0).cuda()
        batch_heatmaps, batch_locs, batch_scos = net(inputs)
        flops, params = get_model_infos(net, inputs.shape)
        print('IN-shape : {:}, FLOPs : {:} MB, Params : {:} MB'.format(
            list(inputs.shape), flops, params))
    # obtain the locations on the image in the orignial size
    cpu = torch.device('cpu')
    np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(
        cpu).numpy(), batch_scos.to(cpu).numpy(), cropped_size.numpy()
    locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(
        np_batch_scos[0, :-1], -1)

    scale_h, scale_w = cropped_size[0] * 1. / inputs.size(
        -2), cropped_size[1] * 1. / inputs.size(-1)

    locations[:,
              0], locations[:, 1] = locations[:, 0] * scale_w + cropped_size[
                  2], locations[:, 1] * scale_h + cropped_size[3]
    prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0)

    print('the coordinates for {:} facial landmarks:'.format(param.num_pts))
    for i in range(param.num_pts):
        point = prediction[:, i]
        print('the {:02d}/{:02d}-th point : ({:.1f}, {:.1f}), score = {:.2f}'.
              format(i, param.num_pts, float(point[0]), float(point[1]),
                     float(point[2])))

    if args.save:
        resize = 512
        image = draw_image_by_points(args.image, prediction, 2, (255, 0, 0),
                                     args.face, resize)
        image.save(args.save)
        print('save the visualization results into {:}'.format(args.save))
    else:
        print('ignore the visualization procedure')
예제 #25
0
def main(xargs):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    config = load_config(xargs.config_path, {
        "class_num": class_num,
        "xshape": xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data,
        valid_data,
        xargs.dataset,
        "configs/nas-benchmark/",
        (config.batch_size, config.test_batch_size),
        xargs.workers,
    )
    logger.log(
        "||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}"
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log("||||||| {:10s} ||||||| Config={:}".format(
        xargs.dataset, config))

    search_space = get_search_spaces("cell", xargs.search_space_name)
    if xargs.model_config is None:
        model_config = dict2config(
            dict(
                name="SETN",
                C=xargs.channel,
                N=xargs.num_cells,
                max_nodes=xargs.max_nodes,
                num_classes=class_num,
                space=search_space,
                affine=False,
                track_running_stats=bool(xargs.track_running_stats),
            ),
            None,
        )
    else:
        model_config = load_config(
            xargs.model_config,
            dict(
                num_classes=class_num,
                space=search_space,
                affine=False,
                track_running_stats=bool(xargs.track_running_stats),
            ),
            None,
        )
    logger.log("search space : {:}".format(search_space))
    search_model = get_cell_based_tiny_net(model_config)

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(
        search_model.get_alphas(),
        lr=xargs.arch_learning_rate,
        betas=(0.5, 0.999),
        weight_decay=xargs.arch_weight_decay,
    )
    logger.log("w-optimizer : {:}".format(w_optimizer))
    logger.log("a-optimizer : {:}".format(a_optimizer))
    logger.log("w-scheduler : {:}".format(w_scheduler))
    logger.log("criterion   : {:}".format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    logger.log("FLOP = {:.2f} M, Params = {:.2f} MB".format(flop, param))
    logger.log("search-space : {:}".format(search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log("{:} create API = {:} done".format(time_string(), api))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info["epoch"]
        checkpoint = torch.load(last_info["last_checkpoint"])
        genotypes = checkpoint["genotypes"]
        valid_accuracies = checkpoint["valid_accuracies"]
        search_model.load_state_dict(checkpoint["search_model"])
        w_scheduler.load_state_dict(checkpoint["w_scheduler"])
        w_optimizer.load_state_dict(checkpoint["w_optimizer"])
        a_optimizer.load_state_dict(checkpoint["a_optimizer"])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        init_genotype, _ = get_best_arch(valid_loader, network,
                                         xargs.select_num)
        start_epoch, valid_accuracies, genotypes = 0, {
            "best": -1
        }, {
            -1: init_genotype
        }

    # start training
    start_time, search_time, epoch_time, total_epoch = (
        time.time(),
        AverageMeter(),
        AverageMeter(),
        config.epochs + config.warmup,
    )
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch)
        logger.log("\n[Search the {:}-th epoch] {:}, LR={:}".format(
            epoch_str, need_time, min(w_scheduler.get_lr())))

        (
            search_w_loss,
            search_w_top1,
            search_w_top5,
            search_a_loss,
            search_a_top1,
            search_a_top5,
        ) = search_func(
            search_loader,
            network,
            criterion,
            w_scheduler,
            w_optimizer,
            a_optimizer,
            epoch_str,
            xargs.print_freq,
            logger,
        )
        search_time.update(time.time() - start_time)
        logger.log(
            "[{:}] search [base] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s"
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        logger.log(
            "[{:}] search [arch] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%"
            .format(epoch_str, search_a_loss, search_a_top1, search_a_top5))

        genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                                xargs.select_num)
        network.module.set_cal_mode("dynamic", genotype)
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)
        logger.log(
            "[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}% | {:}"
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5,
                    genotype))
        # search_model.set_cal_mode('urs')
        # valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        # logger.log('[{:}] URS---evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # search_model.set_cal_mode('joint')
        # valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        # logger.log('[{:}] JOINT-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # search_model.set_cal_mode('select')
        # valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        # logger.log('[{:}] Selec-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1

        genotypes[epoch] = genotype
        logger.log("<<<--->>> The {:}-th epoch : {:}".format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(xargs),
                "search_model": search_model.state_dict(),
                "w_optimizer": w_optimizer.state_dict(),
                "a_optimizer": a_optimizer.state_dict(),
                "w_scheduler": w_scheduler.state_dict(),
                "genotypes": genotypes,
                "valid_accuracies": valid_accuracies,
            },
            model_base_path,
            logger,
        )
        last_info = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )
        with torch.no_grad():
            logger.log("{:}".format(search_model.show_alphas()))
        if api is not None:
            logger.log("{:}".format(api.query_by_arch(genotypes[epoch],
                                                      "200")))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    # the final post procedure : count the time
    start_time = time.time()
    genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                            xargs.select_num)
    search_time.update(time.time() - start_time)
    network.module.set_cal_mode("dynamic", genotype)
    valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
        valid_loader, network, criterion)
    logger.log(
        "Last : the gentotype is : {:}, with the validation accuracy of {:.3f}%."
        .format(genotype, valid_a_top1))

    logger.log("\n" + "-" * 100)
    # check the performance from the architecture dataset
    logger.log(
        "SETN : run {:} epochs, cost {:.1f} s, last-geno is {:}.".format(
            total_epoch, search_time.sum, genotype))
    if api is not None:
        logger.log("{:}".format(api.query_by_arch(genotype, "200")))
    logger.close()
예제 #26
0
def main(args):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    # torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)

    prepare_seed(args.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        args.dataset, args.data_path, args.cutout_length
    )
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True,
    )
    # get configures
    model_config = load_config(args.model_config, {"class_num": class_num}, logger)
    optim_config = load_config(args.optim_config, {"class_num": class_num}, logger)

    if args.model_source == "normal":
        base_model = obtain_model(model_config)
    elif args.model_source == "nas":
        base_model = obtain_nas_infer_model(model_config, args.extra_model_path)
    elif args.model_source == "autodl-searched":
        base_model = obtain_model(model_config, args.extra_model_path)
    else:
        raise ValueError("invalid model-source : {:}".format(args.model_source))
    flop, param = get_model_infos(base_model, xshape)
    logger.log("model ====>>>>:\n{:}".format(base_model))
    logger.log("model information : {:}".format(base_model.get_message()))
    logger.log("-" * 50)
    logger.log(
        "Params={:.2f} MB, FLOPs={:.2f} M ... = {:.2f} G".format(
            param, flop, flop / 1e3
        )
    )
    logger.log("-" * 50)
    logger.log("train_data : {:}".format(train_data))
    logger.log("valid_data : {:}".format(valid_data))
    optimizer, scheduler, criterion = get_optim_scheduler(
        base_model.parameters(), optim_config
    )
    logger.log("optimizer  : {:}".format(optimizer))
    logger.log("scheduler  : {:}".format(scheduler))
    logger.log("criterion  : {:}".format(criterion))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(base_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start".format(last_info)
        )
        last_infox = torch.load(last_info)
        start_epoch = last_infox["epoch"] + 1
        last_checkpoint_path = last_infox["last_checkpoint"]
        if not last_checkpoint_path.exists():
            logger.log(
                "Does not find {:}, try another path".format(last_checkpoint_path)
            )
            last_checkpoint_path = (
                last_info.parent
                / last_checkpoint_path.parent.name
                / last_checkpoint_path.name
            )
        checkpoint = torch.load(last_checkpoint_path)
        base_model.load_state_dict(checkpoint["base-model"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        valid_accuracies = checkpoint["valid_accuracies"]
        max_bytes = checkpoint["max_bytes"]
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch.".format(
                last_info, start_epoch
            )
        )
    elif args.resume is not None:
        assert Path(args.resume).exists(), "Can not find the resume file : {:}".format(
            args.resume
        )
        checkpoint = torch.load(args.resume)
        start_epoch = checkpoint["epoch"] + 1
        base_model.load_state_dict(checkpoint["base-model"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        valid_accuracies = checkpoint["valid_accuracies"]
        max_bytes = checkpoint["max_bytes"]
        logger.log(
            "=> loading checkpoint from '{:}' start with {:}-th epoch.".format(
                args.resume, start_epoch
            )
        )
    elif args.init_model is not None:
        assert Path(
            args.init_model
        ).exists(), "Can not find the initialization file : {:}".format(args.init_model)
        checkpoint = torch.load(args.init_model)
        base_model.load_state_dict(checkpoint["base-model"])
        start_epoch, valid_accuracies, max_bytes = 0, {"best": -1}, {}
        logger.log("=> initialize the model from {:}".format(args.init_model))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, max_bytes = 0, {"best": -1}, {}

    train_func, valid_func = get_procedures(args.procedure)

    total_epoch = optim_config.epochs + optim_config.warmup
    # Main Training and Evaluation Loop
    start_time = time.time()
    epoch_time = AverageMeter()
    for epoch in range(start_epoch, total_epoch):
        scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True)
        )
        epoch_str = "epoch={:03d}/{:03d}".format(epoch, total_epoch)
        LRs = scheduler.get_lr()
        find_best = False
        # set-up drop-out ratio
        if hasattr(base_model, "update_drop_path"):
            base_model.update_drop_path(
                model_config.drop_path_prob * epoch / total_epoch
            )
        logger.log(
            "\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}".format(
                time_string(), epoch_str, need_time, min(LRs), max(LRs), scheduler
            )
        )

        # train for one epoch
        train_loss, train_acc1, train_acc5 = train_func(
            train_loader,
            network,
            criterion,
            scheduler,
            optimizer,
            optim_config,
            epoch_str,
            args.print_freq,
            logger,
        )
        # log the results
        logger.log(
            "***{:s}*** TRAIN [{:}] loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}".format(
                time_string(), epoch_str, train_loss, train_acc1, train_acc5
            )
        )

        # evaluate the performance
        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
            logger.log("-" * 150)
            valid_loss, valid_acc1, valid_acc5 = valid_func(
                valid_loader,
                network,
                criterion,
                optim_config,
                epoch_str,
                args.print_freq_eval,
                logger,
            )
            valid_accuracies[epoch] = valid_acc1
            logger.log(
                "***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}".format(
                    time_string(),
                    epoch_str,
                    valid_loss,
                    valid_acc1,
                    valid_acc5,
                    valid_accuracies["best"],
                    100 - valid_accuracies["best"],
                )
            )
            if valid_acc1 > valid_accuracies["best"]:
                valid_accuracies["best"] = valid_acc1
                find_best = True
                logger.log(
                    "Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}.".format(
                        epoch,
                        valid_acc1,
                        valid_acc5,
                        100 - valid_acc1,
                        100 - valid_acc5,
                        model_best_path,
                    )
                )
            num_bytes = (
                torch.cuda.max_memory_cached(next(network.parameters()).device) * 1.0
            )
            logger.log(
                "[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]".format(
                    next(network.parameters()).device,
                    int(num_bytes),
                    num_bytes / 1e3,
                    num_bytes / 1e6,
                    num_bytes / 1e9,
                )
            )
            max_bytes[epoch] = num_bytes
        if epoch % 10 == 0:
            torch.cuda.empty_cache()

        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "max_bytes": deepcopy(max_bytes),
                "FLOP": flop,
                "PARAM": param,
                "valid_accuracies": deepcopy(valid_accuracies),
                "model-config": model_config._asdict(),
                "optim-config": optim_config._asdict(),
                "base-model": base_model.state_dict(),
                "scheduler": scheduler.state_dict(),
                "optimizer": optimizer.state_dict(),
            },
            model_base_path,
            logger,
        )
        if find_best:
            copy_checkpoint(model_base_path, model_best_path, logger)
        last_info = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log("\n" + "-" * 200)
    logger.log(
        "Finish training/validation in {:} with Max-GPU-Memory of {:.2f} MB, and save final checkpoint into {:}".format(
            convert_secs2time(epoch_time.sum, True),
            max(v for k, v in max_bytes.items()) / 1e6,
            logger.path("info"),
        )
    )
    logger.log("-" * 200 + "\n")
    logger.close()
예제 #27
0
def main(args):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    # torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)

    prepare_seed(args.rand_seed)
    logger = prepare_logger(args)

    # prepare dataset
    train_data, valid_data, xshape, class_num = get_datasets(
        args.dataset, args.data_path, args.cutout_length)
    # train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True , num_workers=args.workers, pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True,
    )

    split_file_path = Path(args.split_path)
    assert split_file_path.exists(), "{:} does not exist".format(
        split_file_path)
    split_info = torch.load(split_file_path)

    train_split, valid_split = split_info["train"], split_info["valid"]
    assert (len(set(train_split).intersection(set(valid_split))) == 0
            ), "There should be 0 element that belongs to both train and valid"
    assert len(train_split) + len(valid_split) == len(
        train_data), "{:} + {:} vs {:}".format(len(train_split),
                                               len(valid_split),
                                               len(train_data))
    search_dataset = SearchDataset(args.dataset, train_data, train_split,
                                   valid_split)

    search_train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(train_split),
        pin_memory=True,
        num_workers=args.workers,
    )
    search_valid_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split),
        pin_memory=True,
        num_workers=args.workers,
    )
    search_loader = torch.utils.data.DataLoader(
        search_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True,
        sampler=None,
    )
    # get configures
    model_config = load_config(
        args.model_config,
        {
            "class_num": class_num,
            "search_mode": args.search_shape
        },
        logger,
    )

    # obtain the model
    search_model = obtain_search_model(model_config)
    MAX_FLOP, param = get_model_infos(search_model, xshape)
    optim_config = load_config(args.optim_config, {
        "class_num": class_num,
        "FLOP": MAX_FLOP
    }, logger)
    logger.log("Model Information : {:}".format(search_model.get_message()))
    logger.log("MAX_FLOP = {:} M".format(MAX_FLOP))
    logger.log("Params   = {:} M".format(param))
    logger.log("train_data : {:}".format(train_data))
    logger.log("search-data: {:}".format(search_dataset))
    logger.log("search_train_loader : {:} samples".format(len(train_split)))
    logger.log("search_valid_loader : {:} samples".format(len(valid_split)))
    base_optimizer, scheduler, criterion = get_optim_scheduler(
        search_model.base_parameters(), optim_config)
    arch_optimizer = torch.optim.Adam(
        search_model.arch_parameters(),
        lr=optim_config.arch_LR,
        betas=(0.5, 0.999),
        weight_decay=optim_config.arch_decay,
    )
    logger.log("base-optimizer : {:}".format(base_optimizer))
    logger.log("arch-optimizer : {:}".format(arch_optimizer))
    logger.log("scheduler      : {:}".format(scheduler))
    logger.log("criterion      : {:}".format(criterion))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    # load checkpoint
    if last_info.exists() or (args.resume is not None and osp.isfile(
            args.resume)):  # automatically resume from previous checkpoint
        if args.resume is not None and osp.isfile(args.resume):
            resume_path = Path(args.resume)
        elif last_info.exists():
            resume_path = last_info
        else:
            raise ValueError("Something is wrong.")
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            resume_path))
        checkpoint = torch.load(resume_path)
        if "last_checkpoint" in checkpoint:
            last_checkpoint_path = checkpoint["last_checkpoint"]
            if not last_checkpoint_path.exists():
                logger.log("Does not find {:}, try another path".format(
                    last_checkpoint_path))
                last_checkpoint_path = (resume_path.parent /
                                        last_checkpoint_path.parent.name /
                                        last_checkpoint_path.name)
            assert (last_checkpoint_path.exists()
                    ), "can not find the checkpoint from {:}".format(
                        last_checkpoint_path)
            checkpoint = torch.load(last_checkpoint_path)
        start_epoch = checkpoint["epoch"] + 1
        search_model.load_state_dict(checkpoint["search_model"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        base_optimizer.load_state_dict(checkpoint["base_optimizer"])
        arch_optimizer.load_state_dict(checkpoint["arch_optimizer"])
        valid_accuracies = checkpoint["valid_accuracies"]
        arch_genotypes = checkpoint["arch_genotypes"]
        discrepancies = checkpoint["discrepancies"]
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(resume_path, start_epoch))
    else:
        logger.log(
            "=> do not find the last-info file : {:} or resume : {:}".format(
                last_info, args.resume))
        start_epoch, valid_accuracies, arch_genotypes, discrepancies = (
            0,
            {
                "best": -1
            },
            {},
            {},
        )

    # main procedure
    train_func, valid_func = get_procedures(args.procedure)
    total_epoch = optim_config.epochs + optim_config.warmup
    start_time, epoch_time = time.time(), AverageMeter()
    for epoch in range(start_epoch, total_epoch):
        scheduler.update(epoch, 0.0)
        search_model.set_tau(args.gumbel_tau_max, args.gumbel_tau_min,
                             epoch * 1.0 / total_epoch)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True))
        epoch_str = "epoch={:03d}/{:03d}".format(epoch, total_epoch)
        LRs = scheduler.get_lr()
        find_best = False

        logger.log(
            "\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}, tau={:}, FLOP={:.2f}"
            .format(
                time_string(),
                epoch_str,
                need_time,
                min(LRs),
                max(LRs),
                scheduler,
                search_model.tau,
                MAX_FLOP,
            ))

        # train for one epoch
        train_base_loss, train_arch_loss, train_acc1, train_acc5 = train_func(
            search_loader,
            network,
            criterion,
            scheduler,
            base_optimizer,
            arch_optimizer,
            optim_config,
            {
                "epoch-str": epoch_str,
                "FLOP-exp": MAX_FLOP * args.FLOP_ratio,
                "FLOP-weight": args.FLOP_weight,
                "FLOP-tolerant": MAX_FLOP * args.FLOP_tolerant,
            },
            args.print_freq,
            logger,
        )
        # log the results
        logger.log(
            "***{:s}*** TRAIN [{:}] base-loss = {:.6f}, arch-loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}"
            .format(
                time_string(),
                epoch_str,
                train_base_loss,
                train_arch_loss,
                train_acc1,
                train_acc5,
            ))
        cur_FLOP, genotype = search_model.get_flop("genotype",
                                                   model_config._asdict(),
                                                   None)
        arch_genotypes[epoch] = genotype
        arch_genotypes["last"] = genotype
        logger.log("[{:}] genotype : {:}".format(epoch_str, genotype))
        arch_info, discrepancy = search_model.get_arch_info()
        logger.log(arch_info)
        discrepancies[epoch] = discrepancy
        logger.log(
            "[{:}] FLOP : {:.2f} MB, ratio : {:.4f}, Expected-ratio : {:.4f}, Discrepancy : {:.3f}"
            .format(
                epoch_str,
                cur_FLOP,
                cur_FLOP / MAX_FLOP,
                args.FLOP_ratio,
                np.mean(discrepancy),
            ))

        # if cur_FLOP/MAX_FLOP > args.FLOP_ratio:
        #  init_flop_weight = init_flop_weight * args.FLOP_decay
        # else:
        #  init_flop_weight = init_flop_weight / args.FLOP_decay

        # evaluate the performance
        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
            logger.log("-" * 150)
            valid_loss, valid_acc1, valid_acc5 = valid_func(
                search_valid_loader,
                network,
                criterion,
                epoch_str,
                args.print_freq_eval,
                logger,
            )
            valid_accuracies[epoch] = valid_acc1
            logger.log(
                "***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}"
                .format(
                    time_string(),
                    epoch_str,
                    valid_loss,
                    valid_acc1,
                    valid_acc5,
                    valid_accuracies["best"],
                    100 - valid_accuracies["best"],
                ))
            if valid_acc1 > valid_accuracies["best"]:
                valid_accuracies["best"] = valid_acc1
                arch_genotypes["best"] = genotype
                find_best = True
                logger.log(
                    "Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}."
                    .format(
                        epoch,
                        valid_acc1,
                        valid_acc5,
                        100 - valid_acc1,
                        100 - valid_acc5,
                        model_best_path,
                    ))

        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "valid_accuracies": deepcopy(valid_accuracies),
                "model-config": model_config._asdict(),
                "optim-config": optim_config._asdict(),
                "search_model": search_model.state_dict(),
                "scheduler": scheduler.state_dict(),
                "base_optimizer": base_optimizer.state_dict(),
                "arch_optimizer": arch_optimizer.state_dict(),
                "arch_genotypes": arch_genotypes,
                "discrepancies": discrepancies,
            },
            model_base_path,
            logger,
        )
        if find_best:
            copy_checkpoint(model_base_path, model_best_path, logger)
        last_info = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log("")
    logger.log("-" * 100)
    last_config_path = logger.path("log") / "seed-{:}-last.config".format(
        args.rand_seed)
    configure2str(arch_genotypes["last"], str(last_config_path))
    logger.log("save the last config int {:} :\n{:}".format(
        last_config_path, arch_genotypes["last"]))

    best_arch, valid_acc = arch_genotypes["best"], valid_accuracies["best"]
    for key, config in arch_genotypes.items():
        if key == "last":
            continue
        FLOP_ratio = config["estimated_FLOP"] / MAX_FLOP
        if abs(FLOP_ratio - args.FLOP_ratio) <= args.FLOP_tolerant:
            if valid_acc < valid_accuracies[key]:
                best_arch, valid_acc = config, valid_accuracies[key]
    print("Best-Arch : {:}\nRatio={:}, Valid-ACC={:}".format(
        best_arch, best_arch["estimated_FLOP"] / MAX_FLOP, valid_acc))
    best_config_path = logger.path("log") / "seed-{:}-best.config".format(
        args.rand_seed)
    configure2str(best_arch, str(best_config_path))
    logger.log("save the last config int {:} :\n{:}".format(
        best_config_path, best_arch))
    logger.log("\n" + "-" * 200)
    logger.log(
        "Finish training/validation in {:}, and save final checkpoint into {:}"
        .format(convert_secs2time(epoch_time.sum, True), logger.path("info")))
    logger.close()
예제 #28
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    assert xargs.dataset == 'cifar10', 'currently only support CIFAR-10'
    if xargs.dataset == 'cifar10' or xargs.dataset == 'cifar100':
        split_Fpath = 'configs/nas-benchmark/cifar-split.txt'
        cifar_split = load_config(split_Fpath, None, None)
        train_split, valid_split = cifar_split.train, cifar_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    elif xargs.dataset.startswith('ImageNet16'):
        split_Fpath = 'configs/nas-benchmark/{:}-split.txt'.format(
            xargs.dataset)
        imagenet16_split = load_config(split_Fpath, None, None)
        train_split, valid_split = imagenet16_split.train, imagenet16_split.valid
        logger.log('Load split file from {:}'.format(split_Fpath))
    else:
        raise ValueError('invalid dataset : {:}'.format(xargs.dataset))
    #config_path = 'configs/nas-benchmark/algos/SETN.config'
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    # To split data
    train_data_v2 = deepcopy(train_data)
    train_data_v2.transform = valid_data.transform
    valid_data = train_data_v2
    search_data = SearchDataset(xargs.dataset, train_data, train_split,
                                valid_split)
    # data loader
    search_loader = torch.utils.data.DataLoader(search_data,
                                                batch_size=config.batch_size,
                                                shuffle=True,
                                                num_workers=xargs.workers,
                                                pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=config.test_batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split),
        num_workers=xargs.workers,
        pin_memory=True)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    model_config = dict2config(
        {
            'name': 'SETN',
            'C': xargs.channel,
            'N': xargs.num_cells,
            'max_nodes': xargs.max_nodes,
            'num_classes': class_num,
            'space': search_space,
            'affine': False,
            'track_running_stats': bool(xargs.track_running_stats)
        }, None)
    logger.log('search space : {:}'.format(search_space))
    search_model = get_cell_based_tiny_net(model_config)

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    #logger.log('{:}'.format(search_model))
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space : {:}'.format(search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint = torch.load(last_info['last_checkpoint'])
        genotypes = checkpoint['genotypes']
        valid_accuracies = checkpoint['valid_accuracies']
        search_model.load_state_dict(checkpoint['search_model'])
        w_scheduler.load_state_dict(checkpoint['w_scheduler'])
        w_optimizer.load_state_dict(checkpoint['w_optimizer'])
        a_optimizer.load_state_dict(checkpoint['a_optimizer'])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = 0, {'best': -1}, {}

    # start training
    start_time, search_time, epoch_time, total_epoch = time.time(
    ), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format(
            epoch_str, need_time, min(w_scheduler.get_lr())))

        search_w_loss, search_w_top1, search_w_top5, search_a_loss, search_a_top1, search_a_top5 \
                    = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger)
        search_time.update(time.time() - start_time)
        logger.log(
            '[{:}] search [base] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        logger.log(
            '[{:}] search [arch] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, search_a_loss, search_a_top1, search_a_top5))

        genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                                xargs.select_num)
        network.module.set_cal_mode('dynamic', genotype)
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)
        logger.log(
            '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}% | {:}'
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5,
                    genotype))
        #search_model.set_cal_mode('urs')
        #valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        #logger.log('[{:}] URS---evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        #search_model.set_cal_mode('joint')
        #valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        #logger.log('[{:}] JOINT-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        #search_model.set_cal_mode('select')
        #valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        #logger.log('[{:}] Selec-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1

        genotypes[epoch] = genotype
        logger.log('<<<--->>> The {:}-th epoch : {:}'.format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'a_optimizer': a_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict(),
                'genotypes': genotypes,
                'valid_accuracies': valid_accuracies
            }, model_base_path, logger)
        last_info = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)
        with torch.no_grad():
            logger.log('arch-parameters :\n{:}'.format(
                nn.functional.softmax(search_model.arch_parameters,
                                      dim=-1).cpu()))
        if api is not None:
            logger.log('{:}'.format(api.query_by_arch(genotypes[epoch])))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    # the final post procedure : count the time
    start_time = time.time()
    genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                            xargs.select_num)
    search_time.update(time.time() - start_time)
    network.module.set_cal_mode('dynamic', genotype)
    valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
        valid_loader, network, criterion)
    logger.log(
        'Last : the gentotype is : {:}, with the validation accuracy of {:.3f}%.'
        .format(genotype, valid_a_top1))

    logger.log('\n' + '-' * 100)
    # check the performance from the architecture dataset
    logger.log(
        'SETN : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(
            total_epoch, search_time.sum, genotype))
    if api is not None: logger.log('{:}'.format(api.query_by_arch(genotype)))
    logger.close()
예제 #29
0
def main(xargs):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    #config_path = 'configs/nas-benchmark/algos/GDAS.config'
    config = load_config(xargs.config_path, {
        'class_num': class_num,
        'xshape': xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/',
        config.batch_size, xargs.workers)
    logger.log(
        '||||||| {:10s} ||||||| Search-Loader-Num={:}, batch size={:}'.format(
            xargs.dataset, len(search_loader), config.batch_size))
    logger.log('||||||| {:10s} ||||||| Config={:}'.format(
        xargs.dataset, config))

    search_space = get_search_spaces('cell', xargs.search_space_name)
    if xargs.model_config is None:
        model_config = dict2config(
            {
                'name': 'GDAS',
                'C': xargs.channel,
                'N': xargs.num_cells,
                'max_nodes': xargs.max_nodes,
                'num_classes': class_num,
                'space': search_space,
                'affine': False,
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    else:
        model_config = load_config(
            xargs.model_config, {
                'num_classes': class_num,
                'space': search_space,
                'affine': False,
                'track_running_stats': bool(xargs.track_running_stats)
            }, None)
    search_model = get_cell_based_tiny_net(model_config)
    logger.log('search-model :\n{:}'.format(search_model))
    logger.log('model-config : {:}'.format(model_config))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    logger.log('w-optimizer : {:}'.format(w_optimizer))
    logger.log('a-optimizer : {:}'.format(a_optimizer))
    logger.log('w-scheduler : {:}'.format(w_scheduler))
    logger.log('criterion   : {:}'.format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log('search-space [{:} ops] : {:}'.format(len(search_space),
                                                     search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log('{:} create API = {:} done'.format(time_string(), api))

    last_info, model_base_path, model_best_path = logger.path(
        'info'), logger.path('model'), logger.path('best')
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info['epoch']
        checkpoint = torch.load(last_info['last_checkpoint'])
        genotypes = checkpoint['genotypes']
        valid_accuracies = checkpoint['valid_accuracies']
        search_model.load_state_dict(checkpoint['search_model'])
        w_scheduler.load_state_dict(checkpoint['w_scheduler'])
        w_optimizer.load_state_dict(checkpoint['w_optimizer'])
        a_optimizer.load_state_dict(checkpoint['a_optimizer'])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = 0, {
            'best': -1
        }, {
            -1: search_model.genotype()
        }

    # start training
    start_time, search_time, epoch_time, total_epoch = time.time(
    ), AverageMeter(), AverageMeter(), config.epochs + config.warmup
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = 'Time Left: {:}'.format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch)
        search_model.set_tau(xargs.tau_max -
                             (xargs.tau_max - xargs.tau_min) * epoch /
                             (total_epoch - 1))
        logger.log('\n[Search the {:}-th epoch] {:}, tau={:}, LR={:}'.format(
            epoch_str, need_time, search_model.get_tau(),
            min(w_scheduler.get_lr())))

        search_w_loss, search_w_top1, search_w_top5, valid_a_loss , valid_a_top1 , valid_a_top5 \
                  = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger)
        search_time.update(time.time() - start_time)
        logger.log(
            '[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        logger.log(
            '[{:}] evaluate  : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1
        if valid_a_top1 > valid_accuracies['best']:
            valid_accuracies['best'] = valid_a_top1
            genotypes['best'] = search_model.genotype()
            find_best = True
        else:
            find_best = False

        genotypes[epoch] = search_model.genotype()
        logger.log('<<<--->>> The {:}-th epoch : {:}'.format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(xargs),
                'search_model': search_model.state_dict(),
                'w_optimizer': w_optimizer.state_dict(),
                'a_optimizer': a_optimizer.state_dict(),
                'w_scheduler': w_scheduler.state_dict(),
                'genotypes': genotypes,
                'valid_accuracies': valid_accuracies
            }, model_base_path, logger)
        last_info = save_checkpoint(
            {
                'epoch': epoch + 1,
                'args': deepcopy(args),
                'last_checkpoint': save_path,
            }, logger.path('info'), logger)
        if find_best:
            logger.log(
                '<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.'
                .format(epoch_str, valid_a_top1))
            copy_checkpoint(model_base_path, model_best_path, logger)
        with torch.no_grad():
            logger.log('{:}'.format(search_model.show_alphas()))
        if api is not None:
            logger.log('{:}'.format(api.query_by_arch(genotypes[epoch],
                                                      '200')))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log('\n' + '-' * 100)
    # check the performance from the architecture dataset
    logger.log(
        'GDAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(
            total_epoch, search_time.sum, genotypes[total_epoch - 1]))
    if api is not None:
        logger.log('{:}'.format(
            api.query_by_arch(genotypes[total_epoch - 1], '200')))
    logger.close()
예제 #30
0
def evaluate_for_seed(arch_config, opt_config, train_loader, valid_loaders,
                      seed: int, logger):

    prepare_seed(seed)  # random seed
    net = get_cell_based_tiny_net(arch_config)
    # net = TinyNetwork(arch_config['channel'], arch_config['num_cells'], arch, config.class_num)
    flop, param = get_model_infos(net, opt_config.xshape)
    logger.log("Network : {:}".format(net.get_message()), False)
    logger.log(
        "{:} Seed-------------------------- {:} --------------------------".
        format(time_string(), seed))
    logger.log("FLOP = {:} MB, Param = {:} MB".format(flop, param))
    # train and valid
    optimizer, scheduler, criterion = get_optim_scheduler(
        net.parameters(), opt_config)
    default_device = torch.cuda.current_device()
    network = torch.nn.DataParallel(net,
                                    device_ids=[default_device
                                                ]).cuda(device=default_device)
    criterion = criterion.cuda(device=default_device)
    # start training
    start_time, epoch_time, total_epoch = (
        time.time(),
        AverageMeter(),
        opt_config.epochs + opt_config.warmup,
    )
    (
        train_losses,
        train_acc1es,
        train_acc5es,
        valid_losses,
        valid_acc1es,
        valid_acc5es,
    ) = ({}, {}, {}, {}, {}, {})
    train_times, valid_times, lrs = {}, {}, {}
    for epoch in range(total_epoch):
        scheduler.update(epoch, 0.0)
        lr = min(scheduler.get_lr())
        train_loss, train_acc1, train_acc5, train_tm = procedure(
            train_loader, network, criterion, scheduler, optimizer, "train")
        train_losses[epoch] = train_loss
        train_acc1es[epoch] = train_acc1
        train_acc5es[epoch] = train_acc5
        train_times[epoch] = train_tm
        lrs[epoch] = lr
        with torch.no_grad():
            for key, xloder in valid_loaders.items():
                valid_loss, valid_acc1, valid_acc5, valid_tm = procedure(
                    xloder, network, criterion, None, None, "valid")
                valid_losses["{:}@{:}".format(key, epoch)] = valid_loss
                valid_acc1es["{:}@{:}".format(key, epoch)] = valid_acc1
                valid_acc5es["{:}@{:}".format(key, epoch)] = valid_acc5
                valid_times["{:}@{:}".format(key, epoch)] = valid_tm

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.avg * (total_epoch - epoch - 1),
                              True))
        logger.log(
            "{:} {:} epoch={:03d}/{:03d} :: Train [loss={:.5f}, acc@1={:.2f}%, acc@5={:.2f}%] Valid [loss={:.5f}, acc@1={:.2f}%, acc@5={:.2f}%], lr={:}"
            .format(
                time_string(),
                need_time,
                epoch,
                total_epoch,
                train_loss,
                train_acc1,
                train_acc5,
                valid_loss,
                valid_acc1,
                valid_acc5,
                lr,
            ))
    info_seed = {
        "flop": flop,
        "param": param,
        "arch_config": arch_config._asdict(),
        "opt_config": opt_config._asdict(),
        "total_epoch": total_epoch,
        "train_losses": train_losses,
        "train_acc1es": train_acc1es,
        "train_acc5es": train_acc5es,
        "train_times": train_times,
        "valid_losses": valid_losses,
        "valid_acc1es": valid_acc1es,
        "valid_acc5es": valid_acc5es,
        "valid_times": valid_times,
        "learning_rates": lrs,
        "net_state_dict": net.state_dict(),
        "net_string": "{:}".format(net),
        "finish-train": True,
    }
    return info_seed