예제 #1
0
def main(args):

    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # if resume is True, resume configs and checkpoint from the existing files.
    if args.search_resume:
        # args.resume_file path to ... .../EXP-time
        # resume experiment in a new File, rather than the same file.
        # configs resume
        assert os.path.exists(
            args.resume_file
        ), 'cannot find the resume file {:}, please re-check'.format(
            args.resume_file)
        config_file_path = os.path.join(args.resume_file, 'search.config')
        assert os.path.exists(
            config_file_path
        ), "the path to configs file path {:} is not exists".format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict, 'search')
        # new EXP file initialize
        resume_EXP_time = config_dict['path'].split('/')[-1]
        resume_exp_name = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name, EXP_time +
            '-resume-{:}'.format(resume_exp_name + '-' + resume_EXP_time))
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')
        #save_configs(args.__dict__, args.path, 'search')
        #logger = prepare_logger(args)
        #logger.log("=> loading configs from the file '{:}' start.".format(args.resume_file), mode='info')
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)
    else:
        # training initialization
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')

    # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    if args.scheduler == 'cosine':
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None
    # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion.
    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'warmup_epoch': args.warmup_epochs,
        'epochs': args.epochs,
        'class_num': args.nb_classes,
    }
    # arch_optimizer_config
    if args.arch_optimizer_type == 'adam':
        args.arch_optimizer_params = {
            'betas': (args.arch_adam_beta1, args.arch_adam_beta2),
            'eps': args.arch_adam_eps
        }
    else:
        args.arch_optimizer_params = None
    # related to entropy constraint loss
    # TODO: pay attention, use separate lambda for cell_entropy and network_entropy.
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2,
        }
    elif args.reg_loss_type == 'add#linear#linearschedule':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2,
        }
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None
    # perform config save, for run_configs and arch_search_configs
    save_configs(args.__dict__, args.path, 'search')
    logger = prepare_logger(args)
    logger.log("=> loading configs from the file '{:}' start.".format(
        args.resume_file) if args.search_resume else
               '=> train-search phase initialization done',
               mode='info')

    #print(args.optimizer_config)
    run_config = RunConfig(**args.__dict__)
    arch_search_config = ArchSearchConfig(**args.__dict__)

    # args.bn_momentum and args.bn_eps are not used

    super_network = GumbelAutoDeepLab(args.filter_multiplier,
                                      args.block_multiplier,
                                      args.steps,
                                      args.nb_classes,
                                      args.nb_layers,
                                      args.bn_momentum,
                                      args.bn_eps,
                                      args.search_space,
                                      logger,
                                      affine=False)

    # calculate init entropy
    _, network_index = super_network.get_network_arch_hardwts(
    )  # set self.hardwts again
    _, aspp_index = super_network.get_aspp_hardwts_index()
    single_path = super_network.sample_single_path(args.nb_layers, aspp_index,
                                                   network_index)
    cell_arch_entropy, network_arch_entropy, entropy = super_network.calculate_entropy(
        single_path)

    logger.log('=> entropy : {:}'.format(entropy), mode='info')

    vis_init_params = {
        'cell_entropy': cell_arch_entropy,
        'network_entropy': network_arch_entropy,
        'entropy': entropy,
    }
    #vis_elements = args.elements
    #vis_elements.extend(['cell_entropy', 'network_entropy', 'entropy'])
    #args.elements = vis_elements
    args.vis_init_params = vis_init_params
    if args.open_vis:
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=args.vis_init_params)
    else:
        vis = None
    '''
    from exp.autodeeplab.auto_deeplab import AutoDeeplab
    super_network = AutoDeeplab(args.filter_multiplier, args.block_multiplier, args.steps,
                                args.nb_classes, args.nb_layers, args.search_space, logger, affine=False)
    '''
    '''
    from exp.fixed_network_level.supernetwork import FixedNetwork
    super_network = FixedNetwork(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes,
                                 args.nb_layers, args.search_space, logger, affine=False)
    '''
    arch_search_run_manager = ArchSearchRunManager(args.path, super_network,
                                                   run_config,
                                                   arch_search_config, logger,
                                                   vis)
    display_all_families_information(args, 'search', arch_search_run_manager,
                                     logger)
    '''
    # get_model_infos, perform inference
    # TODO: modify the way of forward into gdas_forward
    flop, param = get_model_infos(super_network, [1, 3, 512, 512])
    print('||||||| FLOPS & PARAMS |||||||')
    print('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    '''
    # 1. resume warmup phase
    # 2. resume search phase
    # 3. add last_info log × not last_info, every time, the saved_file name is not consistent, should given resume_file

    # 1. given EXP file time           completed :: resume_file :: ->EXP-time
    # 2. get configs, and load config  completed
    # 3. resume checkpoint             completed

    # TODO: have issue in resume semantics. After resume, it will allocate more GPU memory than the normal one, which will raise OOM in search phase.

    if args.search_resume:
        if os.path.exists(args.resume_file):  # resume_file :: path to EXP-time
            logger.log("=> loading checkpoint of the file '{:}' start".format(
                args.resume_file),
                       mode='info')
            warm_up_checkpoint = os.path.join(
                args.resume_file, 'checkpoints',
                'seed-{:}-warm.pth'.format(args.random_seed))
            search_checkpoint = os.path.join(
                args.resume_file, 'checkpoints',
                'seed-{:}-search.pth'.format(args.random_seed))
            if args.resume_from_warmup == False:  # resume checkpoint in search phase
                checkpoint = torch.load(search_checkpoint)
                super_network.load_state_dict(checkpoint['state_dict'])
                arch_search_run_manager.run_manager.optimizer.load_state_dict(
                    checkpoint['weight_optimizer'])
                arch_search_run_manager.run_manager.scheduler.load_state_dict(
                    checkpoint['weight_scheduler'])
                arch_search_run_manager.arch_optimizer.load_state_dict(
                    checkpoint['arch_optimizer'])
                arch_search_run_manager.run_manager.monitor_metric = checkpoint[
                    'best_monitor'][0]
                arch_search_run_manager.run_manager.best_monitor = checkpoint[
                    'best_monitor'][1]
                arch_search_run_manager.warmup = checkpoint['warmup']
                arch_search_run_manager.start_epoch = checkpoint[
                    'start_epochs']  # pay attention:: start_epochs and warmup_epoch in nas_manager
                logger.log(
                    "=> loading checkpoint of the file '{:}' start with {:}-th epochs in search phase"
                    .format(search_checkpoint, checkpoint['start_epochs']),
                    mode='info')
            else:  # resume checkpoint in warmup phase
                checkpoint = torch.load(warm_up_checkpoint)
                super_network.load_state_dict(checkpoint['state_dict'])
                arch_search_run_manager.run_manager.optimizer.load_state_dict(
                    checkpoint['weight_optimizer'])
                arch_search_run_manager.run_manager.scheduler.load_state_dict(
                    checkpoint['weight_scheduler'])
                arch_search_run_manager.warmup = checkpoint['warmup']
                arch_search_run_manager.warmup_epoch = checkpoint[
                    'warmup_epoch']
                logger.log(
                    "=> loading checkpoint of the file '{:}' start with {:}-th epochs in warmup phase"
                    .format(warm_up_checkpoint, checkpoint['warmup_epoch']),
                    mode='info')
        else:
            logger.log(
                "=> can not find the file: {:} please re-confirm it\n"
                "=> start warm-up and search from scratch... ...".format(
                    args.resume_file),
                mode='info')
    else:
        logger.log("=> start warm-up and search from scratch... ...",
                   mode='info')

    # torch.autograd.set_detect_anomaly(True)
    # warm up phase
    if arch_search_run_manager.warmup:
        arch_search_run_manager.warm_up(warmup_epochs=args.warmup_epochs)
    # train search phase
    arch_search_run_manager.train()

    logger.close()
예제 #2
0
파일: train.py 프로젝트: VITA-Group/ASG
def main():
    global args, best_prec1
    PID = os.getpid()
    args = parser.parse_args()
    prepare_seed(args.rand_seed)

    if args.timestamp == 'none':
        args.timestamp = "{:}".format(time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time())))

    # Log outputs
    if args.evaluate:
        args.save_dir = args.save_dir + "/Visda17-Res101-evaluate" + \
            "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp)
    else:
        args.save_dir = args.save_dir + \
            "/Visda17-Res101-%s-train.%s-LR%.2E-epoch%d-batch%d-seed%d"%(
                   "LWF%.2f"%args.lwf if args.lwf > 0 else "XE", args.train_blocks, args.lr, args.epochs, args.batch_size, args.rand_seed) + \
            "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp)
    logger = prepare_logger(args)

    data_transforms = {
        'train': transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
    }

    kwargs = {'num_workers': 20, 'pin_memory': True}
    trainset = VisDA17(txt_file=os.path.join(args.data, "train/image_list.txt"), root_dir=os.path.join(args.data, "train"), transform=data_transforms['train'])
    valset = VisDA17(txt_file=os.path.join(args.data, "validation/image_list.txt"), root_dir=os.path.join(args.data, "validation"), transform=data_transforms['val'], label_one_hot=True)
    train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, **kwargs)
    val_loader = DataLoader(valset, batch_size=args.batch_size, shuffle=False, **kwargs)

    model = resnet101(pretrained=True)
    num_ftrs = model.fc.in_features
    fc_layers = nn.Sequential(
        nn.Linear(num_ftrs, 512),
        nn.ReLU(inplace=True),
        nn.Linear(512, args.num_class),
    )
    model.fc_new = fc_layers

    train_blocks = args.train_blocks.split('.')
    # default turn-off fc, turn-on fc_new
    for param in model.fc.parameters():
        param.requires_grad = False
    ##### Freeze several bottom layers (Optional) #####
    non_train_blocks = ['conv1', 'bn1', 'layer1', 'layer2', 'layer3', 'layer4', 'fc']
    for name in train_blocks:
        try:
            non_train_blocks.remove(name)
        except Exception:
            print("cannot find block name %s\nAvailable blocks are: conv1, bn1, layer1, layer2, layer3, layer4, fc"%name)
    for name in non_train_blocks:
        for param in getattr(model, name).parameters():
            param.requires_grad = False

    # Setup optimizer
    factor = 0.1
    sgd_in = []
    for name in train_blocks:
        if name != 'fc':
            sgd_in.append({'params': get_params(model, [name]), 'lr': factor*args.lr})
        else:
            sgd_in.append({'params': get_params(model, ["fc_new"]), 'lr': args.lr})
    base_lrs = [ group['lr'] for group in sgd_in ]
    optimizer = SGD(sgd_in, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    # Optionally resume from a checkpoint
    if args.resume != 'none':
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
        else:
            print("=ImageClassdata> no checkpoint found at '{}'".format(args.resume))

    model = model.cuda()

    model_old = None
    if args.lwf > 0:
        # create a fixed model copy for Life-long learning
        model_old = resnet101(pretrained=True)
        for param in model_old.parameters():
            param.requires_grad = False
        model_old.eval()
        model_old.cuda()

    if args.evaluate:
        prec1 = validate(val_loader, model)
        print(prec1)
        exit(0)

    # Main training loop
    iter_max = args.epochs * len(train_loader)
    iter_stat = IterNums(iter_max)
    for epoch in range(args.start_epoch, args.epochs):
        print("<< ============== JOB (PID = %d) %s ============== >>"%(PID, args.save_dir))
        logger.log("Epoch: %d"%(epoch+1))
        # train for one epoch
        train(train_loader, model, optimizer, base_lrs, iter_stat, epoch, logger.writer, model_old=model_old, adjust_lr=True)

        # evaluate on validation set
        prec1 = validate(val_loader, model)
        logger.writer.add_scalar("prec", prec1, epoch)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(args.save_dir, {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
        }, is_best)

    logging.info('Best accuracy: {prec1:.3f}'.format(prec1=best_prec1))
예제 #3
0
파일: l2o_train.py 프로젝트: NVlabs/ASG
def main():
    args = get_args()
    PID = os.getpid()
    print("<< ============== JOB (PID = %d) %s ============== >>" %
          (PID, args.save_dir))
    prepare_seed(args.rand_seed)

    if args.timestamp == 'none':
        args.timestamp = "{:}".format(
            time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time())))

    torch.set_num_threads(1)

    # Log outputs
    args.save_dir = args.save_dir + \
        "/Visda17-L2O.train.Res101-%s-train.%s-LR%.2E-epoch%d-batch%d-seed%d"%(
               "LWF" if args.lwf > 0 else "XE", args.train_blocks, args.lr, args.epochs, args.batch_size, args.rand_seed) + \
        "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp)
    logger = prepare_logger(args)

    best_prec1 = 0

    #### preparation ###########################################
    data_transforms = {
        'train':
        transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ]),
        'val':
        transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ]),
    }

    kwargs = {'num_workers': 20, 'pin_memory': True}
    trainset = VisDA17(txt_file=os.path.join(args.data,
                                             "train/image_list.txt"),
                       root_dir=os.path.join(args.data, "train"),
                       transform=data_transforms['train'])
    valset = VisDA17(txt_file=os.path.join(args.data,
                                           "validation/image_list.txt"),
                     root_dir=os.path.join(args.data, "validation"),
                     transform=data_transforms['val'],
                     label_one_hot=True)
    train_loader = DataLoader(trainset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              **kwargs)
    val_loader = DataLoader(valset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            **kwargs)
    train_loader_iter = iter(train_loader)
    current_optimizee_step, prev_optimizee_step = 0, 0

    model_old = None
    if args.lwf > 0:
        # create a fixed model copy for Life-long learning
        model_old = resnet101(pretrained=True)
        for param in model_old.parameters():
            param.requires_grad = False
        model_old.eval()
        model_old.cuda()
    ############################################################

    ### Agent Settings ########################################
    RANDOM = False  # False | True | 'init'
    action_space = np.arange(0, 1.1, 0.1)
    obs_avg = True
    _window_size = 1
    window_size = 1 if obs_avg else _window_size
    window_shrink_size = 20  # larger: controller will be updated more frequently
    sgd_in_names = [
        "conv1", "bn1", "layer1", "layer2", "layer3", "layer4", "fc_new"
    ]
    coord_size = len(sgd_in_names)
    ob_name_lstm = ["loss", "loss_kl", "step", "fc_mean", "fc_std"]
    ob_name_scalar = []
    obs_shape = (len(ob_name_lstm) * window_size + len(ob_name_scalar) +
                 coord_size, )
    _hidden_size = 20
    hidden_size = _hidden_size * len(ob_name_lstm)
    actor_critic = Policy(coord_size,
                          input_size=(len(ob_name_lstm), len(ob_name_scalar)),
                          action_space=len(action_space),
                          hidden_size=_hidden_size,
                          window_size=window_size)
    actor_critic.cuda()
    actor_critic.eval()

    partial = torch.load(args.agent_load_dir,
                         map_location=lambda storage, loc: storage)
    state = actor_critic.state_dict()
    pretrained_dict = {k: v for k, v in partial.items()}
    state.update(pretrained_dict)
    actor_critic.load_state_dict(state)

    ################################################################

    _min_iter = 10
    # reset optmizee
    model, optimizer, current_optimizee_step, prev_optimizee_step = prepare_optimizee(
        args, sgd_in_names, obs_shape, hidden_size, actor_critic,
        current_optimizee_step, prev_optimizee_step)
    epoch_size = len(train_loader)
    total_steps = epoch_size * args.epochs
    bar_format = '{desc}[{elapsed}<{remaining},{rate_fmt}]'
    pbar = tqdm(range(int(epoch_size * args.epochs)),
                file=sys.stdout,
                bar_format=bar_format,
                ncols=100)
    _window_size = max(
        _min_iter,
        current_optimizee_step + prev_optimizee_step // window_shrink_size)
    train_loader_iter, obs, loss, loss_kl, fc_mean, fc_std = train_step(
        args,
        _window_size,
        train_loader_iter,
        train_loader,
        model,
        optimizer,
        obs_avg,
        args.lr,
        pbar,
        current_optimizee_step + prev_optimizee_step,
        total_steps,
        model_old=model_old)
    logger.writer.add_scalar("loss/ce", loss,
                             current_optimizee_step + prev_optimizee_step)
    logger.writer.add_scalar("loss/kl", loss_kl,
                             current_optimizee_step + prev_optimizee_step)
    logger.writer.add_scalar("loss/total", loss + loss_kl,
                             current_optimizee_step + prev_optimizee_step)
    logger.writer.add_scalar("fc/mean", fc_mean,
                             current_optimizee_step + prev_optimizee_step)
    logger.writer.add_scalar("fc/std", fc_std,
                             current_optimizee_step + prev_optimizee_step)
    current_optimizee_step += _window_size
    pbar.update(_window_size)
    prev_obs = obs.unsqueeze(0)
    prev_hidden = torch.zeros(actor_critic.net.num_recurrent_layers, 1,
                              hidden_size).cuda()
    for epoch in range(args.epochs):
        print("\n===== Epoch %d / %d =====" % (epoch + 1, args.epochs))
        print("<< ============== JOB (PID = %d) %s ============== >>" %
              (PID, args.save_dir))
        while current_optimizee_step < epoch_size:
            # Sample actions
            with torch.no_grad():
                if not RANDOM:
                    value, action, action_log_prob, recurrent_hidden_states, distribution = actor_critic.act(
                        prev_obs, prev_hidden, deterministic=False)
                    action = action.squeeze()
                    action_log_prob = action_log_prob.squeeze()
                    value = value.squeeze()
                    for idx in range(len(action)):
                        logger.writer.add_scalar(
                            "action/%s" % sgd_in_names[idx], action[idx],
                            current_optimizee_step + prev_optimizee_step)
                        logger.writer.add_scalar(
                            "entropy/%s" % sgd_in_names[idx],
                            distribution.distributions[idx].entropy(),
                            current_optimizee_step + prev_optimizee_step)
                        optimizer.param_groups[idx]['lr'] = float(
                            action_space[action[idx]]) * args.lr
                        logger.writer.add_scalar(
                            "LR/%s" % sgd_in_names[idx],
                            optimizer.param_groups[idx]['lr'],
                            current_optimizee_step + prev_optimizee_step)
                else:
                    if RANDOM is True or RANDOM == 'init':
                        for idx in range(coord_size):
                            optimizer.param_groups[idx]['lr'] = float(
                                choice(action_space)) * args.lr
                    if RANDOM == 'init':
                        RANDOM = 'done'
                    for idx in range(coord_size):
                        logger.writer.add_scalar(
                            "LR/%s" % sgd_in_names[idx],
                            optimizer.param_groups[idx]['lr'],
                            current_optimizee_step + prev_optimizee_step)

            # Obser reward and next obs
            _window_size = max(
                _min_iter, current_optimizee_step +
                prev_optimizee_step // window_shrink_size)
            _window_size = min(_window_size,
                               epoch_size - current_optimizee_step)
            train_loader_iter, obs, loss, loss_kl, fc_mean, fc_std = train_step(
                args,
                _window_size,
                train_loader_iter,
                train_loader,
                model,
                optimizer,
                obs_avg,
                args.lr,
                pbar,
                current_optimizee_step + prev_optimizee_step,
                total_steps,
                model_old=model_old)
            logger.writer.add_scalar(
                "loss/ce", loss, current_optimizee_step + prev_optimizee_step)
            logger.writer.add_scalar(
                "loss/kl", loss_kl,
                current_optimizee_step + prev_optimizee_step)
            logger.writer.add_scalar(
                "loss/total", loss + loss_kl,
                current_optimizee_step + prev_optimizee_step)
            logger.writer.add_scalar(
                "fc/mean", fc_mean,
                current_optimizee_step + prev_optimizee_step)
            logger.writer.add_scalar(
                "fc/std", fc_std, current_optimizee_step + prev_optimizee_step)
            current_optimizee_step += _window_size
            pbar.update(_window_size)
            prev_obs = obs.unsqueeze(0)
            if not RANDOM: prev_hidden = recurrent_hidden_states
        prev_optimizee_step += current_optimizee_step
        current_optimizee_step = 0

        # evaluate on validation set
        prec1 = validate(val_loader, model, args)
        logger.writer.add_scalar("prec", prec1, epoch)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            args.save_dir, {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
            }, is_best)

        logging.info('Best accuracy: {prec1:.3f}'.format(prec1=best_prec1))
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    if args.retrain_resume and args.evaluation == False:  # if resume from the last retrain
        config_file_path = os.path.join(args.resume_file, 'retrain.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the last retrain phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict,
                       'retrain')  # config resume from the last retrain
        # get EXP_time in last_retrain for flag
        EXP_time_last_retrain = config_dict['path'].split('/')[-1]
        Exp_name_last_retrain = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)  # from the last retrain.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')
    elif args.retrain_resume == False and args.evaluation:
        config_file_path = os.path.join(args.evaluation_ckpt, 'retrain.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the best checkpoint'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict, 'retrain')
        EXP_time_best_checkpoint = config_dict['path'].split('/')[-1]
        EXP_name_best_checkpoint = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')

    elif args.retrain_resume == False and args.evaluation == False:
        # resume from the searching phrase.
        config_file_path = os.path.join(args.checkpoint_file, 'search.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the search phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        args.random_seed = config_dict['random_seed']  # get random_seed
        # get EXP_time in search phase, for flag
        EXP_time_search = config_dict['path'].split('/')[-1]
        EXP_name_search = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')
    else:
        raise NotImplementedError(
            'invalid mode retrain_resume {:} open_vis {:}'.format(
                args.retrain_resume, args.open_vis))
    # optimizer params
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    # scheduler params
    if args.scheduler == 'cosine':
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    # criterion params
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None

    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'epochs': args.epochs,
        'class_num': args.nb_classes,
    }
    if args.search_space == 'autodeeplab':
        conv_candidates = autodeeplab
    elif args.search_space == 'proxyless':
        conv_candidates = proxyless
    elif args.search_space == 'counter':
        conv_candidates = counter
    elif args.search_space == 'my_search_space':
        conv_candidates = my_search_space
    else:
        raise ValueError('search_space : {:} is not supported'.format(
            args.search_space))

    # related to entropy constraint loss
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2
        }
    elif args.reg_loss_type == 'add#linear#linearschedule':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2,
        }
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None

    logger = prepare_logger(args)
    if args.retrain_resume and args.evaluation == False:
        logger.log(
            '=> loading configs {:} from the last retrain phase.'.format(
                config_file_path), 'info')
    elif args.retrain_resume == False and args.evaluation:
        logger.log(
            '=> loading configs {:} from the best retrain phrase.'.format(
                config_file_path), 'info')
    elif args.retrain_resume == False and args.evaluation == False:
        logger.log(
            '=> loading configs {:} from search phrase.'.format(
                config_file_path), 'info')

    # save new config, and create logger.
    save_configs(args.__dict__, args.path, 'retrain')
    # create run_config
    run_config = RunConfig(**args.__dict__)

    # only open_vis in retrain phrase
    if args.open_vis:
        assert args.evaluation == False, 'invalid mode open_vis {:} and open_test {:}'.format(
            args.open_vis, args.evaluation)
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None

    #print(args.evaluation)

    if args.evaluation:
        assert os.path.exists(args.evaluation_ckpt
                              ), 'cannot find the best checkpoint {:}'.format(
                                  args.evaluation_ckpt)
        checkpoint_path = os.path.join(
            args.evaluation_ckpt, 'checkpoints',
            'seed-{:}-retrain-best.pth'.format(args.random_seed))
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        #print(actual_path)
        #print(cell_genotypes)
        '''
        my_search_space = [
                '3x3_SepFacConv1', '5x5_SepFacConv1',
                '3x3_SepFacConv2', '5x5_SepFacConv2',
                '3x3_SepFacConv4', '5x5_SepFacConv4',]
        '''

        # 0:4 1:4 2:5 3:5 4:4 5:2
        actual_path = [0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2]
        cell_genotypes = [(0, [[('2<-1', 0), ('2<-0', 3)]]),
                          (2, [[('2<-1', 4), ('2<-0', 1)]]),
                          (7, [[('2<-1', 3), ('2<-0', 0)]]),
                          (15, [[('2<-1', 1), ('2<-0', 2)]]),
                          (27, [[('2<-1', 4), ('2<-0', 3)]]),
                          (38, [[('2<-1', 4), ('2<-0', 0)]]),
                          (48, [[('2<-1', 2), ('2<-0', 5)]]),
                          (60, [[('2<-1', 0), ('2<-0', 1)]]),
                          (73, [[('2<-0', 3), ('2<-1', 3)]]),
                          (84, [[('2<-1', 2), ('2<-0', 1)]]),
                          (94, [[('2<-1', 4), ('2<-0', 2)]]),
                          (102, [[('2<-1', 2), ('2<-0', 5)]])]
        '''
        actual_path = [0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2, 1]
        cell_genotypes = [(0, [[('2<-1', 4), ('2<-0', 5)]]), (2, [[('2<-1', 3), ('2<-0', 1)]]), (7, [[('2<-1', 2), ('2<-0', 5)]]), (17, [[('2<-0',
 1), ('2<-1', 1)]]), (28, [[('2<-1', 4), ('2<-0', 1)]]), (38, [[('2<-1', 4), ('2<-0', 2)]]), (50, [[('2<-1', 5), ('2<-0', 1)]]), (63, [[('2<-1', 4), ('2<-0', 2)]]), (74, [[('2<-1', 1), ('2<-0', 0)]]), (84, [[('2<-1', 3), ('2<-0', 1)]]), (92, [[('2<-1', 4), ('2<-0', 5)]]), (99, [[('2<-1', 0), ('2<-0', 3)]])]
 '''

        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)

        # save new config, and create logger.
        #save_configs(args.__dict__, args.path, 'retrain')
        # create run_config
        #run_config = RunConfig(**args.__dict__)

        evaluation_run_manager = RunManager(args.path,
                                            normal_network,
                                            logger,
                                            run_config,
                                            vis,
                                            out_log=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain',
                                         evaluation_run_manager, logger)
        logger.log(
            '=> loaded the best checkpoint from {:}, start evaluation'.format(
                checkpoint_path), 'info')

        evaluation_run_manager.validate(is_test=True, use_train_mode=False)

    else:
        # resume from the last retrain
        if args.retrain_resume:
            logger.log(
                '=> Loading checkpoint from {:} of the last retrain phase'.
                format(args.resume_file),
                mode='info')
            # checkpoint_file from the last retrain phase.
            checkpoint_path = os.path.join(
                args.resume_file, 'checkpoints',
                'seed-{:}-retrain.pth'.format(args.random_seed))
            assert os.path.exists(
                checkpoint_path
            ), 'cannot find retrain checkpoint file {:}'.format(
                checkpoint_path)
            checkpoint = torch.load(checkpoint_path)
            actual_path, cell_genotypes = checkpoint[
                'actual_path'], checkpoint['cell_genotypes']
            args.actual_path = actual_path
            args.cell_genotypes = cell_genotypes
            normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                                  args.filter_multiplier,
                                                  args.block_multiplier,
                                                  args.steps,
                                                  args.nb_classes,
                                                  actual_path,
                                                  cell_genotypes,
                                                  args.search_space,
                                                  affine=True)
            flop, param = get_model_infos(normal_network, [1, 3, 512, 512])
            logger.log(
                '|#################### Network Info ####################|\n'
                'FLOPs:{:.2f} M,     Params:{:.2f} MB'.format(flop, param),
                mode='info')

            # save new config, and create logger.
            #save_configs(args.__dict__, args.path, 'retrain')
            # create run_config
            #run_config = RunConfig(**args.__dict__)

            retrain_run_manager = RunManager(args.path,
                                             normal_network,
                                             logger,
                                             run_config,
                                             vis,
                                             out_log=True)
            normal_network.load_state_dict(checkpoint['state_dict'])
            display_all_families_information(args, 'retrain',
                                             retrain_run_manager, logger)
            retrain_run_manager.optimizer.load_state_dict(
                checkpoint['weight_optimizer'])
            retrain_run_manager.scheduler.load_state_dict(
                checkpoint['scheduler'])
            retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0]
            retrain_run_manager.best_monitor = checkpoint['best_monitor'][1]
            retrain_run_manager.start_epoch = checkpoint[
                'start_epoch']  # has +1
            logger.log(
                '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch'
                .format(checkpoint_path, checkpoint['start_epoch']),
                mode='info')
        else:
            # from search phrase, load the optimal architecture and perform retrain.
            arch_checkpoint_path = os.path.join(
                args.checkpoint_file, 'checkpoints',
                'seed-{:}-arch-best.pth'.format(args.random_seed))

            # TODO, the best epoch has gotten in advance.
            #checkpoint_path = os.path.join(args.checkpoint_file, 'checkpoints', 'seed-{:}-search-best.pth'.format(args.random_seed))
            #tmp_checkpoint = torch.load(checkpoint_path)
            #best_epoch = tmp_checkpoint['start_epochs'] - 1
            #logger.log('=> best epochs: {:}'.format(best_epoch), mode='info') # get the best_epoch

            assert os.path.exists(
                arch_checkpoint_path
            ), 'cannot find arch_checkpoint file {:} from search phase'.format(
                arch_checkpoint_path)
            checkpoint = torch.load(arch_checkpoint_path)
            actual_path, cell_genotypes = checkpoint[
                'actual_path'], checkpoint['cell_genotypes']
            new_genotypes = []
            for _index, genotype in cell_genotypes:
                xlist = []
                for edge_genotype in genotype:
                    for (node_str, select_index) in edge_genotype:
                        xlist.append((node_str, conv_candidates[select_index]))
                new_genotypes.append((_index, xlist))
            log_str = 'Obtained actual_path and cell_genotypes:\n' \
                      'Actual_path: {:}\n' \
                      'Genotype:\n'.format(actual_path)
            for _index, genotype in new_genotypes:
                log_str += 'index: {:} arch: {:}\n'.format(_index, genotype)
            logger.log(log_str, mode='info')
            args.actual_path = actual_path
            args.cell_genotypes = cell_genotypes
            normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                                  args.filter_multiplier,
                                                  args.block_multiplier,
                                                  args.steps,
                                                  args.nb_classes,
                                                  actual_path,
                                                  cell_genotypes,
                                                  args.search_space,
                                                  affine=True)

            flop, param = get_model_infos(normal_network, [1, 3, 512, 512])
            logger.log(
                '|#################### Network Info ####################|\n'
                'FLOPs:{:.2f} M,     Params:{:.2f} MB'.format(flop, param),
                mode='info')

            # save new config, and create logger.
            #save_configs(args.__dict__, args.path, 'retrain')
            # create run_config
            #run_config = RunConfig(**args.__dict__)

            retrain_run_manager = RunManager(args.path,
                                             normal_network,
                                             logger,
                                             run_config,
                                             vis,
                                             out_log=True)
            #normal_network.load_state_dict(checkpoint['state_dict'])
            display_all_families_information(args, 'retrain',
                                             retrain_run_manager, logger)
            logger.log(
                '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase',
                mode='info')

        # perform train and validation in train() method
        retrain_run_manager.train()

    logger.close()
예제 #5
0
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    if args.retrain_resume:
        config_file_path = os.path.join(args.resume_file, 'retrain.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the last retrain phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict, 'retrain')
        # get EXP_time in last_retrain for flag
        EXP_time_last_retrain = config_dict['path'].split('/')[-1]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name,
            EXP_time + '-resume-{:}'.format(EXP_time_last_retrain))
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py'))
        save_configs(args.__dict__, args.path, 'retrain')
        logger = prepare_logger(args)
        logger.log(
            '=> loading configs {:} from the last retrain phase.'.format(
                config_file_path),
            mode='info')
        if args.search_space == 'autodeeplab':
            conv_candidates = autodeeplab
        elif args.search_space == 'proxyless':
            conv_candidates = proxyless
        elif args.search_space == 'my_search_space':
            conv_candidates = my_search_space
        else:
            raise ValueError('search space {:} is not supported'.format(
                args.search_space))
    else:
        # resume partial configs setting and arch_checkpoint from the search phase by default.
        config_file_path = os.path.join(args.checkpoint_file, 'search.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the search phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        args.random_seed = config_dict['random_seed']
        # get EXP_time in search phase, for flag
        EXP_time_search = config_dict['path'].split('/')[-1]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name,
            EXP_time + '-resume-{:}'.format(EXP_time_search))
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py'))
        save_configs(args.__dict__, args.path, 'retrain')
        logger = prepare_logger(args)
        logger.log(
            '=> starting retrain from the search phase config {:}.'.format(
                config_file_path),
            mode='info')

        # optimizer params
        if args.weight_optimizer_type == 'SGD':
            weight_optimizer_params = {
                'momentum': args.momentum,
                'nesterov': args.nesterov,
                'weight_decay': args.weight_decay,
            }
        elif args.weight_optimizer_type == 'RMSprop':
            weight_optimizer_params = {
                'momentum': args.momentum,
                'weight_decay': args.weight_decay,
            }
        else:
            weight_optimizer_params = None
        # scheduler params
        if args.scheduler == 'cosine':
            scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
        elif args.scheduler == 'multistep':
            scheduler_params = {
                'milestones': args.milestones,
                'gammas': args.gammas
            }
        elif args.scheduler == 'exponential':
            scheduler_params = {'gamma': args.gamma}
        elif args.scheduler == 'linear':
            scheduler_params = {'min_lr': args.min_lr}
        else:
            scheduler_params = None
        # criterion params
        if args.criterion == 'SmoothSoftmax':
            criterion_params = {'label_smooth': args.label_smoothing}
        else:
            criterion_params = None

        args.optimizer_config = {
            'optimizer_type': args.weight_optimizer_type,
            'optimizer_params': weight_optimizer_params,
            'scheduler': args.scheduler,
            'scheduler_params': scheduler_params,
            'criterion': args.criterion,
            'criterion_params': criterion_params,
            'init_lr': args.init_lr,
            'epochs': args.epochs,
            'class_num': args.nb_classes,
        }
        if args.search_space == 'autodeeplab':
            conv_candidates = autodeeplab
        elif args.search_space == 'proxyless':
            conv_candidates = proxyless
        elif args.search_space == 'counter':
            conv_candidates = counter
        elif args.search_space == 'my_search_space':
            conv_candidates = my_search_space
        else:
            raise ValueError('search_space : {:} is not supported'.format(
                args.search_space))

        # related to entropy constraint loss
        if args.reg_loss_type == 'add#linear':
            args.reg_loss_params = {'lambda': args.reg_loss_lambda}
        elif args.reg_loss_type == 'mul#log':
            args.reg_loss_params = {
                'alpha': args.reg_loss_alpha,
                'beta': args.reg_loss_beta
            }
        else:
            args.reg_loss_params = None

    # create run_config
    run_config = RunConfig(**args.__dict__)

    #if args.open_test == False: # retrain and validate
    if args.open_vis:  # only open_vis in re-train phase, rather than both re-train and test.
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None
    if args.retrain_resume:
        logger.log(
            '=> Loading checkpoint from {:} of the last retrain phase'.format(
                args.resume_file),
            mode='info')
        # checkpoint_file from the last retrain phase.
        checkpoint_path = os.path.join(
            args.resume_file, 'checkpoints',
            'seed-{:}-retrain.pth'.format(args.random_seed))
        assert os.path.exists(
            checkpoint_path), 'cannot find retrain checkpoint file {:}'.format(
                checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        args.actual_path = actual_path
        args.cell_genotypes = cell_genotypes
        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)
        retrain_run_manager = RunManager(args.path,
                                         normal_network,
                                         logger,
                                         run_config,
                                         vis,
                                         out_log=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain', retrain_run_manager,
                                         logger)
        retrain_run_manager.optimizer.load_state_dict(
            checkpoint['weight_optimizer'])
        retrain_run_manager.scheduler.load_state_dict(checkpoint['scheduler'])
        retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0]
        retrain_run_manager.best_monitor = checkpoint['best_monitor'][1]
        retrain_run_manager.start_epoch = checkpoint['start_epoch']
        logger.log(
            '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch'
            .format(checkpoint_path, checkpoint['start_epoch']),
            mode='info')
    else:
        # todo from the search phase, read the last arch_checkpoint, rather than the best one.
        arch_checkpoint_path = os.path.join(
            args.checkpoint_file, 'checkpoints',
            'seed-{:}-arch.pth'.format(args.random_seed))
        assert os.path.exists(
            arch_checkpoint_path
        ), 'cannot find arch_checkpoint file {:} from search phase'.format(
            arch_checkpoint_path)
        checkpoint = torch.load(arch_checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        new_genotypes = []
        for _index, genotype in cell_genotypes:
            xlist = []
            for edge_genotype in genotype:
                for (node_str, select_index) in edge_genotype:
                    xlist.append((node_str, conv_candidates[select_index]))
            new_genotypes.append((_index, xlist))
        log_str = 'Obtained actual_path and cell_genotypes:\n' \
                  'Actual_path: {:}\n' \
                  'Genotype:\n'.format(actual_path)
        for _index, genotype in new_genotypes:
            log_str += 'index: {:} arch: {:}\n'.format(_index, genotype)
        logger.log(log_str, mode='info')
        args.actual_path = actual_path
        args.cell_genotypes = cell_genotypes
        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)
        retrain_run_manager = RunManager(args.path,
                                         normal_network,
                                         logger,
                                         run_config,
                                         vis,
                                         out_log=True)
        #normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain', retrain_run_manager,
                                         logger)
        logger.log(
            '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase',
            mode='info')
    # perform train and validation in train() method
    retrain_run_manager.train()
    '''
    else: # test phase
        checkpoint_path = os.path.join(args.resume_file, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed))
        assert os.path.exists(checkpoint_path), 'cannot find best checkpoint {:} from the retrain phase'.format(checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint['cell_genotypes']
        normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier,
                                              args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        test_manager = RunManager(args.path, normal_network, logger, run_config, vis=None, out_log=True)
        display_all_families_information(args, 'retrain', test_manager, logger)

        # save testing configs
        save_configs(args.__dict__, args.path, 'test')
        test_manager.validate(epoch=None, is_test=    True, use_train_mode = False)
    '''
    logger.close()
예제 #6
0
def main():
    global args, best_mIoU
    PID = os.getpid()
    args = parser.parse_args()
    prepare_seed(args.rand_seed)
    device = torch.device("cuda:" + str(args.gpus))

    if args.timestamp == 'none':
        args.timestamp = "{:}".format(
            time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time())))

    switch_model = args.switch_model
    assert switch_model in ["deeplab50", "deeplab101"]

    # Log outputs
    if args.evaluate:
        args.save_dir = args.save_dir + "/GTA5-%s-evaluate"%switch_model + \
            "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp)
    else:
        args.save_dir = args.save_dir + \
            "/GTA5_512x512-{model}-LWF.stg{csg_stages}.w{csg_weight}-APool.{apool}-Aug.{augment}-chunk{chunks}-mlp{mlp}.K{csg_k}-LR{lr}.bone{factor}-epoch{epochs}-batch{batch_size}-seed{seed}".format(
                    model=switch_model,
                    csg_stages=args.csg_stages,
                    mlp=args.mlp,
                    csg_weight=args.csg,
                    apool=args.apool,
                    augment=args.augment,
                    chunks=args.chunks,
                    csg_k=args.csg_k,
                    lr="%.2E"%args.lr,
                    factor="%.1f"%args.factor,
                    epochs=args.epochs,
                    batch_size=args.batch_size,
                    seed=args.rand_seed
                    ) + \
            "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp)
    logger = prepare_logger(args)

    from config_seg import config as data_setting
    data_setting.batch_size = args.batch_size
    train_loader = get_train_loader(data_setting,
                                    GTA5,
                                    test=False,
                                    augment=args.augment)

    args.stages = [int(stage) for stage in args.csg_stages.split('.')
                   ] if len(args.csg_stages) > 0 else []
    chunks = [int(chunk) for chunk in args.chunks.split('.')
              ] if len(args.chunks) > 0 else []
    assert len(chunks) == 1 or len(chunks) == len(args.stages)
    if len(chunks) < len(args.stages):
        chunks = [chunks[0]] * len(args.stages)

    if switch_model == 'deeplab50':
        layers = [3, 4, 6, 3]
    elif switch_model == 'deeplab101':
        layers = [3, 4, 23, 3]
    model = csg_builder.CSG(deeplab,
                            get_head=None,
                            K=args.csg_k,
                            stages=args.stages,
                            chunks=chunks,
                            task='new-seg',
                            apool=args.apool,
                            mlp=args.mlp,
                            base_encoder_kwargs={
                                'num_seg_classes': args.num_classes,
                                'layers': layers
                            })

    threds = 3
    evaluator = SegEvaluator(
        Cityscapes(data_setting, 'val', None),
        args.num_classes,
        np.array([0.485, 0.456, 0.406]),
        np.array([0.229, 0.224, 0.225]),
        model.encoder_q, [
            1,
        ],
        False,
        devices=args.gpus,
        config=data_setting,
        threds=threds,
        verbose=False,
        save_path=None,
        show_image=False
    )  # just calculate mIoU, no prediction file is generated
    # verbose=False, save_path="./prediction_files", show_image=True, show_prediction=True)  # generate prediction files

    # Setup optimizer
    factor = args.factor
    sgd_in = [
        {
            'params': get_params(model.encoder_q, ["conv1"]),
            'lr': factor * args.lr
        },
        {
            'params': get_params(model.encoder_q, ["bn1"]),
            'lr': factor * args.lr
        },
        {
            'params': get_params(model.encoder_q, ["layer1"]),
            'lr': factor * args.lr
        },
        {
            'params': get_params(model.encoder_q, ["layer2"]),
            'lr': factor * args.lr
        },
        {
            'params': get_params(model.encoder_q, ["layer3"]),
            'lr': factor * args.lr
        },
        {
            'params': get_params(model.encoder_q, ["layer4"]),
            'lr': factor * args.lr
        },
        {
            'params': get_params(model.encoder_q, ["fc_new"]),
            'lr': args.lr
        },
    ]
    base_lrs = [group['lr'] for group in sgd_in]
    optimizer = SGD(sgd_in,
                    lr=args.lr,
                    momentum=args.momentum,
                    weight_decay=args.weight_decay)

    # Optionally resume from a checkpoint
    if args.resume != 'none':
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume,
                                    map_location=lambda storage, loc: storage)
            args.start_epoch = checkpoint['epoch']
            best_mIoU = checkpoint['best_mIoU']
            msg = model.load_state_dict(checkpoint['state_dict'])
            print("resume weights: ", msg)
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=ImageClassdata> no checkpoint found at '{}'".format(
                args.resume))

    model = model.to(device)

    if args.evaluate:
        mIoU = validate(evaluator, model, -1)
        print(mIoU)
        exit(0)

    # Main training loop
    iter_max = args.epochs * len(train_loader)
    iter_stat = IterNums(iter_max)
    for epoch in range(args.start_epoch, args.epochs):
        print("<< ============== JOB (PID = %d) %s ============== >>" %
              (PID, args.save_dir))
        logger.log("Epoch: %d" % (epoch + 1))
        # train for one epoch
        train(args,
              train_loader,
              model,
              optimizer,
              base_lrs,
              iter_stat,
              epoch,
              logger,
              device,
              adjust_lr=epoch < args.epochs)

        # evaluate on validation set
        torch.cuda.empty_cache()
        mIoU = validate(evaluator, model, epoch)
        logger.writer.add_scalar("mIoU", mIoU, epoch + 1)
        logger.log("mIoU: %f" % mIoU)

        # remember best mIoU and save checkpoint
        is_best = mIoU > best_mIoU
        best_mIoU = max(mIoU, best_mIoU)
        save_checkpoint(
            args.save_dir, {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_mIoU': best_mIoU,
            }, is_best)

    logging.info('Best accuracy: {mIoU:.3f}'.format(mIoU=best_mIoU))
예제 #7
0
def main():
    global args, best_prec1
    PID = os.getpid()
    args = parser.parse_args()
    prepare_seed(args.rand_seed)

    if args.timestamp == 'none':
        args.timestamp = "{:}".format(
            time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time())))

    # Log outputs
    if args.evaluate:
        args.save_dir = args.save_dir + "/Visda17-Res101-evaluate" + \
            "%s/%s"%('/'+args.resume.replace('/', '+') if args.resume != 'none' else '', args.timestamp)
    else:
        args.save_dir = args.save_dir + \
            "/VisDA-Res101-CSG.stg{csg_stages}.w{csg_weight}-APool.{apool}-Aug.{augment}-chunk{chunks}-mlp{mlp}.K{csg_k}-LR{lr}.bone{factor}-epoch{epochs}-batch{batch_size}-seed{seed}".format(
                    csg_stages=args.csg_stages,
                    mlp=args.mlp,
                    csg_weight=args.csg,
                    apool=args.apool,
                    augment=args.augment,
                    chunks=args.chunks,
                    csg_k=args.csg_k,
                    lr="%.2E"%args.lr,
                    factor="%.1f"%args.factor,
                    epochs=args.epochs,
                    batch_size=args.batch_size,
                    seed=args.rand_seed
                    ) + \
            "%s/%s"%('/'+args.resume.replace('/', '+') if args.resume != 'none' else '', args.timestamp)
    logger = prepare_logger(args)

    data_transforms = {
        'val':
        transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ]),
    }
    if args.augment:
        data_transforms['train'] = transforms.Compose([
            RandAugment(1, 6., augment_list),
            transforms.Resize(224),
            transforms.RandomCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])
    else:
        data_transforms['train'] = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

    kwargs = {'num_workers': 20, 'pin_memory': True}
    if args.augment:
        # two source
        trainset = VisDA17(txt_file=os.path.join(args.data,
                                                 "train/image_list.txt"),
                           root_dir=os.path.join(args.data, "train"),
                           transform=TwoCropsTransform(
                               data_transforms['train'],
                               data_transforms['train']))
    else:
        # one source
        trainset = VisDA17(txt_file=os.path.join(args.data,
                                                 "train/image_list.txt"),
                           root_dir=os.path.join(args.data, "train"),
                           transform=data_transforms['train'])
    train_loader = DataLoader(trainset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              drop_last=True,
                              **kwargs)
    valset = VisDA17(txt_file=os.path.join(args.data,
                                           "validation/image_list.txt"),
                     root_dir=os.path.join(args.data, "validation"),
                     transform=data_transforms['val'],
                     label_one_hot=True)
    val_loader = DataLoader(valset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            **kwargs)

    args.stages = [int(stage) for stage in args.csg_stages.split('.')
                   ] if len(args.csg_stages) > 0 else []
    chunks = [int(chunk) for chunk in args.chunks.split('.')
              ] if len(args.chunks) > 0 else []
    assert len(chunks) == 1 or len(chunks) == len(args.stages)
    if len(chunks) < len(args.stages):
        chunks = [chunks[0]] * len(args.stages)

    def get_head(num_ftrs, num_classes):
        _dim = 512
        return nn.Sequential(
            nn.Linear(num_ftrs, _dim),
            nn.ReLU(inplace=False),
            nn.Linear(_dim, num_classes),
        )

    model = csg_builder.CSG(
        resnet101,
        get_head=get_head,
        K=args.csg_k,
        stages=args.stages,
        chunks=chunks,
        apool=args.apool,
        mlp=args.mlp,
    )

    train_blocks = "conv1.bn1.layer1.layer2.layer3.layer4.fc"
    train_blocks = train_blocks.split('.')
    # Setup optimizer
    factor = args.factor
    sgd_in = []
    for name in train_blocks:
        if name != 'fc':
            sgd_in.append({
                'params': get_params(model.encoder_q, [name]),
                'lr': factor * args.lr
            })
        else:
            # no update to fc but to fc_new
            sgd_in.append({
                'params': get_params(model.encoder_q, ["fc_new"]),
                'lr': args.lr
            })
            if model.mlp:
                sgd_in.append({
                    'params': get_params(model.encoder_q, ["fc_csg"]),
                    'lr': args.lr
                })
    base_lrs = [group['lr'] for group in sgd_in]
    optimizer = SGD(sgd_in,
                    lr=args.lr,
                    momentum=args.momentum,
                    weight_decay=args.weight_decay)

    # Optionally resume from a checkpoint
    if args.resume != 'none':
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume,
                                    map_location=lambda storage, loc: storage)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            msg = model.load_state_dict(checkpoint['state_dict'], strict=False)
            print("resume weights: ", msg)
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=ImageClassdata> no checkpoint found at '{}'".format(
                args.resume))

    model = model.cuda()

    if args.evaluate:
        prec1 = validate(val_loader, model, args, 0)
        print(prec1)
        exit(0)

    # Main training loop
    iter_max = args.epochs * len(train_loader)
    iter_stat = IterNums(iter_max)
    for epoch in range(args.start_epoch, args.epochs):
        print("<< ============== JOB (PID = %d) %s ============== >>" %
              (PID, args.save_dir))
        logger.log("Epoch: %d" % (epoch + 1))
        train(train_loader,
              model,
              optimizer,
              base_lrs,
              iter_stat,
              epoch,
              logger,
              args,
              adjust_lr=epoch < args.epochs)

        prec1 = validate(val_loader, model, args, epoch)
        logger.writer.add_scalar("prec", prec1, epoch + 1)
        logger.log("prec: %f" % prec1)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(args.save_dir, {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
        },
                        is_best,
                        keep_last=1)

    logging.info('Best accuracy: {prec1:.3f}'.format(prec1=best_prec1))
예제 #8
0
def main(args):

    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)
    set_manual_seed(args.random_seed)
    #print_experiment_environment()
    EXP_time = time_for_file()
    args.path = os.path.join(args.path, args.exp_name, EXP_time)
    os.makedirs(args.path, exist_ok=True)
    create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')

    # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    if args.scheduler == 'cosine':
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None
    # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion.
    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'warmup_epoch': args.warmup_epochs,
        'epochs': args.epochs,
        'class_num': args.nb_classes,
    }
    # TODO need modification, not need in counter_network
    args.conv_candidates = [
        '3x3_MBConv3',
        '3x3_MBConv6',
        '5x5_MBConv3',
        '5x5_MBConv6',
        '7x7_MBConv3',
        '7x7_MBConv6',
        'Zero',  #'Identity'
    ]
    run_config = RunConfig(**args.__dict__)
    # arch_optimizer_config
    if args.arch_optimizer_type == 'adam':
        args.arch_optimizer_params = {
            'betas': (args.arch_adam_beta1, args.arch_adam_beta2),
            'eps': args.arch_adam_eps
        }
    else:
        args.arch_optimizer_params = None

    # related to hardware constraint
    # TODO: get rid of
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {'lambda': args.reg_loss_lambda}
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None

    arch_search_config = ArchSearchConfig(**args.__dict__)
    # perform config save, for run_configs and arch_search_configs
    save_configs(run_config.config, arch_search_config.config, args.path,
                 'search')
    logger = prepare_logger(args)
    if args.open_vis:
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None
    '''
    super_network = GumbelAutoDeepLab(
        args.filter_multiplier, args.block_multiplier, args.steps,
        args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.conv_candidates, logger
    )
    '''
    super_network = CounterMBConvNet(2, search_space=args.search_space)
    train_manager = RunManager(args.path,
                               super_network,
                               logger,
                               run_config,
                               vis=vis,
                               out_log=True)
    # train search phase
    train_manager.train()
    logger.close()