def load_state_dict(model: Network, state_dict, strict=True): """Copies parameters and buffers from :attr:`state_dict` into this module and its descendants. If :attr:`strict` is ``True`` then the keys of :attr:`state_dict` must exactly match the keys returned by this module's :func:`state_dict()` function. Arguments: state_dict (dict): A dict containing parameters and persistent buffers. strict (bool): Strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's `:func:`state_dict()` function. :param strict: :param state_dict: :param model: """ own_state = model.state_dict() for name, param in state_dict.items(): if name in own_state: if isinstance(param, torch.nn.Parameter): # backwards compatibility for serialized parameters param = param.detach() try: own_state[name].copy_(param) except Exception: raise RuntimeError( 'While copying the parameter named {}, ' 'whose dimensions in the model are {} and ' 'whose dimensions in the checkpoint are {}.'. format(name, own_state[name].size(), param.size())) elif strict: raise KeyError( 'unexpected key "{}" in state_dict'.format(name)) if strict: missing = set(own_state.keys()) - set(state_dict.keys()) if len(missing) > 0: raise KeyError( 'missing keys in state_dict: "{}"'.format(missing))
def main(): args.exp_path /= f'{args.gpu}_{time.strftime("%Y%m%d-%H%M%S")}' utils.create_exp_dir(Path(args.exp_path), scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(args.exp_path / 'log.txt') fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) if args.seed is None: raise Exception('designate seed.') np.random.seed(args.seed) cudnn.benchmark = True cudnn.enabled = True torch.manual_seed(args.seed) # ================================================ # total, used = os.popen( # 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' # ).read().split('\n')[args.gpu].split(',') # total = int(total) # used = int(used) # print('Total GPU mem:', total, 'used:', used) # try: # block_mem = 0.85 * (total - used) # print(block_mem) # x = torch.empty((256, 1024, int(block_mem))).cuda() # del x # except RuntimeError as err: # print(err) # block_mem = 0.8 * (total - used) # print(block_mem) # x = torch.empty((256, 1024, int(block_mem))).cuda() # del x # # # print('reuse mem now ...') # ================================================ logging.info(f'GPU device = {args.gpu}') logging.info(f'args = {args}') criterion = nn.CrossEntropyLoss().to(device) setting = args.location model = Network(args.init_ch, 10, args.layers, criterion, setting) checkpoint = None previous_epochs = 0 if args.checkpoint_path: checkpoint = torch.load(args.checkpoint_path) utils.load(model, checkpoint['state_dict'], False) previous_epochs = checkpoint['epoch'] args.epochs -= previous_epochs if args.epochs <= 0: raise Exception('args.epochs is too small.') if use_DataParallel: print('use Data Parallel') model = nn.parallel.DataParallel(model) model = model.cuda() module = model.module torch.cuda.manual_seed_all(args.seed) else: model = model.to(device) module = model param_size = utils.count_parameters_in_MB(model) logging.info(f'param size = {param_size}MB') arch_and_attn_params = list( map( id, module.arch_and_attn_parameters() if use_DataParallel else model.arch_and_attn_parameters())) weight_params = filter( lambda p: id(p) not in arch_and_attn_params, module.parameters() if use_DataParallel else model.parameters()) optimizer = optim.SGD(weight_params, args.lr, momentum=args.momentum, weight_decay=args.wd) if checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) num_train = len(train_data) # 50000 indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) # 25000 train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batchsz, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=8) # from 2 valid_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batchsz, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]), pin_memory=True, num_workers=8) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=args.lr_min) if checkpoint: scheduler.load_state_dict(checkpoint['scheduler']) arch = Arch(model, criterion, args) if checkpoint: arch.optimizer.load_state_dict(checkpoint['arch_optimizer']) for epoch in tqdm(range(args.epochs), desc='Total Progress'): scheduler.step() lr = scheduler.get_lr()[0] logging.info(f'\nEpoch: {epoch} lr: {lr}') gen = module.genotype() logging.info(f'Genotype: {gen}') print(F.softmax(module.alphas_normal, dim=-1)) print(F.softmax(module.alphas_reduce, dim=-1)) if module.betas_normal is not None: print(F.softmax(module.betas_normal, dim=-1)) print(F.softmax(module.betas_reduce, dim=-1)) if module.gammas_normal is not None: print(F.softmax(module.gammas_normal, dim=-1)) print(F.softmax(module.gammas_reduce, dim=-1)) # training train_acc, train_obj = train(train_queue, valid_queue, model, arch, criterion, optimizer, lr, epoch + 1) logging.info(f'train acc: {train_acc}') # validation valid_acc, valid_obj = infer(valid_queue, model, criterion, epoch + 1) logging.info(f'valid acc: {valid_acc}') utils.save(model, args.exp_path / 'search.pt') utils.save_checkpoint( { 'epoch': epoch + 1 + previous_epochs, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'arch_optimizer': arch.optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, False, args.exp_path) gen = module.genotype() gen_path = args.exp_path / 'genotype.json' utils.save_genotype(gen, gen_path) logging.info(f'Result genotype: {gen}')
arch_optimizer = torch.optim.Adam(model.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast weights to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: filepath = args.checkpoint_format.format(exp=args.save, epoch=resume_from_epoch) checkpoint = torch.load(filepath) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) # model_path = "./search-EXP-final/weights.pt" # model.load_state_dict(torch.load(model_path)) start_time = time.time() if hvd.rank() == 0: logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) for epoch in range(resume_from_epoch, args.epochs):
def main(): start = time.time() if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True CLASSES = 1000 channels = SEARCH_SPACE['channel_size'] strides = SEARCH_SPACE['strides'] # Model model = Network(channels, strides, CLASSES) model = model.to(device) model = nn.DataParallel(model, device_ids=config.gpus) logger.info("param size = %fMB", utils.count_parameters_in_MB(model)) config.world_size = 0 if config.target_hardware is None: config.ref_value = None else: config.ref_value = ref_values[config.target_hardware][ '%.2f' % config.width_mult] # Loss criterion = LatencyLoss(config, channels, strides).cuda(config.gpus) normal_critersion = nn.CrossEntropyLoss() alpha_weight = model.module.arch_parameters() # weight = [param for param in model.parameters() if not utils.check_tensor_in_list(param, alpha_weight)] weight = model.module.weight_parameters() # Optimizer w_optimizer = torch.optim.SGD(weight, config.w_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) alpha_optimizer = torch.optim.Adam(alpha_weight, lr=config.alpha_lr, betas=(config.arch_adam_beta1, config.arch_adam_beta2), eps=config.arch_adam_eps, weight_decay=config.alpha_weight_decay) train_data = get_imagenet_torch( type='train', # image_dir="/googol/atlas/public/cv/ILSVRC/Data/" # use soft link `mkdir ./data/imagenet && ln -s /googol/atlas/public/cv/ILSVRC/Data/CLS-LOC/* ./data/imagenet/` image_dir=config.data_path + "/" + config.dataset.lower(), batch_size=config.batch_size, num_threads=config.workers, world_size=config.world_size, crop=224, device_id=0, num_gpus=len(config.gpus), portion=config.train_portion) valid_data = get_imagenet_torch( type='val', # image_dir="/googol/atlas/public/cv/ILSVRC/Data/" # use soft link `mkdir ./data/imagenet && ln -s /googol/atlas/public/cv/ILSVRC/Data/CLS-LOC/* ./data/imagenet/` image_dir=config.data_path + "/" + config.dataset.lower(), batch_size=config.batch_size, num_threads=config.workers, world_size=config.world_size, crop=224, device_id=0, num_gpus=len(config.gpus), portion=config.val_portion) best_top1 = 0. best_genotype = list() lr = 0 config.start_epoch = -1 config.warmup_epoch = 0 config.warmup = True ### Resume form warmup model or train model ### if config.resume: try: model_path = config.path + '/checkpoint.pth.tar' model, w_optimizer, alpha_optimizer = load_model( model, model_fname=model_path, optimizer=w_optimizer, arch_optimizer=alpha_optimizer) except Exception: warmup_path = config.path + '/warmup.pth.tar' if os.path.exists(warmup_path): print('load warmup weights') model, w_optimizer, alpha_optimizer = load_model( model, model_fname=warmup_path, optimizer=w_optimizer, arch_optimizer=alpha_optimizer) else: print('fail to load models') w_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optimizer, float(config.epochs), eta_min=config.w_lr_min) if config.start_epoch < 0 and config.warm_up: for epoch in range(config.warmup_epoch, config.warmup_epochs): # warmup train_top1, train_loss = warm_up(train_data, valid_data, model, normal_critersion, criterion, w_optimizer, epoch, writer) config.start_epoch = epoch update_schedule = utils.get_update_schedule_grad(len(train_data), config) for epoch in range(config.start_epoch + 1, config.epochs): if epoch > config.warmup_epochs: w_scheduler.step() lr = w_scheduler.get_lr()[0] logger.info('epoch %d lr %e', epoch, lr) # training train_top1, train_loss = train(train_data, valid_data, model, normal_critersion, criterion, w_optimizer, alpha_optimizer, lr, epoch, writer, update_schedule) logger.info('Train top1 %f', train_top1) # validation top1 = train_top1 if epoch % 10 == 0: top1, loss = infer(valid_data, model, epoch, criterion, normal_critersion, writer) logger.info('valid top1 %f', top1) genotype = model.module.genotype() logger.info("genotype = {}".format(genotype)) # save if best_top1 < top1: best_top1 = top1 best_genotype = genotype is_best = True else: is_best = False save_model(model, { 'warmup': False, 'epoch': epoch, 'w_optimizer': w_optimizer.state_dict(), 'alpha_optimizer': alpha_optimizer.state_dict(), 'state_dict': model.state_dict() }, is_best=is_best) utils.time(time.time() - start) logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) logger.info("Best Genotype = {}".format(best_genotype))
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True, num_workers=2) test_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) test_queue = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) bin_op = bin_utils_search.BinOp(model, args) best_acc = 0. best_genotypes = [] for epoch in range(args.epochs): scheduler.step() lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) genotype = model.genotype() genotype_img = model.genotype(args.gamma) logging.info('genotype = %s', genotype) logging.info(F.softmax(model.alphas_normal, dim=-1)) logging.info(F.softmax(model.alphas_reduce, dim=-1)) # training train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, bin_op, epoch) logging.info('train_acc %f', train_acc) # validation valid_acc, valid_obj = infer(valid_queue, model, criterion, bin_op) logging.info('valid_acc %f', valid_acc) if best_acc < valid_acc: best_acc = valid_acc if len(best_genotypes) > 0: best_genotypes[0] = genotype best_genotypes[1] = genotype_img else: best_genotypes.append(genotype) best_genotypes.append(genotype_img) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'arch_param': model.arch_parameters(), 'val_acc': valid_acc, 'optimizer': optimizer.state_dict(), }, False, args.save) with open('./genotypes.py', 'a') as f: f.write(args.geno_name + ' = ' + str(best_genotypes[0]) + '\n') f.write(args.geno_name + '_img' + ' = ' + str(best_genotypes[1]) + '\n')
#scheduler.step() lr = scheduler.get_lr()[0] # STAGE 1 start = time.time() for epoch in range(args.epochs): ## Training the whole population logging.info("[INFO] Generation {} training with learning rate {}".format( epoch + 1, scheduler.get_lr()[0])) start_time = time.time() train(model, train_queue, criterion, optimizer, epoch + 1) logging.info("[INFO] Training finished in {} minutes".format( (time.time() - start_time) / 60)) torch.save(model.state_dict(), "model.pt") #lr = scheduler.get_lr()[0] scheduler.step() logging.info("[INFO] Evaluating Generation {} ".format(epoch + 1)) validation(model, valid_queue, criterion, epoch + 1) population.pop_sort() for i, p in enumerate(population.get_population()): writer.add_scalar("pop_top1_{}".format(i + 1), p.get_fitness(), epoch + 1) writer.add_scalar("pop_top5_{}".format(i + 1), p.top5.avg, epoch + 1) writer.add_scalar("pop_obj_valid_{}".format(i + 1), p.objs.avg, epoch + 1) with open(os.path.join(DIR, "population_{}.pickle".format(epoch + 1)),
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() start_epoch = 1 model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion) # utils.print_model_param_nums(model) model = model.cuda() model = nn.DataParallel(model) logger = Logger('./logs') logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_transform, valid_transform = utils._data_transforms_cifar10(args) if args.set == 'cifar100': train_data = dset.CIFAR100(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR100(root=args.data, train=False, download=True, transform=valid_transform) else: train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) # num_train = len(train_data) # indices = list(range(num_train)) # split = int(np.floor(args.train_portion * num_train)) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,0.9) # architect = Architect(model, args) for epoch in range(args.epochs): scheduler.step() lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) # genotype = model.genotype() # logging.info('genotype = %s', genotype) #print(F.softmax(model.alphas_normal, dim=-1)) #print(F.softmax(model.alphas_reduce, dim=-1)) # training train_acc, train_obj = train(train_queue, model, criterion, optimizer, logger) logging.info('train_acc %f', train_acc) # validation if args.epochs - epoch <= 1: valid_acc, valid_obj = infer(valid_queue, model, criterion, epoch, logger) logging.info('valid_acc %f', valid_acc) checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(checkpoint, PATH) utils.save(model, os.path.join(args.save, 'weights.pt'))
def train(): use_gpu = cfg.MODEL.DEVICE == "cuda" # 1、make dataloader train_loader, val_loader, test_loader, num_query, num_class = darts_make_data_loader( cfg) # print(num_query, num_class) # 2、make model model = Network(num_class, cfg) # tensor = torch.randn(2, 3, 256, 128) # res = model(tensor) # print(res[0].size()) [2, 751] # 3、make optimizer optimizer = make_optimizer(cfg, model) arch_optimizer = torch.optim.Adam( model._arch_parameters(), lr=cfg.SOLVER.ARCH_LR, betas=(0.5, 0.999), weight_decay=cfg.SOLVER.ARCH_WEIGHT_DECAY) # 4、make lr scheduler lr_scheduler = make_lr_scheduler(cfg, optimizer) # make lr scheduler arch_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( arch_optimizer, [80, 160], 0.1) # 5、make loss loss_fn = darts_make_loss(cfg) # model._set_loss(loss_fn, compute_loss_acc) # 6、make architect # architect = Architect(model, cfg) # get parameters device = cfg.MODEL.DEVICE use_gpu = device == "cuda" pretrained = cfg.MODEL.PRETRAINED != "" log_period = cfg.OUTPUT.LOG_PERIOD ckpt_period = cfg.OUTPUT.CKPT_PERIOD eval_period = cfg.OUTPUT.EVAL_PERIOD output_dir = cfg.OUTPUT.DIRS ckpt_save_path = output_dir + cfg.OUTPUT.CKPT_DIRS epochs = cfg.SOLVER.MAX_EPOCHS batch_size = cfg.SOLVER.BATCH_SIZE grad_clip = cfg.SOLVER.GRAD_CLIP batch_num = len(train_loader) log_iters = batch_num // log_period if not os.path.exists(ckpt_save_path): os.makedirs(ckpt_save_path) # create *_result.xlsx # save the result for analyze name = (cfg.OUTPUT.LOG_NAME).split(".")[0] + ".xlsx" result_path = cfg.OUTPUT.DIRS + name wb = xl.Workbook() sheet = wb.worksheets[0] titles = [ 'size/M', 'speed/ms', 'final_planes', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss' ] sheet.append(titles) check_epochs = [40, 80, 120, 160, 200, 240, 280, 320, 360, epochs] values = [] logger = logging.getLogger("CSNet_Search.train") size = count_parameters(model) values.append(format(size, '.2f')) values.append(model.final_planes) logger.info("the param number of the model is {:.2f} M".format(size)) logger.info("Starting Search CDNetwork") best_mAP, best_r1 = 0., 0. is_best = False avg_loss, avg_acc = RunningAverageMeter(), RunningAverageMeter() avg_time, global_avg_time = AverageMeter(), AverageMeter() if use_gpu: model = model.to(device) if pretrained: logger.info("load self pretrained chekpoint to init") model.load_pretrained_model(cfg.MODEL.PRETRAINED) else: logger.info("use kaiming init to init the model") model.kaiming_init_() # exit(1) for epoch in range(epochs): model.set_tau(cfg.MODEL.TAU_MAX - (cfg.MODEL.TAU_MAX - cfg.MODEL.TAU_MIN) * epoch / (epochs - 1)) lr_scheduler.step() lr = lr_scheduler.get_lr()[0] # architect lr.step arch_lr_scheduler.step() # if save epoch_num k, then run k+1 epoch next if pretrained and epoch < model.start_epoch: continue # print(epoch) # exit(1) model.train() avg_loss.reset() avg_acc.reset() avg_time.reset() for i, batch in enumerate(train_loader): t0 = time.time() imgs, labels = batch val_imgs, val_labels = next(iter(val_loader)) if use_gpu: imgs = imgs.to(device) labels = labels.to(device) val_imgs = val_imgs.to(device) val_labels = val_labels.to(device) # 1、 update the weights optimizer.zero_grad() res = model(imgs) # loss = loss_fn(scores, feats, labels) loss, acc = compute_loss_acc(res, labels, loss_fn) loss.backward() if grad_clip != 0: nn.utils.clip_grad_norm_(model.parameters(), grad_clip) optimizer.step() # 2、update the alpha arch_optimizer.zero_grad() res = model(val_imgs) val_loss, val_acc = compute_loss_acc(res, val_labels, loss_fn) val_loss.backward() arch_optimizer.step() # compute the acc # acc = (scores.max(1)[1] == labels).float().mean() t1 = time.time() avg_time.update((t1 - t0) / batch_size) avg_loss.update(loss) avg_acc.update(acc) # log info if (i + 1) % log_iters == 0: logger.info( "epoch {}: {}/{} with loss is {:.5f} and acc is {:.3f}". format(epoch + 1, i + 1, batch_num, avg_loss.avg, avg_acc.avg)) logger.info( "end epochs {}/{} with lr: {:.5f} and avg_time is: {:.3f} ms". format(epoch + 1, epochs, lr, avg_time.avg * 1000)) global_avg_time.update(avg_time.avg) # test the model if (epoch + 1) % eval_period == 0 or (epoch + 1) in check_epochs: model.eval() metrics = R1_mAP(num_query, use_gpu=use_gpu) with torch.no_grad(): for vi, batch in enumerate(test_loader): # break # print(len(batch)) imgs, labels, camids = batch if use_gpu: imgs = imgs.to(device) feats = model(imgs) metrics.update((feats, labels, camids)) #compute cmc and mAP cmc, mAP = metrics.compute() logger.info("validation results at epoch {}".format(epoch + 1)) logger.info("mAP:{:2%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.2%}".format( r, cmc[r - 1])) # determine whether current model is the best if mAP > best_mAP: is_best = True best_mAP = mAP logger.info("Get a new best mAP") if cmc[0] > best_r1: is_best = True best_r1 = cmc[0] logger.info("Get a new best r1") # add the result to sheet if (epoch + 1) in check_epochs: val = [avg_acc.avg, mAP, cmc[0], cmc[4], cmc[9]] change = [format(v * 100, '.2f') for v in val] change.append(format(avg_loss.avg, '.3f')) values.extend(change) # whether to save the model if (epoch + 1) % ckpt_period == 0 or is_best: torch.save(model.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) model._parse_genotype(file=ckpt_save_path + "genotype_{}.json".format(epoch + 1)) logger.info("checkpoint {} was saved".format(epoch + 1)) if is_best: torch.save(model.state_dict(), ckpt_save_path + "best_ckpt.pth") model._parse_genotype(file=ckpt_save_path + "best_genotype.json") logger.info("best_checkpoint was saved") is_best = False # exit(1) values.insert(1, format(global_avg_time.avg * 1000, '.2f')) sheet.append(values) wb.save(result_path) logger.info("Ending Search GDAS_Search")
####MAIN 함수 val_acc_top5 = [] val_acc_top1 = [] for epoch in range(opt.epochs): np.random.seed(2) torch.cuda.manual_seed(2) # training train_acc_top1, train_acc_top5 , train_valoss,train_poloss = train(train_queue, valid_queue, model,criterion, optimizer_arch,optimizer_model,opt.arch_learning_rate,opt.lr_model) # validation valid_acc_top1,valid_acc_top5, valid_valoss = infer(valid_queue, model, criterion) f.write("%5.5f "% train_acc_top1) f.write("%5.5f "% train_acc_top5) f.write("%5.5f "% train_valoss) f.write("%5.5f "% train_poloss ) f.write("%5.5f "% valid_acc_top1 ) f.write("%5.5f "% valid_acc_top5 ) f.write("%5.5f "% valid_valoss ) f.write("\n") print("epoch : " , epoch , "Train_Acc_Top1 : " , train_acc_top1 , "Train_value_loss : ",train_valoss,"Train_policy : " , train_poloss ) print("epoch : " , epoch, "Val_Acc_Top1 : " , valid_acc_top1 , "Val_Acc_Top5 : " , valid_acc_top5,"Loss : " , valid_valoss) torch.save(model.state_dict(),'weights.pt') f.close()