def eval_tta(config, augment, reporter): C.get() C.get().conf = config cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[ 'cv_fold'], augment['save_path'] # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path + '.pth') if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model = nn.DataParallel(model).cuda() model.eval() src_loaders = [] # for _ in range(augment['num_policy']): _, src_tl, src_validloader, src_ttl = get_dataloaders( C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, cv_num, split_idx=cv_fold, target=False, random_range=C.get()['args'].random_range) del src_tl, src_ttl start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') emd_loss = nn.DataParallel(emdModule()).cuda() losses = [] corrects = [] for data in src_validloader: with torch.no_grad(): point_cloud = data['point_cloud'].cuda() label = torch.ones_like(data['label'], dtype=torch.int64).cuda() trans_pc = data['transformed'] pred = model(trans_pc) if C.get()['args'].use_emd_false: loss_emd = (torch.mean(emd_loss(point_cloud.permute(0, 2, 1), trans_pc.permute(0, 2, 1), 0.05, 3000)[0])).unsqueeze(0) \ * C.get()['args'].emd_coeff else: loss_emd = torch.tensor([0.0]) if C.get()['args'].no_dc: loss = loss_emd else: loss = loss_emd + loss_fn(pred, label) # print(loss) losses.append(loss.detach().cpu().numpy()) pred = pred.max(dim=1)[1] pred = pred.t() correct = float( torch.sum(pred == label).item()) / pred.size(0) * 100 corrects.append(correct) del loss, correct, pred, data, label, loss_emd losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() corrects_max = max(corrects) metrics.add_dict({ 'minus_loss': -1 * np.sum(losses_min), 'correct': np.sum(corrects_max), # 'cnt': len(corrects_max) }) del corrects, corrects_max del model # metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() # print(metrics) reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['minus_loss']
def train_and_eval(tag, dataroot, metric='last', save_path=None, only_eval=False, unsupervised=False, mode=None): max_epoch = C.get()['epoch'] trainloader, unsuploader, testloader = get_dataloaders( C.get()['dataset'], C.get()['batch'], C.get()['batch_unsup'], dataroot, mode=mode, n_labeled=args.n_labeled) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=True) criterion = nn.CrossEntropyLoss() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get( 'momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov']) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': t_max = C.get()['epoch'] if C.get()['lr_schedule'].get('warmup', None): t_max -= C.get()['lr_schedule']['warmup']['epoch'] scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=0.) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler) if not tag.strip(): from metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(logdir='./logs/%s/%s' % (tag, x)) for x in ['train', 'test'] ] result = OrderedDict() epoch_start = 1 if save_path and os.path.exists(save_path): data = torch.load(save_path) model.load_state_dict(data['model']) optimizer.load_state_dict(data['optimizer']) epoch_start = data['epoch'] if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['test'] = run_epoch(model, testloader, unsuploader, criterion, None, desc_default='*test', epoch=epoch_start, writer=writers[1]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop global best_valid_top1 best_valid_loss = 10e10 for epoch in range(epoch_start, max_epoch + 1): model.train() rs = dict() if args.train_mode == 'small': print(f'only small') rs['train'] = run_epoch(model, trainloader, unsuploader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=True, unsupervised=False, scheduler=scheduler) else: rs['train'] = run_epoch(model, trainloader, unsuploader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=True, unsupervised=unsupervised, scheduler=scheduler) if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') model.eval() if epoch % (10 if 'cifar' in C.get()['dataset'] else 30) == 0 or epoch == max_epoch: rs['test'] = run_epoch(model, testloader, unsuploader, criterion, None, desc_default='*test', epoch=epoch, writer=writers[1], verbose=True) if best_valid_top1 < rs['test']['top1']: best_valid_top1 = rs['test']['top1'] if metric == 'last' or rs[metric]['loss'] < best_valid_loss: # TODO if metric != 'last': best_valid_loss = rs[metric]['loss'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('test_top1/best', rs['test']['top1'], epoch) # save checkpoint if save_path: logger.info('save model@%d to %s' % (epoch, save_path)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path) del model return result
def train_and_eval(config, tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, local_rank=0, evaluation_interval=5): # ckpt = torch.load(save_path) total_batch = config["batch"] if local_rank >= 0: dist.init_process_group(backend='nccl', init_method='env://', world_size=int(os.environ['WORLD_SIZE'])) device = torch.device('cuda', local_rank) torch.cuda.set_device(device) config()['lr'] *= dist.get_world_size() logger.info(f'local batch={config["batch"]} world_size={dist.get_world_size()} ----> total batch={config["batch"] * dist.get_world_size()}') total_batch = config["batch"] * dist.get_world_size() is_master = local_rank < 0 or dist.get_rank() == 0 if is_master: add_filehandler(logger, 'master' + '.log') if not reporter: reporter = lambda **kwargs: 0 max_epoch = config['epoch'] trainsampler, trainloader, validloader, testloader_ = get_dataloaders(config['dataset'], config['batch'], dataroot, test_ratio, split_idx=cv_fold, multinode=(local_rank >= 0)) # create a model & an optimizer model = get_model(config['model'], num_class(config['dataset']), local_rank=local_rank) model_ema = get_model(config['model'], num_class(config['dataset']), local_rank=-1) model_ema.eval() criterion_ce = criterion = CrossEntropyLabelSmooth(num_class(config['dataset']), config.get('lb_smooth', 0)) if config.get('mixup', 0.0) > 0.0: criterion = CrossEntropyMixUpLabelSmooth(num_class(config['dataset']), config.get('lb_smooth', 0)) if config['optimizer']['type'] == 'sgd': optimizer = optim.SGD( model.parameters(), lr=config['lr'], momentum=config['optimizer'].get('momentum', 0.9), weight_decay=0.0, nesterov=config['optimizer'].get('nesterov', True) ) elif config['optimizer']['type'] == 'rmsprop': optimizer = RMSpropTF( model.parameters(), lr=config['lr'], weight_decay=0.0, alpha=0.9, momentum=0.9, eps=0.001 ) else: raise ValueError('invalid optimizer type=%s' % config['optimizer']['type']) lr_scheduler_type = config['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['epoch'], eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == 'efficientnet': scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 0.97 ** int((x + config['lr_schedule']['warmup']['epoch']) / 2.4)) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if config['lr_schedule'].get('warmup', None) and config['lr_schedule']['warmup']['epoch'] > 0: scheduler = GradualWarmupScheduler( optimizer, multiplier=config['lr_schedule']['warmup']['multiplier'], total_epoch=config['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler ) if not tag or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test']] if config['optimizer']['ema'] > 0.0 and is_master: # https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856/4?u=ildoonet ema = EMA(config['optimizer']['ema']) else: ema = None result = OrderedDict() epoch_start = 1 #TODO: change only eval=False when without save_path ?? if save_path != 'test.pth': # and is_master: --> should load all data(not able to be broadcasted) if save_path and not os.path.exists(save_path): import torch.utils.model_zoo as model_zoo data = model_zoo.load_url('https://download.pytorch.org/models/resnet50-19c8e357.pth', model_dir=os.path.join(os.getcwd(), 'FastAutoAugment/models')) if config['dataset'] == 'cifar10': data.pop('fc.weight') data.pop('fc.bias') model_dict = model.state_dict() model_dict.update(data) model.load_state_dict(model_dict) torch.save(model_dict, save_path) logger.info('%s file found. loading...' % save_path) data = torch.load(save_path) key = 'model' if 'model' in data else 'state_dict' if 'epoch' not in data: model.load_state_dict(data) else: logger.info('checkpoint epoch@%d' % data['epoch']) if not isinstance(model, (DataParallel, DistributedDataParallel)): model.load_state_dict({k.replace('module.', ''): v for k, v in data[key].items()}) else: model.load_state_dict({k if 'module.' in k else 'module.'+k: v for k, v in data[key].items()}) logger.info('optimizer.load_state_dict+') optimizer.load_state_dict(data['optimizer']) if data['epoch'] < config['epoch']: epoch_start = data['epoch'] else: only_eval = True if ema is not None: ema.shadow = data.get('ema', {}) if isinstance(data.get('ema', {}), dict) else data['ema'].state_dict() del data if local_rank >= 0: for name, x in model.state_dict().items(): dist.broadcast(x, 0) logger.info(f'multinode init. local_rank={dist.get_rank()} is_master={is_master}') torch.cuda.synchronize() tqdm_disabled = bool(os.environ.get('TASK_NAME', '')) and local_rank != 0 # KakaoBrain Environment if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(config, model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0], is_master=is_master) with torch.no_grad(): rs['valid'] = run_epoch(config, model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1], is_master=is_master) rs['test'] = run_epoch(config, model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2], is_master=is_master) if ema is not None and len(ema) > 0: model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()}) rs['valid'] = run_epoch(config, model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=0, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(config, model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=0, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): if setname not in rs: continue result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if local_rank >= 0: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(config, model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=(is_master and local_rank <= 0), scheduler=scheduler, ema=ema, wd=config['optimizer']['decay'], tqdm_disabled=tqdm_disabled) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if ema is not None and config['optimizer']['ema_interval'] > 0 and epoch % config['optimizer']['ema_interval'] == 0: logger.info(f'ema synced+ rank={dist.get_rank()}') if ema is not None: model.load_state_dict(ema.state_dict()) for name, x in model.state_dict().items(): # print(name) dist.broadcast(x, 0) torch.cuda.synchronize() logger.info(f'ema synced- rank={dist.get_rank()}') if is_master and (epoch % evaluation_interval == 0 or epoch == max_epoch): with torch.no_grad(): rs['valid'] = run_epoch(config, model, validloader, criterion_ce, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(config, model, testloader_, criterion_ce, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) if ema is not None: model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()}) rs['valid'] = run_epoch(config, model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(config, model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) logger.info( f'epoch={epoch} ' f'[train] loss={rs["train"]["loss"]:.4f} top1={rs["train"]["top1"]:.4f} ' f'[valid] loss={rs["valid"]["loss"]:.4f} top1={rs["valid"]["top1"]:.4f} ' f'[test] loss={rs["test"]["loss"]:.4f} top1={rs["test"]["top1"]:.4f} ' ) if metric == 'last' or rs[metric]['top1'] > best_top1: if metric != 'last': best_top1 = rs[metric]['top1'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter( loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1'] ) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s, err=%.4f' % (epoch, save_path, 1 - best_top1)) torch.save({ 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict(), 'ema': ema.state_dict() if ema is not None else None, }, save_path) del model result['top1_test'] = best_top1 return result
def train_and_eval( tag, dataroot, metric="last", save_path=None, only_eval=False, unsupervised=False, labeled_sample_num=4000, ): max_epoch = C.get()["epoch"] trainloader, unsuploader, testloader = get_dataloaders( C.get()["dataset"], C.get()["batch"], C.get()["batch_unsup"], dataroot, labeled_sample_num ) # create a model & an optimizer model = get_model( C.get()["model"], num_class(C.get()["dataset"]), data_parallel=True ) criterion = nn.CrossEntropyLoss() if C.get()["optimizer"]["type"] == "sgd": optimizer = optim.SGD( model.parameters(), lr=C.get()["lr"], momentum=C.get()["optimizer"].get("momentum", 0.9), weight_decay=C.get()["optimizer"]["decay"], nesterov=C.get()["optimizer"]["nesterov"], ) else: raise ValueError("invalid optimizer type=%s" % C.get()["optimizer"]["type"]) lr_scheduler_type = C.get()["lr_schedule"].get("type", "cosine") if lr_scheduler_type == "cosine": t_max = C.get()["epoch"] if C.get()["lr_schedule"].get("warmup", None): t_max -= C.get()["lr_schedule"]["warmup"]["epoch"] scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=t_max, eta_min=0.0 ) else: raise ValueError("invalid lr_schduler=%s" % lr_scheduler_type) if C.get()["lr_schedule"].get("warmup", None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()["lr_schedule"]["warmup"]["multiplier"], total_epoch=C.get()["lr_schedule"]["warmup"]["epoch"], after_scheduler=scheduler, ) if not tag.strip(): from metrics import SummaryWriterDummy as SummaryWriter logger.warning("tag not provided, no tensorboard log.") else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(logdir="./logs/%s/%s" % (tag, x)) for x in ["train", "test"] ] result = OrderedDict() epoch_start = 1 if save_path and os.path.exists(save_path) and (args.resume == True): data = torch.load(save_path) model.load_state_dict(data["model"]) optimizer.load_state_dict(data["optimizer"]) epoch_start = data["epoch"] print("load sucessfully") if only_eval: logger.info("evaluation only+") model.eval() rs = dict() rs["test"] = run_epoch( model, testloader, unsuploader, criterion, None, desc_default="*test", epoch=epoch_start, writer=writers[1], method=args.method, ) for key, setname in itertools.product( ["loss", "top1", "top5"], ["train", "test"] ): result["%s_%s" % (key, setname)] = rs[setname][key] result["epoch"] = 0 return result # train loop global best_valid_top1 best_valid_loss = 10e10 for epoch in range(epoch_start, max_epoch + 1): model.train() rs = dict() rs["train"] = run_epoch( model, trainloader, unsuploader, criterion, optimizer, desc_default="train", epoch=epoch, writer=writers[0], verbose=True, unsupervised=unsupervised, scheduler=scheduler, method=args.method, ) if math.isnan(rs["train"]["loss"]): raise Exception("train loss is NaN.") model.eval() if ( epoch % (10 if "cifar" in C.get()["dataset"] else 30) == 0 or epoch == max_epoch ): rs["test"] = run_epoch( model, testloader, unsuploader, criterion, None, desc_default="*test", epoch=epoch, writer=writers[1], verbose=True, method=args.method ) if best_valid_top1 < rs["test"]["top1"]: best_valid_top1 = rs["test"]["top1"] if metric == "last" or rs[metric]["loss"] < best_valid_loss: # TODO if metric != "last": best_valid_loss = rs[metric]["loss"] for key, setname in itertools.product( ["loss", "top1", "top5"], ["train", "test"] ): result["%s_%s" % (key, setname)] = rs[setname][key] result["epoch"] = epoch writers[1].add_scalar("test_top1/best", rs["test"]["top1"], epoch) # save checkpoint if save_path: logger.info("save model@%d to %s" % (epoch, save_path)) torch.save( { "epoch": epoch, "log": { "train": rs["train"].get_dict(), "test": rs["test"].get_dict(), }, "optimizer": optimizer.state_dict(), "model": model.state_dict(), }, save_path, ) del model return result
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', \ save_path=None,pretrained=None, only_eval=False): if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'],\ dataroot, test_ratio, split_idx = cv_fold) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=True) # criterion = nn.CrossEntropyLoss() criterion = LabelSmoothSoftmaxCE() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get( 'momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov']) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) is_master = True logger.debug('is_master=%s' % is_master) #set schedulers lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': logger.debug('cosine learn decay.') t_max = C.get()['epoch'] if C.get()['lr_schedule'].get('warmup', None): t_max -= C.get()['lr_schedule']['warmup']['epoch'] scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=0.) elif lr_scheduler_type == 'mixnet_l': scheduler = adjust_learning_rate_mixnet(optimizer) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler) from tensorboardX import SummaryWriter writers = [ SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test'] ] result = OrderedDict() epoch_start = 1 ## load model for training or evaluation if save_path and os.path.exists(save_path): data = torch.load(save_path) if 'model' in data: new_state_dict = {} for k, v in data['model'].items(): if 'module.' not in k: new_state_dict['module.' + k] = v else: new_state_dict[k] = v model.load_state_dict(new_state_dict) optimizer.load_state_dict(data['optimizer']) logger.info('ckpt epoch@%d' % data['epoch']) if data['epoch'] < C.get()['epoch']: epoch_start = data['epoch'] else: only_eval = True logger.info('epoch=%d' % data['epoch']) else: model.load_state_dict(data) del data elif pretrained: assert os.path.exists(pretrained) ckt = torch.load(pretrained) model_dict = model.state_dict() if 'model' in ckt: new_state_dict = {} for k, v in ckt['model'].items(): if 'module.' not in k: new_state_dict['module.' + k] = v else: new_state_dict[k] = v model_dict.update(new_state_dict) model.load_state_dict(model_dict) else: model_dict.update(ckt) model.load_state_dict(model_dict) ##Evaluate the model if only_eval: print('Eval model') logger.info('evaluation only+') model.eval() rs = dict() rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_valid_loss = 10e10 best_accuracy = 0.0 for epoch in range(epoch_start, max_epoch + 1): scheduler.step() model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', \ epoch=epoch, writer=writers[0], verbose=is_master, scheduler=scheduler, is_train=True) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if (epoch % 1) == 0 or epoch == max_epoch: rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', \ epoch=epoch, writer=writers[1], verbose=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', \ epoch=epoch, writer=writers[2], verbose=is_master) if metric == 'last' or rs[metric]['loss'] < best_valid_loss: if metric != 'last': best_valid_loss = rs[metric]['loss'] for key, setname in itertools.product( ['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter(loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1']) # save checkpoint if is_master and save_path: if rs['test']['top1'] > best_accuracy: best_accuracy = rs['test']['top1'] logger.info('save model@%d to %s' % (epoch, save_path)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path) del model return result
def train_and_eval(tag, dataroot, metric='last', resume=False, save_path=None, only_eval=False, unsupervised=False, devices=None): max_epoch = C.get()['epoch'] unsup_idx = C.get()['unsup_idx'] if os.path.exists(unsup_idx): unsup_idx = np.load(unsup_idx).tolist() print('Unsup idx:', len(unsup_idx)) trainloader, unsuploader, testloader = get_dataloaders( C.get()['dataset'], C.get()['batch'], C.get()['batch_unsup'], dataroot, with_noise=True, random_state=C.get()['random_state'], unsup_idx=unsup_idx) else: trainloader, unsuploader, testloader = get_dataloaders( C.get()['dataset'], C.get()['batch'], C.get()['batch_unsup'], dataroot, with_noise=False, random_state=C.get()['random_state']) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=True, devices=devices) criterion = nn.CrossEntropyLoss() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get( 'momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov']) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': #t_max = 600 #print('Temp Fix for AnnealingCosine, Tmax=',t_max) t_max = C.get()['epoch'] if C.get()['lr_schedule'].get('warmup', None): t_max -= C.get()['lr_schedule']['warmup']['epoch'] scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=0.) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler) if not tag.strip(): from metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(logdir='./logs/%s/%s' % (tag, x)) for x in ['train', 'test'] ] result = OrderedDict() epoch_start = 1 if (resume or only_eval) and save_path and os.path.exists(save_path): print('Resuming from last epoch:', save_path) ckpt = torch.load(save_path) model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) epoch_start = ckpt['epoch'] elif os.path.exists(C.get()['pretrain']): print('Loading pretrain from:', C.get()['pretrain']) ckpt = torch.load(C.get()['pretrain']) try: model.load_state_dict(ckpt['state_dict']) except RuntimeError: new_state_dict = { 'module.' + k: v for k, v in ckpt['state_dict'].items() } model.load_state_dict(new_state_dict) else: print('DEBUG: No pretrain available') if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['test'] = run_epoch(model, testloader, unsuploader, criterion, None, desc_default='*test', epoch=epoch_start, writer=writers[1]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop global best_valid_top1 best_valid_loss = 10e10 for epoch in range(epoch_start, max_epoch + 1): model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, unsuploader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=False, unsupervised=unsupervised, scheduler=scheduler) print('Train At Epoch {}'.format(epoch), rs['train']) if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') model.eval() if epoch % (10 if 'cifar' in C.get()['dataset'] else 30) == 0 or epoch == max_epoch: rs['test'] = run_epoch(model, testloader, unsuploader, criterion, None, desc_default='*test', epoch=epoch, writer=writers[1], verbose=False) print('Test At Epoch {}'.format(epoch), rs['test']) if best_valid_top1 < rs['test']['top1']: best_valid_top1 = rs['test']['top1'] if metric == 'last' or rs[metric]['loss'] < best_valid_loss: # TODO if metric != 'last': best_valid_loss = rs[metric]['loss'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('test_top1/best', rs['test']['top1'], epoch) # save checkpoint if save_path: logger.info('save model@%d to %s' % (epoch, save_path)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'state_dict': model.state_dict() }, save_path) del model return result
def eval_tta(config, augment): augment['num_policy'] = 1 # TODO remove C.get() C.get().conf = config cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[ 'cv_fold'], augment['save_path'] print(augment) # setup - provided augmentation rules C.get().aug = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() loaders = [] for _ in range(augment['num_policy']): # TODO _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, split_idx=cv_fold) loaders.append(iter(validloader)) del tl, tl2 start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') try: while True: losses = [] corrects = [] for loader in loaders: data, label = next(loader) data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) losses.append(loss.detach().cpu().numpy()) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() corrects.append(correct) del loss, correct, pred, data, label losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() corrects = np.concatenate(corrects) corrects_max = np.max(corrects, axis=0).squeeze() metrics.add_dict({ 'minus_loss': -1 * np.sum(losses_min), 'correct': np.sum(corrects_max), 'cnt': len(corrects_max) }) del corrects, corrects_max except StopIteration: pass del model metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() # reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) # track.log(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) tune.report(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']