def simple_fit(model, loss_function, dataset, optimizer, epochs, lr=0.01, weight_decay=0, print_interval=1, **opt_kwargs): optimizer = get_optimizer(optimizer, model, lr=lr, weight_decay=weight_decay, **opt_kwargs) iterations = 1 model.train() for epoch_index in range(1, 1 + epochs): for data_index, data in enumerate(dataset): optimizer.zero_grad() loss, monitors = loss_function(model, data) loss.backward() optimizer.step() if iterations % print_interval == 0: logger.info( f'Epoch {epoch_index} Index {data_index} (Iteration {iterations}): loss = {loss.item():.4f}, monitors={monitors}.' ) iterations += 1
def main(run_id): if args.dump_dir is not None: if args.runs > 1: args.current_dump_dir = os.path.join(args.dump_dir, 'run_{}'.format(run_id)) io.mkdir(args.current_dump_dir) else: args.current_dump_dir = args.dump_dir args.checkpoints_dir = os.path.join(args.current_dump_dir, 'checkpoints') io.mkdir(args.checkpoints_dir) args.summary_file = os.path.join(args.current_dump_dir, 'summary.json') logger.info(format_args(args)) model = Model() optimizer = get_optimizer(args.optimizer, model, args.lr) if args.accum_grad > 1: optimizer = AccumGrad(optimizer, args.accum_grad) trainer = MyTrainer.from_args(model, optimizer, args) if args.load_checkpoint is not None: trainer.load_checkpoint(args.load_checkpoint) if args.test_only: trainer.current_epoch = 0 return None, trainer.test() graduated = trainer.train() trainer.save_checkpoint('last') test_meters = trainer.test() if graduated or args.test_not_graduated else None return graduated, test_meters
def main(run_id): if args.dump_dir is not None: if args.runs > 1: args.current_dump_dir = os.path.join(args.dump_dir, 'run_{}'.format(run_id)) io.mkdir(args.current_dump_dir) else: args.current_dump_dir = args.dump_dir args.summary_file = os.path.join(args.current_dump_dir, 'summary.json') args.checkpoints_dir = os.path.join(args.current_dump_dir, 'checkpoints') io.mkdir(args.checkpoints_dir) logger.info(format_args(args)) model = Model() if args.use_gpu: model.cuda() optimizer = get_optimizer(args.optimizer, model, args.lr) if args.accum_grad > 1: optimizer = AccumGrad(optimizer, args.accum_grad) trainer = MyTrainer.from_args(model, optimizer, args) if args.load_checkpoint is not None: trainer.load_checkpoint(args.load_checkpoint) if args.test_only: return None, trainer.test() final_meters = trainer.train() trainer.save_checkpoint('last') return trainer.early_stopped, trainer.test()
def __init__(self, model, optimizer, lr=0.01, weight_decay=0, **opt_kwargs): optimizer = get_optimizer(optimizer, model, lr=lr, weight_decay=weight_decay, **opt_kwargs) self._model = model self._optimizer = optimizer
def main(): if args.dump_dir is not None: args.current_dump_dir = args.dump_dir args.summary_file = os.path.join(args.current_dump_dir, 'summary.json') args.checkpoints_dir = os.path.join(args.current_dump_dir, 'checkpoints') io.mkdir(args.checkpoints_dir) exp_fh = open(os.path.join(args.current_dump_dir, 'exp.sh'), 'a') print('jac-run {}'.format(' '.join(sys.argv)), file=exp_fh) exp_fh.close() logger.info('jac-run {}'.format(' '.join(sys.argv))) logger.info(format_args(args)) print(args.solution_count) model = models.get_model(args) if args.use_gpu: model.cuda() optimizer = get_optimizer(args.optimizer, model, args.lr, weight_decay=args.wt_decay) trainer = MyTrainer.from_args(model, optimizer, args) trainer.num_iters = 0 trainer.num_bad_updates = 0 trainer.test_batch_size = args.test_batch_size trainer.mode = 'warmup' trainer.checkpoint_mode = "warmup" trainer._latent_model = None trainer._static_model = None skip_warmup = False if args.load_checkpoint is not None: extra = trainer.load_checkpoint(args.load_checkpoint) #skip_warmup = extra is not None and (extra['name'] == 'last_warmup') skip_warmup = args.skip_warmup my_lr_scheduler = scheduler.CustomReduceLROnPlateau( trainer._optimizer, { 'mode': 'min', 'factor': 0.2, 'patience': math.ceil(7 / args.test_interval), 'verbose': True, 'threshold': 0.0001, 'threshold_mode': 'rel', 'cooldown': 0, 'min_lr': 0.01 * args.lr, 'eps': 0.0000001 }, maxPatienceToStopTraining=math.ceil(20 / args.test_interval)) trainer.my_lr_scheduler = my_lr_scheduler if args.test_only: # # trainer.load_latent_samples(os.path.join( # args.current_dump_dir, "latent_z_samples.pkl")) trainer.pred_dump = [] trainer.reset_test() rv = trainer.test() #with open(os.path.join(args.current_dump_dir, "pred_dump.pkl"), "wb") as f: # pickle.dump(trainer.pred_dump, f) trainer.dump_errors(force=True) with open(os.path.join(args.current_dump_dir, 'results.out'), "w") as f: print(rv[0].avg['corrected accuracy'], file=f) test_at_end(trainer) return None, rv if not skip_warmup: warmup_meters, warmup_test_meters = trainer.train( 1, args.warmup_epochs) trainer.save_checkpoint('last_warmup') else: logger.info("Skipping warmup") if args.epochs > 0: # define latent model # clone the main model # set the optimizer if skip_warmup: trainer._prepare_dataset(args.epoch_size, 'train') # trainer.checkpoint_mode = "hot" trainer.best_accuracy = -1 args.min_loss = 0 trainer._latent_model = models.get_latent_model(args, trainer.model) trainer._latent_model.train() if not args.no_static: trainer._static_model = copy.deepcopy(trainer._model) trainer._latent_optimizer = get_optimizer( args.optimizer, trainer._latent_model, args.lr_latent, weight_decay=args.latent_wt_decay) trainer.mode = "hot" # switch off training mode only after pretraining phi # since pretraining phi requires training statistics if not args.no_static: trainer._static_model.eval() #trainer._static_model.training = True # # if skip_warmup: # extra = trainer.load_checkpoint(args.load_checkpoint) trainer.datasets['train'].reset_sampler(args.hot_data_sampling) #trainer.datasets["train"].data_sampling = args.hot_data_sampling if not args.no_static: trainer._static_model.train() if args.pretrain_phi > 0: my_lr_scheduler.maxPatienceToStopTraining = 10000 for x in trainer._optimizer.param_groups: x['lr'] = 0.0 _ = trainer.train(args.warmup_epochs + 1, args.pretrain_phi) trainer.best_accuracy = -1 trainer._optimizer = get_optimizer(args.optimizer, trainer.model, args.lr_hot, weight_decay=args.wt_decay) my_lr_scheduler = scheduler.CustomReduceLROnPlateau( trainer._optimizer, { 'mode': 'min', 'factor': 0.2, 'patience': math.ceil(7 / args.test_interval), 'verbose': True, 'threshold': 0.01, 'threshold_mode': 'rel', 'cooldown': 0, 'min_lr': 0.01 * args.lr_hot, 'eps': 0.0000001 }, maxPatienceToStopTraining=math.ceil(25 / args.test_interval)) trainer.my_lr_scheduler = my_lr_scheduler final_meters = trainer.train( args.warmup_epochs + args.pretrain_phi + 1, args.epochs) trainer.save_checkpoint('last') trainer.load_checkpoint( os.path.join(args.checkpoints_dir, 'checkpoint_best.pth')) logger.info("Best Dev Accuracy: {}".format(trainer.best_accuracy)) trainer.reset_test() ret = trainer.test() trainer.dump_errors(force=True) with open(os.path.join(args.current_dump_dir, 'results.out'), "w") as f: print(trainer.best_accuracy, ret[0].avg['corrected accuracy'], file=f) test_at_end(trainer) return ret