def generate(generations, population, nn_param_choices, dataset): """Generate a network with the genetic algorithm. Args: generations (int): Number of times to evole the population population (int): Number of networks in each generation nn_param_choices (dict): Parameter choices for networks dataset (str): Dataset to use for training/evaluating """ optimizer = Optimizer(nn_param_choices) networks = optimizer.create_population(population) # Evolve the generation. for i in range(generations): logging.info("***Doing generation %d of %d***" % (i + 1, generations)) # Train and get accuracy for networks. train_networks(networks, dataset) # Get the average accuracy for this generation. average_accuracy = get_average_accuracy(networks) # Print out the average accuracy each generation. logging.info("Generation average: %.2f%%" % (average_accuracy * 100)) logging.info('-' * 80) # Evolve, except on the last iteration. if i != generations - 1: # Do the evolution. networks = optimizer.evolve(networks) # Sort our final population. networks = sorted(networks, key=lambda x: x.accuracy, reverse=True) # Print out the top 5 networks. print_networks(networks[:5])
def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
def __init__(self, db): self.db = db self.cfg = db.cfg self.net = PuzzleModel(db) if self.cfg.cuda: if self.cfg.parallel and torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") self.net = nn.DataParallel(self.net) self.net = self.net.cuda() if self.cfg.cuda and self.cfg.parallel: net = self.net.module else: net = self.net image_encoder_trainable_paras = \ filter(lambda p: p.requires_grad, net.image_encoder.parameters()) raw_optimizer = optim.Adam([ {'params': image_encoder_trainable_paras}, {'params': net.text_encoder.embedding.parameters(), 'lr': self.cfg.finetune_lr}, {'params': net.text_encoder.rnn.parameters()}, {'params': net.what_decoder.parameters()}, {'params': net.where_decoder.parameters()}, {'params': net.shape_encoder.parameters()}, ], lr=self.cfg.lr) optimizer = Optimizer(raw_optimizer, max_grad_norm=self.cfg.grad_norm_clipping) # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer.optimizer, factor=0.8, patience=3) # scheduler = optim.lr_scheduler.StepLR(optimizer.optimizer, step_size=3, gamma=0.8) # optimizer.set_scheduler(scheduler) self.optimizer = optimizer self.epoch = 0 if self.cfg.pretrained is not None: self.load_pretrained_net(self.cfg.pretrained)
def get_model(model_file_path=None, eval=False): model = Summarizer() optimizer = Optimizer(config.optim, config.lr_coverage if config.cov else config.lr, acc=config.adagrad_init_acc, max_grad_norm=config.max_grad_norm) optimizer.set_parameters(model.parameters()) step, loss = 1, 0 if model_file_path is not None: checkpoint = torch.load(model_file_path) step = checkpoint['step'] loss = checkpoint['loss'] model_state_dict = dict([(k, v) for k, v in checkpoint['model'].items()]) model.load_state_dict(model_state_dict, strict=False) if not config.cov and not eval: optimizer.optim.load_state_dict(checkpoint['optimizer']) if config.cuda: for state in optimizer.optim.state.values(): for k, v in checkpoint.items(): if torch.is_tensor(v): state[k] = v.cuda() if config.cuda: model = model.cuda() optimizer.set_parameters(model.parameters()) return model, optimizer, step, loss
def __init__(self, config): self.cfg = config self.net = RegionGroundingModel(config) if self.cfg.cuda: self.net = self.net.cuda() params = filter(lambda p: p.requires_grad, self.net.parameters()) raw_optimizer = optim.Adam(params, lr=self.cfg.lr) optimizer = Optimizer(raw_optimizer, max_grad_norm=self.cfg.grad_norm_clipping) if self.cfg.coco_mode >= 0: scheduler = optim.lr_scheduler.StepLR(optimizer.optimizer, step_size=75, gamma=0.1) optimizer.set_scheduler(scheduler) self.optimizer = optimizer self.epoch = 0 if self.cfg.pretrained is not None: self.load_pretrained_net(self.cfg.pretrained) print('-------------------') print('All parameters') for name, param in self.net.named_parameters(): print(name, param.size()) print('-------------------') print('Trainable parameters') for name, param in self.net.named_parameters(): if param.requires_grad: print(name, param.size())
def main_worker(rank, args): args.rank = rank args = setup(args) loaders = Data(args).get_loader() model = Model(args) optimizer = Optimizer(args, model) if args.amp: model = optimizer.set_amp(model) model.parallelize() criterion = Loss(args, model=model, optimizer=optimizer) trainer = Trainer(args, model, criterion, optimizer, loaders) if args.stay: interact(local=locals()) exit() if args.demo: trainer.evaluate(epoch=args.startEpoch, mode='demo') exit() for epoch in range(1, args.startEpoch): if args.do_validate: if epoch % args.validate_every == 0: trainer.fill_evaluation(epoch, 'val') if args.do_test: if epoch % args.test_every == 0: trainer.fill_evaluation(epoch, 'test') for epoch in range(args.startEpoch, args.endEpoch + 1): if args.do_train: trainer.train(epoch) if args.do_validate: if epoch % args.validate_every == 0: if trainer.epoch != epoch: trainer.load(epoch) trainer.validate(epoch) if args.do_test: if epoch % args.test_every == 0: if trainer.epoch != epoch: trainer.load(epoch) trainer.test(epoch) if args.rank == 0 or not args.launched: print('') trainer.imsaver.join_background() cleanup(args)
def __init__(self, dset, conf, save=False): # Set batches train, val, test = dset.build_batches("relevant") # Build model, optimizer, loss and scheduler model = self.build_model(conf["model"], dset.vocab, dset.char_vocab) opt = Optimizer(model.parameters(), conf["optim"]) loss = BCE() lr_sch = LR_scheduler(opt.opt, conf["optim"]) # To track early stopping self.best = {"val": {"f1": 0}, "test": {}} step, stop = 0, 0 # For max epochs for ep in range(conf["train"]["max_epochs"]): print("\n\tEpoch %d" % ep) for batch in train: # set the in training mode. model.train() # advance step step += 1 # forward pass x, y, mask = self.fw_pass(model, batch) # measure error fw_loss = loss(x, y) # backward pass opt.train_op(fw_loss) # validation if step % conf["train"]["val_steps"] == 0: # Set the in testing mode model.eval() # Eval on val set val_metrics = utils.bin_fw_eval(model, self.fw_pass, val, step) if val_metrics["f1"] > self.best["val"]["f1"]: # reset Early stop stop = 0 # Eval on test set test_metrics = utils.bin_fw_eval( model, self.fw_pass, test, step) self.best = {"val": val_metrics, "test": test_metrics} if save: model.save(step, conf, self.best, opt, lr_sch, "bin") else: if stop == conf["train"]["patience"]: return stop += 1 # maybe update lr lr_sch.step()
def train(sourceVocabClass, targetVocabClass): """Train the Equilid Model from character to language-tagged-token sampleData.""" # Ensure we have a directory to write to if not os.path.exists(model_dir): os.makedirs(model_dir) max_len = 1000 seq2seqModel, loss, srcField, tgtField = create_model( sourceVocabClass, targetVocabClass) logger.debug("char itos length:{} desc:{}".format(len(srcField.vocab.itos), srcField.vocab.itos)) logger.debug("char stoi length:{} desc:{}".format(len(srcField.vocab.stoi), srcField.vocab.stoi)) logger.debug("lang itos length:{} desc:{}".format(len(tgtField.vocab.itos), tgtField.vocab.itos)) logger.debug("lang stoi length:{} desc:{}".format(len(tgtField.vocab.stoi), tgtField.vocab.stoi)) # get a generator for files in the data directory train_dev_pairs = get_file_paths(FLAGS.data_dir, 'train:dev') if torch.cuda.is_available(): loss.cuda() print("Training model") t = SupervisedTrainer(loss=loss, batch_size=int(FLAGS.batch_size), checkpoint_every=FLAGS.checkpoint_interval, print_every=20, expt_dir=FLAGS.expt_dir) adamOptimizer = torch.optim.Adam(seq2seqModel.parameters(), lr=float(FLAGS.learning_rate)) optimizer = Optimizer(adamOptimizer, max_grad_norm=FLAGS.max_gradient_norm) for train_path, dev_path in train_dev_pairs: train_dataset = load_tabular_dataset(train_path, srcField, tgtField, max_len) dev_dataset = load_tabular_dataset(dev_path, srcField, tgtField, max_len) logger.debug("Using Dataset files train:{} dev:{}".format( train_path, dev_path)) seq2seqModel = t.train(seq2seqModel, train_dataset, num_epochs=int(FLAGS.num_epochs), dev_data=dev_dataset, optimizer=optimizer, teacher_forcing_ratio=1, resume=FLAGS.resume) print("training completed!")
def __init__(self, config): self.cfg = config self.net = SynthesisModel(config) if self.cfg.cuda: if self.cfg.parallel and torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") self.net = nn.DataParallel(self.net) self.net = self.net.cuda() if self.cfg.cuda and self.cfg.parallel: net = self.net.module else: net = self.net raw_optimizer = optim.Adam([{ 'params': net.encoder.parameters() }, { 'params': net.decoder.parameters() }], lr=self.cfg.lr) optimizer = Optimizer(raw_optimizer) self.optimizer = optimizer self.epoch = 0 if self.cfg.pretrained is not None: self.load_pretrained_net(self.cfg.pretrained)
) else: args.lr = 0.00035 optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4) lr_scheduler = WarmupMultiStepLR( optimizer, milestones=[200, 400], gamma=0.1, warmup_epochs=100, ) optimizer = Optimizer(optimizer=optimizer, lr_scheduler=lr_scheduler, max_epochs=800) args.results_dir = os.path.join( args.results_dir, dataset, "{}_pooling_{}_loss_{}".format(args.optim, args.pooling_type, args.loss_type), ) if args.non_local: args.results_dir = args.results_dir + "_nonlocal" # run solver = Engine( results_dir=args.results_dir,
def train(self, train_db, val_db, test_db): ################################################################## ## Optimizer ################################################################## image_encoder_trainable_paras = \ filter(lambda p: p.requires_grad, self.net.image_encoder.parameters()) raw_optimizer = optim.Adam([ {'params': self.net.text_encoder.embedding.parameters(), 'lr': self.cfg.finetune_lr}, {'params': image_encoder_trainable_paras, 'lr': self.cfg.finetune_lr}, {'params': self.net.text_encoder.rnn.parameters()}, {'params': self.net.what_decoder.parameters()}, {'params': self.net.where_decoder.parameters()} ], lr=self.cfg.lr) optimizer = Optimizer(raw_optimizer, max_grad_norm=self.cfg.grad_norm_clipping) # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer.optimizer, factor=0.8, patience=3) scheduler = optim.lr_scheduler.StepLR(optimizer.optimizer, step_size=3, gamma=0.8) optimizer.set_scheduler(scheduler) ################################################################## ## LOG ################################################################## logz.configure_output_dir(self.cfg.model_dir) logz.save_config(self.cfg) ################################################################## ## Main loop ################################################################## start = time() for epoch in range(self.cfg.n_epochs): ################################################################## ## Training ################################################################## torch.cuda.empty_cache() train_pred_loss, train_attn_loss, train_eos_loss, train_accu = \ self.train_epoch(train_db, optimizer, epoch) ################################################################## ## Validation ################################################################## torch.cuda.empty_cache() val_loss, val_accu, val_infos = self.validate_epoch(val_db) ################################################################## ## Sample ################################################################## torch.cuda.empty_cache() self.sample(epoch, test_db, self.cfg.n_samples) torch.cuda.empty_cache() ################################################################## ## Logging ################################################################## # update optim scheduler optimizer.update(np.mean(val_loss), epoch) logz.log_tabular("Time", time() - start) logz.log_tabular("Iteration", epoch) logz.log_tabular("TrainAverageError", np.mean(train_pred_loss)) logz.log_tabular("TrainStdError", np.std(train_pred_loss)) logz.log_tabular("TrainMaxError", np.max(train_pred_loss)) logz.log_tabular("TrainMinError", np.min(train_pred_loss)) logz.log_tabular("TrainAverageAccu", np.mean(train_accu)) logz.log_tabular("TrainStdAccu", np.std(train_accu)) logz.log_tabular("TrainMaxAccu", np.max(train_accu)) logz.log_tabular("TrainMinAccu", np.min(train_accu)) logz.log_tabular("ValAverageError", np.mean(val_loss)) logz.log_tabular("ValStdError", np.std(val_loss)) logz.log_tabular("ValMaxError", np.max(val_loss)) logz.log_tabular("ValMinError", np.min(val_loss)) logz.log_tabular("ValAverageAccu", np.mean(val_accu)) logz.log_tabular("ValStdAccu", np.std(val_accu)) logz.log_tabular("ValMaxAccu", np.max(val_accu)) logz.log_tabular("ValMinAccu", np.min(val_accu)) logz.log_tabular("ValAverageObjAccu", np.mean(val_accu[:, 0])) logz.log_tabular("ValStdObjAccu", np.std(val_accu[:, 0])) logz.log_tabular("ValMaxObjAccu", np.max(val_accu[:, 0])) logz.log_tabular("ValMinObjAccu", np.min(val_accu[:, 0])) logz.log_tabular("ValAveragePoseAccu", np.mean(val_accu[:, 1])) logz.log_tabular("ValStdPoseAccu", np.std(val_accu[:, 1])) logz.log_tabular("ValMaxPoseAccu", np.max(val_accu[:, 1])) logz.log_tabular("ValMinPoseAccu", np.min(val_accu[:, 1])) logz.log_tabular("ValAverageExprAccu", np.mean(val_accu[:, 2])) logz.log_tabular("ValStdExprAccu", np.std(val_accu[:, 2])) logz.log_tabular("ValMaxExprAccu", np.max(val_accu[:, 2])) logz.log_tabular("ValMinExprAccu", np.min(val_accu[:, 2])) logz.log_tabular("ValAverageCoordAccu", np.mean(val_accu[:, 3])) logz.log_tabular("ValStdCoordAccu", np.std(val_accu[:, 3])) logz.log_tabular("ValMaxCoordAccu", np.max(val_accu[:, 3])) logz.log_tabular("ValMinCoordAccu", np.min(val_accu[:, 3])) logz.log_tabular("ValAverageScaleAccu", np.mean(val_accu[:, 4])) logz.log_tabular("ValStdScaleAccu", np.std(val_accu[:, 4])) logz.log_tabular("ValMaxScaleAccu", np.max(val_accu[:, 4])) logz.log_tabular("ValMinScaleAccu", np.min(val_accu[:, 4])) logz.log_tabular("ValAverageFlipAccu", np.mean(val_accu[:, 5])) logz.log_tabular("ValStdFlipAccu", np.std(val_accu[:, 5])) logz.log_tabular("ValMaxFlipAccu", np.max(val_accu[:, 5])) logz.log_tabular("ValMinFlipAccu", np.min(val_accu[:, 5])) logz.log_tabular("ValUnigramF3", np.mean(val_infos.unigram_F3())) logz.log_tabular("ValBigramF3", np.mean(val_infos.bigram_F3())) logz.log_tabular("ValUnigramP", np.mean(val_infos.unigram_P())) logz.log_tabular("ValUnigramR", np.mean(val_infos.unigram_R())) logz.log_tabular("ValBigramP", val_infos.mean_bigram_P()) logz.log_tabular("ValBigramR", val_infos.mean_bigram_R()) logz.log_tabular("ValUnigramPose", np.mean(val_infos.pose())) logz.log_tabular("ValUnigramExpr", np.mean(val_infos.expr())) logz.log_tabular("ValUnigramScale", np.mean(val_infos.scale())) logz.log_tabular("ValUnigramFlip", np.mean(val_infos.flip())) logz.log_tabular("ValUnigramSim", np.mean(val_infos.unigram_coord())) logz.log_tabular("ValBigramSim", val_infos.mean_bigram_coord()) logz.dump_tabular() ################################################################## ## Checkpoint ################################################################## log_info = [np.mean(val_loss), np.mean(val_accu)] self.save_checkpoint(epoch, log_info) torch.cuda.empty_cache()
def train(args): """Run model training.""" print("Start Training ...") # Get nested namespaces. model_args = args.model_args logger_args = args.logger_args optim_args = args.optim_args data_args = args.data_args transform_args = args.transform_args # Get logger. print('Getting logger... log to path: {}'.format(logger_args.log_path)) logger = Logger(logger_args.log_path, logger_args.save_dir) # For conaug, point to the MOCO pretrained weights. if model_args.ckpt_path and model_args.ckpt_path != 'None': print("pretrained checkpoint specified : {}".format( model_args.ckpt_path)) # CL-specified args are used to load the model, rather than the # ones saved to args.json. model_args.pretrained = False ckpt_path = model_args.ckpt_path model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path, gpu_ids=args.gpu_ids, model_args=model_args, is_training=True) if not model_args.moco: optim_args.start_epoch = ckpt_info['epoch'] + 1 else: optim_args.start_epoch = 1 else: print( 'Starting without pretrained training checkpoint, random initialization.' ) # If no ckpt_path is provided, instantiate a new randomly # initialized model. model_fn = models.__dict__[model_args.model] if data_args.custom_tasks is not None: tasks = NamedTasks[data_args.custom_tasks] else: tasks = model_args.__dict__[TASKS] # TASKS = "tasks" print("Tasks: {}".format(tasks)) model = model_fn(tasks, model_args) model = nn.DataParallel(model, args.gpu_ids) # Put model on gpu or cpu and put into training mode. model = model.to(args.device) model.train() print("========= MODEL ==========") print(model) # Get train and valid loader objects. train_loader = get_loader(phase="train", data_args=data_args, transform_args=transform_args, is_training=True, return_info_dict=False, logger=logger) valid_loader = get_loader(phase="valid", data_args=data_args, transform_args=transform_args, is_training=False, return_info_dict=False, logger=logger) # Instantiate the predictor class for obtaining model predictions. predictor = Predictor(model, args.device) # Instantiate the evaluator class for evaluating models. evaluator = Evaluator(logger) # Get the set of tasks which will be used for saving models # and annealing learning rate. eval_tasks = EVAL_METRIC2TASKS[optim_args.metric_name] # Instantiate the saver class for saving model checkpoints. saver = ModelSaver(save_dir=logger_args.save_dir, iters_per_save=logger_args.iters_per_save, max_ckpts=logger_args.max_ckpts, metric_name=optim_args.metric_name, maximize_metric=optim_args.maximize_metric, keep_topk=logger_args.keep_topk) # TODO: JBY: handle threshold for fine tuning if model_args.fine_tuning == 'full': # Fine tune all layers. pass else: # Freeze other layers. models.PretrainedModel.set_require_grad_for_fine_tuning( model, model_args.fine_tuning.split(',')) # Instantiate the optimizer class for guiding model training. optimizer = Optimizer(parameters=model.parameters(), optim_args=optim_args, batch_size=data_args.batch_size, iters_per_print=logger_args.iters_per_print, iters_per_visual=logger_args.iters_per_visual, iters_per_eval=logger_args.iters_per_eval, dataset_len=len(train_loader.dataset), logger=logger) if model_args.ckpt_path and not model_args.moco: # Load the same optimizer as used in the original training. optimizer.load_optimizer(ckpt_path=model_args.ckpt_path, gpu_ids=args.gpu_ids) model_uncertainty = model_args.model_uncertainty loss_fn = evaluator.get_loss_fn( loss_fn_name=optim_args.loss_fn, model_uncertainty=model_args.model_uncertainty, mask_uncertain=True, device=args.device) # Run training while not optimizer.is_finished_training(): optimizer.start_epoch() # TODO: JBY, HACK WARNING # What is the hack? metrics = None for inputs, targets in train_loader: optimizer.start_iter() if optimizer.global_step and optimizer.global_step % optimizer.iters_per_eval == 0 or len( train_loader.dataset ) - optimizer.iter < optimizer.batch_size: # Only evaluate every iters_per_eval examples. predictions, groundtruth = predictor.predict(valid_loader) # print("predictions: {}".format(predictions)) metrics, curves = evaluator.evaluate_tasks( groundtruth, predictions) # Log metrics to stdout. logger.log_metrics(metrics) # Add logger for all the metrics for valid_loader logger.log_scalars(metrics, optimizer.global_step) # Get the metric used to save model checkpoints. average_metric = evaluator.evaluate_average_metric( metrics, eval_tasks, optim_args.metric_name) if optimizer.global_step % logger_args.iters_per_save == 0: # Only save every iters_per_save examples directly # after evaluation. print("Save global step: {}".format(optimizer.global_step)) saver.save(iteration=optimizer.global_step, epoch=optimizer.epoch, model=model, optimizer=optimizer, device=args.device, metric_val=average_metric) # Step learning rate scheduler. optimizer.step_scheduler(average_metric) with torch.set_grad_enabled(True): logits, embedding = model(inputs.to(args.device)) loss = loss_fn(logits, targets.to(args.device)) optimizer.log_iter(inputs, logits, targets, loss) optimizer.zero_grad() loss.backward() optimizer.step() optimizer.end_iter() optimizer.end_epoch(metrics) logger.log('=== Training Complete ===')
def train(self, train_db, val_db, test_db): ################################################################## ## Optimizer ################################################################## if self.cfg.cuda and self.cfg.parallel: net = self.net.module else: net = self.net image_encoder_trainable_paras = \ filter(lambda p: p.requires_grad, net.image_encoder.parameters()) raw_optimizer = optim.Adam([ { 'params': image_encoder_trainable_paras }, { 'params': net.text_encoder.embedding.parameters(), 'lr': self.cfg.finetune_lr }, { 'params': net.text_encoder.rnn.parameters() }, { 'params': net.what_decoder.parameters() }, { 'params': net.where_decoder.parameters() }, { 'params': net.shape_encoder.parameters() }, ], lr=self.cfg.lr) optimizer = Optimizer(raw_optimizer, max_grad_norm=self.cfg.grad_norm_clipping) # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer.optimizer, factor=0.8, patience=3) # scheduler = optim.lr_scheduler.StepLR(optimizer.optimizer, step_size=3, gamma=0.8) # optimizer.set_scheduler(scheduler) ################################################################## ## LOG ################################################################## logz.configure_output_dir(self.cfg.model_dir) logz.save_config(self.cfg) ################################################################## ## Main loop ################################################################## start = time() min_val_loss = 100000000 for epoch in range(self.cfg.n_epochs): ################################################################## ## Training ################################################################## torch.cuda.empty_cache() train_loss = self.train_epoch(train_db, optimizer, epoch) ################################################################## ## Validation ################################################################## torch.cuda.empty_cache() val_loss = self.validate_epoch(val_db, epoch) ################################################################## ## Logging ################################################################## # update optim scheduler current_val_loss = np.mean(val_loss[:, 0]) # optimizer.update(current_val_loss, epoch) logz.log_tabular("Time", time() - start) logz.log_tabular("Iteration", epoch) logz.log_tabular("AverageLoss", np.mean(train_loss[:, 0])) logz.log_tabular("AverageEmbedLoss", np.mean(train_loss[:, 1])) logz.log_tabular("AverageAttnLoss", np.mean(train_loss[:, 2])) logz.log_tabular("ValAverageLoss", np.mean(val_loss[:, 0])) logz.log_tabular("ValAverageEmbedLoss", np.mean(val_loss[:, 1])) logz.log_tabular("ValAverageAttnLoss", np.mean(val_loss[:, 2])) logz.dump_tabular() ################################################################## ## Checkpoint ################################################################## if min_val_loss > current_val_loss: min_val_loss = current_val_loss # log_info = [np.mean(val_loss), np.mean(val_accu)] # self.save_checkpoint(epoch, log_info) self.save_best_checkpoint() torch.cuda.empty_cache()
def train(self, train_db, val_db, test_db): ################################################################## ## Optimizer ################################################################## if self.cfg.cuda and self.cfg.parallel: net = self.net.module else: net = self.net image_encoder_trainable_paras = \ filter(lambda p: p.requires_grad, net.image_encoder.parameters()) # raw_optimizer = optim.Adam([ # {'params': net.text_encoder.parameters(), 'lr': self.cfg.finetune_lr}, # {'params': image_encoder_trainable_paras}, # {'params': net.what_decoder.parameters()}, # {'params': net.where_decoder.parameters()} # ], lr=self.cfg.lr) raw_optimizer = optim.Adam([{ 'params': image_encoder_trainable_paras, 'initial_lr': self.cfg.lr }, { 'params': net.what_decoder.parameters(), 'initial_lr': self.cfg.lr }, { 'params': net.where_decoder.parameters(), 'initial_lr': self.cfg.lr }], lr=self.cfg.lr) self.optimizer = Optimizer(raw_optimizer, max_grad_norm=self.cfg.grad_norm_clipping) # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer.optimizer, factor=0.8, patience=3) scheduler = optim.lr_scheduler.StepLR(self.optimizer.optimizer, step_size=3, gamma=0.8, last_epoch=self.start_epoch - 1) self.optimizer.set_scheduler(scheduler) num_train_steps = int( len(train_db) / self.cfg.accumulation_steps * self.cfg.n_epochs) num_warmup_steps = int(num_train_steps * self.cfg.warmup) self.bert_optimizer = AdamW([{ 'params': net.text_encoder.parameters(), 'initial_lr': self.cfg.finetune_lr }], lr=self.cfg.finetune_lr) self.bert_scheduler = get_linear_schedule_with_warmup( self.bert_optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps, last_epoch=self.start_epoch - 1) bucket_boundaries = [4, 8, 12, 16, 22] # [4,8,12,16,22] print('preparing training bucket sampler') self.train_bucket_sampler = BucketSampler( train_db, bucket_boundaries, batch_size=self.cfg.batch_size) print('preparing validation bucket sampler') self.val_bucket_sampler = BucketSampler(val_db, bucket_boundaries, batch_size=4) ################################################################## ## LOG ################################################################## logz.configure_output_dir(self.cfg.model_dir) logz.save_config(self.cfg) ################################################################## ## Main loop ################################################################## start = time() for epoch in range(self.start_epoch, self.cfg.n_epochs): ################################################################## ## Training ################################################################## print('Training...') torch.cuda.empty_cache() train_pred_loss, train_attn_loss, train_eos_loss, train_accu, train_mse = \ self.train_epoch(train_db, self.optimizer, epoch) ################################################################## ## Validation ################################################################## print('Validation...') val_loss, val_accu, val_mse, val_infos = self.validate_epoch( val_db) ################################################################## ## Sample ################################################################## if self.cfg.if_sample: print('Sample...') torch.cuda.empty_cache() self.sample(epoch, test_db, self.cfg.n_samples) torch.cuda.empty_cache() ################################################################## ## Logging ################################################################## # update optim scheduler print('Loging...') self.optimizer.update(np.mean(val_loss), epoch) logz.log_tabular("Time", time() - start) logz.log_tabular("Iteration", epoch) logz.log_tabular("TrainAverageError", np.mean(train_pred_loss)) logz.log_tabular("TrainAverageAccu", np.mean(train_accu)) logz.log_tabular("TrainAverageMse", np.mean(train_mse)) logz.log_tabular("ValAverageError", np.mean(val_loss)) logz.log_tabular("ValAverageAccu", np.mean(val_accu)) logz.log_tabular("ValAverageObjAccu", np.mean(val_accu[:, 0])) logz.log_tabular("ValAverageCoordAccu", np.mean(val_accu[:, 1])) logz.log_tabular("ValAverageScaleAccu", np.mean(val_accu[:, 2])) logz.log_tabular("ValAverageRatioAccu", np.mean(val_accu[:, 3])) logz.log_tabular("ValAverageMse", np.mean(val_mse)) logz.log_tabular("ValAverageXMse", np.mean(val_mse[:, 0])) logz.log_tabular("ValAverageYMse", np.mean(val_mse[:, 1])) logz.log_tabular("ValAverageWMse", np.mean(val_mse[:, 2])) logz.log_tabular("ValAverageHMse", np.mean(val_mse[:, 3])) logz.log_tabular("ValUnigramF3", np.mean(val_infos.unigram_F3())) logz.log_tabular("ValBigramF3", np.mean(val_infos.bigram_F3())) logz.log_tabular("ValUnigramP", np.mean(val_infos.unigram_P())) logz.log_tabular("ValUnigramR", np.mean(val_infos.unigram_R())) logz.log_tabular("ValBigramP", val_infos.mean_bigram_P()) logz.log_tabular("ValBigramR", val_infos.mean_bigram_R()) logz.log_tabular("ValUnigramScale", np.mean(val_infos.scale())) logz.log_tabular("ValUnigramRatio", np.mean(val_infos.ratio())) logz.log_tabular("ValUnigramSim", np.mean(val_infos.unigram_coord())) logz.log_tabular("ValBigramSim", val_infos.mean_bigram_coord()) logz.dump_tabular() ################################################################## ## Checkpoint ################################################################## print('Saving checkpoint...') log_info = [np.mean(val_loss), np.mean(val_accu)] self.save_checkpoint(epoch, log_info) torch.cuda.empty_cache()
seq2seq_m = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq_m.cuda() #initialize random tensor for param in seq2seq_m.parameters(): param.data.uniform_(-0.08, 0.08) t = SupervisedTrainer(loss=loss, batch_size=batch_size, checkpoint_every=50, print_every=10, expt_dir=expt_dir) optimizer = Optimizer( torch.optim.Adam(seq2seq_m.parameters(), lr=0.001, betas=(0.9, 0.999))) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) ################################ seq2seq_m = t.train(seq2seq_m, train, num_epochs=num_epochs, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=resume) e = int(time.time() - start_time) print('ELAPSED TIME TRAINING ~> {:02d}:{:02d}:{:02d}'.format( e // 3600, (e % 3600 // 60), e % 60))
dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) scheduler = StepLR(optimizer.optimizer, 1) optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=50, print_every=10, expt_dir=opt.expt_dir) seq2seq = t.train(seq2seq, train, num_epochs=20000, dev_data=dev, optimizer=optimizer,
# y = y / y.max(axis=0) # Add baseline feature in column 0 X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X)) # %% [markdown] # ## Experiment 1: Numerical Approximation # %% cost = LinearCostFunction(X, y) step_size = 0.1 max_iter = 5000 tol = 1e-8 delta = 1e-5 optimizer = Optimizer(step_size, max_iter, tol, delta) initial_params = np.zeros(X.shape[1]) optimized_params, iters = optimizer.optimize(cost, initial_params) print( f'Found min at {optimized_params} starting at {initial_params} in {iters} iterations of optimization algorithm.' ) y_pred = np.sum(X * optimized_params, axis=1) # y_true = np.sum(X, axis=1) plt.scatter(y_pred, y) plt.plot([0, 25], [0, 25]) plt.show() # %% [markdown] # ## Experiment 2: Normal Solutions # %%
def __init__(self, dset, conf, save=False): # Set batches train, val, test = dset.build_batches("tok_tags") # Build model model = self.build_model(conf["model"], dset.vocab, dset.char_vocab, dset.tag_vocab) opt = Optimizer(model.parameters(), conf["optim"]) if conf["model"]["use_crf"]: loss = CustomLoss(model.crf) else: loss = WeightedCEL(dset.tag_vocab) lr_sch = LR_scheduler(opt.opt, conf["optim"]) # To track early stopping self.best = {"val": {"f1": 0}, "test": {}} step, stop = 0, 0 # Tags to ignore in metrics ign_tok = [ dset.tag_vocab["<p>"], dset.tag_vocab["<s>"], dset.tag_vocab["</s>"] ] for ep in range(conf["train"]["max_epochs"]): print("\n\tEpoch %d" % ep) for batch in train: # set the in training mode. model.train() # advance step step += 1 # forward pass x, y, mask = self.fw_pass(model, batch) # measure error fw_loss = loss(x, y, mask) # backward pass opt.train_op(fw_loss) # validation if step % conf["train"]["val_steps"] == 0: # Set the in testing mode model.eval() # Eval on val set val_metrics = utils.ner_fw_eval(model, self.fw_pass, val, step, ign_tok) if val_metrics["f1"] > self.best["val"]["f1"]: # reset Early stop stop = 0 # Eval on test set test_metrics = utils.ner_fw_eval( model, self.fw_pass, test, step, ign_tok) self.best = {"val": val_metrics, "test": test_metrics} if save: model.save(step, conf, self.best, opt, lr_sch, "ner") else: if stop == conf["train"]["patience"]: return stop += 1 # maybe update lr lr_sch.step()
def train(args): """Run model training.""" # Get nested namespaces. model_args = args.model_args logger_args = args.logger_args optim_args = args.optim_args data_args = args.data_args # Get logger. logger = Logger(logger_args) if model_args.ckpt_path: # CL-specified args are used to load the model, rather than the # ones saved to args.json. model_args.pretrained = False ckpt_path = model_args.ckpt_path assert False model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path, gpu_ids=args.gpu_ids, model_args=model_args, is_training=True) optim_args.start_epoch = ckpt_info['epoch'] + 1 else: # If no ckpt_path is provided, instantiate a new randomly # initialized model. model_fn = models.__dict__[model_args.model] model = model_fn(model_args) model = nn.DataParallel(model, args.gpu_ids) # Put model on gpu or cpu and put into training mode. model = model.to(args.device) model.train() # Get train and valid loader objects. train_loader = get_loader(phase="train", data_args=data_args, is_training=True, logger=logger) valid_loader = get_loader(phase="valid", data_args=data_args, is_training=False, logger=logger) dense_valid_loader = get_loader(phase="dense_valid", data_args=data_args, is_training=False, logger=logger) # Instantiate the predictor class for obtaining model predictions. predictor = Predictor(model, args.device) # Instantiate the evaluator class for evaluating models. # By default, get best performance on validation set. evaluator = Evaluator(logger=logger, tune_threshold=True) # Instantiate the saver class for saving model checkpoints. saver = ModelSaver(save_dir=logger_args.save_dir, iters_per_save=logger_args.iters_per_save, max_ckpts=logger_args.max_ckpts, metric_name=optim_args.metric_name, maximize_metric=optim_args.maximize_metric, keep_topk=True, logger=logger) # Instantiate the optimizer class for guiding model training. optimizer = Optimizer(parameters=model.parameters(), optim_args=optim_args, batch_size=data_args.batch_size, iters_per_print=logger_args.iters_per_print, iters_per_visual=logger_args.iters_per_visual, iters_per_eval=logger_args.iters_per_eval, dataset_len=len(train_loader.dataset), logger=logger) if model_args.ckpt_path: # Load the same optimizer as used in the original training. optimizer.load_optimizer(ckpt_path=model_args.ckpt_path, gpu_ids=args.gpu_ids) loss_fn = evaluator.get_loss_fn(loss_fn_name=optim_args.loss_fn) # Run training while not optimizer.is_finished_training(): optimizer.start_epoch() for inputs, targets in train_loader: optimizer.start_iter() if optimizer.global_step % optimizer.iters_per_eval == 0: # Only evaluate every iters_per_eval examples. predictions, groundtruth = predictor.predict(valid_loader) metrics = evaluator.evaluate(groundtruth, predictions) # Evaluate on dense dataset dense_predictions, dense_groundtruth = predictor.predict( dense_valid_loader) dense_metrics = evaluator.dense_evaluate( dense_groundtruth, dense_predictions) # Merge the metrics dicts together metrics = {**metrics, **dense_metrics} # Log metrics to stdout. logger.log_metrics(metrics, phase='valid') # Log to tb logger.log_scalars(metrics, optimizer.global_step, phase='valid') if optimizer.global_step % logger_args.iters_per_save == 0: # Only save every iters_per_save examples directly # after evaluation. saver.save(iteration=optimizer.global_step, epoch=optimizer.epoch, model=model, optimizer=optimizer, device=args.device, metric_val=metrics[optim_args.metric_name]) # Step learning rate scheduler. optimizer.step_scheduler(metrics[optim_args.metric_name]) with torch.set_grad_enabled(True): # Run the minibatch through the model. logits = model(inputs.to(args.device)) # Compute the minibatch loss. loss = loss_fn(logits, targets.to(args.device)) # Log the data from this iteration. optimizer.log_iter(inputs, logits, targets, loss) # Perform a backward pass. optimizer.zero_grad() loss.backward() optimizer.step() optimizer.end_iter() optimizer.end_epoch(metrics) # Save the most recent model. saver.save(iteration=optimizer.global_step, epoch=optimizer.epoch, model=model, optimizer=optimizer, device=args.device, metric_val=metrics[optim_args.metric_name])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--batchsize", "-b", type=int, default=64) parser.add_argument("--seq-length", "-l", type=int, default=35) parser.add_argument("--total-epochs", "-e", type=int, default=300) parser.add_argument("--gpu-device", "-g", type=int, default=0) parser.add_argument("--grad-clip", "-gc", type=float, default=5) parser.add_argument("--learning-rate", "-lr", type=float, default=1) parser.add_argument("--weight-decay", "-wd", type=float, default=0.000001) parser.add_argument("--dropout-embedding-softmax", "-dos", type=float, default=0.5) parser.add_argument("--dropout-rnn", "-dor", type=float, default=0.2) parser.add_argument("--variational-dropout", "-vdo", dest="variational_dropout", action="store_true", default=False) parser.add_argument("--use-tanh", "-tanh", dest="use_tanh", action="store_true", default=True) parser.add_argument("--use-identity", "-identity", dest="use_tanh", action="store_false") parser.add_argument("--momentum", "-mo", type=float, default=0.9) parser.add_argument("--optimizer", "-opt", type=str, default="msgd") parser.add_argument("--ndim-feature", "-nf", type=int, default=640) parser.add_argument("--num-layers", "-nl", type=int, default=2) parser.add_argument("--lr-decay-epoch", "-lrd", type=int, default=20) parser.add_argument("--model-filename", "-m", type=str, default="model.hdf5") args = parser.parse_args() print("#layers={}".format(args.num_layers)) print("d={}".format(args.ndim_feature)) print("dropout={}".format( "Variational" if args.variational_dropout else "Standard")) print("g={}".format("tanh" if args.use_tanh else "identity")) assert args.num_layers > 0 assert args.ndim_feature > 0 dataset_train, dataset_dev, dataset_test = chainer.datasets.get_ptb_words() dataset_dev = np.asarray(dataset_dev, dtype=np.int32) vocab_size = max(dataset_train) + 1 rnn = RNN(vocab_size, ndim_feature=args.ndim_feature, num_layers=args.num_layers, use_tanh=args.use_tanh, dropout_embedding_softmax=args.dropout_embedding_softmax, dropout_rnn=args.dropout_rnn, variational_dropout=args.variational_dropout) rnn.load(args.model_filename) total_iterations_train = len(dataset_train) // (args.seq_length * args.batchsize) optimizer = Optimizer(args.optimizer, args.learning_rate, args.momentum) optimizer.setup(rnn.model) if args.grad_clip > 0: optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) if args.weight_decay > 0: optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) using_gpu = False if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() rnn.model.to_gpu() using_gpu = True xp = rnn.model.xp training_start_time = time.time() for epoch in range(args.total_epochs): sum_loss = 0 epoch_start_time = time.time() # training for itr in range(total_iterations_train): # sample minbatch batch_offsets = np.random.randint(0, len(dataset_train) - args.seq_length - 1, size=args.batchsize) x_batch = np.empty((args.batchsize, args.seq_length), dtype=np.int32) t_batch = np.empty((args.batchsize, args.seq_length), dtype=np.int32) for batch_index, offset in enumerate(batch_offsets): sequence = dataset_train[offset:offset + args.seq_length] teacher = dataset_train[offset + 1:offset + args.seq_length + 1] x_batch[batch_index] = sequence t_batch[batch_index] = teacher if using_gpu: x_batch = cuda.to_gpu(x_batch) t_batch = cuda.to_gpu(t_batch) t_batch = flatten(t_batch) # update model parameters with chainer.using_config("train", True): rnn.reset_state() y_batch = rnn(x_batch, flatten=True) loss = functions.softmax_cross_entropy(y_batch, t_batch) rnn.model.cleargrads() loss.backward() optimizer.update() sum_loss += float(loss.data) assert sum_loss == sum_loss, "Encountered NaN!" printr("Training ... {:3.0f}% ({}/{})".format( (itr + 1) / total_iterations_train * 100, itr + 1, total_iterations_train)) rnn.save(args.model_filename) # evaluation x_sequence = dataset_dev[:-1] t_sequence = dataset_dev[1:] rnn.reset_state() total_iterations_dev = math.ceil(len(x_sequence) / args.seq_length) offset = 0 negative_log_likelihood = 0 for itr in range(total_iterations_dev): seq_length = min(offset + args.seq_length, len(x_sequence)) - offset x_batch = x_sequence[None, offset:offset + seq_length] t_batch = flatten(t_sequence[None, offset:offset + seq_length]) if using_gpu: x_batch = cuda.to_gpu(x_batch) t_batch = cuda.to_gpu(t_batch) with chainer.no_backprop_mode() and chainer.using_config( "train", False): y_batch = rnn(x_batch, flatten=True) negative_log_likelihood += float( functions.softmax_cross_entropy(y_batch, t_batch).data) * seq_length printr("Computing perplexity ...{:3.0f}% ({}/{})".format( (itr + 1) / total_iterations_dev * 100, itr + 1, total_iterations_dev)) offset += seq_length assert negative_log_likelihood == negative_log_likelihood, "Encountered NaN!" perplexity = math.exp(negative_log_likelihood / len(dataset_dev)) clear_console() print( "Epoch {} done in {} sec - loss: {:.6f} - log_likelihood: {} - ppl: {} - lr: {:.3g} - total {} min" .format(epoch + 1, int(time.time() - epoch_start_time), sum_loss / total_iterations_train, int(-negative_log_likelihood), int(perplexity), optimizer.get_learning_rate(), int((time.time() - training_start_time) // 60))) if epoch >= args.lr_decay_epoch: optimizer.decrease_learning_rate(0.98, final_value=1e-5)
if opt.resume and not opt.load_checkpoint: last_checkpoint = get_last_checkpoint(opt.best_model_dir) if last_checkpoint: opt.load_checkpoint = os.path.join(opt.model_dir, last_checkpoint) opt.skip_steps = int(last_checkpoint.strip('.pt').split('/')[-1]) if opt.load_checkpoint: model.load_state_dict(torch.load(opt.load_checkpoint)) opt.skip_steps = int(opt.load_checkpoint.strip('.pt').split('/')[-1]) logger.info(f"\nLoad from {opt.load_checkpoint}\n") else: for param in model.parameters(): param.data.uniform_(-opt.init_weight, opt.init_weight) optimizer = optim.Adam(model.parameters()) optimizer = Optimizer(optimizer, max_grad_norm=opt.clip_grad) loss = nn.CrossEntropyLoss() model = model.to(device) loss = loss.to(device) if opt.phase == 'train': trans_data = TranslateData() train_set = Short_text_Dataset(opt.train_path, trans_data.translate_data, src_vocab, max_src_length=opt.max_src_length) train = DataLoader(train_set, batch_size=opt.batch_size, shuffle=False, drop_last=True, collate_fn=trans_data.collate_fn)
model = VideoModel(config.model) if config.training.num_gpu > 0: model = model.cuda() if config.training.num_gpu > 1: device_ids = list(range(config.training.num_gpu)) model = torch.nn.DataParallel(model, device_ids=device_ids) logger.info('Loaded the model to %d GPUs' % config.training.num_gpu) n_params, enc, dec, fir_enc = count_parameters(model) logger.info('# the number of parameters in the whole model: %d' % n_params) logger.info('# the number of parameters in the Encoder: %d' % enc) logger.info('# the number of parameters in the Decoder: %d' % (n_params - enc)) optimizer = Optimizer(model.parameters(), config.optim) logger.info('Created a %s optimizer.' % config.optim.type) start_epoch = 0 # create a visualizer if config.training.visualization: visualizer = SummaryWriter(os.path.join(exp_name, 'log')) logger.info('Created a visualizer.') else: visualizer = None for epoch in range(start_epoch, config.training.epochs): # eval_ctc_model(epoch, config, model, val_data, logger, visualizer) train_ctc_model(epoch, config, model, train_data, optimizer, logger, visualizer)
n_layers=opt.rnn_layers, rnn_cell='lstm', dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters(), lr=opt.lr, betas=(0.9, 0.995)), max_grad_norm=opt.grad_norm) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = CrossEntropyLoss(weight, pad) if torch.cuda.is_available(): loss.cuda() t = SupervisedTrainer(loss=loss, batch_size=batch_size, checkpoint_every=100, print_every=10, expt_dir=opt.expt_dir)