'vocab_size': len(vocab), 'embedding_size': opts.word_embedding_size, 'hidden_size': opts.rnn_hidden_size, 'padding_idx': padding_idx, 'dropout_ratio': opts.rnn_dropout, 'bidirectional': opts.bidirectional == 1, 'num_layers': opts.rnn_num_layers } # Model setup torch.no_grad() model = SelfMonitoring(**policy_model_kwargs).cuda() encoder = EncoderRNN(**encoder_kwargs).cuda() params = list(encoder.parameters()) + list(model.parameters()) optimizer = torch.optim.Adam(params, lr=opts.learning_rate) resume_training(opts, model, encoder, optimizer) model.eval() # model.device = torch.device("cpu") encoder.eval() # encoder.device = torch.device("cpu") resnet = models.resnet152(pretrained=True) resnet.eval() resnet.cuda() # Gibson setup config = parse_config('ped.yaml') def transform_img(im): ''' Prep gibson rgb input for pytorch model ''' # RGB pixel mean - from feature precomputing script im = im[60:540, :, :3].copy()
def main(opts): # set manual_seed and build vocab print(opts, flush=True) setup(opts, opts.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Usando {device} :)") # create a batch training environment that will also preprocess text vocab = read_vocab(opts.train_vocab) tok = Tokenizer(opts.remove_punctuation == 1, opts.reversed == 1, vocab=vocab, encoding_length=opts.max_cap_length) # create language instruction encoder encoder_kwargs = { 'opts': opts, 'vocab_size': len(vocab), 'embedding_size': opts.word_embedding_size, 'hidden_size': opts.rnn_hidden_size, 'padding_idx': padding_idx, 'dropout_ratio': opts.rnn_dropout, 'bidirectional': opts.bidirectional == 1, 'num_layers': opts.rnn_num_layers } print('Using {} as encoder ...'.format(opts.lang_embed)) if 'lstm' in opts.lang_embed: encoder = EncoderRNN(**encoder_kwargs) else: raise ValueError('Unknown {} language embedding'.format(opts.lang_embed)) print(encoder) # create policy model policy_model_kwargs = { 'opts':opts, 'img_fc_dim': opts.img_fc_dim, 'img_fc_use_batchnorm': opts.img_fc_use_batchnorm == 1, 'img_dropout': opts.img_dropout, 'img_feat_input_dim': opts.img_feat_input_dim, 'rnn_hidden_size': opts.rnn_hidden_size, 'rnn_dropout': opts.rnn_dropout, 'max_len': opts.max_cap_length, 'max_navigable': opts.max_navigable } if opts.arch == 'regretful': model = Regretful(**policy_model_kwargs) elif opts.arch == 'self-monitoring': model = SelfMonitoring(**policy_model_kwargs) elif opts.arch == 'speaker-baseline': model = SpeakerFollowerBaseline(**policy_model_kwargs) else: raise ValueError('Unknown {} model for seq2seq agent'.format(opts.arch)) print(model) encoder = encoder.to(device) model = model.to(device) params = list(encoder.parameters()) + list(model.parameters()) optimizer = torch.optim.Adam(params, lr=opts.learning_rate) # optionally resume from a checkpoint if opts.resume: model, encoder, optimizer, best_success_rate = resume_training(opts, model, encoder, optimizer) # if a secondary exp name is specified, this is useful when resuming from a previous saved # experiment and save to another experiment, e.g., pre-trained on synthetic data and fine-tune on real data if opts.exp_name_secondary: opts.exp_name += opts.exp_name_secondary feature, img_spec = load_features(opts.img_feat_dir, opts.blind) if opts.test_submission: assert opts.resume, 'The model was not resumed before running for submission.' test_env = ('test', (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, splits=['test'], tokenizer=tok), Evaluation(['test'], opts))) agent_kwargs = { 'opts': opts, 'env': test_env[1][0], 'results_path': "", 'encoder': encoder, 'model': model, 'feedback': opts.feedback } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer) epoch = opts.start_epoch - 1 trainer.eval(epoch, test_env) return # set up R2R environments if not opts.train_data_augmentation: train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=['train'], tokenizer=tok) else: train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=['synthetic'], tokenizer=tok) val_craft_splits = ['craft_seen', 'craft_unseen'] val_splits = ['val_seen', 'val_unseen'] if opts.craft_eval: val_splits += val_craft_splits val_envs = {split: (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, splits=[split], tokenizer=tok), Evaluation([split], opts)) for split in val_splits} # create agent agent_kwargs = { 'opts': opts, 'env': train_env, 'results_path': "", 'encoder': encoder, 'model': model, 'feedback': opts.feedback } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer, opts.train_iters_epoch) if opts.eval_only: success_rate = [] for val_env in val_envs.items(): success_rate.append(trainer.eval(opts.start_epoch - 1, val_env, tb_logger=None)) return # set up tensorboard logger tb_logger = set_tb_logger(opts.log_dir, opts.exp_name, opts.resume) sys.stdout.flush() best_success_rate = best_success_rate if opts.resume else 0.0 for epoch in range(opts.start_epoch, opts.max_num_epochs + 1): trainer.train(epoch, train_env, tb_logger) if epoch % opts.eval_every_epochs == 0: success_rate = [] for val_env in val_envs.items(): success_rate.append(trainer.eval(epoch, val_env, tb_logger)) success_rate_compare = success_rate[1] if is_experiment(): # remember best val_seen success rate and save checkpoint is_best = success_rate_compare >= best_success_rate best_success_rate = max(success_rate_compare, best_success_rate) print("--> Highest val_unseen success rate: {}".format(best_success_rate)) sys.stdout.flush() # save the model if it is the best so far save_checkpoint({ 'opts': opts, 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'encoder_state_dict': encoder.state_dict(), 'best_success_rate': best_success_rate, 'optimizer': optimizer.state_dict(), 'max_episode_len': opts.max_episode_len, }, is_best, checkpoint_dir=opts.checkpoint_dir, name=opts.exp_name) if opts.train_data_augmentation and epoch == opts.epochs_data_augmentation: train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=['train'], tokenizer=tok) print("--> Finished training")
def main(opts): # set manual_seed and build vocab setup(opts, opts.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # create a batch training environment that will also preprocess text vocab = read_vocab(opts.train_vocab) tok = Tokenizer( opts.remove_punctuation == 1, opts.reversed == 1, vocab=vocab, encoding_length=opts.max_cap_length, ) # create language instruction encoder encoder_kwargs = { "opts": opts, "vocab_size": len(vocab), "embedding_size": opts.word_embedding_size, "hidden_size": opts.rnn_hidden_size, "padding_idx": padding_idx, "dropout_ratio": opts.rnn_dropout, "bidirectional": opts.bidirectional == 1, "num_layers": opts.rnn_num_layers, } print("Using {} as encoder ...".format(opts.lang_embed)) if "lstm" in opts.lang_embed: encoder = EncoderRNN(**encoder_kwargs) else: raise ValueError("Unknown {} language embedding".format( opts.lang_embed)) print(encoder) # create policy model policy_model_kwargs = { "opts": opts, "img_fc_dim": opts.img_fc_dim, "img_fc_use_batchnorm": opts.img_fc_use_batchnorm == 1, "img_dropout": opts.img_dropout, "img_feat_input_dim": opts.img_feat_input_dim, "rnn_hidden_size": opts.rnn_hidden_size, "rnn_dropout": opts.rnn_dropout, "max_len": opts.max_cap_length, "max_navigable": opts.max_navigable, } if opts.arch == "self-monitoring": model = SelfMonitoring(**policy_model_kwargs) elif opts.arch == "speaker-baseline": model = SpeakerFollowerBaseline(**policy_model_kwargs) else: raise ValueError("Unknown {} model for seq2seq agent".format( opts.arch)) print(model) encoder = encoder.to(device) model = model.to(device) params = list(encoder.parameters()) + list(model.parameters()) optimizer = torch.optim.Adam(params, lr=opts.learning_rate) # optionally resume from a checkpoint if opts.resume: model, encoder, optimizer, best_success_rate = resume_training( opts, model, encoder, optimizer) # if a secondary exp name is specified, this is useful when resuming from a previous saved # experiment and save to another experiment, e.g., pre-trained on synthetic data and fine-tune on real data if opts.exp_name_secondary: opts.exp_name += opts.exp_name_secondary feature, img_spec = load_features(opts.img_feat_dir) if opts.test_submission: assert (opts.resume ), "The model was not resumed before running for submission." test_env = ( "test", ( R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, splits=["test"], tokenizer=tok, ), Evaluation(["test"]), ), ) agent_kwargs = { "opts": opts, "env": test_env[1][0], "results_path": "", "encoder": encoder, "model": model, "feedback": opts.feedback, } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer) epoch = opts.start_epoch - 1 trainer.eval(epoch, test_env) return # set up R2R environments if not opts.train_data_augmentation: train_env = R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=["train"], tokenizer=tok, ) else: train_env = R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=["synthetic"], tokenizer=tok, ) val_envs = { split: ( R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, splits=[split], tokenizer=tok, ), Evaluation([split]), ) for split in ["val_seen", "val_unseen"] } # create agent agent_kwargs = { "opts": opts, "env": train_env, "results_path": "", "encoder": encoder, "model": model, "feedback": opts.feedback, } agent = PanoSeq2SeqAgent(**agent_kwargs) # setup trainer trainer = PanoSeq2SeqTrainer(opts, agent, optimizer, opts.train_iters_epoch) if opts.eval_beam or opts.eval_only: success_rate = [] for val_env in val_envs.items(): success_rate.append( trainer.eval(opts.start_epoch - 1, val_env, tb_logger=None)) return # set up tensorboard logger tb_logger = set_tb_logger(opts.log_dir, opts.exp_name, opts.resume) best_success_rate = best_success_rate if opts.resume else 0.0 for epoch in range(opts.start_epoch, opts.max_num_epochs + 1): trainer.train(epoch, train_env, tb_logger) if epoch % opts.eval_every_epochs == 0: success_rate = [] for val_env in val_envs.items(): success_rate.append(trainer.eval(epoch, val_env, tb_logger)) success_rate_compare = success_rate[1] if is_experiment(): # remember best val_seen success rate and save checkpoint is_best = success_rate_compare >= best_success_rate best_success_rate = max(success_rate_compare, best_success_rate) print("--> Highest val_unseen success rate: {}".format( best_success_rate)) # save the model if it is the best so far save_checkpoint( { "opts": opts, "epoch": epoch + 1, "state_dict": model.state_dict(), "encoder_state_dict": encoder.state_dict(), "best_success_rate": best_success_rate, "optimizer": optimizer.state_dict(), "max_episode_len": opts.max_episode_len, }, is_best, checkpoint_dir=opts.checkpoint_dir, name=opts.exp_name, ) if (opts.train_data_augmentation and epoch == opts.epochs_data_augmentation): train_env = R2RPanoBatch( opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed, splits=["train"], tokenizer=tok, ) print("--> Finished training")
def main(opts): if opts.exp_number is not None: opts.exp_name = opts.exp_name + '_' + opts.exp_number opts.dataset = 'datasets/%s' % opts.dataset tb_logger = set_tb_logger('{}/{}'.format(opts.log_dir, opts.model), opts.exp_name, opts.resume) best_SPD, best_TC = float("inf"), 0.0 # Load data if opts.model == 'vlntrans': opts.max_instr_len = 512 vocab_file = "%s/vocab/vlntrans_vocab.txt" % opts.dataset tokenizer = tx.data.BERTTokenizer(pretrained_model_name='bert-base-uncased', hparams={'vocab_file': vocab_file}) else: vocab_file = "%s/vocab/nobert_vocab.txt" % opts.dataset vocab = read_vocab(vocab_file) tokenizer = Tokenizer(vocab=vocab, encoding_length=opts.max_instr_len) features, img_size = load_features(opts.img_feat_dir) train_env = TouchdownBatch(opts, features, img_size, batch_size=opts.batch_size, seed=opts.seed, splits=['train'], tokenizer=tokenizer, name="train") val_env = TouchdownBatch(opts, features, img_size, batch_size=opts.batch_size, seed=opts.seed, splits=['dev'], tokenizer=tokenizer, name="eval") # Build model, optimizers, agent and trainer device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if opts.model == 'vlntrans': instr_encoder = tx.modules.BERTEncoder(pretrained_model_name='bert-base-uncased').to(device) model = VLNTransformer().to(device) text_linear = nn.Linear(768, opts.hidden_dim).to(device) pano_encoder = Conv_net(opts).to(device) bert_params = list(instr_encoder.parameters()) bert_optimizer = torch.optim.Adam(bert_params, lr=opts.bert_lr, weight_decay=opts.weight_decay) other_params = list(pano_encoder.parameters()) + list(model.parameters()) + list(text_linear.parameters()) optimizer = torch.optim.Adam(other_params, lr=opts.lr, weight_decay=opts.weight_decay) instr_encoder = nn.DataParallel(instr_encoder) text_linear = nn.DataParallel(text_linear) pano_encoder = nn.DataParallel(pano_encoder) model = nn.DataParallel(model) if opts.resume: model, instr_encoder, text_linear, pano_encoder, optimizer, bert_optimizer, best_SPD, best_TC = \ resume_training(opts, model, instr_encoder, text_linear=text_linear, pano_encoder=pano_encoder, optimizer=optimizer, bert_optimizer=bert_optimizer) agent = TouchdownVLNTransformer(opts, train_env, instr_encoder, pano_encoder, text_linear, model) trainer = TouchdownTrainer(opts, agent, optimizer, bert_optimizer) else: instr_encoder = Embed_RNN(len(vocab)).to(device) model = RConcat(opts).to(device) if opts.model == 'rconcat' else GA(opts).to(device) params = list(instr_encoder.parameters()) + list(model.parameters()) optimizer = torch.optim.Adam(params, lr=opts.lr, weight_decay=opts.weight_decay) if opts.resume: model, instr_encoder, optimizer, best_SPD, best_TC = \ resume_training(opts, model, instr_encoder, optimizer=optimizer) agent = TouchdownRConcat(opts, train_env, instr_encoder, model) if opts.model == 'rconcat' else TouchdownGA(opts, train_env, instr_encoder, model) trainer = TouchdownTrainer(opts, agent, optimizer) # Evaluation on dev set and test set if opts.test: assert opts.resume, 'The model was not resumed.' test_env = TouchdownBatch(opts, features, img_size, batch_size=opts.batch_size, seed=opts.seed, splits=['test'], tokenizer=tokenizer, name='test') epoch = opts.start_epoch - 1 trainer.eval_(epoch, val_env) trainer.eval_(epoch, test_env) return for epoch in range(opts.start_epoch, opts.max_num_epochs + 1): trainer.train(epoch, train_env, tb_logger) if epoch % opts.eval_every_epochs == 0: TC, SPD = trainer.eval_(epoch, val_env, tb_logger=tb_logger) is_best_SPD = SPD <= best_SPD best_SPD = min(SPD, best_SPD) is_best_TC = TC >= best_TC best_TC = max(TC, best_TC) print("--> Best dev SPD: {}, best dev TC: {}".format(best_SPD, best_TC)) ckpt = { 'opts': opts, 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'instr_encoder_state_dict': instr_encoder.state_dict(), 'best_SPD': best_SPD, 'best_TC': best_TC, 'optimizer': optimizer.state_dict() } if opts.model == 'vlntrans': ckpt['pano_encoder_state_dict'] = pano_encoder.state_dict() ckpt['text_linear_state_dict'] = text_linear.state_dict() ckpt['bert_optimizer'] = bert_optimizer.state_dict() save_checkpoint(ckpt, is_best_SPD, is_best_TC, epoch=epoch) print("--> Finished training")
args = parser.parse_args() if __name__ == "__main__": # get the values from the params data_dir = args.data_dir checkpoint_path = args.checkpoint_path additional_epochs = args.epochs arch = args.arch gpu = args.gpu print("These are the arguments supplied! :\n {}".format(args)) image_datasets, dataloaders = utils.create_dataloaders(data_dir) # nn model stuff supported_archs = utils.get_supported_archs() if arch not in supported_archs: print( "'{}' not supported. Please choose either vgg16 or alexnet".format( arch)) else: # resume training and save the model to file model = utils.resume_training(arch, checkpoint_path, additional_epochs, gpu, dataloaders) total_epochs = model.epochs + additional_epochs utils.save_checkpoint(model, image_datasets, model.hidden_units, model.dropout, total_epochs, models.learning_rate, checkpoint_path)