def __init__(self, env, vocab_size, results_path, batch_size, episode_len=20): super(ActorCriticAgent, self).__init__(env, results_path) #For evaluation self.ev = Evaluation(['train']) #For navigation self.episode_len = episode_len self.losses = [] ''' Define instruction encoder ''' word_embedding_size = 256 hidden_size = 512 bidirectional = False dropout_ratio = 0.5 enc_hidden_size = hidden_size//2 if bidirectional else hidden_size self.encoder = EncoderLSTM(vocab_size, word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() context_size = 1024 self.hist_encoder = EncoderHistory(len(self.model_actions), 32, 2048, context_size).cuda() self.a2c_agent = A2CAgent(enc_hidden_size, context_size, len(self.model_actions) - 2).cuda() self.saved_actions = [] params = list(self.encoder.parameters()) + list(self.hist_encoder.parameters()) + list(self.a2c_agent.parameters()) self.losses = [] self.optimizer = torch.optim.Adam(params, lr=0.001, weight_decay=1e-5)
def test_submission(path_type, max_episode_len, history, MAX_INPUT_LENGTH, feedback_method, n_iters, model_prefix, blind): ''' Train on combined training and validation sets, and generate test submission. ''' setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAINVAL_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH) train_env = R2RBatch(features, batch_size=batch_size, splits=['train', 'val_seen', 'val_unseen'], tokenizer=tok, path_type=path_type, history=history, blind=blind) # Build models and train enc_hidden_size = hidden_size//2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(action_embedding_size, hidden_size, dropout_ratio).cuda() train(train_env, encoder, decoder, n_iters, path_type, history, feedback_method, max_episode_len, MAX_INPUT_LENGTH, model_prefix) # Generate test submission test_env = R2RBatch(features, batch_size=batch_size, splits=['test'], tokenizer=tok, path_type=path_type, history=history, blind=blind) agent = Seq2SeqAgent(test_env, "", encoder, decoder, max_episode_len) agent.results_path = '%s%s_%s_iter_%d.json' % (RESULT_DIR, model_prefix, 'test', 5000) agent.test(use_dropout=False, feedback='argmax') agent.write_results()
def train_val(): ''' Train on the training set, and validate on seen and unseen splits. ''' setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH) train_env = R2RBatch(features, batch_size=batch_size, splits=['train'], tokenizer=tok) # Creat validation environments val_envs = { split: (R2RBatch(features, batch_size=batch_size, splits=[split], tokenizer=tok), Evaluation([split])) for split in ['val_seen', 'val_unseen'] } # Build models and train enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, hidden_size, dropout_ratio).cuda() train(train_env, encoder, decoder, n_iters, val_envs=val_envs)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, batch_size=BATCH_SIZE): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=args.bidirectional, glove=glove)) decoder = try_cuda(AttnDecoderLSTM( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok), eval.Evaluation([split])) for split in test_splits} return train_env, test_envs, encoder, decoder
def train_all(eval_type, seed, max_episode_len, max_input_length, feedback, n_iters, prefix, blind, debug, train_vocab, trainval_vocab, batch_size, action_embedding_size, target_embedding_size, bidirectional, dropout_ratio, weight_decay, feature_size, hidden_size, word_embedding_size, lr, result_dir, snapshot_dir, plot_dir, train_splits, test_splits): ''' Train on the training set, and validate on the test split. ''' setup(seed, train_vocab, trainval_vocab) # Create a batch training environment that will also preprocess text vocab = read_vocab(train_vocab if eval_type == 'val' else trainval_vocab) tok = Tokenizer(vocab=vocab, encoding_length=max_input_length) train_env = R2RBatch(batch_size=batch_size, splits=train_splits, tokenizer=tok, seed=seed, blind=blind) # Creat validation environments val_envs = { split: (R2RBatch(batch_size=batch_size, splits=[split], tokenizer=tok, seed=seed, blind=blind), Evaluation([split], seed=seed)) for split in test_splits } # Build models and train enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, hidden_size, dropout_ratio, feature_size).cuda() train(eval_type, train_env, encoder, decoder, n_iters, seed, feedback, max_episode_len, max_input_length, prefix, blind, lr, weight_decay, result_dir, snapshot_dir, plot_dir, val_envs=val_envs, debug=debug)
def train_test(path_type, max_episode_len, history, MAX_INPUT_LENGTH, feedback_method, n_iters, model_prefix, blind): ''' Train on the training set, and validate on the test split. ''' setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAINVAL_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH) train_env = R2RBatch(features, batch_size=batch_size, splits=['train', 'val_seen', 'val_unseen'], tokenizer=tok, path_type=path_type, history=history, blind=blind) # Creat validation environments val_envs = { split: (R2RBatch(features, batch_size=batch_size, splits=[split], tokenizer=tok, path_type=path_type, history=history, blind=blind), Evaluation([split], path_type=path_type)) for split in ['test'] } # Build models and train enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, hidden_size, dropout_ratio).cuda() train(train_env, encoder, decoder, n_iters, path_type, history, feedback_method, max_episode_len, MAX_INPUT_LENGTH, model_prefix, val_envs=val_envs)
def train_val(eval_type, seed, max_episode_len, history, max_input_length, feedback_method, n_iters, model_prefix, blind, debug): ''' Train on the training set, and validate on seen and unseen splits. ''' setup(seed) # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=max_input_length) train_env = R2RBatch(batch_size=batch_size, splits=['train'], tokenizer=tok, seed=seed, history=history, blind=blind) # Creat validation environments val_envs = { split: (R2RBatch(batch_size=batch_size, splits=[split], tokenizer=tok, seed=seed, history=history, blind=blind), Evaluation([split], seed=seed)) for split in ['val_seen'] } # Build models and train enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, hidden_size, dropout_ratio, feature_size).cuda() train(eval_type, train_env, encoder, decoder, n_iters, seed, history, feedback_method, max_episode_len, max_input_length, model_prefix, val_envs=val_envs, debug=debug)
def test_submission(): ''' Train on combined training and validation sets, and generate test submission. ''' setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH) # train_env = R2RBatch(features, batch_size=batch_size, splits=['train', 'val_seen', 'val_unseen'], tokenizer=tok) # Build models and train enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, hidden_size, dropout_ratio).cuda() # train(train_env, encoder, decoder, n_iters) encoder.load_state_dict(torch.load('%s/seq2seq_enc.pt' % (SNAPSHOT_DIR))) decoder.load_state_dict(torch.load('%s/seq2seq_dec.pt' % (SNAPSHOT_DIR))) # Generate test submission test_env = R2RBatch(features, batch_size=batch_size, splits=['test1'], tokenizer=tok) agent = Seq2SeqAgent(test_env, "", encoder, decoder, max_episode_len) agent.results_path = '%s%s_%s_iter_%d.json' % (RESULT_DIR, 'seq2seq', 'test1', 20000) agent.test(use_dropout=False, feedback='argmax') agent.write_results()
class ActorCriticAgent(BaseAgent): model_actions = ['left', 'right', 'up', 'down', 'forward', '<end>', '<start>', '<ignore>'] env_actions = [ (0,-1, 0), # left (0, 1, 0), # right (0, 0, 1), # up (0, 0,-1), # down (1, 0, 0), # forward (0, 0, 0), # <end> (0, 0, 0), # <start> (0, 0, 0) # <ignore> ] SavedAction = namedtuple('SavedAction', ['log_prob', 'value', 'step']) eps = np.finfo(np.float32).eps.item() def __init__(self, env, vocab_size, results_path, batch_size, episode_len=20): super(ActorCriticAgent, self).__init__(env, results_path) #For evaluation self.ev = Evaluation(['train']) #For navigation self.episode_len = episode_len self.losses = [] ''' Define instruction encoder ''' word_embedding_size = 256 hidden_size = 512 bidirectional = False dropout_ratio = 0.5 enc_hidden_size = hidden_size//2 if bidirectional else hidden_size self.encoder = EncoderLSTM(vocab_size, word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() context_size = 1024 self.hist_encoder = EncoderHistory(len(self.model_actions), 32, 2048, context_size).cuda() self.a2c_agent = A2CAgent(enc_hidden_size, context_size, len(self.model_actions) - 2).cuda() self.saved_actions = [] params = list(self.encoder.parameters()) + list(self.hist_encoder.parameters()) + list(self.a2c_agent.parameters()) self.losses = [] self.optimizer = torch.optim.Adam(params, lr=0.001, weight_decay=1e-5) def _sort_batch(self, obs): seq_tensor = np.array([ob['instr_encoding'] for ob in obs]) seq_lengths = np.argmax(seq_tensor == padding_idx, axis=1) seq_lengths[seq_lengths == 0] = seq_tensor.shape[1] # Full length seq_tensor = torch.from_numpy(seq_tensor) seq_lengths = torch.from_numpy(seq_lengths) # Sort sequences by lengths seq_lengths, perm_idx = seq_lengths.sort(0, True) sorted_tensor = seq_tensor[perm_idx] mask = (sorted_tensor == padding_idx)[:,:seq_lengths[0]] return Variable(sorted_tensor, requires_grad=False).long().cuda(), \ mask.byte().cuda(), \ list(seq_lengths), list(perm_idx) def _feature_variable(self, obs): feature_size = obs[0]['feature'].shape[0] features = np.empty((len(obs),feature_size), dtype=np.float32) for i,ob in enumerate(obs): features[i,:] = ob['feature'] return Variable(torch.from_numpy(features), requires_grad=False).cuda() def _teacher_action(self, obs, ended): a = torch.LongTensor(len(obs)) for i,ob in enumerate(obs): # Supervised teacher only moves one axis at a time ix,heading_chg,elevation_chg = ob['teacher'] if heading_chg > 0: a[i] = self.model_actions.index('right') elif heading_chg < 0: a[i] = self.model_actions.index('left') elif elevation_chg > 0: a[i] = self.model_actions.index('up') elif elevation_chg < 0: a[i] = self.model_actions.index('down') elif ix > 0: a[i] = self.model_actions.index('forward') elif ended[i]: a[i] = self.model_actions.index('<ignore>') else: a[i] = self.model_actions.index('<end>') return Variable(a, requires_grad=False).cuda() def rollout(self, guide_prob): #For navigation obs = np.array(self.env.reset()) batch_size = len(obs) seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs) perm_obs = obs[perm_idx] traj = [{ 'instr_id': ob['instr_id'], 'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])] } for ob in perm_obs] ctx,h_t,c_t = self.encoder(seq, seq_lengths) a_t = Variable(torch.ones(batch_size).long() * self.model_actions.index('<start>'), requires_grad=False).cuda() ended = np.array([False] * len(obs)) env_action = [None] * batch_size h_n, c_n = self.hist_encoder.init_hidden(batch_size) for t in range(self.episode_len): f_t = self._feature_variable(perm_obs) enc_data, h_n, c_n =self.hist_encoder(a_t, f_t, h_n, c_n) action_prob, critic_value = self.a2c_agent(ctx, seq_lengths, enc_data) guided = np.random.choice(2, batch_size, p=[1.0 - guide_prob, guide_prob]) demo = self._teacher_action(perm_obs, ended) if guided[0] == 1: a_t = demo else: if len(perm_obs[0]['navigableLocations']) <= 1: action_prob[0, self.model_actions.index('forward')] = -float('inf') action_prob = F.softmax(action_prob, dim=1) m = Categorical(action_prob) a_t = m.sample() if not ended[0]: self.saved_actions.append(self.SavedAction(m.log_prob(a_t), critic_value, t)) for i, (idx, ob) in enumerate(zip(perm_idx, perm_obs)): action_idx = a_t[i] if action_idx == self.model_actions.index('<end>'): ended[i] = True env_action[idx] = self.env_actions[action_idx] obs = np.array(self.env.step(env_action)) perm_obs = obs[perm_idx] for i,ob in enumerate(perm_obs): if not ended[i]: traj[i]['path'].append((ob['viewpoint'], ob['heading'], ob['elevation'])) if ended.all(): break return traj def clear_saved_actions(self): del self.saved_actions[:] def test(self, guide_prob): self.encoder.eval() self.hist_encoder.eval() self.a2c_agent.eval() self.env.reset_epoch() self.losses = [] self.results = {} # We rely on env showing the entire batch before repeating anything #print 'Testing %s' % self.__class__.__name__ looped = False while True: for traj in self.rollout(guide_prob): if traj['instr_id'] in self.results: looped = True else: self.results[traj['instr_id']] = traj['path'] if looped: break self.clear_saved_actions() def train(self, n_iters, guide_prob): self.encoder.train() self.hist_encoder.train() self.a2c_agent.train() policy_losses = [] value_losses = [] self.losses = [] total_num = 0 success_num = 0 for iter in range(1, n_iters + 1): traj = self.rollout(guide_prob) for i, t in enumerate(traj): nav_error, oracle_error, trajectory_step, trajectory_length = self.ev._score_item(t['instr_id'], t['path']) reward = 1.0 if nav_error < 3.0 else 0.0 total_num += 1.0 success_num += reward for log_prob, value, step in self.saved_actions: discounted_reward = pow(0.99, trajectory_step - step) * reward advantage = discounted_reward - value.item() policy_losses.append(-log_prob * advantage) value_losses.append(F.smooth_l1_loss(value, Variable(torch.tensor([[discounted_reward]]).cuda(), requires_grad=False))) data_len = len(policy_losses) if data_len > 64: self.optimizer.zero_grad() value_loss = torch.stack(value_losses).sum() policy_loss = torch.stack(policy_losses).sum() loss = value_loss + policy_loss self.losses.append(value_loss.item() / data_len) #print('sub iter [%d/%d], Average Value Loss: %.4f' %(iter, n_iters, value_loss.item() / data_len)) loss.backward() self.optimizer.step() self.clear_saved_actions() policy_losses = [] value_losses = [] data_len = len(policy_losses) if data_len > 0: self.optimizer.zero_grad() loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum() self.losses.append(loss.item() / data_len) loss.backward() self.optimizer.step() self.clear_saved_actions() print('guide prob: %.2f, train value loss: %.4f, success: %.2f' % (guide_prob, np.average(np.array(self.losses)), (success_num / total_num)))
n_iters = 5000 log_every = 100 save_every = 100 vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab) glove = np.load(glove_path) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size feature_size = FEATURE_SIZE visEncoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) lanEncoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) agent = compatModel(None, "", visEncoder, lanEncoder, dotSim) #agent.load('tasks/R2R/snapshots/release/speaker_final_release', map_location = 'cpu') agent.load('tasks/R2R/compat/trained_1/compat_sample_imagenet_mean_pooled_train_iter_1000', map_location = 'cpu') if __name__ == "__main__": traj = {'scan':'5q7pvUzZiYa', 'path':["7dc12a67ddfc4a4a849ce620db5b777b", "0e84cf4dec784bc28b78a80bee35c550", "a77784b955454209857d745976a1676d", "67971a17c26f4e2ca117b4fca73507fe", "8db06d3a0dd44508b3c078d60126ce19", "43ac37dfa1db4a13a8a9df4e454eb016", "4bd82c990a6548a994daa97c8f52db06", "6d11ca4d41e04bb1a725c2223c36b2aa", "29fb3c58b29348558d36a9f9440a1379", "c23f26401359426982d11ca494ee739b", "397403366d784caf804d741f32fd68b9", "3c6a35e15ada4b649990d6568cce8bd9", "55e4436f528c4bf09e4550079c572f7b", "69fad7dd177847dbabf69e8fb7c00ddf", "c629c7f1cf6f47a78c45a8ae9ff82247", "21fca0d6192940e580587fe317440f56", "4b85d61dd3a94e8a812affe78f3a322d", "3c025b8e3d2040969cd00dd0e9f29b09"][:2], 'heading':0.0,'elevation_init':0.0} encoded_instructions, _ = tok.encode_sentence('') encoded_instructions = torch.tensor([encoded_instructions], device = 'cpu') rdv_test = rdv(traj) path_obs, path_actions = rdv_test.obs_and_acts() # predicted score = agent.predict(path_obs,path_actions,encoded_instructions)
from model import AlignModel, Seq2Seq, EncoderLSTM, Decoder from data_gen import dev_iter, vocab from torch.utils.tensorboard import SummaryWriter import torch import torch.nn as nn attn = AlignModel() enc = EncoderLSTM() dec = Decoder() special_ids = [ vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<unk>'], vocab.stoi['<pad>'] ] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Seq2Seq(enc, dec, device).to(device) model.load_state_dict(torch.load('./checkpoint/Seq2Seq_2020-05-28 16:21%.pth')) criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi['<pad>']) #将输出转换为词汇,并打印 def id2doc(data, my_vocab, special_ids=special_ids): #data = data.numpy() res = [] for text in data: tmp = [] for num in text: if num not in special_ids: tmp.append(my_vocab.itos[num]) res.append(tmp) for sen in res: print(sen)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) train_env.data.extend(hardNeg_train) # extend train data and shuffle random.shuffle(train_env.data) enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE # ============================================================================= # visEncoder = try_cuda(CompatVisEncoderLSTM( # action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, # bidirectional=bidirectional)) # ============================================================================= visEncoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) # ============================================================================= # lanEncoder = try_cuda(CompatLanEncoderLSTM( # len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, # dropout_ratio, bidirectional=True, glove=glove)) # ============================================================================= lanEncoder = try_cuda( EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc')) #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc')) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits } #test_envs['val_seen'][0].data.extend(hardNeg_val_seen) test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen) test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[ 3000:4000] return train_env, test_envs, visEncoder, lanEncoder, dotSim
def train_val(path_type, max_episode_len, history, MAX_INPUT_LENGTH, feedback_method, n_iters, model_prefix, blind, args): ''' Train on the training set, and validate on seen and unseen splits. ''' nav_graphs = setup(args.action_space, args.navigable_locs_path) # Create a batch training environment that will also preprocess text use_bert = (args.encoder_type in ['bert','vlbert']) # for tokenizer and dataloader if use_bert: tok = BTokenizer(MAX_INPUT_LENGTH) else: vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH) #train_env = R2RBatch(features, batch_size=batch_size, splits=['train'], tokenizer=tok, # path_type=path_type, history=history, blind=blind) feature_store = Feature(features, args.panoramic) train_env = R2RBatch(feature_store, nav_graphs, args.panoramic,args.action_space,batch_size=args.batch_size, splits=['train'], tokenizer=tok, path_type=path_type, history=history, blind=blind) # Creat validation environments #val_envs = {split: (R2RBatch(features, batch_size=batch_size, splits=[split], # tokenizer=tok, path_type=path_type, history=history, blind=blind), # Evaluation([split], path_type=path_type)) for split in ['val_seen', 'val_unseen']} val_envs = {split: (R2RBatch(feature_store,nav_graphs, args.panoramic, args.action_space,batch_size=args.batch_size, splits=[split], tokenizer=tok, path_type=path_type, history=history, blind=blind), Evaluation([split], path_type=path_type)) for split in ['val_seen','val_unseen']} # Build models and train #enc_hidden_size = hidden_size//2 if bidirectional else hidden_size if args.encoder_type == 'vlbert': if args.pretrain_model_name is not None: print("Using the pretrained lm model from %s" %(args.pretrain_model_name)) encoder = DicEncoder(FEATURE_ALL_SIZE,args.enc_hidden_size, args.hidden_size, args.dropout_ratio, args.bidirectional, args.transformer_update, args.bert_n_layers, args.reverse_input, args.top_lstm,args.vl_layers,args.la_layers,args.bert_type) premodel = DicAddActionPreTrain.from_pretrained(args.pretrain_model_name) encoder.bert = premodel.bert encoder.drop = nn.Dropout(p=args.dropout_ratio) encoder.bert._resize_token_embeddings(len(tok)) # remember to resize tok embedding size encoder.bert.update_lang_bert, encoder.bert.config.update_lang_bert = args.transformer_update, args.transformer_update encoder.bert.update_add_layer, encoder.bert.config.update_add_layer = args.update_add_layer, args.update_add_layer encoder = encoder.cuda() else: encoder = DicEncoder(FEATURE_ALL_SIZE,args.enc_hidden_size, args.hidden_size, args.dropout_ratio, args.bidirectional, args.transformer_update, args.bert_n_layers, args.reverse_input, args.top_lstm,args.vl_layers,args.la_layers,args.bert_type).cuda() encoder.bert._resize_token_embeddings(len(tok)) # remember to resize tok embedding size elif args.encoder_type == 'bert': if args.pretrain_model_name is not None: print("Using the pretrained lm model from %s" %(args.pretrain_model_name)) encoder = BertEncoder(args.enc_hidden_size, args.hidden_size, args.dropout_ratio, args.bidirectional, args.transformer_update, args.bert_n_layers, args.reverse_input, args.top_lstm, args.bert_type) premodel = BertForMaskedLM.from_pretrained(args.pretrain_model_name) encoder.bert = premodel.bert encoder.drop = nn.Dropout(p=args.dropout_ratio) encoder.bert._resize_token_embeddings(len(tok)) # remember to resize tok embedding size #encoder.bert.update_lang_bert, encoder.bert.config.update_lang_bert = args.transformer_update, args.transformer_update #encoder.bert.update_add_layer, encoder.bert.config.update_add_layer = args.update_add_layer, args.update_add_layer encoder = encoder.cuda() else: encoder = BertEncoder(args.enc_hidden_size, args.hidden_size, args.dropout_ratio, args.bidirectional, args.transformer_update, args.bert_n_layers, args.reverse_input, args.top_lstm, args.bert_type).cuda() encoder.bert._resize_token_embeddings(len(tok)) else: enc_hidden_size = hidden_size//2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() #decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), # action_embedding_size, args.hidden_size, args.dropout_ratio).cuda() ctx_hidden_size = args.enc_hidden_size * (2 if args.bidirectional else 1) if use_bert and not args.top_lstm: ctx_hidden_size = 768 decoder = R2RAttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, ctx_hidden_size, args.hidden_size, args.dropout_ratio,FEATURE_SIZE, args.panoramic,args.action_space,args.dec_h_type).cuda() decoder = R2RAttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, ctx_hidden_size, args.hidden_size, args.dropout_ratio,FEATURE_SIZE, args.panoramic,args.action_space,args.dec_h_type).cuda() train(train_env, encoder, decoder, n_iters, path_type, history, feedback_method, max_episode_len, MAX_INPUT_LENGTH, model_prefix, val_envs=val_envs, args=args)
from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB MAX_INPUT_LENGTH = 80 feature_size = 2048+128 max_episode_len = 10 word_embedding_size = 300 glove_path = 'tasks/R2R/data/train_glove.npy' action_embedding_size = 2048+128 hidden_size = 512 dropout_ratio = 0.5 vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab) glove = np.load(glove_path) encoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, hidden_size, vocab_pad_idx, dropout_ratio, glove=glove)) decoder = try_cuda(AttnDecoderLSTM( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size)) agent = Seq2SeqAgent( None, "", encoder, decoder, max_episode_len, max_instruction_length=MAX_INPUT_LENGTH) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent.load('tasks/R2R/snapshots/release/follower_final_release', map_location = device)