def get_scores(output_file, split): output_ids = [] eval = Evaluation([split], 'lstm') eval.scores = defaultdict(list) instr_ids = set(eval.instr_ids) with open(output_file) as f: for item in json.load(f): if item['instr_id'] in instr_ids: output_ids.append(item['instr_id']) instr_ids.remove(item['instr_id']) eval._score_item(item['instr_id'], item['trajectory']) return output_ids, eval.scores
class ActorCriticAgent(BaseAgent): model_actions = ['left', 'right', 'up', 'down', 'forward', '<end>', '<start>', '<ignore>'] env_actions = [ (0,-1, 0), # left (0, 1, 0), # right (0, 0, 1), # up (0, 0,-1), # down (1, 0, 0), # forward (0, 0, 0), # <end> (0, 0, 0), # <start> (0, 0, 0) # <ignore> ] SavedAction = namedtuple('SavedAction', ['log_prob', 'value', 'step']) eps = np.finfo(np.float32).eps.item() def __init__(self, env, vocab_size, results_path, batch_size, episode_len=20): super(ActorCriticAgent, self).__init__(env, results_path) #For evaluation self.ev = Evaluation(['train']) #For navigation self.episode_len = episode_len self.losses = [] ''' Define instruction encoder ''' word_embedding_size = 256 hidden_size = 512 bidirectional = False dropout_ratio = 0.5 enc_hidden_size = hidden_size//2 if bidirectional else hidden_size self.encoder = EncoderLSTM(vocab_size, word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() context_size = 1024 self.hist_encoder = EncoderHistory(len(self.model_actions), 32, 2048, context_size).cuda() self.a2c_agent = A2CAgent(enc_hidden_size, context_size, len(self.model_actions) - 2).cuda() self.saved_actions = [] params = list(self.encoder.parameters()) + list(self.hist_encoder.parameters()) + list(self.a2c_agent.parameters()) self.losses = [] self.optimizer = torch.optim.Adam(params, lr=0.001, weight_decay=1e-5) def _sort_batch(self, obs): seq_tensor = np.array([ob['instr_encoding'] for ob in obs]) seq_lengths = np.argmax(seq_tensor == padding_idx, axis=1) seq_lengths[seq_lengths == 0] = seq_tensor.shape[1] # Full length seq_tensor = torch.from_numpy(seq_tensor) seq_lengths = torch.from_numpy(seq_lengths) # Sort sequences by lengths seq_lengths, perm_idx = seq_lengths.sort(0, True) sorted_tensor = seq_tensor[perm_idx] mask = (sorted_tensor == padding_idx)[:,:seq_lengths[0]] return Variable(sorted_tensor, requires_grad=False).long().cuda(), \ mask.byte().cuda(), \ list(seq_lengths), list(perm_idx) def _feature_variable(self, obs): feature_size = obs[0]['feature'].shape[0] features = np.empty((len(obs),feature_size), dtype=np.float32) for i,ob in enumerate(obs): features[i,:] = ob['feature'] return Variable(torch.from_numpy(features), requires_grad=False).cuda() def _teacher_action(self, obs, ended): a = torch.LongTensor(len(obs)) for i,ob in enumerate(obs): # Supervised teacher only moves one axis at a time ix,heading_chg,elevation_chg = ob['teacher'] if heading_chg > 0: a[i] = self.model_actions.index('right') elif heading_chg < 0: a[i] = self.model_actions.index('left') elif elevation_chg > 0: a[i] = self.model_actions.index('up') elif elevation_chg < 0: a[i] = self.model_actions.index('down') elif ix > 0: a[i] = self.model_actions.index('forward') elif ended[i]: a[i] = self.model_actions.index('<ignore>') else: a[i] = self.model_actions.index('<end>') return Variable(a, requires_grad=False).cuda() def rollout(self, guide_prob): #For navigation obs = np.array(self.env.reset()) batch_size = len(obs) seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs) perm_obs = obs[perm_idx] traj = [{ 'instr_id': ob['instr_id'], 'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])] } for ob in perm_obs] ctx,h_t,c_t = self.encoder(seq, seq_lengths) a_t = Variable(torch.ones(batch_size).long() * self.model_actions.index('<start>'), requires_grad=False).cuda() ended = np.array([False] * len(obs)) env_action = [None] * batch_size h_n, c_n = self.hist_encoder.init_hidden(batch_size) for t in range(self.episode_len): f_t = self._feature_variable(perm_obs) enc_data, h_n, c_n =self.hist_encoder(a_t, f_t, h_n, c_n) action_prob, critic_value = self.a2c_agent(ctx, seq_lengths, enc_data) guided = np.random.choice(2, batch_size, p=[1.0 - guide_prob, guide_prob]) demo = self._teacher_action(perm_obs, ended) if guided[0] == 1: a_t = demo else: if len(perm_obs[0]['navigableLocations']) <= 1: action_prob[0, self.model_actions.index('forward')] = -float('inf') action_prob = F.softmax(action_prob, dim=1) m = Categorical(action_prob) a_t = m.sample() if not ended[0]: self.saved_actions.append(self.SavedAction(m.log_prob(a_t), critic_value, t)) for i, (idx, ob) in enumerate(zip(perm_idx, perm_obs)): action_idx = a_t[i] if action_idx == self.model_actions.index('<end>'): ended[i] = True env_action[idx] = self.env_actions[action_idx] obs = np.array(self.env.step(env_action)) perm_obs = obs[perm_idx] for i,ob in enumerate(perm_obs): if not ended[i]: traj[i]['path'].append((ob['viewpoint'], ob['heading'], ob['elevation'])) if ended.all(): break return traj def clear_saved_actions(self): del self.saved_actions[:] def test(self, guide_prob): self.encoder.eval() self.hist_encoder.eval() self.a2c_agent.eval() self.env.reset_epoch() self.losses = [] self.results = {} # We rely on env showing the entire batch before repeating anything #print 'Testing %s' % self.__class__.__name__ looped = False while True: for traj in self.rollout(guide_prob): if traj['instr_id'] in self.results: looped = True else: self.results[traj['instr_id']] = traj['path'] if looped: break self.clear_saved_actions() def train(self, n_iters, guide_prob): self.encoder.train() self.hist_encoder.train() self.a2c_agent.train() policy_losses = [] value_losses = [] self.losses = [] total_num = 0 success_num = 0 for iter in range(1, n_iters + 1): traj = self.rollout(guide_prob) for i, t in enumerate(traj): nav_error, oracle_error, trajectory_step, trajectory_length = self.ev._score_item(t['instr_id'], t['path']) reward = 1.0 if nav_error < 3.0 else 0.0 total_num += 1.0 success_num += reward for log_prob, value, step in self.saved_actions: discounted_reward = pow(0.99, trajectory_step - step) * reward advantage = discounted_reward - value.item() policy_losses.append(-log_prob * advantage) value_losses.append(F.smooth_l1_loss(value, Variable(torch.tensor([[discounted_reward]]).cuda(), requires_grad=False))) data_len = len(policy_losses) if data_len > 64: self.optimizer.zero_grad() value_loss = torch.stack(value_losses).sum() policy_loss = torch.stack(policy_losses).sum() loss = value_loss + policy_loss self.losses.append(value_loss.item() / data_len) #print('sub iter [%d/%d], Average Value Loss: %.4f' %(iter, n_iters, value_loss.item() / data_len)) loss.backward() self.optimizer.step() self.clear_saved_actions() policy_losses = [] value_losses = [] data_len = len(policy_losses) if data_len > 0: self.optimizer.zero_grad() loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum() self.losses.append(loss.item() / data_len) loss.backward() self.optimizer.step() self.clear_saved_actions() print('guide prob: %.2f, train value loss: %.4f, success: %.2f' % (guide_prob, np.average(np.array(self.losses)), (success_num / total_num)))