def teacher_forcing(self, train=True, target=None): if train: self.bidaf.train() else: self.bidaf.eval() # Get input obs = self.env._get_obs() (img_feats, can_feats), feat_len = self.from_shortest_path( ) # Feature from the shortest path insts, inst_len = self.gt_words(obs) # Get Ground Truth Label if target is None: # Label from the env target = np.array([ob['label'] for ob in obs], np.float32) target = torch.from_numpy(target).cuda() else: target = torch.FloatTensor([target] * self.env.batch_size).cuda() feat_mask = utils.length2mask(feat_len) inst_mask = utils.length2mask(inst_len, 80) logits = self.bidaf(img_feats, can_feats, feat_mask, insts, inst_mask) # print("TRUE:", torch.sigmoid(logits).mean()) loss = self.bce_loss(input=logits, target=target) if train: return loss else: return loss.item()
def infer_batch(self, batch=None, insts=None, train=False): """ :param insts: numpy array with [batch_size, length]. It should be PADDED :return: The prob numpy with [batch_size] """ if train: self.bidaf.train() else: self.bidaf.eval() # Get Visual Input if batch is not None: self.env.reset(batch) obs = self.env._get_obs() (img_feats, can_feats), feat_len = self.from_shortest_path( ) # Feature from the shortest path # Get Language Input if insts is None: # Use the default inst in the dataset if the argument **insts** is not given insts, inst_len = self.gt_words(obs) else: # Bring the numpy to cuda # Use FloatTensor() so insts could be another Tensor if type(insts) is list: max_length = max([len(inst) for inst in insts]) insts = [ inst + ([self.tok.word_to_index['<PAD>']] * (max_length - len(inst))) for inst in insts ] insts = np.array(insts) # print("G infer", self.tok.decode_sentence(insts[0])) inst_len = (insts != self.tok.word_to_index['<PAD>']).sum(1) # print("len", inst_len[0]) insts = torch.LongTensor(insts).cuda() # Create Mask feat_mask = utils.length2mask(feat_len) inst_mask = utils.length2mask(inst_len, insts.size(1)) # input --> logit --> probs --> cpu_probs logits = self.bidaf(img_feats, can_feats, feat_mask, insts, inst_mask) # print("FALSE:", torch.sigmoid(logits).mean()) if train: target = torch.FloatTensor([0.] * self.env.batch_size).cuda() loss = self.bce_loss(input=logits, target=target) return loss else: probs = torch.sigmoid(logits) answer = probs.cpu().detach().numpy() return answer
def teacher_forcing(self, train=True, features=None, insts=None, for_listener=False): if train: self.encoder.train() self.decoder.train() else: self.encoder.eval() self.decoder.eval() # Get Image Input & Encode if features is not None: # It is used in calulating the speaker score in beam-search assert insts is not None (img_feats, can_feats), lengths = features ctx = self.encoder(can_feats, img_feats, lengths) batch_size = len(lengths) else: obs = self.env._get_obs() batch_size = len(obs) (img_feats, can_feats), lengths = self.from_shortest_path() # Image Feature (from the shortest path) ctx = self.encoder(can_feats, img_feats, lengths) h_t = torch.zeros(1, batch_size, args.rnn_dim).cuda() c_t = torch.zeros(1, batch_size, args.rnn_dim).cuda() ctx_mask = utils.length2mask(lengths) # Get Language Input if insts is None: insts = self.gt_words(obs) # Language Feature # Decode logits, _, _ = self.decoder(insts, ctx, ctx_mask, h_t, c_t) # Because the softmax_loss only allow dim-1 to be logit, # So permute the output (batch_size, length, logit) --> (batch_size, logit, length) logits = logits.permute(0, 2, 1).contiguous() loss = self.softmax_loss( input = logits[:, :, :-1], # -1 for aligning target = insts[:, 1:] # "1:" to ignore the word <BOS> ) if for_listener: return self.nonreduced_softmax_loss( input = logits[:, :, :-1], # -1 for aligning target = insts[:, 1:] # "1:" to ignore the word <BOS> ) if train: return loss else: # Evaluation _, predict = logits.max(dim=1) # BATCH, LENGTH gt_mask = (insts != self.tok.word_to_index['<PAD>']) correct = (predict[:, :-1] == insts[:, 1:]) * gt_mask[:, 1:] # Not pad and equal to gt correct, gt_mask = correct.type(torch.LongTensor), gt_mask.type(torch.LongTensor) word_accu = correct.sum().item() / gt_mask[:, 1:].sum().item() # Exclude <BOS> sent_accu = (correct.sum(dim=1) == gt_mask[:, 1:].sum(dim=1)).sum().item() / batch_size # Exclude <BOS> return loss.item(), word_accu, sent_accu
def loss(self, data): # forward rnn input, target, length = data rnn_output = self(input, length) # discard the pad mask = length2mask(length) rnn_output = rnn_output.masked_select( mask.unsqueeze(dim=2).expand_as(rnn_output)).view(-1, self.nhid) target = target.masked_select(mask) # forward decoder and calculate loss decoder_loss = self.decoder.forward_with_loss(rnn_output, target) return decoder_loss
def forward(self, ctx, length=None): """ output the fix_length memory :param ctx: (batch_size, max_len, input_dim) :param length: (batch_size) :return: """ attn = self.f(ctx) # (batch_size, max_len, mem_size) if length is not None: mask = utils.length2mask(length).unsqueeze(-1).expand( -1, -1, self.mem_size) # print(attn.size()) # print(mask.size()) attn.masked_fill_(mask, float('-inf')) attn = F.softmax(attn, 1, _stacklevel=5) # (batch_size, max_len, mem_size) attn = attn.transpose(1, 2) # (batch_size, mem_size, max_len) memory = torch.bmm(attn, ctx) # (batch_size, mem_size, rnn_dim) memory = self.drop(memory) return memory
def infer_batch_bunch(self, sampling=False, train=False, featdropmask=None): """ :param sampling: if not, use argmax. else use softmax_multinomial :param train: Whether in the train mode :return: if sampling: return insts(np, [batch, max_len]), log_probs(torch, requires_grad, [batch,max_len]) hiddens(torch, requires_grad, [batch, max_len, dim}) And if train: the log_probs and hiddens are detached if not sampling: returns insts(np, [batch, max_len]) """ if train: self.encoder.train() self.decoder.train() else: self.encoder.eval() self.decoder.eval() # Image Input for the Encoder obs = self.env._get_obs() batch_size = len(obs) viewpoints_list = [list() for _ in range(batch_size)] # Get feature (img_feats, can_feats), lengths = self.from_shortest_path( viewpoints=viewpoints_list ) # Image Feature (from the shortest path) # This code block is only used for the featdrop. if featdropmask is not None: img_feats[..., :-args.angle_feat_size] *= featdropmask can_feats[..., :-args.angle_feat_size] *= featdropmask # Encoder ctx = self.encoder(can_feats, img_feats, lengths, already_dropfeat=(featdropmask is not None)) ctx_mask = utils.length2mask(lengths) # Decoder words = [] log_probs = [] hidden_states = [] entropies = [] h_t = torch.zeros(1, batch_size, args.rnn_dim).cuda() c_t = torch.zeros(1, batch_size, args.rnn_dim).cuda() ended = np.zeros(len(obs), np.bool) word = np.ones( len(obs), np.int64) * self.tok.word_to_index['<BOS>'] # First word is <BOS> word = torch.from_numpy(word).view(-1, 1).cuda() for i in range(args.maxDecode): # Decode Step logits, h_t, c_t = self.decoder( word, ctx, ctx_mask, h_t, c_t) # Decode, logits: (b, 1, vocab_size) # Select the word logits = logits.squeeze() # logits: (b, vocab_size) logits[:, self.tok.word_to_index['<UNK>']] = -float( "inf") # No <UNK> in infer if sampling: probs = F.softmax(logits, -1) m = torch.distributions.Categorical(probs) word = m.sample() log_prob = m.log_prob(word) if train: log_probs.append(log_prob) hidden_states.append(h_t.squeeze()) entropies.append(m.entropy()) else: log_probs.append(log_prob.detach()) hidden_states.append(h_t.squeeze().detach()) entropies.append(m.entropy().detach()) else: values, word = logits.max(1) # Append the word cpu_word = word.cpu().numpy() cpu_word[ended] = self.tok.word_to_index['<PAD>'] words.append(cpu_word) # Prepare the shape for next step word = word.view(-1, 1) # End? ended = np.logical_or(ended, cpu_word == self.tok.word_to_index['<EOS>']) if ended.all(): break if train and sampling: return np.stack(words, 1), torch.stack(log_probs, 1), torch.stack( hidden_states, 1), torch.stack(entropies, 1) else: return np.stack(words, 1) # [(b), (b), (b), ...] --> [b, l]
def _dijkstra(self): """ The dijkstra algorithm. Was called beam search to be consistent with existing work. But it actually finds the Exact K paths with smallest listener log_prob. :return: [{ "scan": XXX "instr_id":XXX, 'instr_encoding": XXX 'dijk_path': [v1, v2, ..., vn] (The path used for find all the candidates) "paths": { "trajectory": [viewpoint_id1, viewpoint_id2, ..., ], "action": [act_1, act_2, ..., ], "listener_scores": [log_prob_act1, log_prob_act2, ..., ], "visual_feature": [(f1_step1, f2_step2, ...), (f1_step2, f2_step2, ...) } }] """ def make_state_id(viewpoint, action): # Make state id return "%s_%s" % (viewpoint, str(action)) def decompose_state_id(state_id): # Make state id viewpoint, action = state_id.split("_") action = int(action) return viewpoint, action # Get first obs obs = self.env._get_obs() # Prepare the state id batch_size = len(obs) results = [{ "scan": ob['scan'], "instr_id": ob['instr_id'], "instr_encoding": ob["instr_encoding"], "dijk_path": [ob['viewpoint']], "paths": [] } for ob in obs] # Encoder seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs) recover_idx = np.zeros_like(perm_idx) for i, idx in enumerate(perm_idx): recover_idx[idx] = i ctx, h_t, c_t = self.encoder(seq, seq_lengths) ctx, h_t, c_t, ctx_mask = ctx[recover_idx], h_t[recover_idx], c_t[ recover_idx], seq_mask[recover_idx] # Recover the original order # Dijk Graph States: id2state = [{ make_state_id(ob['viewpoint'], -95): { "next_viewpoint": ob['viewpoint'], "running_state": (h_t[i], h_t[i], c_t[i]), "location": (ob['viewpoint'], ob['heading'], ob['elevation']), "feature": None, "from_state_id": None, "score": 0, "scores": [], "actions": [], } } for i, ob in enumerate(obs)] # -95 is the start point visited = [set() for _ in range(batch_size)] finished = [set() for _ in range(batch_size)] graphs = [utils.FloydGraph() for _ in range(batch_size)] # For the navigation path ended = np.array([False] * batch_size) # Dijk Algorithm for _ in range(300): # Get the state with smallest score for each batch # If the batch is not ended, find the smallest item. # Else use a random item from the dict (It always exists) smallest_idXstate = [ max(((state_id, state) for state_id, state in id2state[i].items() if state_id not in visited[i]), key=lambda item: item[1]['score']) if not ended[i] else next(iter(id2state[i].items())) for i in range(batch_size) ] # Set the visited and the end seqs for i, (state_id, state) in enumerate(smallest_idXstate): assert (ended[i]) or (state_id not in visited[i]) if not ended[i]: viewpoint, action = decompose_state_id(state_id) visited[i].add(state_id) if action == -1: finished[i].add(state_id) if len(finished[i] ) >= args.candidates: # Get enough candidates ended[i] = True # Gather the running state in the batch h_ts, h1s, c_ts = zip(*(idXstate[1]['running_state'] for idXstate in smallest_idXstate)) h_t, h1, c_t = torch.stack(h_ts), torch.stack(h1s), torch.stack( c_ts) # Recover the env and gather the feature for i, (state_id, state) in enumerate(smallest_idXstate): next_viewpoint = state['next_viewpoint'] scan = results[i]['scan'] from_viewpoint, heading, elevation = state['location'] self.env.env.sims[i].newEpisode( scan, next_viewpoint, heading, elevation) # Heading, elevation is not used in panoramic obs = self.env._get_obs() # Update the floyd graph # Only used to shorten the navigation length # Will not effect the result for i, ob in enumerate(obs): viewpoint = ob['viewpoint'] if not graphs[i].visited(viewpoint): # Update the Graph for c in ob['candidate']: next_viewpoint = c['viewpointId'] dis = self.env.distances[ ob['scan']][viewpoint][next_viewpoint] graphs[i].add_edge(viewpoint, next_viewpoint, dis) graphs[i].update(viewpoint) results[i]['dijk_path'].extend(graphs[i].path( results[i]['dijk_path'][-1], viewpoint)) input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat( obs) # Run one decoding step h_t, c_t, alpha, logit, h1 = self.decoder(input_a_t, f_t, candidate_feat, h_t, h1, c_t, ctx, ctx_mask, False) # Update the dijk graph's states with the newly visited viewpoint candidate_mask = utils.length2mask(candidate_leng) logit.masked_fill_(candidate_mask, -float('inf')) log_probs = F.log_softmax(logit, 1) # Calculate the log_prob here _, max_act = log_probs.max(1) for i, ob in enumerate(obs): current_viewpoint = ob['viewpoint'] candidate = ob['candidate'] current_state_id, current_state = smallest_idXstate[i] old_viewpoint, from_action = decompose_state_id( current_state_id) assert ob['viewpoint'] == current_state['next_viewpoint'] if from_action == -1 or ended[ i]: # If the action is <end> or the batch is ended, skip it continue for j in range(len(ob['candidate']) + 1): # +1 to include the <end> action # score + log_prob[action] modified_log_prob = log_probs[i][j].detach().cpu().item() new_score = current_state['score'] + modified_log_prob if j < len(candidate): # A normal action next_id = make_state_id(current_viewpoint, j) next_viewpoint = candidate[j]['viewpointId'] trg_point = candidate[j]['pointId'] heading = (trg_point % 12) * math.pi / 6 elevation = (trg_point // 12 - 1) * math.pi / 6 location = (next_viewpoint, heading, elevation) else: # The end action next_id = make_state_id(current_viewpoint, -1) # action is -1 next_viewpoint = current_viewpoint # next viewpoint is still here location = (current_viewpoint, ob['heading'], ob['elevation']) if next_id not in id2state[ i] or new_score > id2state[i][next_id]['score']: id2state[i][next_id] = { "next_viewpoint": next_viewpoint, "location": location, "running_state": (h_t[i], h1[i], c_t[i]), "from_state_id": current_state_id, "feature": (f_t[i].detach().cpu(), candidate_feat[i][j].detach().cpu()), "score": new_score, "scores": current_state['scores'] + [modified_log_prob], "actions": current_state['actions'] + [len(candidate) + 1], } # The active state is zero after the updating, then setting the ended to True for i in range(batch_size): if len(visited[i]) == len( id2state[i]): # It's the last active state ended[i] = True # End? if ended.all(): break # Move back to the start point for i in range(batch_size): results[i]['dijk_path'].extend(graphs[i].path( results[i]['dijk_path'][-1], results[i]['dijk_path'][0])) """ "paths": { "trajectory": [viewpoint_id1, viewpoint_id2, ..., ], "action": [act_1, act_2, ..., ], "listener_scores": [log_prob_act1, log_prob_act2, ..., ], "visual_feature": [(f1_step1, f2_step2, ...), (f1_step2, f2_step2, ...) } """ # Gather the Path for i, result in enumerate(results): assert len(finished[i]) <= args.candidates for state_id in finished[i]: path_info = { "trajectory": [], "action": [], "listener_scores": id2state[i][state_id]['scores'], "listener_actions": id2state[i][state_id]['actions'], "visual_feature": [] } viewpoint, action = decompose_state_id(state_id) while action != -95: state = id2state[i][state_id] path_info['trajectory'].append(state['location']) path_info['action'].append(action) path_info['visual_feature'].append(state['feature']) state_id = id2state[i][state_id]['from_state_id'] viewpoint, action = decompose_state_id(state_id) state = id2state[i][state_id] path_info['trajectory'].append(state['location']) for need_reverse_key in [ "trajectory", "action", "visual_feature" ]: path_info[need_reverse_key] = path_info[ need_reverse_key][::-1] result['paths'].append(path_info) return results
def rollout(self, train_ml=None, train_rl=True, reset=True, speaker=None): """ :param train_ml: The weight to train with maximum likelihood :param train_rl: whether use RL in training :param reset: Reset the environment :param speaker: Speaker used in back translation. If the speaker is not None, use back translation. O.w., normal training :return: """ if self.feedback == 'teacher' or self.feedback == 'argmax': train_rl = False if reset: # Reset env obs = np.array(self.env.reset()) else: obs = np.array(self.env._get_obs()) batch_size = len(obs) if speaker is not None: # Trigger the self_train mode! noise = self.decoder.drop_env(torch.ones(self.feature_size).cuda()) batch = self.env.batch.copy() speaker.env = self.env insts = speaker.infer_batch( featdropmask=noise) # Use the same drop mask in speaker # Create fake environments with the generated instruction boss = np.ones((batch_size, 1), np.int64) * self.tok.word_to_index[ '<BOS>'] # First word is <BOS> insts = np.concatenate((boss, insts), 1) for i, (datum, inst) in enumerate(zip(batch, insts)): if inst[-1] != self.tok.word_to_index[ '<PAD>']: # The inst is not ended! inst[-1] = self.tok.word_to_index['<EOS>'] datum.pop('instructions') datum.pop('instr_encoding') datum['instructions'] = self.tok.decode_sentence(inst) datum['instr_encoding'] = inst obs = np.array(self.env.reset(batch)) # Reorder the language input for the encoder (do not ruin the original code) seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs) perm_obs = obs[perm_idx] ctx, h_t, c_t = self.encoder(seq, seq_lengths) ctx_mask = seq_mask # Init the reward shaping last_dist = np.zeros(batch_size, np.float32) for i, ob in enumerate( perm_obs ): # The init distance from the view point to the target last_dist[i] = ob['distance'] # Record starting point traj = [{ 'instr_id': ob['instr_id'], 'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])] } for ob in perm_obs] # For test result submission visited = [set() for _ in perm_obs] # Initialization the tracking state ended = np.array( [False] * batch_size) # Indices match permuation of the model, not env # Init the logs rewards = [] hidden_states = [] policy_log_probs = [] masks = [] entropys = [] ml_loss = 0. h1 = h_t for t in range(self.episode_len): input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat( perm_obs) if speaker is not None: # Apply the env drop mask to the feat candidate_feat[..., :-args.angle_feat_size] *= noise f_t[..., :-args.angle_feat_size] *= noise h_t, c_t, logit, h1 = self.decoder(input_a_t, f_t, candidate_feat, h_t, h1, c_t, ctx, ctx_mask, already_dropfeat=(speaker is not None)) hidden_states.append(h_t) # Mask outputs where agent can't move forward # Here the logit is [b, max_candidate] candidate_mask = utils.length2mask(candidate_leng) if args.submit: # Avoding cyclic path for ob_id, ob in enumerate(perm_obs): visited[ob_id].add(ob['viewpoint']) for c_id, c in enumerate(ob['candidate']): if c['viewpointId'] in visited[ob_id]: candidate_mask[ob_id][c_id] = 1 logit.masked_fill_(candidate_mask.bool(), -float('inf')) # Supervised training target = self._teacher_action(perm_obs, ended) ml_loss += self.criterion(logit, target) # Determine next model inputs if self.feedback == 'teacher': a_t = target # teacher forcing elif self.feedback == 'argmax': _, a_t = logit.max(1) # student forcing - argmax a_t = a_t.detach() log_probs = F.log_softmax(logit, 1) # Calculate the log_prob here policy_log_probs.append(log_probs.gather( 1, a_t.unsqueeze(1))) # Gather the log_prob for each batch elif self.feedback == 'sample': probs = F.softmax(logit, 1) # sampling an action from model c = torch.distributions.Categorical(probs) self.logs['entropy'].append( c.entropy().sum().item()) # For log entropys.append(c.entropy()) # For optimization a_t = c.sample().detach() policy_log_probs.append(c.log_prob(a_t)) else: print(self.feedback) sys.exit('Invalid feedback option') # Prepare environment action # NOTE: Env action is in the perm_obs space cpu_a_t = a_t.cpu().numpy() for i, next_id in enumerate(cpu_a_t): if next_id == ( candidate_leng[i] - 1 ) or next_id == args.ignoreid: # The last action is <end> cpu_a_t[i] = -1 # Change the <end> and ignore action to -1 # Make action and get the new state self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj) obs = np.array(self.env._get_obs()) perm_obs = obs[perm_idx] # Perm the obs for the resu # Calculate the mask and reward dist = np.zeros(batch_size, np.float32) reward = np.zeros(batch_size, np.float32) mask = np.ones(batch_size, np.float32) for i, ob in enumerate(perm_obs): dist[i] = ob['distance'] if ended[ i]: # If the action is already finished BEFORE THIS ACTION. reward[i] = 0. mask[i] = 0. else: # Calculate the reward action_idx = cpu_a_t[i] if action_idx == -1: # If the action now is end if dist[i] < 3: # Correct reward[i] = 2. else: # Incorrect reward[i] = -2. else: # The action is not end reward[i] = -(dist[i] - last_dist[i] ) # Change of distance if reward[i] > 0: # Quantification reward[i] = 1 elif reward[i] < 0: reward[i] = -1 else: raise NameError( "The action doesn't change the move") rewards.append(reward) masks.append(mask) last_dist[:] = dist # Update the finished actions # -1 means ended or ignored (already ended) ended[:] = np.logical_or(ended, (cpu_a_t == -1)) # Early exit if all ended if ended.all(): break if train_rl: # Last action in A2C input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat( perm_obs) if speaker is not None: candidate_feat[..., :-args.angle_feat_size] *= noise f_t[..., :-args.angle_feat_size] *= noise last_h_, _, _, _ = self.decoder(input_a_t, f_t, candidate_feat, h_t, h1, c_t, ctx, ctx_mask, speaker is not None) rl_loss = 0. # NOW, A2C!!! # Calculate the final discounted reward last_value__ = self.critic(last_h_).detach( ) # The value esti of the last state, remove the grad for safety discount_reward = np.zeros(batch_size, np.float32) # The inital reward is zero for i in range(batch_size): if not ended[ i]: # If the action is not ended, use the value function as the last reward discount_reward[i] = last_value__[i] length = len(rewards) total = 0 for t in range(length - 1, -1, -1): discount_reward = discount_reward * args.gamma + rewards[ t] # If it ended, the reward will be 0 mask_ = Variable(torch.from_numpy(masks[t]), requires_grad=False).cuda() clip_reward = discount_reward.copy() r_ = Variable(torch.from_numpy(clip_reward), requires_grad=False).cuda() v_ = self.critic(hidden_states[t]) a_ = (r_ - v_).detach() # r_: The higher, the better. -ln(p(action)) * (discount_reward - value) rl_loss += (-policy_log_probs[t] * a_ * mask_).sum() rl_loss += (((r_ - v_)**2) * mask_).sum() * 0.5 # 1/2 L2 loss if self.feedback == 'sample': rl_loss += (-0.01 * entropys[t] * mask_).sum() self.logs['critic_loss'].append( (((r_ - v_)**2) * mask_).sum().item()) total = total + np.sum(masks[t]) self.logs['total'].append(total) # Normalize the loss function if args.normalize_loss == 'total': rl_loss /= total elif args.normalize_loss == 'batch': rl_loss /= batch_size else: assert args.normalize_loss == 'none' self.loss += rl_loss if train_ml is not None: self.loss += ml_loss * train_ml / batch_size if type( self.loss ) is int: # For safety, it will be activated if no losses are added self.losses.append(0.) else: self.losses.append(self.loss.item() / self.episode_len) # This argument is useless. return traj
def rollout(self, train_ml=None, train_rl=True, reset=True): """ :param train_ml: The weight to train with maximum likelihood :param train_rl: whether use RL in training :param reset: Reset the environment :return: """ if self.feedback == 'teacher' or self.feedback == 'argmax': train_rl = False if reset: # Reset env obs = np.array(self.env.reset()) else: obs = np.array(self.env._get_obs()) batch_size = len(obs) # Language input sentence, language_attention_mask, token_type_ids, \ seq_lengths, perm_idx = self._sort_batch(obs) perm_obs = obs[perm_idx] ''' Language BERT ''' language_inputs = { 'mode': 'language', 'sentence': sentence, 'attention_mask': language_attention_mask, 'lang_mask': language_attention_mask, 'token_type_ids': token_type_ids } if args.vlnbert == 'oscar': language_features = self.vln_bert(**language_inputs) elif args.vlnbert == 'prevalent': h_t, language_features = self.vln_bert(**language_inputs) # Record starting point traj = [{ 'instr_id': ob['instr_id'], 'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])], } for ob in perm_obs] # Init the reward shaping last_dist = np.zeros(batch_size, np.float32) last_ndtw = np.zeros(batch_size, np.float32) for i, ob in enumerate( perm_obs ): # The init distance from the view point to the target last_dist[i] = ob['distance'] path_act = [vp[0] for vp in traj[i]['path']] last_ndtw[i] = self.ndtw_criterion[ob['scan']](path_act, ob['gt_path'], metric='ndtw') # Initialization the tracking state ended = np.array( [False] * batch_size) # Indices match permuation of the model, not env # Init the logs rewards = [] hidden_states = [] policy_log_probs = [] masks = [] entropys = [] ml_loss = 0. for t in range(self.episode_len): input_a_t, candidate_feat, candidate_leng = self.get_input_feat( perm_obs) # the first [CLS] token, initialized by the language BERT, serves # as the agent's state passing through time steps if (t >= 1) or (args.vlnbert == 'prevalent'): language_features = torch.cat( (h_t.unsqueeze(1), language_features[:, 1:, :]), dim=1) visual_temp_mask = (utils.length2mask(candidate_leng) == 0).long() visual_attention_mask = torch.cat( (language_attention_mask, visual_temp_mask), dim=-1) self.vln_bert.vln_bert.config.directions = max(candidate_leng) ''' Visual BERT ''' visual_inputs = { 'mode': 'visual', 'sentence': language_features, 'attention_mask': visual_attention_mask, 'lang_mask': language_attention_mask, 'vis_mask': visual_temp_mask, 'token_type_ids': token_type_ids, 'action_feats': input_a_t, # 'pano_feats': f_t, 'cand_feats': candidate_feat } h_t, logit = self.vln_bert(**visual_inputs) hidden_states.append(h_t) # Mask outputs where agent can't move forward # Here the logit is [b, max_candidate] candidate_mask = utils.length2mask(candidate_leng) logit.masked_fill_(candidate_mask, -float('inf')) # Supervised training target = self._teacher_action(perm_obs, ended) ml_loss += self.criterion(logit, target) # Determine next model inputs if self.feedback == 'teacher': a_t = target # teacher forcing elif self.feedback == 'argmax': _, a_t = logit.max(1) # student forcing - argmax a_t = a_t.detach() log_probs = F.log_softmax(logit, 1) # Calculate the log_prob here policy_log_probs.append(log_probs.gather( 1, a_t.unsqueeze(1))) # Gather the log_prob for each batch elif self.feedback == 'sample': probs = F.softmax(logit, 1) # sampling an action from model c = torch.distributions.Categorical(probs) self.logs['entropy'].append( c.entropy().sum().item()) # For log entropys.append(c.entropy()) # For optimization a_t = c.sample().detach() policy_log_probs.append(c.log_prob(a_t)) else: print(self.feedback) sys.exit('Invalid feedback option') # Prepare environment action # NOTE: Env action is in the perm_obs space cpu_a_t = a_t.cpu().numpy() for i, next_id in enumerate(cpu_a_t): if next_id == (candidate_leng[i] - 1) or next_id == args.ignoreid or ended[ i]: # The last action is <end> cpu_a_t[i] = -1 # Change the <end> and ignore action to -1 # Make action and get the new state self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj) obs = np.array(self.env._get_obs()) perm_obs = obs[perm_idx] # Perm the obs for the resu if train_rl: # Calculate the mask and reward dist = np.zeros(batch_size, np.float32) ndtw_score = np.zeros(batch_size, np.float32) reward = np.zeros(batch_size, np.float32) mask = np.ones(batch_size, np.float32) for i, ob in enumerate(perm_obs): dist[i] = ob['distance'] path_act = [vp[0] for vp in traj[i]['path']] ndtw_score[i] = self.ndtw_criterion[ob['scan']]( path_act, ob['gt_path'], metric='ndtw') if ended[i]: reward[i] = 0.0 mask[i] = 0.0 else: action_idx = cpu_a_t[i] # Target reward if action_idx == -1: # If the action now is end if dist[i] < 3.0: # Correct reward[i] = 2.0 + ndtw_score[i] * 2.0 else: # Incorrect reward[i] = -2.0 else: # The action is not end # Path fidelity rewards (distance & nDTW) reward[i] = -(dist[i] - last_dist[i]) ndtw_reward = ndtw_score[i] - last_ndtw[i] if reward[i] > 0.0: # Quantification reward[i] = 1.0 + ndtw_reward elif reward[i] < 0.0: reward[i] = -1.0 + ndtw_reward else: raise NameError( "The action doesn't change the move") # Miss the target penalty if (last_dist[i] <= 1.0) and (dist[i] - last_dist[i] > 0.0): reward[i] -= (1.0 - last_dist[i]) * 2.0 rewards.append(reward) masks.append(mask) last_dist[:] = dist last_ndtw[:] = ndtw_score # Update the finished actions # -1 means ended or ignored (already ended) ended[:] = np.logical_or(ended, (cpu_a_t == -1)) # Early exit if all ended if ended.all(): break if train_rl: # Last action in A2C input_a_t, candidate_feat, candidate_leng = self.get_input_feat( perm_obs) language_features = torch.cat( (h_t.unsqueeze(1), language_features[:, 1:, :]), dim=1) visual_temp_mask = (utils.length2mask(candidate_leng) == 0).long() visual_attention_mask = torch.cat( (language_attention_mask, visual_temp_mask), dim=-1) self.vln_bert.vln_bert.config.directions = max(candidate_leng) ''' Visual BERT ''' visual_inputs = { 'mode': 'visual', 'sentence': language_features, 'attention_mask': visual_attention_mask, 'lang_mask': language_attention_mask, 'vis_mask': visual_temp_mask, 'token_type_ids': token_type_ids, 'action_feats': input_a_t, # 'pano_feats': f_t, 'cand_feats': candidate_feat } last_h_, _ = self.vln_bert(**visual_inputs) rl_loss = 0. # NOW, A2C!!! # Calculate the final discounted reward last_value__ = self.critic(last_h_).detach( ) # The value esti of the last state, remove the grad for safety discount_reward = np.zeros(batch_size, np.float32) # The inital reward is zero for i in range(batch_size): if not ended[ i]: # If the action is not ended, use the value function as the last reward discount_reward[i] = last_value__[i] length = len(rewards) total = 0 for t in range(length - 1, -1, -1): discount_reward = discount_reward * args.gamma + rewards[ t] # If it ended, the reward will be 0 mask_ = Variable(torch.from_numpy(masks[t]), requires_grad=False).cuda() clip_reward = discount_reward.copy() r_ = Variable(torch.from_numpy(clip_reward), requires_grad=False).cuda() v_ = self.critic(hidden_states[t]) a_ = (r_ - v_).detach() rl_loss += (-policy_log_probs[t] * a_ * mask_).sum() rl_loss += (((r_ - v_)**2) * mask_).sum() * 0.5 # 1/2 L2 loss if self.feedback == 'sample': rl_loss += (-0.01 * entropys[t] * mask_).sum() self.logs['critic_loss'].append( (((r_ - v_)**2) * mask_).sum().item()) total = total + np.sum(masks[t]) self.logs['total'].append(total) # Normalize the loss function if args.normalize_loss == 'total': rl_loss /= total elif args.normalize_loss == 'batch': rl_loss /= batch_size else: assert args.normalize_loss == 'none' self.loss += rl_loss self.logs['RL_loss'].append(rl_loss.item()) if train_ml is not None: self.loss += ml_loss * train_ml / batch_size self.logs['IL_loss'].append( (ml_loss * train_ml / batch_size).item()) if type( self.loss ) is int: # For safety, it will be activated if no losses are added self.losses.append(0.) else: self.losses.append(self.loss.item() / self.episode_len) # This argument is useless. return traj
def rl_train(self, reward_func, iters, ml_weight=0., policy_weight=1., baseline_weight=.5, entropy_weight=0., self_critical=False, ml_env=None): """ :param reward_func: A function takes the [(path, inst)] list as input, returns the reward for each inst :param iters: Train how many iters :param ml_weight: weight for maximum likelihood :param policy_weight: weight for policy loss :param baseline_weight: weight for critic loss (baseline loss) :param entropy_weight: weight for the entropy :param self_critical: Use the self_critical baseline :param ml_env: Specific env for ml (in case that the train_env is aug_env) :return: """ from collections import defaultdict log_dict = defaultdict(lambda: 0) for i in (range(iters)): joint_loss = 0. self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() # Reset Env if args.same_in_batch: self.env.reset(tile_one=True) else: self.env.reset() rl_batch = self.env.batch # RL training insts, log_probs, hiddens, entropies = self.infer_batch( sampling=True, train=True) # Sample a batch # Get the Reward ( and the length, mask) path_ids = [ob['path_id'] for ob in self.env._get_obs()] # Gather the path ids pathXinst = [(path_id, self.tok.shrink(inst)) for path_id, inst in zip(path_ids, insts)] reward = reward_func( rl_batch, pathXinst) # The reward func will evaluate the instruction reward = torch.FloatTensor(reward).cuda() length = np.argmax( np.array(insts) == self.tok.word_to_index['<EOS>'], 1) + 1 # Get length (pos of EOS) + 1 length[length == 1] = insts.shape[ 1] # If there is no EOS, change the length to max length. mask = 1. - utils.length2mask(length).float() # Get the baseline if args.normalize_reward: baseline = reward.mean() else: if self_critical: self.env.reset(rl_batch) insts = self.infer_batch(sampling=False, train=False) # Argmax Decoding pathXinst = [(path_id, self.tok.shrink(inst)) for path_id, inst in zip(path_ids, insts)] baseline = reward_func( rl_batch, pathXinst ) # The reward func will evaluate the instruction baseline = torch.FloatTensor(baseline).cuda().unsqueeze(1) else: baseline_hiddens = hiddens if args.grad_baseline else hiddens.detach( ) baseline = self.decoder.baseline_projection( baseline_hiddens).squeeze() # print("Reward Mean %0.4f, std %0.4f" % (reward.mean().detach().cpu().item(), reward.std().detach().cpu().item())) # print("Baseline Mean %0.4f, std %0.4f" % (baseline.mean().detach().cpu().item(), baseline.std().detach().cpu().item())) # print("Avg abs(Reward - Baseline): %0.4f" % (torch.abs(reward - baseline).mean().detach().cpu().item())) # Calculating the Loss reward = reward.unsqueeze(1) # (batch_size,) --> (batch_size, 1) if args.normalize_reward: # Normalize the reward to mean 0, std 1 advantage = (reward - baseline) / reward.std() * 0.2 else: advantage = reward - baseline policy_loss = (advantage.detach() * (-log_probs) * mask).sum( ) / self.env.batch_size # Normalized by the batch_size baseline_loss = (advantage**2 * mask).sum() / self.env.batch_size avg_entropy = (entropies * mask).sum() / self.env.batch_size # Add the Loss to the joint_loss if baseline_weight != 0.: # To support the pretrain phase joint_loss += baseline_loss * baseline_weight if policy_weight != 0.: # To support the finetune phase joint_loss += policy_loss * policy_weight if entropy_weight != 0.: # Note that the negative entrop is added to encourage exploration joint_loss += -avg_entropy * entropy_weight # ML env preparation if ml_env is not None: # Get the env from ml_env old_env = self.env self.env = ml_env self.env.reset() else: # else reset the same env as RL self.env.reset(batch=rl_batch) # ML Training assert ml_weight != 0 # Because I always log the ml_weight. And it should always exists! if ml_weight != 0.: ml_loss = self.teacher_forcing(train=True) joint_loss += ml_loss * ml_weight else: ml_loss = 0. if ml_env is not None: self.env = old_env # print("Reward Mean %0.4f, std %0.4f" % (reward.mean().detach().cpu().item(), reward.std().detach().cpu().item())) # print("Baseline Mean %0.4f, std %0.4f" % (baseline.mean().detach().cpu().item(), baseline.std().detach().cpu().item())) # print("Avg abs(Reward - Baseline): %0.4f" % (torch.abs(reward - baseline).mean().detach().cpu().item())) # Log: for name, loss in (('baseline_loss', baseline_loss.detach().item()), ('policy_loss', policy_loss.detach().item()), ('ml_loss', ml_loss.detach().item()), ('baseline', baseline.mean().detach().item()), ('reward', reward.mean().detach().item()), ('baseline_std', baseline.std().detach().item()), ('reward_std', reward.std().detach().item()), ('reward_diff', torch.abs(reward - baseline).mean().detach().item()), ('entropy', avg_entropy.item())): log_dict[name] += loss # Backward joint_loss.backward() torch.nn.utils.clip_grad_norm(self.encoder.parameters(), 40.) torch.nn.utils.clip_grad_norm(self.decoder.parameters(), 40.) self.encoder_optimizer.step() self.decoder_optimizer.step() return log_dict
def beam_infer_batch(self, beam_size=5, seq_num=20, candidates=20): """ :param beam_size: Beam_size is the size of the beam-search :param seq_num: Seq_num is the maximum number of returned sequence :param candidates: The maximum number of candidate sequences :return: [[seq 1, seq 2, ... (seq_num in total)] (for batch 1), [seq 1, seq 2, ... (seq_num in total)] (for batch 2), ..., [seq 1, seq 2, ... (seq_num in total)] (for batch n)] """ # Eval Model self.encoder.eval() self.decoder.eval() # Input for the Encoder obs = self.env._get_obs() batch_size = len(obs) (img_feats, can_feats), lengths = self.from_shortest_path( ) # Feature from the shortest path # Encoder ctx = self.encoder(can_feats, img_feats, lengths) # Encode ctx_mask = utils.length2mask(lengths) # init of the Deocer results = [] h_t = torch.zeros(1, batch_size, args.rnn_dim).cuda() c_t = torch.zeros(1, batch_size, args.rnn_dim).cuda() ended = np.zeros(len(obs), np.int) word = np.ones( len(obs), np.int64) * self.tok.word_to_index['<BOS>'] # First word is <BOS> word = torch.from_numpy(word).view(-1, 1).cuda() # Beam Search Initialization bs_now = 1 pre_scores = torch.zeros((batch_size, bs_now)) vocab_size = self.tok.vocab_size() for t in range(args.maxDecode): logits, h_t, c_t = self.decoder( word, ctx, ctx_mask, h_t, c_t) # Decode, logits: (b, 1, vocab_size) logits = logits.view(batch_size, bs_now, -1) # logits: (b, beam_size, vocab_size) log_prob = F.log_softmax( logits, dim=2).cpu() # logit --> log_softmax--> log_prob scores = pre_scores.unsqueeze( -1) + log_prob # scores: (batch, beam, vocab_size) # select top beam_size words. save it scores, word = scores.view(batch_size, -1).topk(beam_size, dim=1) beam = word / vocab_size # beam: (batch, beam) [[0,1,1], [0,1,2] word = word % vocab_size # Log the result for i in range(batch_size): if ended[i] >= candidates: # if the maximum seq exceeded, don't add it word[i] = self.tok.word_to_index['<PAD>'] results.append({ "beam": beam, "word": word, "scores": scores.detach().clone() }) # Save it before change the scores # For next step beam = beam + torch.arange(batch_size, dtype=torch.int64).view( -1, 1) * bs_now # [[0,1,1], [3,4,5], .. def gather_beam(state): # State: (batch * beam, rnn_dim) return state[:, beam.view(-1)] h_t, c_t = (gather_beam(state) for state in (h_t, c_t)) pre_scores = scores bs_now = beam_size assert bs_now == beam.size(1) # Handle the end_beams by setting the pre_scores to a very small value for i in range(word.size(0)): flag = True for j in range(word.size(1)): if word[i][j] == self.tok.word_to_index['<EOS>']: pre_scores[i][j] = -float( 'inf' ) # Set the score to -inf (so it will not appear in next step) ended[i] += 1 # One more <end> seq for batch i else: flag = False if flag: # If all ended, set it to maximum ended[i] = candidates #assert not flag # If all the beams want to end, just stop here. # At last, change the input word = word.view(-1, 1).cuda() # Should it stop now? if (ended >= candidates).all(): break seqs = self.get_all_ends(results, batch_size) results = [] for i in range(batch_size): # sorted_seq = sorted(seqs[i], key=lambda x: x['score'] / len(x['inst']), reverse=True) # sorted_seq = sorted(seqs[i], key=lambda x: x['score'] - 0.5 * abs(29 - len(x['inst'])), reverse=True) sorted_seq = sorted(seqs[i], key=lambda x: x['score'], reverse=True) # print(sorted_seq) results.append([list(seq['inst']) for seq in sorted_seq[:seq_num]]) # print() # for seq in results[0]: # print(self.tok.decode_sentence(seq)) return results # [[inst_1, inst_2, ..., inst_{seq_num}], ... ]
def rollout(self): obs = np.array(self.env.reset()) batch_size = len(obs) # Reorder the language input for the encoder seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs) perm_obs = obs[perm_idx] # Record starting point traj = [{ 'inst_idx': ob['inst_idx'], 'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])] } for ob in perm_obs] # Forward through encoder, giving initial hidden state and memory cell for decoder Last_QA_tensor = np.array([obb['Last_QA_enc'] for obb in perm_obs]) Last_QA_lengths = np.argmax(Last_QA_tensor == padding_idx, axis=1) Last_QA_lengths[Last_QA_lengths == 0] = Last_QA_tensor.shape[ 1] # Full length Last_QA_lengths = torch.from_numpy(Last_QA_lengths) Last_QA_lengths = Last_QA_lengths.long().cuda() Last_QA_tensor = torch.from_numpy(Last_QA_tensor) Last_QA = Variable(Last_QA_tensor, requires_grad=False).long().cuda() H = [] H_l = [] for i, obbb in enumerate(perm_obs): H.append([]) H_l.append([]) for j in range(15): H[i].append(obbb['hist_enc'][j]) h = np.array([obbb['hist_enc'][j]]) h_l = np.argmax(h == padding_idx, axis=1) if h_l == 0: H_l[i].append(1) else: H_l[i].append(h_l) hist_tensor = np.array(H) hist_tensor = torch.from_numpy(hist_tensor) hist = Variable(hist_tensor, requires_grad=False).long().cuda() hist_lengths = np.array(H_l) hist_lengths = torch.from_numpy(hist_lengths) hist_lengths = hist_lengths.long().cuda() tar_tensor = np.array([obbbb['tar_enc'] for obbbb in perm_obs]) tar_lengths = np.ones(batch_size) tar_lengths = torch.from_numpy(tar_lengths) tar_lengths = tar_lengths.long().cuda() tar_tensor = torch.from_numpy(tar_tensor) tar = Variable(tar_tensor, requires_grad=False).long().cuda() ctx, h_t, c_t = self.encoder(seq, seq_lengths, Last_QA, Last_QA_lengths, hist, hist_lengths, tar, tar_lengths) # Initial action # Last_QA_mask = utils.length2mask(Last_QA_lengths.cpu()) a_t = Variable(torch.ones(batch_size).long() * self.model_actions.index('<start>'), requires_grad=False).cuda() ended = np.array( [False] * batch_size) # Indices match permuation of the model, not env visited = [set() for _ in perm_obs] # Do a sequence rollout and calculate the loss self.loss = 0 env_action = [None] * batch_size h1 = h_t for t in range(self.episode_len): input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat( perm_obs) candidate_mask = utils.length2mask(candidate_leng) f_t = self._feature_variable(perm_obs) # Image features from obs h_t, c_t, logit, h1 = self.decoder(input_a_t, f_t, candidate_feat, h_t, h1, c_t, ctx, None) if 'test' in self.env.splits: for ob_id, ob in enumerate(perm_obs): visited[ob_id].add(ob['viewpoint']) for c_id, c in enumerate(ob['candidate']): if c['viewpointId'] in visited[ob_id]: candidate_mask[ob_id][c_id] = 1 logit.masked_fill_(candidate_mask, -float('inf')) # Supervised training if 'test' not in self.env.splits: target = self._teacher_action(perm_obs, ended) # self.loss += self.criterion(logit, target) tmp_loss = self.criterion(logit, target) if not math.isinf(tmp_loss): self.loss += tmp_loss # Determine next model inputs if self.feedback == 'teacher': a_t = target # teacher forcing elif self.feedback == 'argmax': _, a_t = logit.max(1) # student forcing - argmax a_t = a_t.detach() elif self.feedback == 'sample': probs = F.softmax(logit, dim=1) m = D.Categorical(probs) a_t = m.sample() # sampling an action from model else: sys.exit('Invalid feedback option') cpu_a_t = a_t.cpu().numpy() for i, next_id in enumerate(cpu_a_t): if next_id == ( candidate_leng[i] - 1 ) or next_id == args.ignoreid: # The last action is <end> cpu_a_t[i] = -1 self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj) obs = np.array(self.env._get_obs()) perm_obs = obs[perm_idx] ended[:] = np.logical_or(ended, (cpu_a_t == -1)) # Early exit if all ended if ended.all(): break if 'test' not in self.env.splits: self.losses.append(self.loss.item() / self.episode_len) return traj
def from_shortest_path(self, viewpoints=None, get_first_feat=False): """ :param viewpoints: [[], [], ....(batch_size)]. Only for dropout viewpoint :param get_first_feat: whether output the first feat :return: """ obs = self.env._get_obs() ended = np.array( [False] * len(obs)) # Indices match permuation of the model, not env length = np.zeros(len(obs), np.int64) img_feats = [] can_feats = [] teacher_actions = [] teacher_actions_1h = [] candidate_feats = [] candidate_masks = [] first_feat = np.zeros((len(obs), self.obs_dim), np.float32) for i, ob in enumerate(obs): first_feat[i, -args.angle_feat_size:] = utils.angle_feature( ob['heading'], ob['elevation']) first_feat = torch.from_numpy(first_feat).cuda() while not ended.all(): if viewpoints is not None: for i, ob in enumerate(obs): viewpoints[i].append(ob['viewpoint']) teacher_action = self._teacher_action(obs, ended) teacher_action = teacher_action.cpu().numpy() # TODO: why last teacher action not -1 teacher_actions.append(teacher_action.copy()) candidate_length = [len(ob['candidate']) + 1 for ob in obs] # +1 is for the end candidate_feat = np.zeros( (len(obs), max(candidate_length), self.obs_dim)) # NOTE: The candidate_feat at len(ob['candidate']) is the feature for the END, which is zero in my implementation for i, ob in enumerate(obs): for j, c in enumerate(ob['candidate']): candidate_feat[i, j, :] = c['feature'] candidate_feats.append(torch.Tensor(candidate_feat).cuda()) candidate_masks.append(utils.length2mask(candidate_length)) img_feats.append(self._feature_variable(obs)) for i, act in enumerate(teacher_action): if act < 0 or act == len( obs[i]['candidate']): # Ignore or Stop teacher_action[i] = -1 # Stop Action can_feats.append(self._candidate_variable(obs, teacher_action)) self.make_equiv_action(teacher_action, obs) length += (1 - ended) ended[:] = np.logical_or(ended, (teacher_action == -1)) obs = self.env._get_obs() # TODO: heading random ? # TODO: policy decoder behavior clone # TODO: state decoder mse # TODO: state decoder weight = 0 ? assert len(teacher_actions) == len(candidate_feats) == len( candidate_masks) _max = 0 for i in range(len(candidate_feats)): _max = max(_max, candidate_feats[i].shape[1]) shape_list = np.array(candidate_feats[0].shape) shape_list[1] = 1 feat_pad_vec = torch.zeros(tuple(shape_list)).cuda() shape_list = np.array(candidate_masks[0].shape) shape_list[1] = 1 mask_pad_vec = torch.ones(tuple(shape_list)).bool().cuda() for i in range(len(candidate_feats)): diff = _max - candidate_feats[i].shape[1] diff2 = _max - candidate_masks[i].shape[1] assert diff == diff2 if diff > 0: candidate_feats[i] = torch.cat( [candidate_feats[i], feat_pad_vec.repeat(1, diff, 1)], dim=1) candidate_masks[i] = torch.cat( [candidate_masks[i], mask_pad_vec.repeat(1, diff)], dim=1) # convert teacher actions to one-hot vectors teacher_actions_1h.append( torch.nn.functional.one_hot(torch.LongTensor( teacher_actions[i]), num_classes=_max).cuda()) img_feats = torch.stack( img_feats, 1).contiguous() # batch_size, max_len, 36, 2052 can_feats = torch.stack(can_feats, 1).contiguous() # batch_size, max_len, 2052 teacher_actions_1h = torch.stack(teacher_actions_1h, 1).contiguous() candidate_feats = torch.stack(candidate_feats, 1).contiguous() candidate_masks = torch.stack(candidate_masks, 1).contiguous() if get_first_feat: return (img_feats, can_feats, first_feat), length else: return (img_feats, can_feats, teacher_actions_1h, candidate_feats, candidate_masks), length
def rollout(self, train_ml=None, train_rl=True, reset=True, speaker=None): """ :param train_ml: The weight to train with maximum likelihood :param train_rl: whether use RL in training :param reset: Reset the environment :param speaker: Speaker used in back translation. If the speaker is not None, use back translation. O.w., normal training :return: """ if self.feedback == 'teacher' or self.feedback == 'argmax': train_rl = False if reset: # Reset env obs = np.array(self.env.reset()) else: obs = np.array(self.env._get_obs()) batch_size = len(obs) if speaker is not None: # Trigger the self_train mode! noise = self.decoder.drop_env(torch.ones(self.feature_size).cuda()) batch = self.env.batch.copy() speaker.env = self.env insts = speaker.infer_batch(featdropmask=noise) # Use the same drop mask in speaker # Create fake environments with the generated instruction boss = np.ones((batch_size, 1), np.int64) * self.tok.word_to_index['<BOS>'] # First word is <BOS> insts = np.concatenate((boss, insts), 1) for i, (datum, inst) in enumerate(zip(batch, insts)): if inst[-1] != self.tok.word_to_index['<PAD>']: # The inst is not ended! inst[-1] = self.tok.word_to_index['<EOS>'] datum.pop('instructions') datum.pop('instr_encoding') datum['instructions'] = self.tok.decode_sentence(inst) datum['instr_encoding'] = inst obs = np.array(self.env.reset(batch)) # Reorder the language input for the encoder (do not ruin the original code) seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs) perm_obs = obs[perm_idx] ctx, h_t, c_t = self.encoder(seq, seq_lengths) ctx_mask = seq_mask # Init the reward shaping last_dist = np.zeros(batch_size, np.float32) for i, ob in enumerate(perm_obs): # The init distance from the view point to the target last_dist[i] = ob['distance'] # Record starting point traj = [{ 'instr_id': ob['instr_id'], 'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])] } for ob in perm_obs] # For test result submission visited = [set() for _ in perm_obs] # Initialization the tracking state ended = np.array([False] * batch_size) # Indices match permuation of the model, not env # Init the logs rewards = [] hidden_states = [] policy_log_probs = [] masks = [] entropys = [] ml_loss = 0. h1 = h_t for t in range(self.episode_len): input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat(perm_obs) if speaker is not None: # Apply the env drop mask to the feat candidate_feat[..., :-args.angle_feat_size] *= noise f_t[..., :-args.angle_feat_size] *= noise h_t, c_t, logit, h1 = self.decoder(input_a_t, f_t, candidate_feat, h_t, h1, c_t, ctx, ctx_mask, already_dropfeat=(speaker is not None)) hidden_states.append(h_t) # Mask outputs where agent can't move forward # Here the logit is [b, max_candidate] candidate_mask = utils.length2mask(candidate_leng) if args.submit: # Avoding cyclic path for ob_id, ob in enumerate(perm_obs): visited[ob_id].add(ob['viewpoint']) for c_id, c in enumerate(ob['candidate']): if c['viewpointId'] in visited[ob_id]: candidate_mask[ob_id][c_id] = 1 logit.masked_fill_(candidate_mask, -float('inf')) # fill the masked True (not candidate) with -inf
def beam_search(self, beam_size=3, train=False): if train: self.encoder.train() self.decoder.train() else: self.encoder.eval() self.decoder.eval() # Image Input for the Encoder obs = self.env._get_obs() batch_size = len(obs) viewpoints_list = [list() for _ in range(batch_size)] # Get feature (img_feats, can_feats), lengths = self.from_shortest_path( viewpoints=viewpoints_list ) # Image Feature (from the shortest path) # Encoder ctx = self.encoder(can_feats, img_feats, lengths, already_dropfeat=True) ctx_mask = utils.length2mask(lengths) h_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda() c_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda() completed = [] for _ in range(batch_size): completed.append([]) beams = [[ InferenceState(prev_inference_state=None, flat_index=i, last_word=self.tok.word_to_index['<BOS>'], word_count=0, score=0.0) ] for i in range(batch_size)] for t in range(self.args.maxDecode): flat_indices = [] beam_indices = [] w_t_list = [] for beam_index, beam in enumerate(beams): for inf_state in beam: beam_indices.append(beam_index) flat_indices.append(inf_state.flat_index) w_t_list.append(inf_state.last_word) # w_t = try_cuda(Variable(torch.LongTensor(w_t_list), requires_grad=False)) w_t = torch.from_numpy(np.array(w_t_list)).long().cuda() # if len(w_t.shape) == 1: # w_t = w_t.unsqueeze(0) logit, h_t, c_t = self.decoder(w_t.view(-1, 1), ctx[beam_indices], ctx_mask[beam_indices], h_t[:, flat_indices], c_t[:, flat_indices]) logit = logit.squeeze(1) logit[:, self.tok.word_to_index['<UNK>']] = -float( "inf") # No <UNK> in infer # h_t,c_t,alpha,logit = self.decoder(w_t.view(-1, 1), h_t[flat_indices], c_t[flat_indices], ctx[beam_indices], path_mask[beam_indices]) log_probs = F.log_softmax(logit, dim=1).data # num x dim _, word_indices = logit.data.topk(min(beam_size, logit.size()[1]), dim=1) # num x beam_size word_scores = log_probs.gather(1, word_indices) assert word_scores.size() == word_indices.size() start_index = 0 new_beams = [] all_successors = [] for beam_index, beam in enumerate(beams): successors = [] end_index = start_index + len(beam) if beam: for inf_index, (inf_state, word_score_row, word_index_row) in \ enumerate(zip(beam, word_scores[start_index:end_index], word_indices[start_index:end_index])): for word_score, word_index in zip( word_score_row, word_index_row): flat_index = start_index + inf_index successors.append( InferenceState( prev_inference_state=inf_state, flat_index=flat_index, last_word=word_index.item(), word_count=inf_state.word_count + 1, score=inf_state.score + word_score.item())) start_index = end_index successors = sorted(successors, key=lambda t: t.score, reverse=True)[:beam_size] all_successors.append(successors) for beam_index, successors in enumerate(all_successors): new_beam = [] for successor in successors: if successor.last_word == self.tok.word_to_index[ '<EOS>'] or t == self.args.maxDecode - 1: completed[beam_index].append(successor) else: new_beam.append(successor) if len(completed[beam_index]) >= beam_size: new_beam = [] new_beams.append(new_beam) beams = new_beams if not any(beam for beam in beams): break words_batch = {} # max_len = 0 for i in range(batch_size): path_id = obs[i]['path_id'] if not path_id in words_batch: words_batch[path_id] = [] this_completed = completed[i] this_completed = sorted(this_completed, key=lambda t: t.score, reverse=True)[:beam_size] for inf_state in this_completed: word_indices = backchain_inference_states(inf_state) words_batch[path_id].append(word_indices) # max_len = max(max_len,len(word_indices)) # res = np.ones([batch_size, max_len]).astype(np.int32) * self.tok.word_to_index['<PAD>'] # for i,words in enumerate(words_batch): # for j,w in enumerate(words): # res[i,j] = w return words_batch
def teacher_forcing(self, train=True, features=None, insts=None, for_listener=False, perm_idx=None, creator=None): if train: self.encoder.train() self.decoder.train() else: self.encoder.eval() self.decoder.eval() # Get Image Input & Encode if features is not None: # It is used in calulating the speaker score in beam-search # assert insts is not None obs = np.array(self.env._get_obs()) if perm_idx is not None: obs = obs[perm_idx] (img_feats, can_feats), lengths = features ctx = self.encoder(can_feats, img_feats, lengths, already_dropfeat=True) batch_size = len(lengths) else: obs = self.env._get_obs() batch_size = len(obs) if creator is not None: (img_feats, can_feats), lengths, weights_reg = self.from_shortest_path( creator=creator) # Image Feature (from the shortest path) else: (img_feats, can_feats), lengths = self.from_shortest_path() ctx = self.encoder(can_feats, img_feats, lengths) h_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda() c_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda() ctx_mask = utils.length2mask(lengths) # Get Language Input if insts is None: insts = self.gt_words(obs) # Language Feature # Decode logits, _, _ = self.decoder(insts, ctx, ctx_mask, h_t, c_t) # Because the softmax_loss only allow dim-1 to be logit, # So permute the output (batch_size, length, logit) --> (batch_size, logit, length) logits = logits.permute(0, 2, 1).contiguous() loss = self.softmax_loss( input=logits[:, :, :-1], # -1 for aligning target=insts[:, 1:] # "1:" to ignore the word <BOS> ) if check(loss): print('lengths', lengths) print('loss is nan', loss) # print('logits', logits) for i, t in enumerate(insts): l = self.softmax_loss(input=logits[i, :, :-1].unsqueeze(0), target=t[1:].unsqueeze(0)) if check(l): print('case', i) print('inst', t[1:]) print('ctx', check(ctx[i])) print('length', lengths[i]) # for j,label in enumerate(t[1:]): # label = label.item() # if label != self.tok.word_to_index['<PAD>']: # print('pos %d, word %s, logit'%(j, self.tok.index_to_word[label]),logits[i,j]) assert False if for_listener: inst_mask = insts[:, 1:] != self.tok.word_to_index['<PAD>'] return self.nonreduced_softmax_loss( input=logits[:, :, :-1], # -1 for aligning target=insts[:, 1:] # "1:" to ignore the word <BOS> ), inst_mask if train: if creator is not None: return loss, weights_reg return loss else: # Evaluation _, predict = logits.max(dim=1) # BATCH, LENGTH gt_mask = (insts != self.tok.word_to_index['<PAD>']) correct = (predict[:, :-1] == insts[:, 1:] ) * gt_mask[:, 1:] # Not pad and equal to gt correct, gt_mask = correct.type(torch.LongTensor), gt_mask.type( torch.LongTensor) word_accu = correct.sum().item() / gt_mask[:, 1:].sum().item( ) # Exclude <BOS> sent_accu = (correct.sum(dim=1) == gt_mask[:, 1:].sum( dim=1)).sum().item() / batch_size # Exclude <BOS> return loss.item(), word_accu, sent_accu