def choose_command(word_ranks, word_masks_np, use_cuda, epsilon=0.0): batch_size = word_ranks[0].size(0) word_qvalues, word_indices_maxq = _choose_maxQ_command( word_ranks, word_masks_np, use_cuda) if epsilon > 0.0: _, word_indices_random = _choose_random_command( word_ranks, word_masks_np, use_cuda) # random number for epsilon greedy rand_num = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 1)) less_than_epsilon = (rand_num < epsilon).astype("float32") # batch greater_than_epsilon = 1.0 - less_than_epsilon less_than_epsilon = to_pt(less_than_epsilon, use_cuda, type='float') greater_than_epsilon = to_pt(greater_than_epsilon, use_cuda, type='float') less_than_epsilon, greater_than_epsilon = less_than_epsilon.long( ), greater_than_epsilon.long() chosen_indices = [ less_than_epsilon * idx_random + greater_than_epsilon * idx_maxq for idx_random, idx_maxq in zip(word_indices_random, word_indices_maxq) ] else: chosen_indices = word_indices_maxq chosen_indices = [item.detach() for item in chosen_indices] return word_qvalues, chosen_indices
def compute_reward(self): """ Compute rewards by agent. Note this is different from what the training/evaluation scripts do. Agent keeps track of scores and other game information for training purpose. """ # mask = 1 if game is not finished or just finished at current step if len(self.dones) == 1: # it's not possible to finish a game at 0th step mask = [1.0 for _ in self.dones[-1]] else: assert len(self.dones) > 1 mask = [1.0 if not self.dones[-2][i] else 0.0 for i in range(len(self.dones[-1]))] mask = np.array(mask, dtype='float32') mask_pt = to_pt(mask, self.use_cuda, type='float') # rewards returned by game engine are always accumulated value the # agent have recieved. so the reward it gets in the current game step # is the new value minus values at previous step. rewards = np.array(self.scores[-1], dtype='float32') # batch if len(self.scores) > 1: prev_rewards = np.array(self.scores[-2], dtype='float32') rewards = rewards - prev_rewards rewards_pt = to_pt(rewards, self.use_cuda, type='float') return rewards, rewards_pt, mask, mask_pt
def get_agent_inputs(self, string_list): sentence_token_list = [item.split() for item in string_list] sentence_id_list = [ _words_to_ids(tokens, self.word2id) for tokens in sentence_token_list ] input_sentence_char = list_of_token_list_to_char_input( sentence_token_list, self.char2id) input_sentence = pad_sequences( sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32') input_sentence = to_pt(input_sentence, self.use_cuda) input_sentence_char = to_pt(input_sentence_char, self.use_cuda) return input_sentence, input_sentence_char, sentence_id_list
def update(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.replay_memory) < self.replay_batch_size: return None transitions = self.replay_memory.sample(self.replay_batch_size) batch = Transition(*zip(*transitions)) observation_id_list = pad_sequences( batch.observation_id_list, maxlen=max_len(batch.observation_id_list)).astype('int32') input_observation = to_pt(observation_id_list, self.use_cuda) next_observation_id_list = pad_sequences( batch.next_observation_id_list, maxlen=max_len(batch.next_observation_id_list)).astype('int32') next_input_observation = to_pt(next_observation_id_list, self.use_cuda) chosen_indices = list(list(zip(*batch.word_indices))) chosen_indices = [torch.stack(item, 0) for item in chosen_indices] # list of batch x 1 word_ranks = self.infer_word_ranks( input_observation ) # list of batch x vocab, len=5 (one per potential output word) word_qvalues = [ w_rank.gather(1, idx).squeeze(-1) for w_rank, idx in zip(word_ranks, chosen_indices) ] # list of batch q_value = torch.mean(torch.stack(word_qvalues, -1), -1) # batch next_word_ranks = self.infer_word_ranks( next_input_observation ) # batch x n_verb, batch x n_noun, batchx n_second_noun next_word_masks = list(list(zip(*batch.next_word_masks))) next_word_masks = [np.stack(item, 0) for item in next_word_masks] next_word_qvalues, _ = _choose_maxQ_command(next_word_ranks, next_word_masks, self.use_cuda) next_q_value = torch.mean(torch.stack(next_word_qvalues, -1), -1) # batch next_q_value = next_q_value.detach() rewards = torch.stack(batch.reward) # batch not_done = 1.0 - np.array(batch.done, dtype='float32') # batch not_done = to_pt(not_done, self.use_cuda, type='float') rewards = rewards + not_done * next_q_value * self.discount_gamma # batch mask = torch.stack(batch.mask) # batch loss = F.smooth_l1_loss(q_value * mask, rewards * mask) return loss
def get_word_mask(self, list_of_query_id_list, list_of_observation_id_list): batch_size = len(list_of_query_id_list) if self.generate_or_point == "generate": sw_ids = set() for sw in self.stopwords: if sw in self.word2id: sw_ids.add(self.word2id[sw]) word_mask = np.ones((batch_size, len(self.word_vocab)), dtype="float32") for _id in sw_ids: word_mask[:, _id] = 0.0 word_mask = to_pt(word_mask, enable_cuda=self.use_cuda, type="float") mask_word_id_list = [] all_word_ids = set(np.arange(len(self.word_vocab)).tolist()) m = list(all_word_ids - sw_ids) for i in range(batch_size): mask_word_id_list.append(m) return word_mask, mask_word_id_list word_mask_np = np.zeros((batch_size, len(self.word_vocab)), dtype="float32") mask_word_id_list = [] for i in range(batch_size): mask_word_id_list.append(set()) for w_idx in list_of_query_id_list[i]: if self.word_vocab[w_idx] in self.stopwords: continue word_mask_np[i][w_idx] = 1.0 mask_word_id_list[i].add(w_idx) if self.generate_or_point == "qmpoint": for w_idx in list_of_observation_id_list[i]: if self.word_vocab[w_idx] in self.stopwords: continue word_mask_np[i][w_idx] = 1.0 mask_word_id_list[i].add(w_idx) mask_word_id_list = [list(item) for item in mask_word_id_list] for i in range(len(mask_word_id_list)): if len(mask_word_id_list[i]) == 0: mask_word_id_list[i].append( self.word2id[","]) # just in case this list is empty word_mask_np[i][self.word2id[","]] = 1.0 continue word_mask = to_pt(word_mask_np, enable_cuda=self.use_cuda, type="float") return word_mask, mask_word_id_list
def choose_random_command(self, word_ranks, word_masks_np): """ Generate a command randomly, for epsilon greedy. Arguments: word_ranks: Q values for each word by model.action_scorer. word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun). """ batch_size = word_ranks[0].size(0) word_ranks_np = [to_np(item) for item in word_ranks] # list of batch x n_vocab word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)] # list of batch x n_vocab word_indices = [] for i in range(len(word_ranks_np)): indices = [] for j in range(batch_size): msk = word_masks_np[i][j] # vocab indices.append(np.random.choice(len(msk), p=msk / np.sum(msk, -1))) word_indices.append(np.array(indices)) # word_indices: list of batch word_qvalues = [[] for _ in word_masks_np] for i in range(batch_size): for j in range(len(word_qvalues)): word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]]) word_qvalues = [torch.stack(item) for item in word_qvalues] word_indices = [to_pt(item, self.use_cuda) for item in word_indices] word_indices = [item.unsqueeze(-1) for item in word_indices] # list of batch x 1 return word_qvalues, word_indices
def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]): """ Get all the available information, and concat them together to be tensor for a neural model. we use post padding here, all information are tokenized here. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ inventory_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["inventory"]] inventory_id_list = [_words_to_ids(tokens, self.word2id) for tokens in inventory_token_list] feedback_token_list = [preproc(item, str_type='feedback', tokenizer=self.nlp) for item in obs] feedback_id_list = [_words_to_ids(tokens, self.word2id) for tokens in feedback_token_list] quest_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["extra.recipe"]] quest_id_list = [_words_to_ids(tokens, self.word2id) for tokens in quest_token_list] prev_action_token_list = [preproc(item, tokenizer=self.nlp) for item in self.prev_actions] prev_action_id_list = [_words_to_ids(tokens, self.word2id) for tokens in prev_action_token_list] description_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["description"]] for i, d in enumerate(description_token_list): if len(d) == 0: description_token_list[i] = ["end"] # if empty description, insert word "end" description_id_list = [_words_to_ids(tokens, self.word2id) for tokens in description_token_list] description_id_list = [_d + _i + _q + _f + _pa for (_d, _i, _q, _f, _pa) in zip(description_id_list, inventory_id_list, quest_id_list, feedback_id_list, prev_action_id_list)] input_description = pad_sequences(description_id_list, maxlen=max_len(description_id_list)).astype('int32') input_description = to_pt(input_description, self.use_cuda) return input_description, description_id_list
def choose_maxQ_command(self, word_ranks, word_masks_np): """ Generate a command by maximum q values, for epsilon greedy. Arguments: word_ranks: Q values for each word by model.action_scorer. word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun). """ batch_size = word_ranks[0].size(0) word_ranks_np = [to_np(item) for item in word_ranks] # list of batch x n_vocab word_ranks_np = [ r - np.min(r) for r in word_ranks_np ] # minus the min value, so that all values are non-negative word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np) ] # list of batch x n_vocab word_indices = [np.argmax(item, -1) for item in word_ranks_np] # list of batch word_qvalues = [[] for _ in word_masks_np] for i in range(batch_size): for j in range(len(word_qvalues)): word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]]) word_qvalues = [torch.stack(item) for item in word_qvalues] word_indices = [to_pt(item, self.use_cuda) for item in word_indices] word_indices = [item.unsqueeze(-1) for item in word_indices] # list of batch x 1 return word_qvalues, word_indices
def get_qa_loss(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.qa_replay_memory) < self.replay_batch_size: return None transitions = self.qa_replay_memory.sample(self.replay_batch_size) batch = qa_Transition(*zip(*transitions)) answer_distribution, obs_mask = self.answer_question( batch.observation_list, batch.quest_list, use_model="online") # answer_distribution is batch x time x 2 answer_distribution = masked_softmax(answer_distribution, obs_mask.unsqueeze(-1), axis=1) answer_strings = [item[0] for item in batch.answer_strings] groundtruth_answer_positions = get_answer_position( batch.observation_list, answer_strings) # list: batch x 2 groundtruth = pad_sequences(groundtruth_answer_positions).astype( 'int32') groundtruth = to_pt(groundtruth, self.use_cuda) # batch x 2 batch_loss = NegativeLogLoss( answer_distribution * obs_mask.unsqueeze(-1), groundtruth) return torch.mean(batch_loss)
def act_random(self, obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words): with torch.no_grad(): batch_size = len(obs) word_indices_random = self.choose_random_command( batch_size, len(self.word_vocab), possible_words) chosen_indices = word_indices_random chosen_strings = self.get_chosen_strings(chosen_indices) for i in range(batch_size): if chosen_strings[i] == "wait": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "wait": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ chosen_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info
def get_qa_loss(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.qa_replay_memory) < self.replay_batch_size: return None transitions = self.qa_replay_memory.sample(self.replay_batch_size) batch = qa_memory.qa_Transition(*zip(*transitions)) observation_list = batch.observation_list quest_list = batch.quest_list answer_strings = batch.answer_strings answer_position = np.array(_words_to_ids(answer_strings, self.word2id)) groundtruth = to_pt(answer_position, self.use_cuda) # batch input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list) input_observation, input_observation_char, observation_id_list = self.get_agent_inputs( observation_list) answer_distribution, _, _ = self.answer_question( input_observation, input_observation_char, observation_id_list, input_quest, input_quest_char, use_model="online") # batch x vocab batch_loss = NegativeLogLoss(answer_distribution, groundtruth) # batch return torch.mean(batch_loss)
def get_graph_relation_representations(self, relation_names_word_ids): # relation_names_word_ids: num_relation x num_word relation_name_embeddings, _mask = self.embed( relation_names_word_ids) # num_relation x num_word x emb _mask = torch.sum(_mask, -1) # num_relation relation_name_embeddings = torch.sum(relation_name_embeddings, 1) # num_relation x hid tmp = torch.eq(_mask, 0).float() if relation_name_embeddings.is_cuda: tmp = tmp.cuda() _mask = _mask + tmp relation_name_embeddings = relation_name_embeddings / _mask.unsqueeze( -1) relation_name_embeddings = relation_name_embeddings.unsqueeze( 0) # 1 x num_relation x emb relation_ids = np.arange(self.relation_vocab_size) # num_relation relation_ids = to_pt(relation_ids, enable_cuda=relation_names_word_ids.is_cuda, type='long').unsqueeze(0) # 1 x num_relation relation_embeddings, _ = self.relation_embedding( relation_ids) # 1 x num_relation x emb relation_embeddings = torch.cat( [relation_name_embeddings, relation_embeddings], dim=-1) # 1 x num_relation x emb+emb return relation_embeddings
def act_greedy(self, obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words): """ Acts upon the current list of observations. One text command must be returned for each observation. """ with torch.no_grad(): batch_size = len(obs) local_word_masks_np = self.get_local_word_masks(possible_words) local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in local_word_masks_np ] # generate commands for one game step, epsilon greedy is applied, i.e., # there is epsilon of chance to generate random commands action_ranks = self.get_ranks( input_observation, input_observation_char, input_quest, input_quest_char, local_word_masks, use_model="online") # list of batch x vocab word_indices_maxq = self.choose_maxQ_command( action_ranks, local_word_masks) chosen_indices = word_indices_maxq chosen_strings = self.get_chosen_strings(chosen_indices) for i in range(batch_size): if chosen_strings[i] == "wait": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "wait": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ chosen_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info
def act_random(self, obs, infos, input_quest, input_quest_char, quest_id_list): with torch.no_grad(): batch_size = len(obs) # update inputs for answerer if self.not_finished_yet is None: self.not_finished_yet = np.ones((len(obs), ), dtype="float32") self.naozi.push_batch(copy.copy(obs)) else: for i in range(batch_size): if self.not_finished_yet[i] == 1.0: self.naozi.push_one(i, copy.copy(obs[i])) description_list = self.naozi.get() input_description, input_description_char, description_id_list = self.get_agent_inputs( description_list) ctrlf_word_mask, ctrlf_word_ids = self.get_word_mask( quest_id_list, description_id_list) # generate commands for one game step, epsilon greedy is applied, i.e., # there is epsilon of chance to generate random commands action_rank, ctrlf_rank = self.get_ranks( input_description, input_description_char, input_quest, input_quest_char, ctrlf_word_mask, use_model="online") # list of batch x vocab action_indices = self.choose_random_command(action_rank) ctrlf_indices = self.choose_random_command(ctrlf_rank, ctrlf_word_ids) chosen_strings = self.generate_commands(action_indices, ctrlf_indices) for i in range(batch_size): if chosen_strings[i] == "stop": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "stop": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ description_list, action_indices, ctrlf_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info
def choose_random_command(self, action_rank, mask_word_ids=None): """ Generate a command randomly, for epsilon greedy. """ batch_size = action_rank.size(0) action_space_size = action_rank.size(-1) if mask_word_ids is None: indices = np.random.choice(action_space_size, batch_size) else: indices = [] for j in range(batch_size): indices.append(np.random.choice(mask_word_ids[j])) indices = np.array(indices) action_indices = to_pt(indices, self.use_cuda).unsqueeze(-1) # batch x 1 return action_indices
def point_random_position(self, point_distribution, mask): """ Generate a command by random, for epsilon greedy. Arguments: point_distribution: Q values for each position batch x time x 2. mask: position masks. """ batch_size = point_distribution.size(0) mask_np = to_np(mask) # batch x time indices = [] for i in range(batch_size): msk = mask_np[i] # time indices.append( np.random.choice(len(msk), 2, p=msk / np.sum(msk, -1))) indices = to_pt(np.stack(indices, 0), self.use_cuda) # batch x 2 return indices
def get_graph_node_representations(self, node_names_word_ids): # node_names_word_ids: num_node x num_word node_name_embeddings, _mask = self.embed(node_names_word_ids) # num_node x num_word x emb _mask = torch.sum(_mask, -1) # num_node node_name_embeddings = torch.sum(node_name_embeddings, 1) # num_node x hid tmp = torch.eq(_mask, 0).float() if node_name_embeddings.is_cuda: tmp = tmp.cuda() _mask = _mask + tmp node_name_embeddings = node_name_embeddings / _mask.unsqueeze(-1) node_name_embeddings = node_name_embeddings.unsqueeze(0) # 1 x num_node x emb node_ids = np.arange(self.node_vocab_size) # num_node node_ids = to_pt(node_ids, enable_cuda=node_names_word_ids.is_cuda, type='long').unsqueeze(0) # 1 x num_node node_embeddings, _ = self.node_embedding(node_ids) # 1 x num_node x emb node_embeddings = torch.cat([node_name_embeddings, node_embeddings], dim=-1) # 1 x num_node x emb+emb return node_embeddings
def point_maxq_position(self, point_distribution, mask): """ Generate a command by maximum q values, for epsilon greedy. Arguments: point_distribution: Q values for each position batch x time x 2. mask: position masks. """ point_distribution_np = to_np(point_distribution) # batch x time mask_np = to_np(mask) # batch x time point_distribution_np = point_distribution_np - np.min( point_distribution_np ) + 1e-2 # minus the min value, so that all values are non-negative point_distribution_np = point_distribution_np * np.expand_dims( mask_np, -1) # batch x time x 2 indices = np.argmax(point_distribution_np, 1) # batch x 2 indices = to_pt(np.array(indices), self.use_cuda) # batch x 2 return indices
def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]): """ Get all the available information, and concat them together to be tensor for a neural model. we use post padding here, all information are tokenized here. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ word2id = self.vocab.word2id inventory_id_list = get_token_ids_for_items(infos["inventory"], word2id, tokenizer=self.nlp) feedback_id_list = get_token_ids_for_items(obs, word2id, tokenizer=self.nlp) quest_id_list = get_token_ids_for_items(infos["extra.recipe"], word2id, tokenizer=self.nlp) prev_action_id_list = get_token_ids_for_items(self.prev_actions, word2id, tokenizer=self.nlp) description_id_list = get_token_ids_for_items(infos["description"], word2id, tokenizer=self.nlp, subst_if_empty=['end']) description_id_list = [ _d + _i + _q + _f + _pa for (_d, _i, _q, _f, _pa ) in zip(description_id_list, inventory_id_list, quest_id_list, feedback_id_list, prev_action_id_list) ] input_description = pad_sequences( description_id_list, maxlen=max_len(description_id_list)).astype('int32') input_description = to_pt(input_description, self.use_cuda) return input_description, description_id_list
def _choose_random_command(word_ranks, word_masks_np, use_cuda): """ Generate a command randomly, for epsilon greedy. Arguments: word_ranks: Q values for each word by model.action_scorer. word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun, adj2, noun2). """ batch_size = word_ranks[0].size(0) # print("batch_size=", batch_size, len(word_masks_np)) assert len(word_ranks) == len(word_masks_np) word_ranks_np = [ to_np(item) for item in word_ranks ] # list of (batch x n_vocab) arrays, len=5 (5 word output phrases) # word_ranks_np = [r - np.min(r) for r in word_ranks_np] # minus the min value, so that all values are non-negative word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np) ] # list of batch x n_vocab word_indices = [] for i in range( len(word_ranks_np)): # len=5 (verb, adj1, noun1, adj2, noun2) indices = [] for j in range(batch_size): msk = word_masks_np[i][ j] # msk is of len = vocab, j is index into batch indices.append(np.random.choice( len(msk), p=msk / np.sum(msk, -1))) # choose from non-zero entries of msk word_indices.append(np.array(indices)) # word_indices: list of batch word_qvalues = [[] for _ in word_masks_np] for i in range(batch_size): for j in range(len(word_qvalues)): word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]]) word_qvalues = [torch.stack(item) for item in word_qvalues] word_indices = [to_pt(item, use_cuda) for item in word_indices] word_indices = [item.unsqueeze(-1) for item in word_indices] # list of batch x 1 return word_qvalues, word_indices
def choose_random_command(self, batch_size, action_space_size, possible_words=None): """ Generate a command randomly, for epsilon greedy. """ action_indices = [] for i in range(3): if possible_words is None: indices = np.random.choice(action_space_size, batch_size) else: indices = [] for j in range(batch_size): mask_ids = [] for w in possible_words[i][j]: if w in self.word2id: mask_ids.append(self.word2id[w]) indices.append(np.random.choice(mask_ids)) indices = np.array(indices) action_indices.append(to_pt(indices, self.use_cuda)) # batch return action_indices
def act(self, obs: List[str], scores: List[int], dones: List[bool], infos: Dict[str, List[Any]]) -> List[str]: """ Acts upon the current list of observations. One text command must be returned for each observation. Arguments: obs: Previous command's feedback for each game. score: The score obtained so far for each game (at previous step). done: Whether a game is finished (at previous step). infos: Additional information for each game. Returns: Text commands to be performed (one per observation). Notes: Commands returned for games marked as `done` have no effect. The states for finished games are simply copy over until all games are done, in which case `CustomAgent.finish()` is called instead. """ if not self._epsiode_has_started: self._start_episode(obs, infos) if self.mode == "eval": return self.act_eval(obs, scores, dones, infos) if self.current_step > 0: # append scores / dones from previous step into memory self.scores.append(scores) self.dones.append(dones) # compute previous step's rewards and masks rewards_np, rewards, mask_np, mask = self.compute_reward() input_description, description_id_list = self.get_game_step_info(obs, infos) # generate commands for one game step, epsilon greedy is applied, i.e., # there is epsilon of chance to generate random commands word_ranks = self.get_ranks(input_description) # list of batch x vocab _, word_indices_maxq = self.choose_maxQ_command(word_ranks, self.word_masks_np) _, word_indices_random = self.choose_random_command(word_ranks, self.word_masks_np) # random number for epsilon greedy rand_num = np.random.uniform(low=0.0, high=1.0, size=(input_description.size(0), 1)) less_than_epsilon = (rand_num < self.epsilon).astype("float32") # batch greater_than_epsilon = 1.0 - less_than_epsilon less_than_epsilon = to_pt(less_than_epsilon, self.use_cuda, type='float') greater_than_epsilon = to_pt(greater_than_epsilon, self.use_cuda, type='float') less_than_epsilon, greater_than_epsilon = less_than_epsilon.long(), greater_than_epsilon.long() chosen_indices = [less_than_epsilon * idx_random + greater_than_epsilon * idx_maxq for idx_random, idx_maxq in zip(word_indices_random, word_indices_maxq)] chosen_indices = [item.detach() for item in chosen_indices] chosen_strings = self.get_chosen_strings(chosen_indices) self.prev_actions = chosen_strings # push info from previous game step into replay memory if self.current_step > 0: for b in range(len(obs)): if mask_np[b] == 0: continue is_prior = rewards_np[b] > 0.0 self.replay_memory.push(is_prior, self.cache_description_id_list[b], [item[b] for item in self.cache_chosen_indices], rewards[b], mask[b], dones[b], description_id_list[b], [item[b] for item in self.word_masks_np]) # cache new info in current game step into caches self.cache_description_id_list = description_id_list self.cache_chosen_indices = chosen_indices # update neural model by replaying snapshots in replay memory if self.current_step > 0 and self.current_step % self.update_per_k_game_steps == 0: loss = self.update() if loss is not None: # Backpropagate self.optimizer.zero_grad() loss.backward(retain_graph=True) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_grad_norm) self.optimizer.step() # apply gradients self.current_step += 1 if all(dones): self._end_episode(obs, scores, infos) return # Nothing to return. return chosen_strings
def train(): time_1 = datetime.datetime.now() config = generic.load_config() agent = Agent(config) output_dir = "." data_dir = "." # make game environments requested_infos = agent.select_additional_infos_lite() requested_infos_eval = agent.select_additional_infos() games_dir = "./" # training game env env, _ = reinforcement_learning_dataset.get_training_game_env( games_dir + config['rl']['data_path'], config['rl']['difficulty_level'], config['rl']['training_size'], requested_infos, agent.max_nb_steps_per_episode, agent.batch_size) if agent.run_eval: # training game env eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env( games_dir + config['rl']['data_path'], config['rl']['difficulty_level'], requested_infos_eval, agent.eval_max_nb_steps_per_episode, agent.eval_batch_size, valid_or_test="valid") else: eval_env, num_eval_game = None, None # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() reward_win, step_win = None, None dqn_loss_win = None eval_game_points_win, eval_step_win = None, None viz_game_rewards, viz_game_points, viz_game_points_normalized, viz_graph_rewards, viz_count_rewards, viz_step = [], [], [], [], [], [] viz_dqn_loss = [] viz_eval_game_points, viz_eval_game_points_normalized, viz_eval_step = [], [], [] step_in_total = 0 episode_no = 0 running_avg_game_points = HistoryScoreCache(capacity=500) running_avg_game_points_normalized = HistoryScoreCache(capacity=500) running_avg_graph_rewards = HistoryScoreCache(capacity=500) running_avg_count_rewards = HistoryScoreCache(capacity=500) running_avg_game_steps = HistoryScoreCache(capacity=500) running_avg_dqn_loss = HistoryScoreCache(capacity=500) running_avg_game_rewards = HistoryScoreCache(capacity=500) json_file_name = agent.experiment_tag.replace(" ", "_") best_train_performance_so_far, best_eval_performance_so_far = 0.0, 0.0 prev_performance = 0.0 if os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"): agent.load_pretrained_graph_generation_model( data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") else: print( "No graph updater module detected... Please check ", data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) agent.update_target_net() elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt") agent.update_target_net() i_have_seen_these_states = EpisodicCountingMemory( ) # episodic counting based memory i_am_patient = 0 perfect_training = 0 while (True): if episode_no > agent.max_episode: break np.random.seed(episode_no) env.seed(episode_no) obs, infos = env.reset() # filter look and examine actions for commands_ in infos["admissible_commands"]: for cmd_ in [ cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"] ]: commands_.remove(cmd_) batch_size = len(obs) agent.train() agent.init() game_name_list = [ game.metadata["uuid"].split("-")[-1] for game in infos["game"] ] game_max_score_list = [game.max_score for game in infos["game"]] i_have_seen_these_states.reset( ) # reset episodic counting based memory prev_triplets, chosen_actions = [], [] prev_step_dones, prev_rewards = [], [] for _ in range(batch_size): prev_triplets.append([]) chosen_actions.append("restart") prev_step_dones.append(0.0) prev_rewards.append(0.0) prev_h, prev_c = None, None observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite( obs, infos) observation_for_counting = copy.copy(observation_strings) observation_strings = [ item + " <sep> " + a for item, a in zip(observation_strings, chosen_actions) ] # generate g_belief begins generated_commands = agent.command_generation_greedy_generation( observation_strings, prev_triplets) current_triplets = agent.update_knowledge_graph_triplets( prev_triplets, generated_commands) # generate g_belief ends i_have_seen_these_states.push( current_triplets) # update init triplets into memory if agent.count_reward_lambda > 0: agent.reset_binarized_counter(batch_size) _ = agent.get_binarized_count(observation_for_counting) # it requires to store sequences of transitions into memory with order, # so we use a cache to keep what agents returns, and push them into memory # altogether in the end of game. transition_cache = [] still_running_mask = [] game_rewards, game_points, graph_rewards, count_rewards = [], [], [], [] print_actions = [] act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode for step_no in range(agent.max_nb_steps_per_episode): if agent.noisy_net: agent.reset_noise() # Draw a new set of noisy weights new_chosen_actions, chosen_indices, prev_h, prev_c = agent.act( observation_strings, current_triplets, action_candidate_list, previous_h=prev_h, previous_c=prev_c, random=act_randomly) replay_info = [ observation_strings, action_candidate_list, chosen_indices, current_triplets, chosen_actions ] transition_cache.append(replay_info) chosen_actions = new_chosen_actions chosen_actions_before_parsing = [ item[idx] for item, idx in zip(infos["admissible_commands"], chosen_indices) ] obs, scores, dones, infos = env.step(chosen_actions_before_parsing) # filter look and examine actions for commands_ in infos["admissible_commands"]: for cmd_ in [ cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"] ]: commands_.remove(cmd_) prev_triplets = current_triplets observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite( obs, infos) observation_for_counting = copy.copy(observation_strings) observation_strings = [ item + " <sep> " + a for item, a in zip(observation_strings, chosen_actions) ] # generate g_belief begins generated_commands = agent.command_generation_greedy_generation( observation_strings, prev_triplets) current_triplets = agent.update_knowledge_graph_triplets( prev_triplets, generated_commands) # generate g_belief ends has_not_seen = i_have_seen_these_states.has_not_seen( current_triplets) i_have_seen_these_states.push( current_triplets) # update init triplets into memory if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0: agent.reset_noise() # Draw a new set of noisy weights if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0: dqn_loss, _ = agent.update_dqn(episode_no) if dqn_loss is not None: running_avg_dqn_loss.push(dqn_loss) if step_no == agent.max_nb_steps_per_episode - 1: # terminate the game because DQN requires one extra step dones = [True for _ in dones] step_in_total += 1 still_running = [1.0 - float(item) for item in prev_step_dones] # list of float prev_step_dones = dones step_rewards = [ float(curr) - float(prev) for curr, prev in zip(scores, prev_rewards) ] # list of float game_points.append(copy.copy(step_rewards)) if agent.use_negative_reward: step_rewards = [ -1.0 if _lost else r for r, _lost in zip(step_rewards, infos["has_lost"]) ] # list of float step_rewards = [ 5.0 if _won else r for r, _won in zip(step_rewards, infos["has_won"]) ] # list of float prev_rewards = scores if agent.fully_observable_graph: step_graph_rewards = [0.0 for _ in range(batch_size)] else: step_graph_rewards = agent.get_graph_rewards( prev_triplets, current_triplets) # list of float step_graph_rewards = [ r * float(m) for r, m in zip(step_graph_rewards, has_not_seen) ] # counting bonus if agent.count_reward_lambda > 0: step_revisit_counting_rewards = agent.get_binarized_count( observation_for_counting, update=True) step_revisit_counting_rewards = [ r * agent.count_reward_lambda for r in step_revisit_counting_rewards ] else: step_revisit_counting_rewards = [ 0.0 for _ in range(batch_size) ] still_running_mask.append(still_running) game_rewards.append(step_rewards) graph_rewards.append(step_graph_rewards) count_rewards.append(step_revisit_counting_rewards) print_actions.append( chosen_actions_before_parsing[0] if still_running[0] else "--") # if all ended, break if np.sum(still_running) == 0: break still_running_mask_np = np.array(still_running_mask) game_rewards_np = np.array( game_rewards) * still_running_mask_np # step x batch game_points_np = np.array( game_points) * still_running_mask_np # step x batch graph_rewards_np = np.array( graph_rewards) * still_running_mask_np # step x batch count_rewards_np = np.array( count_rewards) * still_running_mask_np # step x batch if agent.graph_reward_lambda > 0.0: graph_rewards_pt = generic.to_pt(graph_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch else: graph_rewards_pt = generic.to_pt(np.zeros_like(graph_rewards_np), enable_cuda=agent.use_cuda, type='float') # step x batch if agent.count_reward_lambda > 0.0: count_rewards_pt = generic.to_pt(count_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch else: count_rewards_pt = generic.to_pt(np.zeros_like(count_rewards_np), enable_cuda=agent.use_cuda, type='float') # step x batch command_rewards_pt = generic.to_pt(game_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch # push experience into replay buffer (dqn) avg_rewards_in_buffer = agent.dqn_memory.avg_rewards() for b in range(game_rewards_np.shape[1]): if still_running_mask_np.shape[ 0] == agent.max_nb_steps_per_episode and still_running_mask_np[ -1][b] != 0: # need to pad one transition _need_pad = True tmp_game_rewards = game_rewards_np[:, b].tolist() + [0.0] else: _need_pad = False tmp_game_rewards = game_rewards_np[:, b] if np.mean( tmp_game_rewards ) < avg_rewards_in_buffer * agent.buffer_reward_threshold: continue for i in range(game_rewards_np.shape[0]): observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[ i] is_final = True if still_running_mask_np[i][b] != 0: is_final = False agent.dqn_memory.add( observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], _triplets[b], command_rewards_pt[i][b], graph_rewards_pt[i][b], count_rewards_pt[i][b], is_final) if still_running_mask_np[i][b] == 0: break if _need_pad: observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[ -1] agent.dqn_memory.add(observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], _triplets[b], command_rewards_pt[-1][b] * 0.0, graph_rewards_pt[-1][b] * 0.0, count_rewards_pt[-1][b] * 0.0, True) for b in range(batch_size): running_avg_game_points.push(np.sum(game_points_np, 0)[b]) game_max_score_np = np.array(game_max_score_list, dtype="float32") running_avg_game_points_normalized.push( (np.sum(game_points_np, 0) / game_max_score_np)[b]) running_avg_game_steps.push(np.sum(still_running_mask_np, 0)[b]) running_avg_game_rewards.push(np.sum(game_rewards_np, 0)[b]) running_avg_graph_rewards.push(np.sum(graph_rewards_np, 0)[b]) running_avg_count_rewards.push(np.sum(count_rewards_np, 0)[b]) # finish game agent.finish_of_episode(episode_no, batch_size) episode_no += batch_size if episode_no < agent.learn_start_from_this_episode: continue if agent.report_frequency == 0 or ( episode_no % agent.report_frequency > (episode_no - batch_size) % agent.report_frequency): continue time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | dqn loss: {:2.3f} | game points: {:2.3f} | normalized game points: {:2.3f} | game rewards: {:2.3f} | graph rewards: {:2.3f} | count rewards: {:2.3f} | used steps: {:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_dqn_loss.get_avg(), running_avg_game_points.get_avg(), running_avg_game_points_normalized.get_avg(), running_avg_game_rewards.get_avg(), running_avg_graph_rewards.get_avg(), running_avg_count_rewards.get_avg(), running_avg_game_steps.get_avg())) print(game_name_list[0] + ": " + " | ".join(print_actions)) # evaluate curr_train_performance = running_avg_game_points_normalized.get_avg() eval_game_points, eval_game_points_normalized, eval_game_step = 0.0, 0.0, 0.0 eval_command_generation_f1 = 0.0 if agent.run_eval: eval_game_points, eval_game_points_normalized, eval_game_step, eval_command_generation_f1, detailed_scores = evaluate.evaluate_belief_mode( eval_env, agent, num_eval_game) curr_eval_performance = eval_game_points_normalized curr_performance = curr_eval_performance if curr_eval_performance > best_eval_performance_so_far: best_eval_performance_so_far = curr_eval_performance agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") elif curr_eval_performance == best_eval_performance_so_far: if curr_eval_performance > 0.0: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: if curr_train_performance >= best_train_performance_so_far: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: curr_eval_performance = 0.0 detailed_scores = "" curr_performance = curr_train_performance if curr_train_performance >= best_train_performance_so_far: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # update best train performance if curr_train_performance >= best_train_performance_so_far: best_train_performance_so_far = curr_train_performance if prev_performance <= curr_performance: i_am_patient = 0 else: i_am_patient += 1 prev_performance = curr_performance # if patient >= patience, resume from checkpoint if agent.patience > 0 and i_am_patient >= agent.patience: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('reload from a good checkpoint...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) agent.update_target_net() i_am_patient = 0 if running_avg_game_points_normalized.get_avg() >= 0.95: perfect_training += 1 else: perfect_training = 0 # plot using visdom if config["general"]["visdom"]: viz_game_rewards.append(running_avg_game_rewards.get_avg()) viz_game_points.append(running_avg_game_points.get_avg()) viz_game_points_normalized.append( running_avg_game_points_normalized.get_avg()) viz_graph_rewards.append(running_avg_graph_rewards.get_avg()) viz_count_rewards.append(running_avg_count_rewards.get_avg()) viz_step.append(running_avg_game_steps.get_avg()) viz_dqn_loss.append(running_avg_dqn_loss.get_avg()) viz_eval_game_points.append(eval_game_points) viz_eval_game_points_normalized.append(eval_game_points_normalized) viz_eval_step.append(eval_game_step) viz_x = np.arange(len(viz_game_rewards)).tolist() if reward_win is None: reward_win = viz.line(X=viz_x, Y=viz_game_rewards, opts=dict(title=agent.experiment_tag + "_game_rewards"), name="game_rewards") viz.line(X=viz_x, Y=viz_graph_rewards, opts=dict(title=agent.experiment_tag + "_graph_rewards"), win=reward_win, update='append', name="graph_rewards") viz.line(X=viz_x, Y=viz_count_rewards, opts=dict(title=agent.experiment_tag + "_count_rewards"), win=reward_win, update='append', name="count_rewards") viz.line(X=viz_x, Y=viz_game_points, opts=dict(title=agent.experiment_tag + "_game_points"), win=reward_win, update='append', name="game_points") viz.line(X=viz_x, Y=viz_game_points_normalized, opts=dict(title=agent.experiment_tag + "_game_points_normalized"), win=reward_win, update='append', name="game_points_normalized") else: viz.line(X=[len(viz_game_rewards) - 1], Y=[viz_game_rewards[-1]], opts=dict(title=agent.experiment_tag + "_game_rewards"), win=reward_win, update='append', name="game_rewards") viz.line(X=[len(viz_graph_rewards) - 1], Y=[viz_graph_rewards[-1]], opts=dict(title=agent.experiment_tag + "_graph_rewards"), win=reward_win, update='append', name="graph_rewards") viz.line(X=[len(viz_count_rewards) - 1], Y=[viz_count_rewards[-1]], opts=dict(title=agent.experiment_tag + "_count_rewards"), win=reward_win, update='append', name="count_rewards") viz.line(X=[len(viz_game_points) - 1], Y=[viz_game_points[-1]], opts=dict(title=agent.experiment_tag + "_game_points"), win=reward_win, update='append', name="game_points") viz.line(X=[len(viz_game_points_normalized) - 1], Y=[viz_game_points_normalized[-1]], opts=dict(title=agent.experiment_tag + "_game_points_normalized"), win=reward_win, update='append', name="game_points_normalized") if step_win is None: step_win = viz.line(X=viz_x, Y=viz_step, opts=dict(title=agent.experiment_tag + "_step"), name="step") else: viz.line(X=[len(viz_step) - 1], Y=[viz_step[-1]], opts=dict(title=agent.experiment_tag + "_step"), win=step_win, update='append', name="step") if dqn_loss_win is None: dqn_loss_win = viz.line(X=viz_x, Y=viz_dqn_loss, opts=dict(title=agent.experiment_tag + "_dqn_loss"), name="dqn loss") else: viz.line(X=[len(viz_dqn_loss) - 1], Y=[viz_dqn_loss[-1]], opts=dict(title=agent.experiment_tag + "_dqn_loss"), win=dqn_loss_win, update='append', name="dqn loss") if eval_game_points_win is None: eval_game_points_win = viz.line( X=viz_x, Y=viz_eval_game_points, opts=dict(title=agent.experiment_tag + "_eval_game_points"), name="eval game points") viz.line(X=viz_x, Y=viz_eval_game_points_normalized, opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"), win=eval_game_points_win, update='append', name="eval_game_points_normalized") else: viz.line(X=[len(viz_eval_game_points) - 1], Y=[viz_eval_game_points[-1]], opts=dict(title=agent.experiment_tag + "_eval_game_points"), win=eval_game_points_win, update='append', name="eval game_points") viz.line(X=[len(viz_eval_game_points_normalized) - 1], Y=[viz_eval_game_points_normalized[-1]], opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"), win=eval_game_points_win, update='append', name="eval_game_points_normalized") if eval_step_win is None: eval_step_win = viz.line(X=viz_x, Y=viz_eval_step, opts=dict(title=agent.experiment_tag + "_eval_step"), name="eval step") else: viz.line(X=[len(viz_eval_step) - 1], Y=[viz_eval_step[-1]], opts=dict(title=agent.experiment_tag + "_eval_step"), win=eval_step_win, update='append', name="eval step") # write accuracies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "dqn loss": str(running_avg_dqn_loss.get_avg()), "train game points": str(running_avg_game_points.get_avg()), "train normalized game points": str(running_avg_game_points_normalized.get_avg()), "train game rewards": str(running_avg_game_rewards.get_avg()), "train graph rewards": str(running_avg_graph_rewards.get_avg()), "train count rewards": str(running_avg_count_rewards.get_avg()), "train steps": str(running_avg_game_steps.get_avg()), "eval game points": str(eval_game_points), "eval normalized game points": str(eval_game_points_normalized), "eval command generation f1": str(eval_command_generation_f1), "eval steps": str(eval_game_step), "detailed scores": detailed_scores }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() if curr_performance == 1.0 and curr_train_performance >= 0.95: break if perfect_training >= 3: break
def train(): time_1 = datetime.datetime.now() with open("config.yaml") as reader: config = yaml.safe_load(reader) if config['general']['dataset'] == "squad": env = GamifiedSquad(config) else: env = GamifiedNewsQA(config) env.split_reset("train") agent = Agent() # visdom viz = visdom.Visdom() plt_win = None eval_plt_win = None plt_q_value_win = None plt_steps_win = None eval_plt_steps_win = None viz_avg_correct_state_acc, viz_avg_qa_acc = [], [] viz_avg_correct_state_q_value = [] viz_eval_correct_state_acc, viz_eval_qa_acc, viz_eval_steps = [], [], [] viz_avg_steps = [] step_in_total = 0 episode_no = 0 running_avg_qa_acc = HistoryScoreCache(capacity=50) running_avg_correct_state_acc = HistoryScoreCache(capacity=50) running_avg_qa_loss = HistoryScoreCache(capacity=50) running_avg_correct_state_loss = HistoryScoreCache(capacity=50) running_avg_correct_state_q_value = HistoryScoreCache(capacity=50) running_avg_steps = HistoryScoreCache(capacity=50) output_dir, data_dir = ".", "." json_file_name = agent.experiment_tag.replace(" ", "_") best_qa_acc_so_far = 0.0 # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt") agent.update_target_net() elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt") agent.update_target_net() while (True): if episode_no > agent.max_episode: break np.random.seed(episode_no) env.seed(episode_no) obs, infos = env.reset() print( "====================================================================================", episode_no) print("-- Q: %s" % (infos[0]["q"].encode('utf-8'))) print("-- A: %s" % (infos[0]["a"][0].encode('utf-8'))) agent.train() agent.init(obs, infos) quest_list = agent.get_game_quest_info(infos) input_quest, input_quest_char, quest_id_list = agent.get_agent_inputs( quest_list) tmp_replay_buffer = [] print_cmds = [] batch_size = len(obs) act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode for step_no in range(agent.max_nb_steps_per_episode): # generate commands if agent.noisy_net: agent.reset_noise() # Draw a new set of noisy weights commands, replay_info = agent.act(obs, infos, input_quest, input_quest_char, quest_id_list, random=act_randomly) obs, infos = env.step(commands) if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0: agent.reset_noise() # Draw a new set of noisy weights if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0: interaction_loss, interaction_q_value = agent.update_interaction( ) if interaction_loss is not None: running_avg_correct_state_loss.push(interaction_loss) running_avg_correct_state_q_value.push(interaction_q_value) qa_loss = agent.update_qa() if qa_loss is not None: running_avg_qa_loss.push(qa_loss) step_in_total += 1 still_running = generic.to_np(replay_info[-1]) print_cmds.append(commands[0] if still_running[0] else "--") # force stopping if step_no == agent.max_nb_steps_per_episode - 1: replay_info[-1] = torch.zeros_like(replay_info[-1]) tmp_replay_buffer.append(replay_info) if np.sum(still_running) == 0: break print(" / ".join(print_cmds).encode('utf-8')) # The agent has exhausted all steps, now answer question. chosen_head_tails = agent.answer_question_act(agent.naozi.get(), quest_list) # batch chosen_head_tails_np = generic.to_np(chosen_head_tails) chosen_answer_strings = generic.get_answer_strings( agent.naozi.get(), chosen_head_tails_np) answer_strings = [item["a"] for item in infos] qa_reward_np = generic.get_qa_reward(chosen_answer_strings, answer_strings) correct_state_reward_np = generic.get_sufficient_info_reward( agent.naozi.get(), answer_strings) correct_state_reward = generic.to_pt(correct_state_reward_np, enable_cuda=agent.use_cuda, type='float') # batch # push qa experience into qa replay buffer for b in range(batch_size): # data points in batch is_prior = qa_reward_np[ b] > agent.qa_reward_prior_threshold * agent.qa_replay_memory.avg_rewards( ) # if the agent is not in the correct state, do not push it into replay buffer if np.mean(correct_state_reward_np[b]) == 0.0: continue agent.qa_replay_memory.push(is_prior, qa_reward_np[b], agent.naozi.get(b), quest_list[b], answer_strings[b]) # small positive reward whenever it answers question correctly masks_np = [generic.to_np(item[-1]) for item in tmp_replay_buffer] command_rewards_np = [] for i in range(len(tmp_replay_buffer)): if i == len(tmp_replay_buffer) - 1: r = correct_state_reward * tmp_replay_buffer[i][-1] r_np = correct_state_reward_np * masks_np[i] else: # give reward only at that one game step, not all r = correct_state_reward * (tmp_replay_buffer[i][-1] - tmp_replay_buffer[i + 1][-1]) r_np = correct_state_reward_np * (masks_np[i] - masks_np[i + 1]) tmp_replay_buffer[i].append(r) command_rewards_np.append(r_np) command_rewards_np = np.array(command_rewards_np) print(command_rewards_np[:, 0]) # push experience into replay buffer for b in range(len(correct_state_reward_np)): is_prior = np.sum(command_rewards_np, 0)[b] > 0.0 for i in range(len(tmp_replay_buffer)): batch_description_list, batch_chosen_indices, batch_chosen_ctrlf_indices, _, batch_rewards = tmp_replay_buffer[ i] is_final = True if masks_np[i][b] != 0: is_final = False agent.replay_memory.push(is_prior, batch_description_list[b], quest_list[b], batch_chosen_indices[b], batch_chosen_ctrlf_indices[b], batch_rewards[b], is_final) if masks_np[i][b] == 0.0: break qa_acc = np.mean(qa_reward_np) correct_state_acc = np.mean(correct_state_reward_np) step_masks_np = np.sum(np.array(masks_np), 0) # batch for i in range(len(qa_reward_np)): # if the answer is totally wrong, we assume it used all steps if qa_reward_np[i] == 0.0: step_masks_np[i] = agent.max_nb_steps_per_episode used_steps = np.mean(step_masks_np) running_avg_qa_acc.push(qa_acc) running_avg_correct_state_acc.push(correct_state_acc) running_avg_steps.push(used_steps) print_rewards = np.sum(np.mean(command_rewards_np, -1)) obs_string = agent.naozi.get(0) print("-- OBS: %s" % (obs_string.encode('utf-8'))) print("-- PRED: %s" % (chosen_answer_strings[0].encode('utf-8'))) # finish game agent.finish_of_episode(episode_no, batch_size) episode_no += batch_size time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | interaction qvalue: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | sufficient info: {:2.3f}/{:2.3f} | used steps: {:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_correct_state_loss.get_avg(), running_avg_correct_state_q_value.get_avg(), running_avg_qa_loss.get_avg(), print_rewards, qa_acc, running_avg_qa_acc.get_avg(), correct_state_acc, running_avg_correct_state_acc.get_avg(), running_avg_steps.get_avg())) if episode_no < agent.learn_start_from_this_episode: continue if agent.report_frequency == 0 or ( episode_no % agent.report_frequency > (episode_no - batch_size) % agent.report_frequency): continue eval_qa_acc, eval_correct_state_acc, eval_used_steps = 0.0, 0.0, 0.0 # evaluate if agent.run_eval: eval_qa_acc, eval_correct_state_acc, eval_used_steps = evaluate.evaluate( env, agent, "valid") env.split_reset("train") # if run eval, then save model by eval accucacy if agent.save_frequency > 0 and ( episode_no % agent.report_frequency <= (episode_no - batch_size) % agent.report_frequency ) and eval_qa_acc > best_qa_acc_so_far: best_qa_acc_so_far = eval_qa_acc agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # save model elif agent.save_frequency > 0 and ( episode_no % agent.report_frequency <= (episode_no - batch_size) % agent.report_frequency): if running_avg_qa_acc.get_avg() > best_qa_acc_so_far: best_qa_acc_so_far = running_avg_qa_acc.get_avg() agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # plot using visdom viz_avg_correct_state_acc.append( running_avg_correct_state_acc.get_avg()) viz_avg_qa_acc.append(running_avg_qa_acc.get_avg()) viz_avg_correct_state_q_value.append( running_avg_correct_state_q_value.get_avg()) viz_eval_correct_state_acc.append(eval_correct_state_acc) viz_eval_qa_acc.append(eval_qa_acc) viz_eval_steps.append(eval_used_steps) viz_avg_steps.append(running_avg_steps.get_avg()) viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist() if plt_win is None: plt_win = viz.line(X=viz_x, Y=viz_avg_correct_state_acc, opts=dict(title=agent.experiment_tag + "_train"), name="sufficient info") viz.line(X=viz_x, Y=viz_avg_qa_acc, opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") else: viz.line(X=[len(viz_avg_correct_state_acc) - 1], Y=[viz_avg_correct_state_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="sufficient info") viz.line(X=[len(viz_avg_qa_acc) - 1], Y=[viz_avg_qa_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") if plt_q_value_win is None: plt_q_value_win = viz.line(X=viz_x, Y=viz_avg_correct_state_q_value, opts=dict(title=agent.experiment_tag + "_train_q_value"), name="sufficient info") else: viz.line(X=[len(viz_avg_correct_state_q_value) - 1], Y=[viz_avg_correct_state_q_value[-1]], opts=dict(title=agent.experiment_tag + "_train_q_value"), win=plt_q_value_win, update='append', name="sufficient info") if plt_steps_win is None: plt_steps_win = viz.line(X=viz_x, Y=viz_avg_steps, opts=dict(title=agent.experiment_tag + "_train_step"), name="used steps") else: viz.line(X=[len(viz_avg_steps) - 1], Y=[viz_avg_steps[-1]], opts=dict(title=agent.experiment_tag + "_train_step"), win=plt_steps_win, update='append', name="used steps") if eval_plt_win is None: eval_plt_win = viz.line(X=viz_x, Y=viz_eval_correct_state_acc, opts=dict(title=agent.experiment_tag + "_eval"), name="sufficient info") viz.line(X=viz_x, Y=viz_eval_qa_acc, opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") else: viz.line(X=[len(viz_eval_correct_state_acc) - 1], Y=[viz_eval_correct_state_acc[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="sufficient info") viz.line(X=[len(viz_eval_qa_acc) - 1], Y=[viz_eval_qa_acc[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") if eval_plt_steps_win is None: eval_plt_steps_win = viz.line( X=viz_x, Y=viz_eval_steps, opts=dict(title=agent.experiment_tag + "_eval_step"), name="used steps") else: viz.line(X=[len(viz_avg_steps) - 1], Y=[viz_eval_steps[-1]], opts=dict(title=agent.experiment_tag + "_eval_step"), win=eval_plt_steps_win, update='append', name="used steps") # write accucacies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "sufficient info": str(running_avg_correct_state_acc.get_avg()), "qa": str(running_avg_qa_acc.get_avg()), "sufficient qvalue": str(running_avg_correct_state_q_value.get_avg()), "eval sufficient info": str(eval_correct_state_acc), "eval qa": str(eval_qa_acc), "eval steps": str(eval_used_steps), "used steps": str(running_avg_steps.get_avg()) }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush()
def act(self, obs, infos, input_quest, input_quest_char, quest_id_list, random=False): with torch.no_grad(): if self.mode == "eval": return self.act_greedy(obs, infos, input_quest, input_quest_char, quest_id_list) if random: return self.act_random(obs, infos, input_quest, input_quest_char, quest_id_list) batch_size = len(obs) # update inputs for answerer if self.not_finished_yet is None: self.not_finished_yet = np.ones((len(obs), ), dtype="float32") self.naozi.push_batch(copy.copy(obs)) else: for i in range(batch_size): if self.not_finished_yet[i] == 1.0: self.naozi.push_one(i, copy.copy(obs[i])) description_list = self.naozi.get() input_description, input_description_char, description_id_list = self.get_agent_inputs( description_list) ctrlf_word_mask, ctrlf_word_ids = self.get_word_mask( quest_id_list, description_id_list) # generate commands for one game step, epsilon greedy is applied, i.e., # there is epsilon of chance to generate random commands action_rank, ctrlf_rank = self.get_ranks( input_description, input_description_char, input_quest, input_quest_char, ctrlf_word_mask, use_model="online") # list of batch x vocab action_indices_maxq = self.choose_maxQ_command(action_rank) action_indices_random = self.choose_random_command(action_rank) ctrlf_indices_maxq = self.choose_maxQ_command( ctrlf_rank, ctrlf_word_mask) ctrlf_indices_random = self.choose_random_command( ctrlf_rank, ctrlf_word_ids) # random number for epsilon greedy rand_num = np.random.uniform(low=0.0, high=1.0, size=(input_description.size(0), 1)) less_than_epsilon = (rand_num < self.epsilon).astype( "float32") # batch greater_than_epsilon = 1.0 - less_than_epsilon less_than_epsilon = to_pt(less_than_epsilon, self.use_cuda, type='long') greater_than_epsilon = to_pt(greater_than_epsilon, self.use_cuda, type='long') chosen_indices = less_than_epsilon * action_indices_random + greater_than_epsilon * action_indices_maxq chosen_ctrlf_indices = less_than_epsilon * ctrlf_indices_random + greater_than_epsilon * ctrlf_indices_maxq chosen_strings = self.generate_commands(chosen_indices, chosen_ctrlf_indices) for i in range(batch_size): if chosen_strings[i] == "stop": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "stop": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ description_list, chosen_indices, chosen_ctrlf_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info
def answer_question(self, input_observation, input_observation_char, observation_id_list, input_quest, input_quest_char, use_model="online"): # first pad answerer_input, and get the mask model = self.online_net if use_model == "online" else self.target_net batch_size = len(observation_id_list) max_length = input_observation.size(1) mask = compute_mask(input_observation) # batch x obs_len # noun mask for location question if self.question_type in ["location"]: location_mask = [] for i in range(batch_size): m = [1 for item in observation_id_list[i]] location_mask.append(m) location_mask = pad_sequences(location_mask, maxlen=max_length, dtype="float32") location_mask = to_pt(location_mask, enable_cuda=self.use_cuda, type='float') assert mask.size() == location_mask.size() mask = mask * location_mask match_representation_sequence = self.get_match_representations( input_observation, input_observation_char, input_quest, input_quest_char, use_model=use_model) pred = model.answer_question(match_representation_sequence, mask) # batch x vocab or batch x 2 # attention sum: # sometimes certain word appears multiple times in the observation, # thus we need to merge them together before doing further computations # ------- but # if answer type is not pointing, we just use a pre-defined mapping # that maps 0/1 to their positions in vocab if self.answer_type == "2 way": observation_id_list = [] max_length = 2 for i in range(batch_size): observation_id_list.append( [self.word2id["0"], self.word2id["1"]]) observation = to_pt( pad_sequences(observation_id_list, maxlen=max_length).astype('int32'), self.use_cuda) vocab_distribution = np.zeros( (batch_size, len(self.word_vocab))) # batch x vocab vocab_distribution = to_pt(vocab_distribution, self.use_cuda, type='float') vocab_distribution = vocab_distribution.scatter_add_( 1, observation, pred) # batch x vocab non_zero_words = [] for i in range(batch_size): non_zero_words.append(list(set(observation_id_list[i]))) vocab_mask = torch.ne(vocab_distribution, 0).float() return vocab_distribution, non_zero_words, vocab_mask
def get_dqn_loss(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.command_generation_replay_memory) < self.replay_batch_size: return None data = self.command_generation_replay_memory.get_batch( self.replay_batch_size, self.multi_step) if data is None: return None obs_list, quest_list, possible_words_list, chosen_indices, rewards, next_obs_list, next_possible_words_list, actual_n_list = data batch_size = len(actual_n_list) input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list) input_observation, input_observation_char, _ = self.get_agent_inputs( obs_list) next_input_observation, next_input_observation_char, _ = self.get_agent_inputs( next_obs_list) possible_words, next_possible_words = [], [] for i in range(3): possible_words.append([item[i] for item in possible_words_list]) next_possible_words.append( [item[i] for item in next_possible_words_list]) local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in self.get_local_word_masks(possible_words) ] next_local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in self.get_local_word_masks(next_possible_words) ] action_ranks = self.get_ranks( input_observation, input_observation_char, input_quest, input_quest_char, local_word_masks, use_model="online" ) # list of batch x vocab or list of batch x vocab x atoms # ps_a word_qvalues = [ ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for w_rank, idx in zip(action_ranks, chosen_indices) ] # list of batch or list of batch x atoms q_value = torch.mean(torch.stack(word_qvalues, -1), -1) # batch or batch x atoms # log_ps_a log_q_value = torch.log(q_value) # batch or batch x atoms with torch.no_grad(): if self.noisy_net: self.target_net.reset_noise() # Sample new target net noise if self.double_dqn: # pns Probabilities p(s_t+n, ·; θonline) next_action_ranks = self.get_ranks(next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_local_word_masks, use_model="online") # list of batch x vocab or list of batch x vocab x atoms # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] next_word_indices = self.choose_maxQ_command( next_action_ranks, next_local_word_masks) # list of batch x 1 # pns # Probabilities p(s_t+n, ·; θtarget) next_action_ranks = self.get_ranks( next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_local_word_masks, use_model="target" ) # batch x vocab or list of batch x vocab x atoms # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) next_word_qvalues = [ ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for w_rank, idx in zip(next_action_ranks, next_word_indices) ] # list of batch or list of batch x atoms else: # pns Probabilities p(s_t+n, ·; θonline) next_action_ranks = self.get_ranks(next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_local_word_masks, use_model="target") # list of batch x vocab or list of batch x vocab x atoms next_word_indices = self.choose_maxQ_command( next_action_ranks, next_local_word_masks) # list of batch x 1 next_word_qvalues = [ ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for w_rank, idx in zip(next_action_ranks, next_word_indices) ] # list of batch or list of batch x atoms next_q_value = torch.mean(torch.stack(next_word_qvalues, -1), -1) # batch or batch x atoms # Compute Tz (Bellman operator T applied to z) discount = to_pt((np.ones_like(actual_n_list) * self.discount_gamma)**actual_n_list, self.use_cuda, type="float") if not self.use_distributional: rewards = rewards + next_q_value * discount # batch loss = F.smooth_l1_loss(q_value, rewards) return loss with torch.no_grad(): Tz = rewards.unsqueeze( -1) + discount.unsqueeze(-1) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.v_min, max=self.v_max) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.v_min) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = torch.zeros(batch_size, self.atoms).float() if self.use_cuda: m = m.cuda() offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).unsqueeze(1).expand( batch_size, self.atoms).long() if self.use_cuda: offset = offset.cuda() m.view(-1).index_add_( 0, (l + offset).view(-1), (next_q_value * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (next_q_value * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum( m * log_q_value, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) loss = torch.mean(loss) return loss
def act(self, obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words, random=False): """ Acts upon the current list of observations. One text command must be returned for each observation. """ with torch.no_grad(): if self.mode == "eval": return self.act_greedy(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words) if random: return self.act_random(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words) batch_size = len(obs) local_word_masks_np = self.get_local_word_masks(possible_words) local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in local_word_masks_np ] # generate commands for one game step, epsilon greedy is applied, i.e., # there is epsilon of chance to generate random commands action_ranks = self.get_ranks( input_observation, input_observation_char, input_quest, input_quest_char, local_word_masks, use_model="online") # list of batch x vocab word_indices_maxq = self.choose_maxQ_command( action_ranks, local_word_masks) word_indices_random = self.choose_random_command( batch_size, len(self.word_vocab), possible_words) # random number for epsilon greedy rand_num = np.random.uniform(low=0.0, high=1.0, size=(batch_size, )) less_than_epsilon = (rand_num < self.epsilon).astype( "float32") # batch greater_than_epsilon = 1.0 - less_than_epsilon less_than_epsilon = to_pt(less_than_epsilon, self.use_cuda, type='long') greater_than_epsilon = to_pt(greater_than_epsilon, self.use_cuda, type='long') chosen_indices = [ less_than_epsilon * idx_random + greater_than_epsilon * idx_maxq for idx_random, idx_maxq in zip(word_indices_random, word_indices_maxq) ] chosen_strings = self.get_chosen_strings(chosen_indices) for i in range(batch_size): if chosen_strings[i] == "wait": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "wait": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ chosen_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info
def get_dqn_loss(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.replay_memory) < self.replay_batch_size: return None, None data = self.replay_memory.get_batch(self.replay_batch_size, self.multi_step) if data is None: return None, None obs_list, quest_list, action_indices, ctrlf_indices, rewards, next_obs_list, actual_ns = data input_observation, input_observation_char, observation_id_list = self.get_agent_inputs( obs_list) input_quest, input_quest_char, quest_id_list = self.get_agent_inputs( quest_list) next_input_observation, next_input_observation_char, next_observation_id_list = self.get_agent_inputs( next_obs_list) ctrlf_word_mask, _ = self.get_word_mask(quest_id_list, observation_id_list) next_ctrlf_word_mask, _ = self.get_word_mask(quest_id_list, next_observation_id_list) action_rank, ctrlf_rank = self.get_ranks( input_observation, input_observation_char, input_quest, input_quest_char, ctrlf_word_mask, use_model="online") # batch x vocab # ps_a q_value_action = ez_gather_dim_1(action_rank, action_indices).squeeze(1) # batch q_value_ctrlf = ez_gather_dim_1(ctrlf_rank, ctrlf_indices).squeeze(1) # batch is_ctrlf = torch.eq(action_indices, float(self.action2id["ctrl+f"]) ).float() # when the action is ctrl+f, batch q_value = (q_value_action + q_value_ctrlf * is_ctrlf) / ( is_ctrlf + 1) # masked average # q_value = torch.mean(torch.stack([q_value_action, q_value_ctrlf], -1), -1) with torch.no_grad(): if self.noisy_net: self.target_net.reset_noise() # Sample new target net noise # pns Probabilities p(s_t+n, ·; θonline) next_action_rank, next_ctrlf_rank = self.get_ranks( next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_ctrlf_word_mask, use_model="online") # batch x vocab # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] next_action_indices = self.choose_maxQ_command( next_action_rank) # batch x 1 next_ctrlf_indices = self.choose_maxQ_command( next_ctrlf_rank, next_ctrlf_word_mask) # batch x 1 # pns # Probabilities p(s_t+n, ·; θtarget) next_action_rank, next_ctrlf_rank = self.get_ranks( next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_ctrlf_word_mask, use_model="target") # batch x vocab # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) next_q_value_action = ez_gather_dim_1(next_action_rank, next_action_indices).squeeze( 1) # batch next_q_value_ctrlf = ez_gather_dim_1(next_ctrlf_rank, next_ctrlf_indices).squeeze( 1) # batch next_is_ctrlf = torch.eq(next_action_indices, float(self.action2id["ctrl+f"])).float( ) # when the action is ctrl+f, batch next_q_value = (next_q_value_action + next_q_value_ctrlf * next_is_ctrlf) / ( next_is_ctrlf + 1) # masked average discount = to_pt( (np.ones_like(actual_ns) * self.discount_gamma)**actual_ns, self.use_cuda, type="float") rewards = rewards + next_q_value * discount # batch loss = F.smooth_l1_loss(q_value, rewards) return loss, q_value
def train(data_path): time_1 = datetime.datetime.now() agent = Agent() # visdom viz = visdom.Visdom() plt_win = None eval_plt_win = None viz_avg_correct_state_acc, viz_avg_qa_acc = [], [] viz_eval_sufficient_info_reward, viz_eval_qa_reward = [], [] step_in_total = 0 running_avg_qa_reward = generic.HistoryScoreCache(capacity=500) running_avg_sufficient_info_reward = generic.HistoryScoreCache( capacity=500) running_avg_qa_loss = generic.HistoryScoreCache(capacity=500) running_avg_correct_state_loss = generic.HistoryScoreCache(capacity=500) output_dir, data_dir = ".", "." json_file_name = agent.experiment_tag.replace(" ", "_") best_sum_reward_so_far = 0.0 # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt") agent.update_target_net() elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt") agent.update_target_net() else: print( "Failed to load pretrained model... couldn't find the checkpoint file..." ) # Create temporary folder for the generated games. games_dir = tempfile.TemporaryDirectory( prefix="tw_games" ) # This is not deleted upon error. It would be better to use a with statement. games_dir = pjoin(games_dir.name, "") # So path ends with '/'. # copy grammar files into tmp folder so that it works smoothly assert os.path.exists( "./textworld_data"), "Oh no! textworld_data folder is not there..." os.mkdir(games_dir) os.mkdir(pjoin(games_dir, "textworld_data")) copy_tree("textworld_data", games_dir + "textworld_data") if agent.run_eval: assert os.path.exists(pjoin( data_path, agent.testset_path)), "Oh no! test_set folder is not there..." os.mkdir(pjoin(games_dir, agent.testset_path)) copy_tree(pjoin(data_path, agent.testset_path), pjoin(games_dir, agent.testset_path)) if agent.train_data_size == -1: game_queue_size = agent.batch_size * 5 game_queue = [] episode_no = 0 if agent.train_data_size == -1: # endless mode game_generator_queue = game_generator.game_generator_queue( path=games_dir, random_map=agent.random_map, question_type=agent.question_type, max_q_size=agent.batch_size * 2, nb_worker=8) else: # generate the training set all_training_games = game_generator.game_generator( path=games_dir, random_map=agent.random_map, question_type=agent.question_type, train_data_size=agent.train_data_size) all_training_games.sort() all_env_ids = None while (True): if episode_no > agent.max_episode: break np.random.seed(episode_no) if agent.train_data_size == -1: # endless mode for _ in range(agent.batch_size): if not game_generator_queue.empty(): tmp_game = game_generator_queue.get() if os.path.exists(tmp_game): game_queue.append(tmp_game) if len(game_queue) == 0: time.sleep(0.1) continue can_delete_these = [] if len(game_queue) > game_queue_size: can_delete_these = game_queue[:-game_queue_size] game_queue = game_queue[-game_queue_size:] sampled_games = np.random.choice(game_queue, agent.batch_size).tolist() env_ids = [ register_game(gamefile, request_infos=request_infos) for gamefile in sampled_games ] else: if all_env_ids is None: all_env_ids = [ register_game(gamefile, request_infos=request_infos) for gamefile in all_training_games ] env_ids = np.random.choice(all_env_ids, agent.batch_size).tolist() if len(env_ids ) != agent.batch_size: # either less than or greater than env_ids = np.random.choice(env_ids, agent.batch_size).tolist() env_id = make_batch2(env_ids, parallel=True) env = gym.make(env_id) env.seed(episode_no) obs, infos = env.reset() batch_size = len(obs) # generate question-answer pairs here questions, answers, reward_helper_info = game_generator.generate_qa_pairs( infos, question_type=agent.question_type, seed=episode_no) print( "====================================================================================", episode_no) print(questions[0], answers[0]) agent.train() agent.init(obs, infos) commands, last_facts, init_facts = [], [], [] commands_per_step, game_facts_cache = [], [] for i in range(batch_size): commands.append("restart") last_facts.append(None) init_facts.append(None) game_facts_cache.append([]) commands_per_step.append(["restart"]) observation_strings, possible_words = agent.get_game_info_at_certain_step( obs, infos) observation_strings = [ a + " <|> " + item for a, item in zip(commands, observation_strings) ] input_quest, input_quest_char, _ = agent.get_agent_inputs(questions) transition_cache = [] print_cmds = [] counting_rewards_np = [] valid_command_rewards_np = [] act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode # push init state into counting reward dict state_strings = agent.get_state_strings(infos) _ = agent.get_binarized_count(state_strings, update=True) for step_no in range(agent.max_nb_steps_per_episode): # update answerer input for i in range(batch_size): if agent.not_finished_yet[i] == 1: agent.naozi.push_one(i, copy.copy(observation_strings[i])) if agent.prev_step_is_still_interacting[i] == 1: new_facts = process_facts(last_facts[i], infos["game"][i], infos["facts"][i], infos["last_action"][i], commands[i]) game_facts_cache[i].append( new_facts ) # info used in reward computing of existence question last_facts[i] = new_facts if step_no == 0: init_facts[i] = copy.copy(new_facts) # generate commands if agent.noisy_net: agent.reset_noise() # Draw a new set of noisy weights observation_strings_w_history = agent.naozi.get() input_observation, input_observation_char, _ = agent.get_agent_inputs( observation_strings_w_history) commands, replay_info = agent.act(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words, random=act_randomly) for i in range(batch_size): commands_per_step[i].append(commands[i]) replay_info = [ observation_strings_w_history, questions, possible_words ] + replay_info admissible_commands = [ set(item) - set(["look", "wait", "inventory"]) for item in infos["admissible_commands"] ] vc_rewards = [ float(c in ac) for c, ac in zip(commands, admissible_commands) ] valid_command_rewards_np.append(np.array(vc_rewards)) # pass commands into env obs, _, _, infos = env.step(commands) # possible words no not depend on history, because one can only interact with what is currently accessible observation_strings, possible_words = agent.get_game_info_at_certain_step( obs, infos) observation_strings = [ a + " <|> " + item for a, item in zip(commands, observation_strings) ] # counting rewards state_strings = agent.get_state_strings(infos) c_rewards = agent.get_binarized_count(state_strings, update=True) counting_rewards_np.append(np.array(c_rewards)) if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0: agent.reset_noise() # Draw a new set of noisy weights if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0: interaction_loss = agent.update_interaction() if interaction_loss is not None: running_avg_correct_state_loss.push(interaction_loss) qa_loss = agent.update_qa() if qa_loss is not None: running_avg_qa_loss.push(qa_loss) print_cmds.append(commands[0] if agent. prev_step_is_still_interacting[0] else "--") # force stopping if step_no == agent.max_nb_steps_per_episode - 1: replay_info[-1] = torch.zeros_like(replay_info[-1]) transition_cache.append(replay_info) step_in_total += 1 if (step_no == agent.max_nb_steps_per_episode - 1) or (step_no > 0 and np.sum(generic.to_np(replay_info[-1])) == 0): break print(" / ".join(print_cmds)) # The agent has exhausted all steps, now answer question. answerer_input = agent.naozi.get() answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs( answerer_input) chosen_word_indices = agent.answer_question_act_greedy( answerer_input_observation, answerer_input_observation_char, answerer_observation_ids, input_quest, input_quest_char) # batch chosen_word_indices_np = generic.to_np(chosen_word_indices) chosen_answers = [ agent.word_vocab[item] for item in chosen_word_indices_np ] # rewards # qa reward qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers) # sufficient info rewards masks = [item[-1] for item in transition_cache] masks_np = [generic.to_np(item) for item in masks] # 1 1 0 0 0 --> 1 1 0 0 0 0 game_finishing_mask = np.stack(masks_np + [np.zeros((batch_size, ))], 0) # game step+1 x batch size # 1 1 0 0 0 0 --> 0 1 0 0 0 game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[ 1:, :] # game step x batch size game_running_mask = np.stack(masks_np, 0) # game step x batch size if agent.question_type == "location": # sufficient info reward: location question reward_helper_info["observation_before_finish"] = answerer_input reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location( reward_helper_info) elif agent.question_type == "existence": # sufficient info reward: existence question reward_helper_info["observation_before_finish"] = answerer_input reward_helper_info[ "game_facts_per_step"] = game_facts_cache # facts before issuing command (we want to stop at correct state) reward_helper_info["init_game_facts"] = init_facts reward_helper_info["full_facts"] = infos["facts"] reward_helper_info["answers"] = answers reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence( reward_helper_info) elif agent.question_type == "attribute": # sufficient info reward: attribute question reward_helper_info["answers"] = answers reward_helper_info[ "game_facts_per_step"] = game_facts_cache # facts before and after issuing commands (we want to compare the differnce) reward_helper_info["init_game_facts"] = init_facts reward_helper_info["full_facts"] = infos["facts"] reward_helper_info[ "commands_per_step"] = commands_per_step # commands before and after issuing commands (we want to compare the differnce) reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute( reward_helper_info) else: raise NotImplementedError # push qa experience into qa replay buffer for b in range(batch_size): # data points in batch # if the agent is not in the correct state, do not push it into replay buffer if np.sum(sufficient_info_reward_np[b]) == 0.0: continue agent.qa_replay_memory.push(False, qa_reward_np[b], answerer_input[b], questions[b], answers[b]) # assign sufficient info reward and counting reward to the corresponding steps counting_rewards_np = np.stack(counting_rewards_np, 1) # batch x game step valid_command_rewards_np = np.stack(valid_command_rewards_np, 1) # batch x game step command_rewards_np = sufficient_info_reward_np + counting_rewards_np * game_running_mask.T * agent.revisit_counting_lambda + valid_command_rewards_np * game_running_mask.T * agent.valid_command_bonus_lambda # batch x game step command_rewards = generic.to_pt(command_rewards_np, enable_cuda=agent.use_cuda, type="float") # batch x game step for i in range(command_rewards_np.shape[1]): transition_cache[i].append(command_rewards[:, i]) print(command_rewards_np[0]) # push command generation experience into replay buffer for b in range(batch_size): is_prior = np.sum(command_rewards_np[b], 0) > 0.0 for i in range(len(transition_cache)): batch_observation_strings, batch_question_strings, batch_possible_words, batch_chosen_indices, _, batch_rewards = transition_cache[ i] is_final = True if masks_np[i][b] != 0: is_final = False agent.command_generation_replay_memory.push( is_prior, batch_observation_strings[b], batch_question_strings[b], [item[b] for item in batch_possible_words], [item[b] for item in batch_chosen_indices], batch_rewards[b], is_final) if masks_np[i][b] == 0.0: break # for printing r_qa = np.mean(qa_reward_np) r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1)) running_avg_qa_reward.push(r_qa) running_avg_sufficient_info_reward.push(r_sufficient_info) print_rewards = np.mean(np.sum(command_rewards_np, -1)) obs_string = answerer_input[0] print(obs_string) # finish game agent.finish_of_episode(episode_no, batch_size) # close env env.close() if agent.train_data_size == -1: # when games are generated on the fly, # remove all files (including .json and .ni) that have been used files_to_delete = [] for gamefile in can_delete_these: if not gamefile.endswith(".ulx"): continue files_to_delete.append(gamefile) files_to_delete.append(gamefile.replace(".ulx", ".json")) files_to_delete.append(gamefile.replace(".ulx", ".ni")) # print("rm -f {}".format(" ".join(files_to_delete))) os.system("rm -f {}".format(" ".join(files_to_delete))) episode_no += batch_size time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | correct state: {:2.3f}/{:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_correct_state_loss.get_avg(), running_avg_qa_loss.get_avg(), print_rewards, r_qa, running_avg_qa_reward.get_avg(), r_sufficient_info, running_avg_sufficient_info_reward.get_avg())) if episode_no < agent.learn_start_from_this_episode: continue if episode_no == 0 or ( episode_no % agent.save_frequency > (episode_no - batch_size) % agent.save_frequency): continue eval_qa_reward, eval_sufficient_info_reward = 0.0, 0.0 # evaluate if agent.run_eval: eval_qa_reward, eval_sufficient_info_reward = evaluate.evaluate( data_dir, agent) # if run eval, then save model by eval accucacy if eval_qa_reward + eval_sufficient_info_reward > best_sum_reward_so_far: best_sum_reward_so_far = eval_qa_reward + eval_sufficient_info_reward agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # save model elif agent.save_checkpoint: if running_avg_qa_reward.get_avg( ) + running_avg_sufficient_info_reward.get_avg( ) > best_sum_reward_so_far: best_sum_reward_so_far = running_avg_qa_reward.get_avg( ) + running_avg_sufficient_info_reward.get_avg() agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # plot using visdom viz_avg_correct_state_acc.append( running_avg_sufficient_info_reward.get_avg()) viz_avg_qa_acc.append(running_avg_qa_reward.get_avg()) viz_eval_sufficient_info_reward.append(eval_sufficient_info_reward) viz_eval_qa_reward.append(eval_qa_reward) viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist() if plt_win is None: plt_win = viz.line(X=viz_x, Y=viz_avg_correct_state_acc, opts=dict(title=agent.experiment_tag + "_train"), name="correct state") viz.line(X=viz_x, Y=viz_avg_qa_acc, opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") else: viz.line(X=[len(viz_avg_correct_state_acc) - 1], Y=[viz_avg_correct_state_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="correct state") viz.line(X=[len(viz_avg_qa_acc) - 1], Y=[viz_avg_qa_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") if eval_plt_win is None: eval_plt_win = viz.line(X=viz_x, Y=viz_eval_sufficient_info_reward, opts=dict(title=agent.experiment_tag + "_eval"), name="correct state") viz.line(X=viz_x, Y=viz_eval_qa_reward, opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") else: viz.line(X=[len(viz_eval_sufficient_info_reward) - 1], Y=[viz_eval_sufficient_info_reward[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="correct state") viz.line(X=[len(viz_eval_qa_reward) - 1], Y=[viz_eval_qa_reward[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") # write accucacies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "sufficient info": running_avg_sufficient_info_reward.get_avg(), "qa": running_avg_qa_reward.get_avg(), "eval sufficient info": eval_sufficient_info_reward, "eval qa": eval_qa_reward }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush()