示例#1
0
def choose_command(word_ranks, word_masks_np, use_cuda, epsilon=0.0):
    batch_size = word_ranks[0].size(0)
    word_qvalues, word_indices_maxq = _choose_maxQ_command(
        word_ranks, word_masks_np, use_cuda)
    if epsilon > 0.0:
        _, word_indices_random = _choose_random_command(
            word_ranks, word_masks_np, use_cuda)
        # random number for epsilon greedy
        rand_num = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 1))
        less_than_epsilon = (rand_num < epsilon).astype("float32")  # batch
        greater_than_epsilon = 1.0 - less_than_epsilon
        less_than_epsilon = to_pt(less_than_epsilon, use_cuda, type='float')
        greater_than_epsilon = to_pt(greater_than_epsilon,
                                     use_cuda,
                                     type='float')
        less_than_epsilon, greater_than_epsilon = less_than_epsilon.long(
        ), greater_than_epsilon.long()
        chosen_indices = [
            less_than_epsilon * idx_random + greater_than_epsilon * idx_maxq
            for idx_random, idx_maxq in zip(word_indices_random,
                                            word_indices_maxq)
        ]
    else:
        chosen_indices = word_indices_maxq
    chosen_indices = [item.detach() for item in chosen_indices]
    return word_qvalues, chosen_indices
示例#2
0
    def compute_reward(self):
        """
        Compute rewards by agent. Note this is different from what the training/evaluation
        scripts do. Agent keeps track of scores and other game information for training purpose.

        """
        # mask = 1 if game is not finished or just finished at current step
        if len(self.dones) == 1:
            # it's not possible to finish a game at 0th step
            mask = [1.0 for _ in self.dones[-1]]
        else:
            assert len(self.dones) > 1
            mask = [1.0 if not self.dones[-2][i] else 0.0 for i in range(len(self.dones[-1]))]
        mask = np.array(mask, dtype='float32')
        mask_pt = to_pt(mask, self.use_cuda, type='float')
        # rewards returned by game engine are always accumulated value the
        # agent have recieved. so the reward it gets in the current game step
        # is the new value minus values at previous step.
        rewards = np.array(self.scores[-1], dtype='float32')  # batch
        if len(self.scores) > 1:
            prev_rewards = np.array(self.scores[-2], dtype='float32')
            rewards = rewards - prev_rewards
        rewards_pt = to_pt(rewards, self.use_cuda, type='float')

        return rewards, rewards_pt, mask, mask_pt
示例#3
0
文件: agent.py 项目: yyht/qait_public
 def get_agent_inputs(self, string_list):
     sentence_token_list = [item.split() for item in string_list]
     sentence_id_list = [
         _words_to_ids(tokens, self.word2id)
         for tokens in sentence_token_list
     ]
     input_sentence_char = list_of_token_list_to_char_input(
         sentence_token_list, self.char2id)
     input_sentence = pad_sequences(
         sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32')
     input_sentence = to_pt(input_sentence, self.use_cuda)
     input_sentence_char = to_pt(input_sentence_char, self.use_cuda)
     return input_sentence, input_sentence_char, sentence_id_list
示例#4
0
    def update(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.

        """
        if len(self.replay_memory) < self.replay_batch_size:
            return None
        transitions = self.replay_memory.sample(self.replay_batch_size)
        batch = Transition(*zip(*transitions))

        observation_id_list = pad_sequences(
            batch.observation_id_list,
            maxlen=max_len(batch.observation_id_list)).astype('int32')
        input_observation = to_pt(observation_id_list, self.use_cuda)
        next_observation_id_list = pad_sequences(
            batch.next_observation_id_list,
            maxlen=max_len(batch.next_observation_id_list)).astype('int32')
        next_input_observation = to_pt(next_observation_id_list, self.use_cuda)
        chosen_indices = list(list(zip(*batch.word_indices)))
        chosen_indices = [torch.stack(item, 0)
                          for item in chosen_indices]  # list of batch x 1

        word_ranks = self.infer_word_ranks(
            input_observation
        )  # list of batch x vocab, len=5 (one per potential output word)
        word_qvalues = [
            w_rank.gather(1, idx).squeeze(-1)
            for w_rank, idx in zip(word_ranks, chosen_indices)
        ]  # list of batch
        q_value = torch.mean(torch.stack(word_qvalues, -1), -1)  # batch

        next_word_ranks = self.infer_word_ranks(
            next_input_observation
        )  # batch x n_verb, batch x n_noun, batchx n_second_noun
        next_word_masks = list(list(zip(*batch.next_word_masks)))
        next_word_masks = [np.stack(item, 0) for item in next_word_masks]
        next_word_qvalues, _ = _choose_maxQ_command(next_word_ranks,
                                                    next_word_masks,
                                                    self.use_cuda)
        next_q_value = torch.mean(torch.stack(next_word_qvalues, -1),
                                  -1)  # batch
        next_q_value = next_q_value.detach()

        rewards = torch.stack(batch.reward)  # batch
        not_done = 1.0 - np.array(batch.done, dtype='float32')  # batch
        not_done = to_pt(not_done, self.use_cuda, type='float')
        rewards = rewards + not_done * next_q_value * self.discount_gamma  # batch
        mask = torch.stack(batch.mask)  # batch
        loss = F.smooth_l1_loss(q_value * mask, rewards * mask)
        return loss
示例#5
0
    def get_word_mask(self, list_of_query_id_list,
                      list_of_observation_id_list):
        batch_size = len(list_of_query_id_list)
        if self.generate_or_point == "generate":
            sw_ids = set()
            for sw in self.stopwords:
                if sw in self.word2id:
                    sw_ids.add(self.word2id[sw])
            word_mask = np.ones((batch_size, len(self.word_vocab)),
                                dtype="float32")
            for _id in sw_ids:
                word_mask[:, _id] = 0.0
            word_mask = to_pt(word_mask,
                              enable_cuda=self.use_cuda,
                              type="float")
            mask_word_id_list = []
            all_word_ids = set(np.arange(len(self.word_vocab)).tolist())
            m = list(all_word_ids - sw_ids)
            for i in range(batch_size):
                mask_word_id_list.append(m)
            return word_mask, mask_word_id_list

        word_mask_np = np.zeros((batch_size, len(self.word_vocab)),
                                dtype="float32")
        mask_word_id_list = []
        for i in range(batch_size):
            mask_word_id_list.append(set())
            for w_idx in list_of_query_id_list[i]:
                if self.word_vocab[w_idx] in self.stopwords:
                    continue
                word_mask_np[i][w_idx] = 1.0
                mask_word_id_list[i].add(w_idx)
            if self.generate_or_point == "qmpoint":
                for w_idx in list_of_observation_id_list[i]:
                    if self.word_vocab[w_idx] in self.stopwords:
                        continue
                    word_mask_np[i][w_idx] = 1.0
                    mask_word_id_list[i].add(w_idx)
        mask_word_id_list = [list(item) for item in mask_word_id_list]
        for i in range(len(mask_word_id_list)):
            if len(mask_word_id_list[i]) == 0:
                mask_word_id_list[i].append(
                    self.word2id[","])  # just in case this list is empty
                word_mask_np[i][self.word2id[","]] = 1.0
                continue
        word_mask = to_pt(word_mask_np,
                          enable_cuda=self.use_cuda,
                          type="float")
        return word_mask, mask_word_id_list
示例#6
0
    def choose_random_command(self, word_ranks, word_masks_np):
        """
        Generate a command randomly, for epsilon greedy.

        Arguments:
            word_ranks: Q values for each word by model.action_scorer.
            word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun).
        """
        batch_size = word_ranks[0].size(0)
        word_ranks_np = [to_np(item) for item in word_ranks]  # list of batch x n_vocab
        word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)]  # list of batch x n_vocab
        word_indices = []
        for i in range(len(word_ranks_np)):
            indices = []
            for j in range(batch_size):
                msk = word_masks_np[i][j]  # vocab
                indices.append(np.random.choice(len(msk), p=msk / np.sum(msk, -1)))
            word_indices.append(np.array(indices))
        # word_indices: list of batch
        word_qvalues = [[] for _ in word_masks_np]
        for i in range(batch_size):
            for j in range(len(word_qvalues)):
                word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]])
        word_qvalues = [torch.stack(item) for item in word_qvalues]
        word_indices = [to_pt(item, self.use_cuda) for item in word_indices]
        word_indices = [item.unsqueeze(-1) for item in word_indices]  # list of batch x 1
        return word_qvalues, word_indices
示例#7
0
    def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]):
        """
        Get all the available information, and concat them together to be tensor for
        a neural model. we use post padding here, all information are tokenized here.

        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        inventory_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["inventory"]]
        inventory_id_list = [_words_to_ids(tokens, self.word2id) for tokens in inventory_token_list]

        feedback_token_list = [preproc(item, str_type='feedback', tokenizer=self.nlp) for item in obs]
        feedback_id_list = [_words_to_ids(tokens, self.word2id) for tokens in feedback_token_list]

        quest_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["extra.recipe"]]
        quest_id_list = [_words_to_ids(tokens, self.word2id) for tokens in quest_token_list]

        prev_action_token_list = [preproc(item, tokenizer=self.nlp) for item in self.prev_actions]
        prev_action_id_list = [_words_to_ids(tokens, self.word2id) for tokens in prev_action_token_list]

        description_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["description"]]
        for i, d in enumerate(description_token_list):
            if len(d) == 0:
                description_token_list[i] = ["end"]  # if empty description, insert word "end"
        description_id_list = [_words_to_ids(tokens, self.word2id) for tokens in description_token_list]
        description_id_list = [_d + _i + _q + _f + _pa for (_d, _i, _q, _f, _pa) in zip(description_id_list, inventory_id_list, quest_id_list, feedback_id_list, prev_action_id_list)]

        input_description = pad_sequences(description_id_list, maxlen=max_len(description_id_list)).astype('int32')
        input_description = to_pt(input_description, self.use_cuda)

        return input_description, description_id_list
示例#8
0
    def choose_maxQ_command(self, word_ranks, word_masks_np):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            word_ranks: Q values for each word by model.action_scorer.
            word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun).
        """
        batch_size = word_ranks[0].size(0)
        word_ranks_np = [to_np(item)
                         for item in word_ranks]  # list of batch x n_vocab
        word_ranks_np = [
            r - np.min(r) for r in word_ranks_np
        ]  # minus the min value, so that all values are non-negative
        word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)
                         ]  # list of batch x n_vocab
        word_indices = [np.argmax(item, -1)
                        for item in word_ranks_np]  # list of batch
        word_qvalues = [[] for _ in word_masks_np]
        for i in range(batch_size):
            for j in range(len(word_qvalues)):
                word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]])
        word_qvalues = [torch.stack(item) for item in word_qvalues]
        word_indices = [to_pt(item, self.use_cuda) for item in word_indices]
        word_indices = [item.unsqueeze(-1)
                        for item in word_indices]  # list of batch x 1
        return word_qvalues, word_indices
示例#9
0
    def get_qa_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.qa_replay_memory) < self.replay_batch_size:
            return None
        transitions = self.qa_replay_memory.sample(self.replay_batch_size)
        batch = qa_Transition(*zip(*transitions))

        answer_distribution, obs_mask = self.answer_question(
            batch.observation_list, batch.quest_list,
            use_model="online")  # answer_distribution is batch x time x 2
        answer_distribution = masked_softmax(answer_distribution,
                                             obs_mask.unsqueeze(-1),
                                             axis=1)

        answer_strings = [item[0] for item in batch.answer_strings]
        groundtruth_answer_positions = get_answer_position(
            batch.observation_list, answer_strings)  # list: batch x 2
        groundtruth = pad_sequences(groundtruth_answer_positions).astype(
            'int32')
        groundtruth = to_pt(groundtruth, self.use_cuda)  # batch x 2
        batch_loss = NegativeLogLoss(
            answer_distribution * obs_mask.unsqueeze(-1), groundtruth)

        return torch.mean(batch_loss)
示例#10
0
文件: agent.py 项目: yyht/qait_public
    def act_random(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        with torch.no_grad():
            batch_size = len(obs)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)
            chosen_indices = word_indices_random
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info
示例#11
0
文件: agent.py 项目: yyht/qait_public
    def get_qa_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.qa_replay_memory) < self.replay_batch_size:
            return None
        transitions = self.qa_replay_memory.sample(self.replay_batch_size)
        batch = qa_memory.qa_Transition(*zip(*transitions))

        observation_list = batch.observation_list
        quest_list = batch.quest_list
        answer_strings = batch.answer_strings
        answer_position = np.array(_words_to_ids(answer_strings, self.word2id))
        groundtruth = to_pt(answer_position, self.use_cuda)  # batch

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, observation_id_list = self.get_agent_inputs(
            observation_list)

        answer_distribution, _, _ = self.answer_question(
            input_observation,
            input_observation_char,
            observation_id_list,
            input_quest,
            input_quest_char,
            use_model="online")  # batch x vocab

        batch_loss = NegativeLogLoss(answer_distribution, groundtruth)  # batch
        return torch.mean(batch_loss)
示例#12
0
    def get_graph_relation_representations(self, relation_names_word_ids):
        # relation_names_word_ids: num_relation x num_word
        relation_name_embeddings, _mask = self.embed(
            relation_names_word_ids)  # num_relation x num_word x emb
        _mask = torch.sum(_mask, -1)  # num_relation
        relation_name_embeddings = torch.sum(relation_name_embeddings,
                                             1)  # num_relation x hid
        tmp = torch.eq(_mask, 0).float()
        if relation_name_embeddings.is_cuda:
            tmp = tmp.cuda()
        _mask = _mask + tmp
        relation_name_embeddings = relation_name_embeddings / _mask.unsqueeze(
            -1)
        relation_name_embeddings = relation_name_embeddings.unsqueeze(
            0)  # 1 x num_relation x emb

        relation_ids = np.arange(self.relation_vocab_size)  # num_relation
        relation_ids = to_pt(relation_ids,
                             enable_cuda=relation_names_word_ids.is_cuda,
                             type='long').unsqueeze(0)  # 1 x num_relation
        relation_embeddings, _ = self.relation_embedding(
            relation_ids)  # 1 x num_relation x emb
        relation_embeddings = torch.cat(
            [relation_name_embeddings, relation_embeddings],
            dim=-1)  # 1 x num_relation x emb+emb
        return relation_embeddings
示例#13
0
文件: agent.py 项目: yyht/qait_public
    def act_greedy(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            batch_size = len(obs)
            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            chosen_indices = word_indices_maxq
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info
示例#14
0
    def act_random(self, obs, infos, input_quest, input_quest_char,
                   quest_id_list):
        with torch.no_grad():
            batch_size = len(obs)

            # update inputs for answerer
            if self.not_finished_yet is None:
                self.not_finished_yet = np.ones((len(obs), ), dtype="float32")
                self.naozi.push_batch(copy.copy(obs))
            else:
                for i in range(batch_size):
                    if self.not_finished_yet[i] == 1.0:
                        self.naozi.push_one(i, copy.copy(obs[i]))

            description_list = self.naozi.get()
            input_description, input_description_char, description_id_list = self.get_agent_inputs(
                description_list)
            ctrlf_word_mask, ctrlf_word_ids = self.get_word_mask(
                quest_id_list, description_id_list)
            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_rank, ctrlf_rank = self.get_ranks(
                input_description,
                input_description_char,
                input_quest,
                input_quest_char,
                ctrlf_word_mask,
                use_model="online")  # list of batch x vocab
            action_indices = self.choose_random_command(action_rank)
            ctrlf_indices = self.choose_random_command(ctrlf_rank,
                                                       ctrlf_word_ids)
            chosen_strings = self.generate_commands(action_indices,
                                                    ctrlf_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "stop":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "stop":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                description_list, action_indices, ctrlf_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info
示例#15
0
 def choose_random_command(self, action_rank, mask_word_ids=None):
     """
     Generate a command randomly, for epsilon greedy.
     """
     batch_size = action_rank.size(0)
     action_space_size = action_rank.size(-1)
     if mask_word_ids is None:
         indices = np.random.choice(action_space_size, batch_size)
     else:
         indices = []
         for j in range(batch_size):
             indices.append(np.random.choice(mask_word_ids[j]))
         indices = np.array(indices)
     action_indices = to_pt(indices,
                            self.use_cuda).unsqueeze(-1)  # batch x 1
     return action_indices
示例#16
0
    def point_random_position(self, point_distribution, mask):
        """
        Generate a command by random, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position batch x time x 2.
            mask: position masks.
        """
        batch_size = point_distribution.size(0)
        mask_np = to_np(mask)  # batch x time
        indices = []
        for i in range(batch_size):
            msk = mask_np[i]  # time
            indices.append(
                np.random.choice(len(msk), 2, p=msk / np.sum(msk, -1)))
        indices = to_pt(np.stack(indices, 0), self.use_cuda)  # batch x 2
        return indices
示例#17
0
    def get_graph_node_representations(self, node_names_word_ids):
        # node_names_word_ids: num_node x num_word
        node_name_embeddings, _mask = self.embed(node_names_word_ids)  # num_node x num_word x emb
        _mask = torch.sum(_mask, -1)  # num_node
        node_name_embeddings = torch.sum(node_name_embeddings, 1)  # num_node x hid
        tmp = torch.eq(_mask, 0).float()
        if node_name_embeddings.is_cuda:
            tmp = tmp.cuda()
        _mask = _mask + tmp
        node_name_embeddings = node_name_embeddings / _mask.unsqueeze(-1)
        node_name_embeddings = node_name_embeddings.unsqueeze(0)  # 1 x num_node x emb

        node_ids = np.arange(self.node_vocab_size)  # num_node
        node_ids = to_pt(node_ids, enable_cuda=node_names_word_ids.is_cuda, type='long').unsqueeze(0)  # 1 x num_node
        node_embeddings, _ = self.node_embedding(node_ids)  # 1 x num_node x emb
        node_embeddings = torch.cat([node_name_embeddings, node_embeddings], dim=-1)  # 1 x num_node x emb+emb
        return node_embeddings
示例#18
0
    def point_maxq_position(self, point_distribution, mask):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position batch x time x 2.
            mask: position masks.
        """
        point_distribution_np = to_np(point_distribution)  # batch x time
        mask_np = to_np(mask)  # batch x time
        point_distribution_np = point_distribution_np - np.min(
            point_distribution_np
        ) + 1e-2  # minus the min value, so that all values are non-negative
        point_distribution_np = point_distribution_np * np.expand_dims(
            mask_np, -1)  # batch x time x 2
        indices = np.argmax(point_distribution_np, 1)  # batch x 2
        indices = to_pt(np.array(indices), self.use_cuda)  # batch x 2
        return indices
示例#19
0
    def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]):
        """
        Get all the available information, and concat them together to be tensor for
        a neural model. we use post padding here, all information are tokenized here.

        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        word2id = self.vocab.word2id
        inventory_id_list = get_token_ids_for_items(infos["inventory"],
                                                    word2id,
                                                    tokenizer=self.nlp)

        feedback_id_list = get_token_ids_for_items(obs,
                                                   word2id,
                                                   tokenizer=self.nlp)

        quest_id_list = get_token_ids_for_items(infos["extra.recipe"],
                                                word2id,
                                                tokenizer=self.nlp)

        prev_action_id_list = get_token_ids_for_items(self.prev_actions,
                                                      word2id,
                                                      tokenizer=self.nlp)

        description_id_list = get_token_ids_for_items(infos["description"],
                                                      word2id,
                                                      tokenizer=self.nlp,
                                                      subst_if_empty=['end'])

        description_id_list = [
            _d + _i + _q + _f + _pa
            for (_d, _i, _q, _f, _pa
                 ) in zip(description_id_list, inventory_id_list,
                          quest_id_list, feedback_id_list, prev_action_id_list)
        ]

        input_description = pad_sequences(
            description_id_list,
            maxlen=max_len(description_id_list)).astype('int32')
        input_description = to_pt(input_description, self.use_cuda)

        return input_description, description_id_list
示例#20
0
def _choose_random_command(word_ranks, word_masks_np, use_cuda):
    """
    Generate a command randomly, for epsilon greedy.

    Arguments:
        word_ranks: Q values for each word by model.action_scorer.
        word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun, adj2, noun2).
    """

    batch_size = word_ranks[0].size(0)
    # print("batch_size=", batch_size, len(word_masks_np))
    assert len(word_ranks) == len(word_masks_np)

    word_ranks_np = [
        to_np(item) for item in word_ranks
    ]  # list of (batch x n_vocab) arrays, len=5 (5 word output phrases)
    # word_ranks_np = [r - np.min(r) for r in word_ranks_np]  # minus the min value, so that all values are non-negative
    word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)
                     ]  # list of batch x n_vocab

    word_indices = []
    for i in range(
            len(word_ranks_np)):  # len=5 (verb, adj1, noun1, adj2, noun2)
        indices = []
        for j in range(batch_size):
            msk = word_masks_np[i][
                j]  # msk is of len = vocab, j is index into batch
            indices.append(np.random.choice(
                len(msk), p=msk /
                np.sum(msk, -1)))  # choose from non-zero entries of msk
        word_indices.append(np.array(indices))
    # word_indices: list of batch

    word_qvalues = [[] for _ in word_masks_np]
    for i in range(batch_size):
        for j in range(len(word_qvalues)):
            word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]])
    word_qvalues = [torch.stack(item) for item in word_qvalues]
    word_indices = [to_pt(item, use_cuda) for item in word_indices]
    word_indices = [item.unsqueeze(-1)
                    for item in word_indices]  # list of batch x 1
    return word_qvalues, word_indices
示例#21
0
文件: agent.py 项目: yyht/qait_public
 def choose_random_command(self,
                           batch_size,
                           action_space_size,
                           possible_words=None):
     """
     Generate a command randomly, for epsilon greedy.
     """
     action_indices = []
     for i in range(3):
         if possible_words is None:
             indices = np.random.choice(action_space_size, batch_size)
         else:
             indices = []
             for j in range(batch_size):
                 mask_ids = []
                 for w in possible_words[i][j]:
                     if w in self.word2id:
                         mask_ids.append(self.word2id[w])
                 indices.append(np.random.choice(mask_ids))
             indices = np.array(indices)
         action_indices.append(to_pt(indices, self.use_cuda))  # batch
     return action_indices
示例#22
0
    def act(self, obs: List[str], scores: List[int], dones: List[bool], infos: Dict[str, List[Any]]) -> List[str]:
        """
        Acts upon the current list of observations.

        One text command must be returned for each observation.

        Arguments:
            obs: Previous command's feedback for each game.
            score: The score obtained so far for each game (at previous step).
            done: Whether a game is finished (at previous step).
            infos: Additional information for each game.

        Returns:
            Text commands to be performed (one per observation).

        Notes:
            Commands returned for games marked as `done` have no effect.
            The states for finished games are simply copy over until all
            games are done, in which case `CustomAgent.finish()` is called
            instead.
        """
        if not self._epsiode_has_started:
            self._start_episode(obs, infos)

        if self.mode == "eval":
            return self.act_eval(obs, scores, dones, infos)

        if self.current_step > 0:
            # append scores / dones from previous step into memory
            self.scores.append(scores)
            self.dones.append(dones)
            # compute previous step's rewards and masks
            rewards_np, rewards, mask_np, mask = self.compute_reward()

        input_description, description_id_list = self.get_game_step_info(obs, infos)
        # generate commands for one game step, epsilon greedy is applied, i.e.,
        # there is epsilon of chance to generate random commands
        word_ranks = self.get_ranks(input_description)  # list of batch x vocab
        _, word_indices_maxq = self.choose_maxQ_command(word_ranks, self.word_masks_np)
        _, word_indices_random = self.choose_random_command(word_ranks, self.word_masks_np)
        # random number for epsilon greedy
        rand_num = np.random.uniform(low=0.0, high=1.0, size=(input_description.size(0), 1))
        less_than_epsilon = (rand_num < self.epsilon).astype("float32")  # batch
        greater_than_epsilon = 1.0 - less_than_epsilon
        less_than_epsilon = to_pt(less_than_epsilon, self.use_cuda, type='float')
        greater_than_epsilon = to_pt(greater_than_epsilon, self.use_cuda, type='float')
        less_than_epsilon, greater_than_epsilon = less_than_epsilon.long(), greater_than_epsilon.long()

        chosen_indices = [less_than_epsilon * idx_random + greater_than_epsilon * idx_maxq for idx_random, idx_maxq in zip(word_indices_random, word_indices_maxq)]
        chosen_indices = [item.detach() for item in chosen_indices]
        chosen_strings = self.get_chosen_strings(chosen_indices)
        self.prev_actions = chosen_strings

        # push info from previous game step into replay memory
        if self.current_step > 0:
            for b in range(len(obs)):
                if mask_np[b] == 0:
                    continue
                is_prior = rewards_np[b] > 0.0
                self.replay_memory.push(is_prior, self.cache_description_id_list[b], [item[b] for item in self.cache_chosen_indices], rewards[b], mask[b], dones[b], description_id_list[b], [item[b] for item in self.word_masks_np])

        # cache new info in current game step into caches
        self.cache_description_id_list = description_id_list
        self.cache_chosen_indices = chosen_indices

        # update neural model by replaying snapshots in replay memory
        if self.current_step > 0 and self.current_step % self.update_per_k_game_steps == 0:
            loss = self.update()
            if loss is not None:
                # Backpropagate
                self.optimizer.zero_grad()
                loss.backward(retain_graph=True)
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_grad_norm)
                self.optimizer.step()  # apply gradients

        self.current_step += 1

        if all(dones):
            self._end_episode(obs, scores, infos)
            return  # Nothing to return.
        return chosen_strings
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    agent = Agent(config)
    output_dir = "."
    data_dir = "."

    # make game environments
    requested_infos = agent.select_additional_infos_lite()
    requested_infos_eval = agent.select_additional_infos()
    games_dir = "./"

    # training game env
    env, _ = reinforcement_learning_dataset.get_training_game_env(
        games_dir + config['rl']['data_path'],
        config['rl']['difficulty_level'], config['rl']['training_size'],
        requested_infos, agent.max_nb_steps_per_episode, agent.batch_size)

    if agent.run_eval:
        # training game env
        eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env(
            games_dir + config['rl']['data_path'],
            config['rl']['difficulty_level'],
            requested_infos_eval,
            agent.eval_max_nb_steps_per_episode,
            agent.eval_batch_size,
            valid_or_test="valid")
    else:
        eval_env, num_eval_game = None, None

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        reward_win, step_win = None, None
        dqn_loss_win = None
        eval_game_points_win, eval_step_win = None, None
        viz_game_rewards, viz_game_points, viz_game_points_normalized, viz_graph_rewards, viz_count_rewards, viz_step = [], [], [], [], [], []
        viz_dqn_loss = []
        viz_eval_game_points, viz_eval_game_points_normalized, viz_eval_step = [], [], []

    step_in_total = 0
    episode_no = 0
    running_avg_game_points = HistoryScoreCache(capacity=500)
    running_avg_game_points_normalized = HistoryScoreCache(capacity=500)
    running_avg_graph_rewards = HistoryScoreCache(capacity=500)
    running_avg_count_rewards = HistoryScoreCache(capacity=500)
    running_avg_game_steps = HistoryScoreCache(capacity=500)
    running_avg_dqn_loss = HistoryScoreCache(capacity=500)
    running_avg_game_rewards = HistoryScoreCache(capacity=500)

    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_train_performance_so_far, best_eval_performance_so_far = 0.0, 0.0
    prev_performance = 0.0

    if os.path.exists(data_dir + "/" +
                      agent.load_graph_generation_model_from_tag + ".pt"):
        agent.load_pretrained_graph_generation_model(
            data_dir + "/" + agent.load_graph_generation_model_from_tag +
            ".pt")
    else:
        print(
            "No graph updater module detected... Please check ", data_dir +
            "/" + agent.load_graph_generation_model_from_tag + ".pt")

    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt")
            agent.update_target_net()

    i_have_seen_these_states = EpisodicCountingMemory(
    )  # episodic counting based memory
    i_am_patient = 0
    perfect_training = 0
    while (True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        env.seed(episode_no)
        obs, infos = env.reset()
        # filter look and examine actions
        for commands_ in infos["admissible_commands"]:
            for cmd_ in [
                    cmd for cmd in commands_ if cmd != "examine cookbook"
                    and cmd.split()[0] in ["examine", "look"]
            ]:
                commands_.remove(cmd_)
        batch_size = len(obs)

        agent.train()
        agent.init()

        game_name_list = [
            game.metadata["uuid"].split("-")[-1] for game in infos["game"]
        ]
        game_max_score_list = [game.max_score for game in infos["game"]]
        i_have_seen_these_states.reset(
        )  # reset episodic counting based memory
        prev_triplets, chosen_actions = [], []
        prev_step_dones, prev_rewards = [], []
        for _ in range(batch_size):
            prev_triplets.append([])
            chosen_actions.append("restart")
            prev_step_dones.append(0.0)
            prev_rewards.append(0.0)

        prev_h, prev_c = None, None

        observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(
            obs, infos)
        observation_for_counting = copy.copy(observation_strings)
        observation_strings = [
            item + " <sep> " + a
            for item, a in zip(observation_strings, chosen_actions)
        ]
        # generate g_belief begins
        generated_commands = agent.command_generation_greedy_generation(
            observation_strings, prev_triplets)
        current_triplets = agent.update_knowledge_graph_triplets(
            prev_triplets, generated_commands)
        # generate g_belief ends
        i_have_seen_these_states.push(
            current_triplets)  # update init triplets into memory

        if agent.count_reward_lambda > 0:
            agent.reset_binarized_counter(batch_size)
            _ = agent.get_binarized_count(observation_for_counting)

        # it requires to store sequences of transitions into memory with order,
        # so we use a cache to keep what agents returns, and push them into memory
        # altogether in the end of game.
        transition_cache = []
        still_running_mask = []
        game_rewards, game_points, graph_rewards, count_rewards = [], [], [], []
        print_actions = []

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        for step_no in range(agent.max_nb_steps_per_episode):
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights

            new_chosen_actions, chosen_indices, prev_h, prev_c = agent.act(
                observation_strings,
                current_triplets,
                action_candidate_list,
                previous_h=prev_h,
                previous_c=prev_c,
                random=act_randomly)
            replay_info = [
                observation_strings, action_candidate_list, chosen_indices,
                current_triplets, chosen_actions
            ]
            transition_cache.append(replay_info)
            chosen_actions = new_chosen_actions
            chosen_actions_before_parsing = [
                item[idx] for item, idx in zip(infos["admissible_commands"],
                                               chosen_indices)
            ]
            obs, scores, dones, infos = env.step(chosen_actions_before_parsing)
            # filter look and examine actions
            for commands_ in infos["admissible_commands"]:
                for cmd_ in [
                        cmd for cmd in commands_ if cmd != "examine cookbook"
                        and cmd.split()[0] in ["examine", "look"]
                ]:
                    commands_.remove(cmd_)
            prev_triplets = current_triplets
            observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(
                obs, infos)
            observation_for_counting = copy.copy(observation_strings)
            observation_strings = [
                item + " <sep> " + a
                for item, a in zip(observation_strings, chosen_actions)
            ]
            # generate g_belief begins
            generated_commands = agent.command_generation_greedy_generation(
                observation_strings, prev_triplets)
            current_triplets = agent.update_knowledge_graph_triplets(
                prev_triplets, generated_commands)
            # generate g_belief ends
            has_not_seen = i_have_seen_these_states.has_not_seen(
                current_triplets)
            i_have_seen_these_states.push(
                current_triplets)  # update init triplets into memory

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights

            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                dqn_loss, _ = agent.update_dqn(episode_no)
                if dqn_loss is not None:
                    running_avg_dqn_loss.push(dqn_loss)

            if step_no == agent.max_nb_steps_per_episode - 1:
                # terminate the game because DQN requires one extra step
                dones = [True for _ in dones]

            step_in_total += 1
            still_running = [1.0 - float(item)
                             for item in prev_step_dones]  # list of float
            prev_step_dones = dones
            step_rewards = [
                float(curr) - float(prev)
                for curr, prev in zip(scores, prev_rewards)
            ]  # list of float
            game_points.append(copy.copy(step_rewards))
            if agent.use_negative_reward:
                step_rewards = [
                    -1.0 if _lost else r
                    for r, _lost in zip(step_rewards, infos["has_lost"])
                ]  # list of float
                step_rewards = [
                    5.0 if _won else r
                    for r, _won in zip(step_rewards, infos["has_won"])
                ]  # list of float
            prev_rewards = scores
            if agent.fully_observable_graph:
                step_graph_rewards = [0.0 for _ in range(batch_size)]
            else:
                step_graph_rewards = agent.get_graph_rewards(
                    prev_triplets, current_triplets)  # list of float
                step_graph_rewards = [
                    r * float(m)
                    for r, m in zip(step_graph_rewards, has_not_seen)
                ]
            # counting bonus
            if agent.count_reward_lambda > 0:
                step_revisit_counting_rewards = agent.get_binarized_count(
                    observation_for_counting, update=True)
                step_revisit_counting_rewards = [
                    r * agent.count_reward_lambda
                    for r in step_revisit_counting_rewards
                ]
            else:
                step_revisit_counting_rewards = [
                    0.0 for _ in range(batch_size)
                ]
            still_running_mask.append(still_running)
            game_rewards.append(step_rewards)
            graph_rewards.append(step_graph_rewards)
            count_rewards.append(step_revisit_counting_rewards)
            print_actions.append(
                chosen_actions_before_parsing[0] if still_running[0] else "--")

            # if all ended, break
            if np.sum(still_running) == 0:
                break

        still_running_mask_np = np.array(still_running_mask)
        game_rewards_np = np.array(
            game_rewards) * still_running_mask_np  # step x batch
        game_points_np = np.array(
            game_points) * still_running_mask_np  # step x batch
        graph_rewards_np = np.array(
            graph_rewards) * still_running_mask_np  # step x batch
        count_rewards_np = np.array(
            count_rewards) * still_running_mask_np  # step x batch
        if agent.graph_reward_lambda > 0.0:
            graph_rewards_pt = generic.to_pt(graph_rewards_np,
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        else:
            graph_rewards_pt = generic.to_pt(np.zeros_like(graph_rewards_np),
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        if agent.count_reward_lambda > 0.0:
            count_rewards_pt = generic.to_pt(count_rewards_np,
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        else:
            count_rewards_pt = generic.to_pt(np.zeros_like(count_rewards_np),
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # step x batch
        command_rewards_pt = generic.to_pt(game_rewards_np,
                                           enable_cuda=agent.use_cuda,
                                           type='float')  # step x batch

        # push experience into replay buffer (dqn)
        avg_rewards_in_buffer = agent.dqn_memory.avg_rewards()
        for b in range(game_rewards_np.shape[1]):
            if still_running_mask_np.shape[
                    0] == agent.max_nb_steps_per_episode and still_running_mask_np[
                        -1][b] != 0:
                # need to pad one transition
                _need_pad = True
                tmp_game_rewards = game_rewards_np[:, b].tolist() + [0.0]
            else:
                _need_pad = False
                tmp_game_rewards = game_rewards_np[:, b]
            if np.mean(
                    tmp_game_rewards
            ) < avg_rewards_in_buffer * agent.buffer_reward_threshold:
                continue
            for i in range(game_rewards_np.shape[0]):
                observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[
                    i]
                is_final = True
                if still_running_mask_np[i][b] != 0:
                    is_final = False
                agent.dqn_memory.add(
                    observation_strings[b], prev_action_strings[b],
                    action_candidate_list[b], chosen_indices[b], _triplets[b],
                    command_rewards_pt[i][b], graph_rewards_pt[i][b],
                    count_rewards_pt[i][b], is_final)
                if still_running_mask_np[i][b] == 0:
                    break
            if _need_pad:
                observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[
                    -1]
                agent.dqn_memory.add(observation_strings[b],
                                     prev_action_strings[b],
                                     action_candidate_list[b],
                                     chosen_indices[b], _triplets[b],
                                     command_rewards_pt[-1][b] * 0.0,
                                     graph_rewards_pt[-1][b] * 0.0,
                                     count_rewards_pt[-1][b] * 0.0, True)

        for b in range(batch_size):
            running_avg_game_points.push(np.sum(game_points_np, 0)[b])
            game_max_score_np = np.array(game_max_score_list, dtype="float32")
            running_avg_game_points_normalized.push(
                (np.sum(game_points_np, 0) / game_max_score_np)[b])
            running_avg_game_steps.push(np.sum(still_running_mask_np, 0)[b])
            running_avg_game_rewards.push(np.sum(game_rewards_np, 0)[b])
            running_avg_graph_rewards.push(np.sum(graph_rewards_np, 0)[b])
            running_avg_count_rewards.push(np.sum(count_rewards_np, 0)[b])

        # finish game
        agent.finish_of_episode(episode_no, batch_size)
        episode_no += batch_size

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if agent.report_frequency == 0 or (
                episode_no % agent.report_frequency >
            (episode_no - batch_size) % agent.report_frequency):
            continue
        time_2 = datetime.datetime.now()
        print(
            "Episode: {:3d} | time spent: {:s} | dqn loss: {:2.3f} | game points: {:2.3f} | normalized game points: {:2.3f} | game rewards: {:2.3f} | graph rewards: {:2.3f} | count rewards: {:2.3f} | used steps: {:2.3f}"
            .format(episode_no,
                    str(time_2 - time_1).rsplit(".")[0],
                    running_avg_dqn_loss.get_avg(),
                    running_avg_game_points.get_avg(),
                    running_avg_game_points_normalized.get_avg(),
                    running_avg_game_rewards.get_avg(),
                    running_avg_graph_rewards.get_avg(),
                    running_avg_count_rewards.get_avg(),
                    running_avg_game_steps.get_avg()))
        print(game_name_list[0] + ":    " + " | ".join(print_actions))

        # evaluate
        curr_train_performance = running_avg_game_points_normalized.get_avg()
        eval_game_points, eval_game_points_normalized, eval_game_step = 0.0, 0.0, 0.0
        eval_command_generation_f1 = 0.0
        if agent.run_eval:
            eval_game_points, eval_game_points_normalized, eval_game_step, eval_command_generation_f1, detailed_scores = evaluate.evaluate_belief_mode(
                eval_env, agent, num_eval_game)
            curr_eval_performance = eval_game_points_normalized
            curr_performance = curr_eval_performance
            if curr_eval_performance > best_eval_performance_so_far:
                best_eval_performance_so_far = curr_eval_performance
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
            elif curr_eval_performance == best_eval_performance_so_far:
                if curr_eval_performance > 0.0:
                    agent.save_model_to_path(output_dir + "/" +
                                             agent.experiment_tag +
                                             "_model.pt")
                else:
                    if curr_train_performance >= best_train_performance_so_far:
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")
        else:
            curr_eval_performance = 0.0
            detailed_scores = ""
            curr_performance = curr_train_performance
            if curr_train_performance >= best_train_performance_so_far:
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
        # update best train performance
        if curr_train_performance >= best_train_performance_so_far:
            best_train_performance_so_far = curr_train_performance

        if prev_performance <= curr_performance:
            i_am_patient = 0
        else:
            i_am_patient += 1
        prev_performance = curr_performance

        # if patient >= patience, resume from checkpoint
        if agent.patience > 0 and i_am_patient >= agent.patience:
            if os.path.exists(output_dir + "/" + agent.experiment_tag +
                              "_model.pt"):
                print('reload from a good checkpoint...')
                agent.load_pretrained_model(output_dir + "/" +
                                            agent.experiment_tag + "_model.pt",
                                            load_partial_graph=False)
                agent.update_target_net()
                i_am_patient = 0

        if running_avg_game_points_normalized.get_avg() >= 0.95:
            perfect_training += 1
        else:
            perfect_training = 0

        # plot using visdom
        if config["general"]["visdom"]:
            viz_game_rewards.append(running_avg_game_rewards.get_avg())
            viz_game_points.append(running_avg_game_points.get_avg())
            viz_game_points_normalized.append(
                running_avg_game_points_normalized.get_avg())
            viz_graph_rewards.append(running_avg_graph_rewards.get_avg())
            viz_count_rewards.append(running_avg_count_rewards.get_avg())
            viz_step.append(running_avg_game_steps.get_avg())
            viz_dqn_loss.append(running_avg_dqn_loss.get_avg())
            viz_eval_game_points.append(eval_game_points)
            viz_eval_game_points_normalized.append(eval_game_points_normalized)
            viz_eval_step.append(eval_game_step)
            viz_x = np.arange(len(viz_game_rewards)).tolist()

            if reward_win is None:
                reward_win = viz.line(X=viz_x,
                                      Y=viz_game_rewards,
                                      opts=dict(title=agent.experiment_tag +
                                                "_game_rewards"),
                                      name="game_rewards")
                viz.line(X=viz_x,
                         Y=viz_graph_rewards,
                         opts=dict(title=agent.experiment_tag +
                                   "_graph_rewards"),
                         win=reward_win,
                         update='append',
                         name="graph_rewards")
                viz.line(X=viz_x,
                         Y=viz_count_rewards,
                         opts=dict(title=agent.experiment_tag +
                                   "_count_rewards"),
                         win=reward_win,
                         update='append',
                         name="count_rewards")
                viz.line(X=viz_x,
                         Y=viz_game_points,
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points"),
                         win=reward_win,
                         update='append',
                         name="game_points")
                viz.line(X=viz_x,
                         Y=viz_game_points_normalized,
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points_normalized"),
                         win=reward_win,
                         update='append',
                         name="game_points_normalized")
            else:
                viz.line(X=[len(viz_game_rewards) - 1],
                         Y=[viz_game_rewards[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_game_rewards"),
                         win=reward_win,
                         update='append',
                         name="game_rewards")
                viz.line(X=[len(viz_graph_rewards) - 1],
                         Y=[viz_graph_rewards[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_graph_rewards"),
                         win=reward_win,
                         update='append',
                         name="graph_rewards")
                viz.line(X=[len(viz_count_rewards) - 1],
                         Y=[viz_count_rewards[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_count_rewards"),
                         win=reward_win,
                         update='append',
                         name="count_rewards")
                viz.line(X=[len(viz_game_points) - 1],
                         Y=[viz_game_points[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points"),
                         win=reward_win,
                         update='append',
                         name="game_points")
                viz.line(X=[len(viz_game_points_normalized) - 1],
                         Y=[viz_game_points_normalized[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_game_points_normalized"),
                         win=reward_win,
                         update='append',
                         name="game_points_normalized")

            if step_win is None:
                step_win = viz.line(X=viz_x,
                                    Y=viz_step,
                                    opts=dict(title=agent.experiment_tag +
                                              "_step"),
                                    name="step")
            else:
                viz.line(X=[len(viz_step) - 1],
                         Y=[viz_step[-1]],
                         opts=dict(title=agent.experiment_tag + "_step"),
                         win=step_win,
                         update='append',
                         name="step")

            if dqn_loss_win is None:
                dqn_loss_win = viz.line(X=viz_x,
                                        Y=viz_dqn_loss,
                                        opts=dict(title=agent.experiment_tag +
                                                  "_dqn_loss"),
                                        name="dqn loss")
            else:
                viz.line(X=[len(viz_dqn_loss) - 1],
                         Y=[viz_dqn_loss[-1]],
                         opts=dict(title=agent.experiment_tag + "_dqn_loss"),
                         win=dqn_loss_win,
                         update='append',
                         name="dqn loss")

            if eval_game_points_win is None:
                eval_game_points_win = viz.line(
                    X=viz_x,
                    Y=viz_eval_game_points,
                    opts=dict(title=agent.experiment_tag +
                              "_eval_game_points"),
                    name="eval game points")
                viz.line(X=viz_x,
                         Y=viz_eval_game_points_normalized,
                         opts=dict(title=agent.experiment_tag +
                                   "_eval_game_points_normalized"),
                         win=eval_game_points_win,
                         update='append',
                         name="eval_game_points_normalized")
            else:
                viz.line(X=[len(viz_eval_game_points) - 1],
                         Y=[viz_eval_game_points[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_eval_game_points"),
                         win=eval_game_points_win,
                         update='append',
                         name="eval game_points")
                viz.line(X=[len(viz_eval_game_points_normalized) - 1],
                         Y=[viz_eval_game_points_normalized[-1]],
                         opts=dict(title=agent.experiment_tag +
                                   "_eval_game_points_normalized"),
                         win=eval_game_points_win,
                         update='append',
                         name="eval_game_points_normalized")

            if eval_step_win is None:
                eval_step_win = viz.line(X=viz_x,
                                         Y=viz_eval_step,
                                         opts=dict(title=agent.experiment_tag +
                                                   "_eval_step"),
                                         name="eval step")
            else:
                viz.line(X=[len(viz_eval_step) - 1],
                         Y=[viz_eval_step[-1]],
                         opts=dict(title=agent.experiment_tag + "_eval_step"),
                         win=eval_step_win,
                         update='append',
                         name="eval step")

        # write accuracies down into file
        _s = json.dumps({
            "time spent":
            str(time_2 - time_1).rsplit(".")[0],
            "dqn loss":
            str(running_avg_dqn_loss.get_avg()),
            "train game points":
            str(running_avg_game_points.get_avg()),
            "train normalized game points":
            str(running_avg_game_points_normalized.get_avg()),
            "train game rewards":
            str(running_avg_game_rewards.get_avg()),
            "train graph rewards":
            str(running_avg_graph_rewards.get_avg()),
            "train count rewards":
            str(running_avg_count_rewards.get_avg()),
            "train steps":
            str(running_avg_game_steps.get_avg()),
            "eval game points":
            str(eval_game_points),
            "eval normalized game points":
            str(eval_game_points_normalized),
            "eval command generation f1":
            str(eval_command_generation_f1),
            "eval steps":
            str(eval_game_step),
            "detailed scores":
            detailed_scores
        })
        with open(output_dir + "/" + json_file_name + '.json',
                  'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()

        if curr_performance == 1.0 and curr_train_performance >= 0.95:
            break
        if perfect_training >= 3:
            break
示例#24
0
def train():

    time_1 = datetime.datetime.now()

    with open("config.yaml") as reader:
        config = yaml.safe_load(reader)
    if config['general']['dataset'] == "squad":
        env = GamifiedSquad(config)
    else:
        env = GamifiedNewsQA(config)
    env.split_reset("train")
    agent = Agent()

    # visdom
    viz = visdom.Visdom()
    plt_win = None
    eval_plt_win = None
    plt_q_value_win = None
    plt_steps_win = None
    eval_plt_steps_win = None
    viz_avg_correct_state_acc, viz_avg_qa_acc = [], []
    viz_avg_correct_state_q_value = []
    viz_eval_correct_state_acc, viz_eval_qa_acc, viz_eval_steps = [], [], []
    viz_avg_steps = []

    step_in_total = 0
    episode_no = 0
    running_avg_qa_acc = HistoryScoreCache(capacity=50)
    running_avg_correct_state_acc = HistoryScoreCache(capacity=50)
    running_avg_qa_loss = HistoryScoreCache(capacity=50)
    running_avg_correct_state_loss = HistoryScoreCache(capacity=50)
    running_avg_correct_state_q_value = HistoryScoreCache(capacity=50)
    running_avg_steps = HistoryScoreCache(capacity=50)

    output_dir, data_dir = ".", "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_qa_acc_so_far = 0.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt")
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt")
            agent.update_target_net()

    while (True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        env.seed(episode_no)
        obs, infos = env.reset()
        print(
            "====================================================================================",
            episode_no)
        print("-- Q: %s" % (infos[0]["q"].encode('utf-8')))
        print("-- A: %s" % (infos[0]["a"][0].encode('utf-8')))

        agent.train()
        agent.init(obs, infos)
        quest_list = agent.get_game_quest_info(infos)
        input_quest, input_quest_char, quest_id_list = agent.get_agent_inputs(
            quest_list)
        tmp_replay_buffer = []
        print_cmds = []
        batch_size = len(obs)

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        for step_no in range(agent.max_nb_steps_per_episode):
            # generate commands
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights
            commands, replay_info = agent.act(obs,
                                              infos,
                                              input_quest,
                                              input_quest_char,
                                              quest_id_list,
                                              random=act_randomly)
            obs, infos = env.step(commands)

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights

            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                interaction_loss, interaction_q_value = agent.update_interaction(
                )
                if interaction_loss is not None:
                    running_avg_correct_state_loss.push(interaction_loss)
                    running_avg_correct_state_q_value.push(interaction_q_value)
                qa_loss = agent.update_qa()
                if qa_loss is not None:
                    running_avg_qa_loss.push(qa_loss)

            step_in_total += 1
            still_running = generic.to_np(replay_info[-1])
            print_cmds.append(commands[0] if still_running[0] else "--")

            # force stopping
            if step_no == agent.max_nb_steps_per_episode - 1:
                replay_info[-1] = torch.zeros_like(replay_info[-1])
            tmp_replay_buffer.append(replay_info)
            if np.sum(still_running) == 0:
                break

        print(" / ".join(print_cmds).encode('utf-8'))
        # The agent has exhausted all steps, now answer question.
        chosen_head_tails = agent.answer_question_act(agent.naozi.get(),
                                                      quest_list)  # batch
        chosen_head_tails_np = generic.to_np(chosen_head_tails)
        chosen_answer_strings = generic.get_answer_strings(
            agent.naozi.get(), chosen_head_tails_np)
        answer_strings = [item["a"] for item in infos]

        qa_reward_np = generic.get_qa_reward(chosen_answer_strings,
                                             answer_strings)
        correct_state_reward_np = generic.get_sufficient_info_reward(
            agent.naozi.get(), answer_strings)
        correct_state_reward = generic.to_pt(correct_state_reward_np,
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # batch

        # push qa experience into qa replay buffer
        for b in range(batch_size):  # data points in batch
            is_prior = qa_reward_np[
                b] > agent.qa_reward_prior_threshold * agent.qa_replay_memory.avg_rewards(
                )
            # if the agent is not in the correct state, do not push it into replay buffer
            if np.mean(correct_state_reward_np[b]) == 0.0:
                continue
            agent.qa_replay_memory.push(is_prior, qa_reward_np[b],
                                        agent.naozi.get(b), quest_list[b],
                                        answer_strings[b])

        # small positive reward whenever it answers question correctly
        masks_np = [generic.to_np(item[-1]) for item in tmp_replay_buffer]
        command_rewards_np = []
        for i in range(len(tmp_replay_buffer)):
            if i == len(tmp_replay_buffer) - 1:
                r = correct_state_reward * tmp_replay_buffer[i][-1]
                r_np = correct_state_reward_np * masks_np[i]
            else:
                # give reward only at that one game step, not all
                r = correct_state_reward * (tmp_replay_buffer[i][-1] -
                                            tmp_replay_buffer[i + 1][-1])
                r_np = correct_state_reward_np * (masks_np[i] -
                                                  masks_np[i + 1])
            tmp_replay_buffer[i].append(r)
            command_rewards_np.append(r_np)
        command_rewards_np = np.array(command_rewards_np)
        print(command_rewards_np[:, 0])

        # push experience into replay buffer
        for b in range(len(correct_state_reward_np)):
            is_prior = np.sum(command_rewards_np, 0)[b] > 0.0
            for i in range(len(tmp_replay_buffer)):
                batch_description_list, batch_chosen_indices, batch_chosen_ctrlf_indices, _, batch_rewards = tmp_replay_buffer[
                    i]
                is_final = True
                if masks_np[i][b] != 0:
                    is_final = False
                agent.replay_memory.push(is_prior, batch_description_list[b],
                                         quest_list[b],
                                         batch_chosen_indices[b],
                                         batch_chosen_ctrlf_indices[b],
                                         batch_rewards[b], is_final)
                if masks_np[i][b] == 0.0:
                    break

        qa_acc = np.mean(qa_reward_np)
        correct_state_acc = np.mean(correct_state_reward_np)
        step_masks_np = np.sum(np.array(masks_np), 0)  # batch
        for i in range(len(qa_reward_np)):
            # if the answer is totally wrong, we assume it used all steps
            if qa_reward_np[i] == 0.0:
                step_masks_np[i] = agent.max_nb_steps_per_episode
        used_steps = np.mean(step_masks_np)

        running_avg_qa_acc.push(qa_acc)
        running_avg_correct_state_acc.push(correct_state_acc)
        running_avg_steps.push(used_steps)
        print_rewards = np.sum(np.mean(command_rewards_np, -1))

        obs_string = agent.naozi.get(0)
        print("-- OBS: %s" % (obs_string.encode('utf-8')))
        print("-- PRED: %s" % (chosen_answer_strings[0].encode('utf-8')))
        # finish game

        agent.finish_of_episode(episode_no, batch_size)
        episode_no += batch_size

        time_2 = datetime.datetime.now()
        print(
            "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | interaction qvalue: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | sufficient info: {:2.3f}/{:2.3f} | used steps: {:2.3f}"
            .format(episode_no,
                    str(time_2 - time_1).rsplit(".")[0],
                    running_avg_correct_state_loss.get_avg(),
                    running_avg_correct_state_q_value.get_avg(),
                    running_avg_qa_loss.get_avg(), print_rewards, qa_acc,
                    running_avg_qa_acc.get_avg(), correct_state_acc,
                    running_avg_correct_state_acc.get_avg(),
                    running_avg_steps.get_avg()))

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if agent.report_frequency == 0 or (
                episode_no % agent.report_frequency >
            (episode_no - batch_size) % agent.report_frequency):
            continue
        eval_qa_acc, eval_correct_state_acc, eval_used_steps = 0.0, 0.0, 0.0
        # evaluate
        if agent.run_eval:
            eval_qa_acc, eval_correct_state_acc, eval_used_steps = evaluate.evaluate(
                env, agent, "valid")
            env.split_reset("train")
            # if run eval, then save model by eval accucacy
            if agent.save_frequency > 0 and (
                    episode_no % agent.report_frequency <=
                (episode_no - batch_size) % agent.report_frequency
            ) and eval_qa_acc > best_qa_acc_so_far:
                best_qa_acc_so_far = eval_qa_acc
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
        # save model
        elif agent.save_frequency > 0 and (
                episode_no % agent.report_frequency <=
            (episode_no - batch_size) % agent.report_frequency):
            if running_avg_qa_acc.get_avg() > best_qa_acc_so_far:
                best_qa_acc_so_far = running_avg_qa_acc.get_avg()
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")

        # plot using visdom
        viz_avg_correct_state_acc.append(
            running_avg_correct_state_acc.get_avg())
        viz_avg_qa_acc.append(running_avg_qa_acc.get_avg())
        viz_avg_correct_state_q_value.append(
            running_avg_correct_state_q_value.get_avg())
        viz_eval_correct_state_acc.append(eval_correct_state_acc)
        viz_eval_qa_acc.append(eval_qa_acc)
        viz_eval_steps.append(eval_used_steps)
        viz_avg_steps.append(running_avg_steps.get_avg())
        viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist()

        if plt_win is None:
            plt_win = viz.line(X=viz_x,
                               Y=viz_avg_correct_state_acc,
                               opts=dict(title=agent.experiment_tag +
                                         "_train"),
                               name="sufficient info")
            viz.line(X=viz_x,
                     Y=viz_avg_qa_acc,
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_avg_correct_state_acc) - 1],
                     Y=[viz_avg_correct_state_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="sufficient info")
            viz.line(X=[len(viz_avg_qa_acc) - 1],
                     Y=[viz_avg_qa_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")

        if plt_q_value_win is None:
            plt_q_value_win = viz.line(X=viz_x,
                                       Y=viz_avg_correct_state_q_value,
                                       opts=dict(title=agent.experiment_tag +
                                                 "_train_q_value"),
                                       name="sufficient info")
        else:
            viz.line(X=[len(viz_avg_correct_state_q_value) - 1],
                     Y=[viz_avg_correct_state_q_value[-1]],
                     opts=dict(title=agent.experiment_tag + "_train_q_value"),
                     win=plt_q_value_win,
                     update='append',
                     name="sufficient info")

        if plt_steps_win is None:
            plt_steps_win = viz.line(X=viz_x,
                                     Y=viz_avg_steps,
                                     opts=dict(title=agent.experiment_tag +
                                               "_train_step"),
                                     name="used steps")
        else:
            viz.line(X=[len(viz_avg_steps) - 1],
                     Y=[viz_avg_steps[-1]],
                     opts=dict(title=agent.experiment_tag + "_train_step"),
                     win=plt_steps_win,
                     update='append',
                     name="used steps")

        if eval_plt_win is None:
            eval_plt_win = viz.line(X=viz_x,
                                    Y=viz_eval_correct_state_acc,
                                    opts=dict(title=agent.experiment_tag +
                                              "_eval"),
                                    name="sufficient info")
            viz.line(X=viz_x,
                     Y=viz_eval_qa_acc,
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_eval_correct_state_acc) - 1],
                     Y=[viz_eval_correct_state_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="sufficient info")
            viz.line(X=[len(viz_eval_qa_acc) - 1],
                     Y=[viz_eval_qa_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")

        if eval_plt_steps_win is None:
            eval_plt_steps_win = viz.line(
                X=viz_x,
                Y=viz_eval_steps,
                opts=dict(title=agent.experiment_tag + "_eval_step"),
                name="used steps")
        else:
            viz.line(X=[len(viz_avg_steps) - 1],
                     Y=[viz_eval_steps[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval_step"),
                     win=eval_plt_steps_win,
                     update='append',
                     name="used steps")

        # write accucacies down into file
        _s = json.dumps({
            "time spent":
            str(time_2 - time_1).rsplit(".")[0],
            "sufficient info":
            str(running_avg_correct_state_acc.get_avg()),
            "qa":
            str(running_avg_qa_acc.get_avg()),
            "sufficient qvalue":
            str(running_avg_correct_state_q_value.get_avg()),
            "eval sufficient info":
            str(eval_correct_state_acc),
            "eval qa":
            str(eval_qa_acc),
            "eval steps":
            str(eval_used_steps),
            "used steps":
            str(running_avg_steps.get_avg())
        })
        with open(output_dir + "/" + json_file_name + '.json',
                  'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()
示例#25
0
    def act(self,
            obs,
            infos,
            input_quest,
            input_quest_char,
            quest_id_list,
            random=False):

        with torch.no_grad():
            if self.mode == "eval":
                return self.act_greedy(obs, infos, input_quest,
                                       input_quest_char, quest_id_list)
            if random:
                return self.act_random(obs, infos, input_quest,
                                       input_quest_char, quest_id_list)
            batch_size = len(obs)

            # update inputs for answerer
            if self.not_finished_yet is None:
                self.not_finished_yet = np.ones((len(obs), ), dtype="float32")
                self.naozi.push_batch(copy.copy(obs))
            else:
                for i in range(batch_size):
                    if self.not_finished_yet[i] == 1.0:
                        self.naozi.push_one(i, copy.copy(obs[i]))

            description_list = self.naozi.get()
            input_description, input_description_char, description_id_list = self.get_agent_inputs(
                description_list)
            ctrlf_word_mask, ctrlf_word_ids = self.get_word_mask(
                quest_id_list, description_id_list)
            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_rank, ctrlf_rank = self.get_ranks(
                input_description,
                input_description_char,
                input_quest,
                input_quest_char,
                ctrlf_word_mask,
                use_model="online")  # list of batch x vocab
            action_indices_maxq = self.choose_maxQ_command(action_rank)
            action_indices_random = self.choose_random_command(action_rank)
            ctrlf_indices_maxq = self.choose_maxQ_command(
                ctrlf_rank, ctrlf_word_mask)
            ctrlf_indices_random = self.choose_random_command(
                ctrlf_rank, ctrlf_word_ids)
            # random number for epsilon greedy
            rand_num = np.random.uniform(low=0.0,
                                         high=1.0,
                                         size=(input_description.size(0), 1))
            less_than_epsilon = (rand_num < self.epsilon).astype(
                "float32")  # batch
            greater_than_epsilon = 1.0 - less_than_epsilon
            less_than_epsilon = to_pt(less_than_epsilon,
                                      self.use_cuda,
                                      type='long')
            greater_than_epsilon = to_pt(greater_than_epsilon,
                                         self.use_cuda,
                                         type='long')

            chosen_indices = less_than_epsilon * action_indices_random + greater_than_epsilon * action_indices_maxq
            chosen_ctrlf_indices = less_than_epsilon * ctrlf_indices_random + greater_than_epsilon * ctrlf_indices_maxq
            chosen_strings = self.generate_commands(chosen_indices,
                                                    chosen_ctrlf_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "stop":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "stop":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                description_list, chosen_indices, chosen_ctrlf_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info
示例#26
0
文件: agent.py 项目: yyht/qait_public
    def answer_question(self,
                        input_observation,
                        input_observation_char,
                        observation_id_list,
                        input_quest,
                        input_quest_char,
                        use_model="online"):
        # first pad answerer_input, and get the mask
        model = self.online_net if use_model == "online" else self.target_net
        batch_size = len(observation_id_list)
        max_length = input_observation.size(1)
        mask = compute_mask(input_observation)  # batch x obs_len

        # noun mask for location question
        if self.question_type in ["location"]:
            location_mask = []
            for i in range(batch_size):
                m = [1 for item in observation_id_list[i]]
                location_mask.append(m)
            location_mask = pad_sequences(location_mask,
                                          maxlen=max_length,
                                          dtype="float32")
            location_mask = to_pt(location_mask,
                                  enable_cuda=self.use_cuda,
                                  type='float')
            assert mask.size() == location_mask.size()
            mask = mask * location_mask

        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        pred = model.answer_question(match_representation_sequence,
                                     mask)  # batch x vocab or batch x 2

        # attention sum:
        # sometimes certain word appears multiple times in the observation,
        # thus we need to merge them together before doing further computations
        # ------- but
        # if answer type is not pointing, we just use a pre-defined mapping
        # that maps 0/1 to their positions in vocab
        if self.answer_type == "2 way":
            observation_id_list = []
            max_length = 2
            for i in range(batch_size):
                observation_id_list.append(
                    [self.word2id["0"], self.word2id["1"]])

        observation = to_pt(
            pad_sequences(observation_id_list,
                          maxlen=max_length).astype('int32'), self.use_cuda)
        vocab_distribution = np.zeros(
            (batch_size, len(self.word_vocab)))  # batch x vocab
        vocab_distribution = to_pt(vocab_distribution,
                                   self.use_cuda,
                                   type='float')
        vocab_distribution = vocab_distribution.scatter_add_(
            1, observation, pred)  # batch x vocab
        non_zero_words = []
        for i in range(batch_size):
            non_zero_words.append(list(set(observation_id_list[i])))
        vocab_mask = torch.ne(vocab_distribution, 0).float()
        return vocab_distribution, non_zero_words, vocab_mask
示例#27
0
文件: agent.py 项目: yyht/qait_public
    def get_dqn_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.command_generation_replay_memory) < self.replay_batch_size:
            return None

        data = self.command_generation_replay_memory.get_batch(
            self.replay_batch_size, self.multi_step)
        if data is None:
            return None

        obs_list, quest_list, possible_words_list, chosen_indices, rewards, next_obs_list, next_possible_words_list, actual_n_list = data
        batch_size = len(actual_n_list)

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, _ = self.get_agent_inputs(
            obs_list)
        next_input_observation, next_input_observation_char, _ = self.get_agent_inputs(
            next_obs_list)

        possible_words, next_possible_words = [], []
        for i in range(3):
            possible_words.append([item[i] for item in possible_words_list])
            next_possible_words.append(
                [item[i] for item in next_possible_words_list])

        local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(possible_words)
        ]
        next_local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(next_possible_words)
        ]

        action_ranks = self.get_ranks(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            local_word_masks,
            use_model="online"
        )  # list of batch x vocab or list of batch x vocab x atoms
        # ps_a
        word_qvalues = [
            ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1)
            for w_rank, idx in zip(action_ranks, chosen_indices)
        ]  # list of batch or list of batch x atoms
        q_value = torch.mean(torch.stack(word_qvalues, -1),
                             -1)  # batch or batch x atoms
        # log_ps_a
        log_q_value = torch.log(q_value)  # batch or batch x atoms

        with torch.no_grad():
            if self.noisy_net:
                self.target_net.reset_noise()  # Sample new target net noise
            if self.double_dqn:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="online")
                # list of batch x vocab or list of batch x vocab x atoms
                # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                # pns # Probabilities p(s_t+n, ·; θtarget)
                next_action_ranks = self.get_ranks(
                    next_input_observation,
                    next_input_observation_char,
                    input_quest,
                    input_quest_char,
                    next_local_word_masks,
                    use_model="target"
                )  # batch x vocab or list of batch x vocab x atoms
                # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms
            else:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="target")
                # list of batch x vocab or list of batch x vocab x atoms
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms

            next_q_value = torch.mean(torch.stack(next_word_qvalues, -1),
                                      -1)  # batch or batch x atoms
            # Compute Tz (Bellman operator T applied to z)
            discount = to_pt((np.ones_like(actual_n_list) *
                              self.discount_gamma)**actual_n_list,
                             self.use_cuda,
                             type="float")
        if not self.use_distributional:
            rewards = rewards + next_q_value * discount  # batch
            loss = F.smooth_l1_loss(q_value, rewards)
            return loss

        with torch.no_grad():
            Tz = rewards.unsqueeze(
                -1) + discount.unsqueeze(-1) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.v_min,
                          max=self.v_max)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.v_min) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = torch.zeros(batch_size, self.atoms).float()
            if self.use_cuda:
                m = m.cuda()
            offset = torch.linspace(0, ((batch_size - 1) * self.atoms),
                                    batch_size).unsqueeze(1).expand(
                                        batch_size, self.atoms).long()
            if self.use_cuda:
                offset = offset.cuda()
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (next_q_value *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (next_q_value *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_q_value,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        loss = torch.mean(loss)
        return loss
示例#28
0
文件: agent.py 项目: yyht/qait_public
    def act(self,
            obs,
            infos,
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            possible_words,
            random=False):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            if self.mode == "eval":
                return self.act_greedy(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            if random:
                return self.act_random(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            batch_size = len(obs)

            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)

            # random number for epsilon greedy
            rand_num = np.random.uniform(low=0.0,
                                         high=1.0,
                                         size=(batch_size, ))
            less_than_epsilon = (rand_num < self.epsilon).astype(
                "float32")  # batch
            greater_than_epsilon = 1.0 - less_than_epsilon
            less_than_epsilon = to_pt(less_than_epsilon,
                                      self.use_cuda,
                                      type='long')
            greater_than_epsilon = to_pt(greater_than_epsilon,
                                         self.use_cuda,
                                         type='long')
            chosen_indices = [
                less_than_epsilon * idx_random +
                greater_than_epsilon * idx_maxq
                for idx_random, idx_maxq in zip(word_indices_random,
                                                word_indices_maxq)
            ]
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info
示例#29
0
    def get_dqn_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.replay_memory) < self.replay_batch_size:
            return None, None

        data = self.replay_memory.get_batch(self.replay_batch_size,
                                            self.multi_step)
        if data is None:
            return None, None
        obs_list, quest_list, action_indices, ctrlf_indices, rewards, next_obs_list, actual_ns = data

        input_observation, input_observation_char, observation_id_list = self.get_agent_inputs(
            obs_list)
        input_quest, input_quest_char, quest_id_list = self.get_agent_inputs(
            quest_list)
        next_input_observation, next_input_observation_char, next_observation_id_list = self.get_agent_inputs(
            next_obs_list)

        ctrlf_word_mask, _ = self.get_word_mask(quest_id_list,
                                                observation_id_list)
        next_ctrlf_word_mask, _ = self.get_word_mask(quest_id_list,
                                                     next_observation_id_list)

        action_rank, ctrlf_rank = self.get_ranks(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            ctrlf_word_mask,
            use_model="online")  # batch x vocab
        # ps_a
        q_value_action = ez_gather_dim_1(action_rank,
                                         action_indices).squeeze(1)  # batch
        q_value_ctrlf = ez_gather_dim_1(ctrlf_rank,
                                        ctrlf_indices).squeeze(1)  # batch
        is_ctrlf = torch.eq(action_indices, float(self.action2id["ctrl+f"])
                            ).float()  # when the action is ctrl+f, batch
        q_value = (q_value_action + q_value_ctrlf * is_ctrlf) / (
            is_ctrlf + 1)  # masked average
        # q_value = torch.mean(torch.stack([q_value_action, q_value_ctrlf], -1), -1)

        with torch.no_grad():
            if self.noisy_net:
                self.target_net.reset_noise()  # Sample new target net noise

            # pns Probabilities p(s_t+n, ·; θonline)
            next_action_rank, next_ctrlf_rank = self.get_ranks(
                next_input_observation,
                next_input_observation_char,
                input_quest,
                input_quest_char,
                next_ctrlf_word_mask,
                use_model="online")  # batch x vocab
            # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            next_action_indices = self.choose_maxQ_command(
                next_action_rank)  # batch x 1
            next_ctrlf_indices = self.choose_maxQ_command(
                next_ctrlf_rank, next_ctrlf_word_mask)  # batch x 1
            # pns # Probabilities p(s_t+n, ·; θtarget)
            next_action_rank, next_ctrlf_rank = self.get_ranks(
                next_input_observation,
                next_input_observation_char,
                input_quest,
                input_quest_char,
                next_ctrlf_word_mask,
                use_model="target")  # batch x vocab
            # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)
            next_q_value_action = ez_gather_dim_1(next_action_rank,
                                                  next_action_indices).squeeze(
                                                      1)  # batch
            next_q_value_ctrlf = ez_gather_dim_1(next_ctrlf_rank,
                                                 next_ctrlf_indices).squeeze(
                                                     1)  # batch
            next_is_ctrlf = torch.eq(next_action_indices,
                                     float(self.action2id["ctrl+f"])).float(
                                     )  # when the action is ctrl+f, batch
            next_q_value = (next_q_value_action +
                            next_q_value_ctrlf * next_is_ctrlf) / (
                                next_is_ctrlf + 1)  # masked average

            discount = to_pt(
                (np.ones_like(actual_ns) * self.discount_gamma)**actual_ns,
                self.use_cuda,
                type="float")

        rewards = rewards + next_q_value * discount  # batch
        loss = F.smooth_l1_loss(q_value, rewards)
        return loss, q_value
示例#30
0
def train(data_path):

    time_1 = datetime.datetime.now()
    agent = Agent()

    # visdom
    viz = visdom.Visdom()
    plt_win = None
    eval_plt_win = None
    viz_avg_correct_state_acc, viz_avg_qa_acc = [], []
    viz_eval_sufficient_info_reward, viz_eval_qa_reward = [], []

    step_in_total = 0
    running_avg_qa_reward = generic.HistoryScoreCache(capacity=500)
    running_avg_sufficient_info_reward = generic.HistoryScoreCache(
        capacity=500)
    running_avg_qa_loss = generic.HistoryScoreCache(capacity=500)
    running_avg_correct_state_loss = generic.HistoryScoreCache(capacity=500)

    output_dir, data_dir = ".", "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_sum_reward_so_far = 0.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt")
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt")
            agent.update_target_net()
        else:
            print(
                "Failed to load pretrained model... couldn't find the checkpoint file..."
            )

    # Create temporary folder for the generated games.
    games_dir = tempfile.TemporaryDirectory(
        prefix="tw_games"
    )  # This is not deleted upon error. It would be better to use a with statement.
    games_dir = pjoin(games_dir.name, "")  # So path ends with '/'.
    # copy grammar files into tmp folder so that it works smoothly
    assert os.path.exists(
        "./textworld_data"), "Oh no! textworld_data folder is not there..."
    os.mkdir(games_dir)
    os.mkdir(pjoin(games_dir, "textworld_data"))
    copy_tree("textworld_data", games_dir + "textworld_data")
    if agent.run_eval:
        assert os.path.exists(pjoin(
            data_path,
            agent.testset_path)), "Oh no! test_set folder is not there..."
        os.mkdir(pjoin(games_dir, agent.testset_path))
        copy_tree(pjoin(data_path, agent.testset_path),
                  pjoin(games_dir, agent.testset_path))

    if agent.train_data_size == -1:
        game_queue_size = agent.batch_size * 5
        game_queue = []

    episode_no = 0
    if agent.train_data_size == -1:
        # endless mode
        game_generator_queue = game_generator.game_generator_queue(
            path=games_dir,
            random_map=agent.random_map,
            question_type=agent.question_type,
            max_q_size=agent.batch_size * 2,
            nb_worker=8)
    else:
        # generate the training set
        all_training_games = game_generator.game_generator(
            path=games_dir,
            random_map=agent.random_map,
            question_type=agent.question_type,
            train_data_size=agent.train_data_size)
        all_training_games.sort()
        all_env_ids = None
    while (True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        if agent.train_data_size == -1:
            # endless mode
            for _ in range(agent.batch_size):
                if not game_generator_queue.empty():
                    tmp_game = game_generator_queue.get()
                    if os.path.exists(tmp_game):
                        game_queue.append(tmp_game)
            if len(game_queue) == 0:
                time.sleep(0.1)
                continue
            can_delete_these = []
            if len(game_queue) > game_queue_size:
                can_delete_these = game_queue[:-game_queue_size]
                game_queue = game_queue[-game_queue_size:]
            sampled_games = np.random.choice(game_queue,
                                             agent.batch_size).tolist()
            env_ids = [
                register_game(gamefile, request_infos=request_infos)
                for gamefile in sampled_games
            ]
        else:
            if all_env_ids is None:
                all_env_ids = [
                    register_game(gamefile, request_infos=request_infos)
                    for gamefile in all_training_games
                ]
            env_ids = np.random.choice(all_env_ids, agent.batch_size).tolist()

        if len(env_ids
               ) != agent.batch_size:  # either less than or greater than
            env_ids = np.random.choice(env_ids, agent.batch_size).tolist()
        env_id = make_batch2(env_ids, parallel=True)
        env = gym.make(env_id)
        env.seed(episode_no)

        obs, infos = env.reset()
        batch_size = len(obs)
        # generate question-answer pairs here
        questions, answers, reward_helper_info = game_generator.generate_qa_pairs(
            infos, question_type=agent.question_type, seed=episode_no)
        print(
            "====================================================================================",
            episode_no)
        print(questions[0], answers[0])

        agent.train()
        agent.init(obs, infos)

        commands, last_facts, init_facts = [], [], []
        commands_per_step, game_facts_cache = [], []
        for i in range(batch_size):
            commands.append("restart")
            last_facts.append(None)
            init_facts.append(None)
            game_facts_cache.append([])
            commands_per_step.append(["restart"])

        observation_strings, possible_words = agent.get_game_info_at_certain_step(
            obs, infos)
        observation_strings = [
            a + " <|> " + item
            for a, item in zip(commands, observation_strings)
        ]
        input_quest, input_quest_char, _ = agent.get_agent_inputs(questions)

        transition_cache = []
        print_cmds = []
        counting_rewards_np = []
        valid_command_rewards_np = []

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        # push init state into counting reward dict
        state_strings = agent.get_state_strings(infos)
        _ = agent.get_binarized_count(state_strings, update=True)
        for step_no in range(agent.max_nb_steps_per_episode):
            # update answerer input
            for i in range(batch_size):
                if agent.not_finished_yet[i] == 1:
                    agent.naozi.push_one(i, copy.copy(observation_strings[i]))
                if agent.prev_step_is_still_interacting[i] == 1:
                    new_facts = process_facts(last_facts[i], infos["game"][i],
                                              infos["facts"][i],
                                              infos["last_action"][i],
                                              commands[i])
                    game_facts_cache[i].append(
                        new_facts
                    )  # info used in reward computing of existence question
                    last_facts[i] = new_facts
                    if step_no == 0:
                        init_facts[i] = copy.copy(new_facts)

            # generate commands
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights

            observation_strings_w_history = agent.naozi.get()
            input_observation, input_observation_char, _ = agent.get_agent_inputs(
                observation_strings_w_history)
            commands, replay_info = agent.act(obs,
                                              infos,
                                              input_observation,
                                              input_observation_char,
                                              input_quest,
                                              input_quest_char,
                                              possible_words,
                                              random=act_randomly)
            for i in range(batch_size):
                commands_per_step[i].append(commands[i])

            replay_info = [
                observation_strings_w_history, questions, possible_words
            ] + replay_info
            admissible_commands = [
                set(item) - set(["look", "wait", "inventory"])
                for item in infos["admissible_commands"]
            ]
            vc_rewards = [
                float(c in ac) for c, ac in zip(commands, admissible_commands)
            ]
            valid_command_rewards_np.append(np.array(vc_rewards))

            # pass commands into env
            obs, _, _, infos = env.step(commands)
            # possible words no not depend on history, because one can only interact with what is currently accessible
            observation_strings, possible_words = agent.get_game_info_at_certain_step(
                obs, infos)
            observation_strings = [
                a + " <|> " + item
                for a, item in zip(commands, observation_strings)
            ]
            # counting rewards
            state_strings = agent.get_state_strings(infos)
            c_rewards = agent.get_binarized_count(state_strings, update=True)
            counting_rewards_np.append(np.array(c_rewards))

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights

            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                interaction_loss = agent.update_interaction()
                if interaction_loss is not None:
                    running_avg_correct_state_loss.push(interaction_loss)
                qa_loss = agent.update_qa()
                if qa_loss is not None:
                    running_avg_qa_loss.push(qa_loss)

            print_cmds.append(commands[0] if agent.
                              prev_step_is_still_interacting[0] else "--")
            # force stopping
            if step_no == agent.max_nb_steps_per_episode - 1:
                replay_info[-1] = torch.zeros_like(replay_info[-1])
            transition_cache.append(replay_info)
            step_in_total += 1
            if (step_no == agent.max_nb_steps_per_episode -
                    1) or (step_no > 0
                           and np.sum(generic.to_np(replay_info[-1])) == 0):
                break

        print(" / ".join(print_cmds))
        # The agent has exhausted all steps, now answer question.
        answerer_input = agent.naozi.get()
        answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs(
            answerer_input)

        chosen_word_indices = agent.answer_question_act_greedy(
            answerer_input_observation, answerer_input_observation_char,
            answerer_observation_ids, input_quest, input_quest_char)  # batch
        chosen_word_indices_np = generic.to_np(chosen_word_indices)
        chosen_answers = [
            agent.word_vocab[item] for item in chosen_word_indices_np
        ]
        # rewards
        # qa reward
        qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers)
        # sufficient info rewards
        masks = [item[-1] for item in transition_cache]
        masks_np = [generic.to_np(item) for item in masks]
        # 1 1 0 0 0 --> 1 1 0 0 0 0
        game_finishing_mask = np.stack(masks_np + [np.zeros((batch_size, ))],
                                       0)  # game step+1 x batch size
        # 1 1 0 0 0 0 --> 0 1 0 0 0
        game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[
            1:, :]  # game step x batch size
        game_running_mask = np.stack(masks_np, 0)  # game step x batch size

        if agent.question_type == "location":
            # sufficient info reward: location question
            reward_helper_info["observation_before_finish"] = answerer_input
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location(
                reward_helper_info)
        elif agent.question_type == "existence":
            # sufficient info reward: existence question
            reward_helper_info["observation_before_finish"] = answerer_input
            reward_helper_info[
                "game_facts_per_step"] = game_facts_cache  # facts before issuing command (we want to stop at correct state)
            reward_helper_info["init_game_facts"] = init_facts
            reward_helper_info["full_facts"] = infos["facts"]
            reward_helper_info["answers"] = answers
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence(
                reward_helper_info)
        elif agent.question_type == "attribute":
            # sufficient info reward: attribute question
            reward_helper_info["answers"] = answers
            reward_helper_info[
                "game_facts_per_step"] = game_facts_cache  # facts before and after issuing commands (we want to compare the differnce)
            reward_helper_info["init_game_facts"] = init_facts
            reward_helper_info["full_facts"] = infos["facts"]
            reward_helper_info[
                "commands_per_step"] = commands_per_step  # commands before and after issuing commands (we want to compare the differnce)
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute(
                reward_helper_info)
        else:
            raise NotImplementedError

        # push qa experience into qa replay buffer
        for b in range(batch_size):  # data points in batch
            # if the agent is not in the correct state, do not push it into replay buffer
            if np.sum(sufficient_info_reward_np[b]) == 0.0:
                continue
            agent.qa_replay_memory.push(False, qa_reward_np[b],
                                        answerer_input[b], questions[b],
                                        answers[b])

        # assign sufficient info reward and counting reward to the corresponding steps
        counting_rewards_np = np.stack(counting_rewards_np,
                                       1)  # batch x game step
        valid_command_rewards_np = np.stack(valid_command_rewards_np,
                                            1)  # batch x game step
        command_rewards_np = sufficient_info_reward_np + counting_rewards_np * game_running_mask.T * agent.revisit_counting_lambda + valid_command_rewards_np * game_running_mask.T * agent.valid_command_bonus_lambda  # batch x game step
        command_rewards = generic.to_pt(command_rewards_np,
                                        enable_cuda=agent.use_cuda,
                                        type="float")  # batch x game step
        for i in range(command_rewards_np.shape[1]):
            transition_cache[i].append(command_rewards[:, i])
        print(command_rewards_np[0])

        # push command generation experience into replay buffer
        for b in range(batch_size):
            is_prior = np.sum(command_rewards_np[b], 0) > 0.0
            for i in range(len(transition_cache)):
                batch_observation_strings, batch_question_strings, batch_possible_words, batch_chosen_indices, _, batch_rewards = transition_cache[
                    i]
                is_final = True
                if masks_np[i][b] != 0:
                    is_final = False
                agent.command_generation_replay_memory.push(
                    is_prior, batch_observation_strings[b],
                    batch_question_strings[b],
                    [item[b] for item in batch_possible_words],
                    [item[b] for item in batch_chosen_indices],
                    batch_rewards[b], is_final)
                if masks_np[i][b] == 0.0:
                    break

        # for printing
        r_qa = np.mean(qa_reward_np)
        r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1))
        running_avg_qa_reward.push(r_qa)
        running_avg_sufficient_info_reward.push(r_sufficient_info)
        print_rewards = np.mean(np.sum(command_rewards_np, -1))
        obs_string = answerer_input[0]
        print(obs_string)
        # finish game
        agent.finish_of_episode(episode_no, batch_size)
        # close env
        env.close()
        if agent.train_data_size == -1:
            # when games are generated on the fly,
            # remove all files (including .json and .ni) that have been used
            files_to_delete = []
            for gamefile in can_delete_these:
                if not gamefile.endswith(".ulx"):
                    continue
                files_to_delete.append(gamefile)
                files_to_delete.append(gamefile.replace(".ulx", ".json"))
                files_to_delete.append(gamefile.replace(".ulx", ".ni"))
            # print("rm -f {}".format(" ".join(files_to_delete)))
            os.system("rm -f {}".format(" ".join(files_to_delete)))
        episode_no += batch_size

        time_2 = datetime.datetime.now()
        print(
            "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | correct state: {:2.3f}/{:2.3f}"
            .format(episode_no,
                    str(time_2 - time_1).rsplit(".")[0],
                    running_avg_correct_state_loss.get_avg(),
                    running_avg_qa_loss.get_avg(), print_rewards, r_qa,
                    running_avg_qa_reward.get_avg(), r_sufficient_info,
                    running_avg_sufficient_info_reward.get_avg()))

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if episode_no == 0 or (
                episode_no % agent.save_frequency >
            (episode_no - batch_size) % agent.save_frequency):
            continue
        eval_qa_reward, eval_sufficient_info_reward = 0.0, 0.0
        # evaluate
        if agent.run_eval:
            eval_qa_reward, eval_sufficient_info_reward = evaluate.evaluate(
                data_dir, agent)
            # if run eval, then save model by eval accucacy
            if eval_qa_reward + eval_sufficient_info_reward > best_sum_reward_so_far:
                best_sum_reward_so_far = eval_qa_reward + eval_sufficient_info_reward
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
        # save model
        elif agent.save_checkpoint:
            if running_avg_qa_reward.get_avg(
            ) + running_avg_sufficient_info_reward.get_avg(
            ) > best_sum_reward_so_far:
                best_sum_reward_so_far = running_avg_qa_reward.get_avg(
                ) + running_avg_sufficient_info_reward.get_avg()
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")

        # plot using visdom
        viz_avg_correct_state_acc.append(
            running_avg_sufficient_info_reward.get_avg())
        viz_avg_qa_acc.append(running_avg_qa_reward.get_avg())
        viz_eval_sufficient_info_reward.append(eval_sufficient_info_reward)
        viz_eval_qa_reward.append(eval_qa_reward)
        viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist()

        if plt_win is None:
            plt_win = viz.line(X=viz_x,
                               Y=viz_avg_correct_state_acc,
                               opts=dict(title=agent.experiment_tag +
                                         "_train"),
                               name="correct state")
            viz.line(X=viz_x,
                     Y=viz_avg_qa_acc,
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_avg_correct_state_acc) - 1],
                     Y=[viz_avg_correct_state_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="correct state")
            viz.line(X=[len(viz_avg_qa_acc) - 1],
                     Y=[viz_avg_qa_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")

        if eval_plt_win is None:
            eval_plt_win = viz.line(X=viz_x,
                                    Y=viz_eval_sufficient_info_reward,
                                    opts=dict(title=agent.experiment_tag +
                                              "_eval"),
                                    name="correct state")
            viz.line(X=viz_x,
                     Y=viz_eval_qa_reward,
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_eval_sufficient_info_reward) - 1],
                     Y=[viz_eval_sufficient_info_reward[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="correct state")
            viz.line(X=[len(viz_eval_qa_reward) - 1],
                     Y=[viz_eval_qa_reward[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")

        # write accucacies down into file
        _s = json.dumps({
            "time spent":
            str(time_2 - time_1).rsplit(".")[0],
            "sufficient info":
            running_avg_sufficient_info_reward.get_avg(),
            "qa":
            running_avg_qa_reward.get_avg(),
            "eval sufficient info":
            eval_sufficient_info_reward,
            "eval qa":
            eval_qa_reward
        })
        with open(output_dir + "/" + json_file_name + '.json',
                  'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()