Exemplo n.º 1
0
def evaluate(env, agent, valid_test="valid"):

    env.seed(42)
    env.split_reset(valid_test)
    agent.eval()
    print_qa_acc, print_correct_state_acc, print_steps = [], [], []

    while (True):
        obs, infos = env.reset(random=False)

        agent.init(obs, infos)
        quest_list = agent.get_game_quest_info(infos)
        input_quest, input_quest_char, quest_id_list = agent.get_agent_inputs(
            quest_list)

        tmp_replay_buffer = []

        for step_no in range(agent.eval_max_nb_steps_per_episode):
            commands, replay_info = agent.act_greedy(obs, infos, input_quest,
                                                     input_quest_char,
                                                     quest_id_list)

            tmp_replay_buffer.append(replay_info)
            obs, infos = env.step(commands)

            still_running = generic.to_np(replay_info[-1])
            if np.sum(still_running) == 0:
                break

        # The agent has exhausted all steps, now answer question.
        chosen_head_tails = agent.answer_question_act(agent.naozi.get(),
                                                      quest_list)  # batch
        chosen_head_tails_np = generic.to_np(chosen_head_tails)
        chosen_answer_strings = generic.get_answer_strings(
            agent.naozi.get(), chosen_head_tails_np)
        answer_strings = [item["a"] for item in infos]
        masks_np = [generic.to_np(item[-1]) for item in tmp_replay_buffer]

        qa_reward_np = generic.get_qa_reward(chosen_answer_strings,
                                             answer_strings)
        correct_state_reward_np = generic.get_sufficient_info_reward(
            agent.naozi.get(), answer_strings)
        step_masks_np = np.sum(np.array(masks_np), 0)
        for i in range(len(qa_reward_np)):
            # if the answer is totally wrong, we assume it used all steps
            if qa_reward_np[i] == 0.0:
                step_masks_np[i] = agent.eval_max_nb_steps_per_episode
        print_qa_acc += qa_reward_np.tolist()
        print_correct_state_acc += correct_state_reward_np.tolist()
        print_steps += step_masks_np.tolist()
        if env.batch_pointer == 0:
            break

    print(
        "===== Eval =====: qa acc: {:2.3f} | correct state: {:2.3f} | used steps: {:2.3f}"
        .format(np.mean(np.array(print_qa_acc)),
                np.mean(np.array(print_correct_state_acc)),
                np.mean(np.array(print_steps))))
    return np.mean(np.array(print_qa_acc)), np.mean(
        np.array(print_correct_state_acc)), np.mean(np.array(print_steps))
Exemplo n.º 2
0
 def update_interaction(self):
     # update neural model by replaying snapshots in replay memory
     interaction_loss, q_value = self.get_dqn_loss()
     if interaction_loss is None:
         return None, None
     loss = interaction_loss * self.interaction_loss_lambda
     # Backpropagate
     self.online_net.zero_grad()
     self.optimizer.zero_grad()
     loss.backward()
     # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
     torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                    self.clip_grad_norm)
     self.optimizer.step()  # apply gradients
     return to_np(torch.mean(interaction_loss)), to_np(torch.mean(q_value))
Exemplo n.º 3
0
def evaluate_observation_generation_loss(env, agent, valid_test="valid"):
    env.split_reset(valid_test)
    agent.eval()
    ave_loss = []

    while(True):
        observation_strings, prev_action_strings = env.get_batch()
        batch_size = len(observation_strings)
        lens = [len(elem) for elem in observation_strings]
        max_len = max(lens)
        padded_observation_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in observation_strings]
        padded_prev_action_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in prev_action_strings]
        eps_masks = torch.zeros((batch_size, max_len), dtype=torch.float).cuda() if agent.use_cuda else torch.zeros((batch_size, max_len), dtype=torch.float)
        for i in range(batch_size):
            eps_masks[i, :lens[i]] = 1

        prev_h = None
        for j in range(max_len):
            batch_obs_string = [elem[j] for elem in padded_observation_strings]
            batch_prev_action_string = [elem[j] for elem in padded_prev_action_strings]
            with torch.no_grad():
                loss, _, prev_h = agent.observation_generation_teacher_force(batch_obs_string, batch_prev_action_string, eps_masks[:, j], prev_h)
            ave_loss.append(to_np(loss))
        if env.batch_pointer == 0:
            break
    return np.mean(np.array(ave_loss))
Exemplo n.º 4
0
    def generate_commands(self, action_indices, ctrlf_indices):

        action_indices_np = to_np(action_indices)
        ctrlf_indices_np = to_np(ctrlf_indices)
        res_str = []
        batch_size = action_indices_np.shape[0]
        for i in range(batch_size):
            which = action_indices_np[i][0]
            if which == self.action2id["ctrl+f"]:
                which_word = ctrlf_indices_np[i][0]
                res_str.append("ctrl+f " + self.word_vocab[which_word])
            elif which < len(self.id2action):
                res_str.append(self.id2action[which])
            else:
                raise NotImplementedError
        return res_str
Exemplo n.º 5
0
    def choose_random_command(self, word_ranks, word_masks_np):
        """
        Generate a command randomly, for epsilon greedy.

        Arguments:
            word_ranks: Q values for each word by model.action_scorer.
            word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun).
        """
        batch_size = word_ranks[0].size(0)
        word_ranks_np = [to_np(item) for item in word_ranks]  # list of batch x n_vocab
        word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)]  # list of batch x n_vocab
        word_indices = []
        for i in range(len(word_ranks_np)):
            indices = []
            for j in range(batch_size):
                msk = word_masks_np[i][j]  # vocab
                indices.append(np.random.choice(len(msk), p=msk / np.sum(msk, -1)))
            word_indices.append(np.array(indices))
        # word_indices: list of batch
        word_qvalues = [[] for _ in word_masks_np]
        for i in range(batch_size):
            for j in range(len(word_qvalues)):
                word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]])
        word_qvalues = [torch.stack(item) for item in word_qvalues]
        word_indices = [to_pt(item, self.use_cuda) for item in word_indices]
        word_indices = [item.unsqueeze(-1) for item in word_indices]  # list of batch x 1
        return word_qvalues, word_indices
Exemplo n.º 6
0
def evaluate_deep_graph_infomax(env, agent, valid_test="valid", verbose=False):
    env.split_reset(valid_test)
    agent.eval()
    list_eval_acc, list_eval_loss = [], []
    # counter = 0
    # to_print = []

    while (True):
        triplets = env.get_batch()
        with torch.no_grad():
            loss, labels, dgi_discriminator_logits, batch_nonzero_idx = agent.get_deep_graph_infomax_logits(
                triplets)
        # sigmoid
        dgi_discriminator_logits = 1.0 / (1.0 +
                                          np.exp(-dgi_discriminator_logits))

        for i in range(len(triplets)):

            gt = labels[i]  # num_node*2
            pred_idx = (dgi_discriminator_logits[i] >= 0.5).astype(
                "float32")  # num_node*2
            nonzeros = np.array(batch_nonzero_idx[i].tolist() +
                                (batch_nonzero_idx[i] +
                                 len(agent.node_vocab)).tolist())
            gt = gt[nonzeros]  # num_nonzero
            pred_idx = pred_idx[nonzeros]  # num_nonzero
            correct = (pred_idx == gt).astype("float32").tolist()
            list_eval_acc += correct

        loss = to_np(loss)
        list_eval_loss.append(loss)

        if env.batch_pointer == 0:
            break
    return np.mean(list_eval_loss), np.mean(list_eval_acc)
Exemplo n.º 7
0
    def choose_maxQ_command(self, word_ranks, word_masks_np):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            word_ranks: Q values for each word by model.action_scorer.
            word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun).
        """
        batch_size = word_ranks[0].size(0)
        word_ranks_np = [to_np(item)
                         for item in word_ranks]  # list of batch x n_vocab
        word_ranks_np = [
            r - np.min(r) for r in word_ranks_np
        ]  # minus the min value, so that all values are non-negative
        word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)
                         ]  # list of batch x n_vocab
        word_indices = [np.argmax(item, -1)
                        for item in word_ranks_np]  # list of batch
        word_qvalues = [[] for _ in word_masks_np]
        for i in range(batch_size):
            for j in range(len(word_qvalues)):
                word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]])
        word_qvalues = [torch.stack(item) for item in word_qvalues]
        word_indices = [to_pt(item, self.use_cuda) for item in word_indices]
        word_indices = [item.unsqueeze(-1)
                        for item in word_indices]  # list of batch x 1
        return word_qvalues, word_indices
Exemplo n.º 8
0
    def point_maxq_position(self, point_distribution, mask):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position batch x time x 2.
            mask: position masks.
        """
        point_distribution_np = to_np(point_distribution)  # batch x time
        mask_np = to_np(mask)  # batch x time
        point_distribution_np = point_distribution_np - np.min(
            point_distribution_np
        ) + 1e-2  # minus the min value, so that all values are non-negative
        point_distribution_np = point_distribution_np * np.expand_dims(
            mask_np, -1)  # batch x time x 2
        indices = np.argmax(point_distribution_np, 1)  # batch x 2
        indices = to_pt(np.array(indices), self.use_cuda)  # batch x 2
        return indices
Exemplo n.º 9
0
 def get_chosen_strings(self, chosen_indices):
     """
     Turns list of word indices into actual command strings.
     chosen_indices: Word indices chosen by model.
     """
     chosen_indices_np = [to_np(item) for item in chosen_indices]
     res_str = []
     batch_size = chosen_indices_np[0].shape[0]
     for i in range(batch_size):
         verb, adj, noun = chosen_indices_np[0][i], chosen_indices_np[1][
             i], chosen_indices_np[2][i]
         res_str.append(self.word_ids_to_commands(verb, adj, noun))
     return res_str
Exemplo n.º 10
0
def evaluate_state_prediction(env, agent, valid_test="valid", verbose=False):
    env.split_reset(valid_test)
    agent.eval()
    list_eval_acc, list_eval_loss = [], []
    counter = 0
    to_print = []

    while (True):
        target_graph, previous_graph, action, admissible_graphs = env.get_batch(
        )
        with torch.no_grad():
            loss, sp_ret, np_labels, admissible_graphs = agent.get_state_prediction_logits(
                previous_graph, action, target_graph, admissible_graphs)
        loss = to_np(loss)
        pred = np.argmax(sp_ret, -1)  # batch
        gt = np.argmax(np_labels, -1)  # batch
        correct = (pred == gt).astype("float32").tolist()
        list_eval_acc += correct
        list_eval_loss += [loss]

        if verbose:
            for i in range(len(previous_graph)):
                to_print.append(
                    str(counter) +
                    " -------------------------------------------- acc: " +
                    str(correct[i]))
                trips = []
                for t in previous_graph[i]:
                    trips.append(t[0] + "-" + t[2] + "-" + t[1])
                to_print.append("PREV TRIPLETS: %s " % (" | ".join(trips)))
                to_print.append("ACTION: %s " % (action[i]))
                trips = []
                for t in admissible_graphs[i][pred[i]]:
                    trips.append(t[0] + "-" + t[2] + "-" + t[1])
                to_print.append("PRED TRIPLETS: %s " % (" | ".join(trips)))
                trips = []
                for t in target_graph[i]:
                    trips.append(t[0] + "-" + t[2] + "-" + t[1])
                to_print.append("GT TRIPLETS: %s " % (" | ".join(trips)))
                to_print.append("")
                counter += 1

        if env.batch_pointer == 0:
            break
    with open(agent.experiment_tag + "_output.txt", "w") as f:
        f.write("\n".join(to_print))
    print("Eval Loss: {:2.3f}, Eval accuracy: {:2.3f}".format(
        np.mean(list_eval_loss), np.mean(list_eval_acc)))
    return np.mean(list_eval_loss), np.mean(list_eval_acc)
Exemplo n.º 11
0
    def point_random_position(self, point_distribution, mask):
        """
        Generate a command by random, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position batch x time x 2.
            mask: position masks.
        """
        batch_size = point_distribution.size(0)
        mask_np = to_np(mask)  # batch x time
        indices = []
        for i in range(batch_size):
            msk = mask_np[i]  # time
            indices.append(
                np.random.choice(len(msk), 2, p=msk / np.sum(msk, -1)))
        indices = to_pt(np.stack(indices, 0), self.use_cuda)  # batch x 2
        return indices
Exemplo n.º 12
0
def _choose_random_command(word_ranks, word_masks_np, use_cuda):
    """
    Generate a command randomly, for epsilon greedy.

    Arguments:
        word_ranks: Q values for each word by model.action_scorer.
        word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun, adj2, noun2).
    """

    batch_size = word_ranks[0].size(0)
    # print("batch_size=", batch_size, len(word_masks_np))
    assert len(word_ranks) == len(word_masks_np)

    word_ranks_np = [
        to_np(item) for item in word_ranks
    ]  # list of (batch x n_vocab) arrays, len=5 (5 word output phrases)
    # word_ranks_np = [r - np.min(r) for r in word_ranks_np]  # minus the min value, so that all values are non-negative
    word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)
                     ]  # list of batch x n_vocab

    word_indices = []
    for i in range(
            len(word_ranks_np)):  # len=5 (verb, adj1, noun1, adj2, noun2)
        indices = []
        for j in range(batch_size):
            msk = word_masks_np[i][
                j]  # msk is of len = vocab, j is index into batch
            indices.append(np.random.choice(
                len(msk), p=msk /
                np.sum(msk, -1)))  # choose from non-zero entries of msk
        word_indices.append(np.array(indices))
    # word_indices: list of batch

    word_qvalues = [[] for _ in word_masks_np]
    for i in range(batch_size):
        for j in range(len(word_qvalues)):
            word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]])
    word_qvalues = [torch.stack(item) for item in word_qvalues]
    word_indices = [to_pt(item, use_cuda) for item in word_indices]
    word_indices = [item.unsqueeze(-1)
                    for item in word_indices]  # list of batch x 1
    return word_qvalues, word_indices
Exemplo n.º 13
0
def train():

    time_1 = datetime.datetime.now()

    with open("config.yaml") as reader:
        config = yaml.safe_load(reader)
    if config['general']['dataset'] == "squad":
        env = GamifiedSquad(config)
    else:
        env = GamifiedNewsQA(config)
    env.split_reset("train")
    agent = Agent()

    # visdom
    viz = visdom.Visdom()
    plt_win = None
    eval_plt_win = None
    plt_q_value_win = None
    plt_steps_win = None
    eval_plt_steps_win = None
    viz_avg_correct_state_acc, viz_avg_qa_acc = [], []
    viz_avg_correct_state_q_value = []
    viz_eval_correct_state_acc, viz_eval_qa_acc, viz_eval_steps = [], [], []
    viz_avg_steps = []

    step_in_total = 0
    episode_no = 0
    running_avg_qa_acc = HistoryScoreCache(capacity=50)
    running_avg_correct_state_acc = HistoryScoreCache(capacity=50)
    running_avg_qa_loss = HistoryScoreCache(capacity=50)
    running_avg_correct_state_loss = HistoryScoreCache(capacity=50)
    running_avg_correct_state_q_value = HistoryScoreCache(capacity=50)
    running_avg_steps = HistoryScoreCache(capacity=50)

    output_dir, data_dir = ".", "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_qa_acc_so_far = 0.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt")
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt")
            agent.update_target_net()

    while (True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        env.seed(episode_no)
        obs, infos = env.reset()
        print(
            "====================================================================================",
            episode_no)
        print("-- Q: %s" % (infos[0]["q"].encode('utf-8')))
        print("-- A: %s" % (infos[0]["a"][0].encode('utf-8')))

        agent.train()
        agent.init(obs, infos)
        quest_list = agent.get_game_quest_info(infos)
        input_quest, input_quest_char, quest_id_list = agent.get_agent_inputs(
            quest_list)
        tmp_replay_buffer = []
        print_cmds = []
        batch_size = len(obs)

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        for step_no in range(agent.max_nb_steps_per_episode):
            # generate commands
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights
            commands, replay_info = agent.act(obs,
                                              infos,
                                              input_quest,
                                              input_quest_char,
                                              quest_id_list,
                                              random=act_randomly)
            obs, infos = env.step(commands)

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights

            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                interaction_loss, interaction_q_value = agent.update_interaction(
                )
                if interaction_loss is not None:
                    running_avg_correct_state_loss.push(interaction_loss)
                    running_avg_correct_state_q_value.push(interaction_q_value)
                qa_loss = agent.update_qa()
                if qa_loss is not None:
                    running_avg_qa_loss.push(qa_loss)

            step_in_total += 1
            still_running = generic.to_np(replay_info[-1])
            print_cmds.append(commands[0] if still_running[0] else "--")

            # force stopping
            if step_no == agent.max_nb_steps_per_episode - 1:
                replay_info[-1] = torch.zeros_like(replay_info[-1])
            tmp_replay_buffer.append(replay_info)
            if np.sum(still_running) == 0:
                break

        print(" / ".join(print_cmds).encode('utf-8'))
        # The agent has exhausted all steps, now answer question.
        chosen_head_tails = agent.answer_question_act(agent.naozi.get(),
                                                      quest_list)  # batch
        chosen_head_tails_np = generic.to_np(chosen_head_tails)
        chosen_answer_strings = generic.get_answer_strings(
            agent.naozi.get(), chosen_head_tails_np)
        answer_strings = [item["a"] for item in infos]

        qa_reward_np = generic.get_qa_reward(chosen_answer_strings,
                                             answer_strings)
        correct_state_reward_np = generic.get_sufficient_info_reward(
            agent.naozi.get(), answer_strings)
        correct_state_reward = generic.to_pt(correct_state_reward_np,
                                             enable_cuda=agent.use_cuda,
                                             type='float')  # batch

        # push qa experience into qa replay buffer
        for b in range(batch_size):  # data points in batch
            is_prior = qa_reward_np[
                b] > agent.qa_reward_prior_threshold * agent.qa_replay_memory.avg_rewards(
                )
            # if the agent is not in the correct state, do not push it into replay buffer
            if np.mean(correct_state_reward_np[b]) == 0.0:
                continue
            agent.qa_replay_memory.push(is_prior, qa_reward_np[b],
                                        agent.naozi.get(b), quest_list[b],
                                        answer_strings[b])

        # small positive reward whenever it answers question correctly
        masks_np = [generic.to_np(item[-1]) for item in tmp_replay_buffer]
        command_rewards_np = []
        for i in range(len(tmp_replay_buffer)):
            if i == len(tmp_replay_buffer) - 1:
                r = correct_state_reward * tmp_replay_buffer[i][-1]
                r_np = correct_state_reward_np * masks_np[i]
            else:
                # give reward only at that one game step, not all
                r = correct_state_reward * (tmp_replay_buffer[i][-1] -
                                            tmp_replay_buffer[i + 1][-1])
                r_np = correct_state_reward_np * (masks_np[i] -
                                                  masks_np[i + 1])
            tmp_replay_buffer[i].append(r)
            command_rewards_np.append(r_np)
        command_rewards_np = np.array(command_rewards_np)
        print(command_rewards_np[:, 0])

        # push experience into replay buffer
        for b in range(len(correct_state_reward_np)):
            is_prior = np.sum(command_rewards_np, 0)[b] > 0.0
            for i in range(len(tmp_replay_buffer)):
                batch_description_list, batch_chosen_indices, batch_chosen_ctrlf_indices, _, batch_rewards = tmp_replay_buffer[
                    i]
                is_final = True
                if masks_np[i][b] != 0:
                    is_final = False
                agent.replay_memory.push(is_prior, batch_description_list[b],
                                         quest_list[b],
                                         batch_chosen_indices[b],
                                         batch_chosen_ctrlf_indices[b],
                                         batch_rewards[b], is_final)
                if masks_np[i][b] == 0.0:
                    break

        qa_acc = np.mean(qa_reward_np)
        correct_state_acc = np.mean(correct_state_reward_np)
        step_masks_np = np.sum(np.array(masks_np), 0)  # batch
        for i in range(len(qa_reward_np)):
            # if the answer is totally wrong, we assume it used all steps
            if qa_reward_np[i] == 0.0:
                step_masks_np[i] = agent.max_nb_steps_per_episode
        used_steps = np.mean(step_masks_np)

        running_avg_qa_acc.push(qa_acc)
        running_avg_correct_state_acc.push(correct_state_acc)
        running_avg_steps.push(used_steps)
        print_rewards = np.sum(np.mean(command_rewards_np, -1))

        obs_string = agent.naozi.get(0)
        print("-- OBS: %s" % (obs_string.encode('utf-8')))
        print("-- PRED: %s" % (chosen_answer_strings[0].encode('utf-8')))
        # finish game

        agent.finish_of_episode(episode_no, batch_size)
        episode_no += batch_size

        time_2 = datetime.datetime.now()
        print(
            "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | interaction qvalue: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | sufficient info: {:2.3f}/{:2.3f} | used steps: {:2.3f}"
            .format(episode_no,
                    str(time_2 - time_1).rsplit(".")[0],
                    running_avg_correct_state_loss.get_avg(),
                    running_avg_correct_state_q_value.get_avg(),
                    running_avg_qa_loss.get_avg(), print_rewards, qa_acc,
                    running_avg_qa_acc.get_avg(), correct_state_acc,
                    running_avg_correct_state_acc.get_avg(),
                    running_avg_steps.get_avg()))

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if agent.report_frequency == 0 or (
                episode_no % agent.report_frequency >
            (episode_no - batch_size) % agent.report_frequency):
            continue
        eval_qa_acc, eval_correct_state_acc, eval_used_steps = 0.0, 0.0, 0.0
        # evaluate
        if agent.run_eval:
            eval_qa_acc, eval_correct_state_acc, eval_used_steps = evaluate.evaluate(
                env, agent, "valid")
            env.split_reset("train")
            # if run eval, then save model by eval accucacy
            if agent.save_frequency > 0 and (
                    episode_no % agent.report_frequency <=
                (episode_no - batch_size) % agent.report_frequency
            ) and eval_qa_acc > best_qa_acc_so_far:
                best_qa_acc_so_far = eval_qa_acc
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
        # save model
        elif agent.save_frequency > 0 and (
                episode_no % agent.report_frequency <=
            (episode_no - batch_size) % agent.report_frequency):
            if running_avg_qa_acc.get_avg() > best_qa_acc_so_far:
                best_qa_acc_so_far = running_avg_qa_acc.get_avg()
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")

        # plot using visdom
        viz_avg_correct_state_acc.append(
            running_avg_correct_state_acc.get_avg())
        viz_avg_qa_acc.append(running_avg_qa_acc.get_avg())
        viz_avg_correct_state_q_value.append(
            running_avg_correct_state_q_value.get_avg())
        viz_eval_correct_state_acc.append(eval_correct_state_acc)
        viz_eval_qa_acc.append(eval_qa_acc)
        viz_eval_steps.append(eval_used_steps)
        viz_avg_steps.append(running_avg_steps.get_avg())
        viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist()

        if plt_win is None:
            plt_win = viz.line(X=viz_x,
                               Y=viz_avg_correct_state_acc,
                               opts=dict(title=agent.experiment_tag +
                                         "_train"),
                               name="sufficient info")
            viz.line(X=viz_x,
                     Y=viz_avg_qa_acc,
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_avg_correct_state_acc) - 1],
                     Y=[viz_avg_correct_state_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="sufficient info")
            viz.line(X=[len(viz_avg_qa_acc) - 1],
                     Y=[viz_avg_qa_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")

        if plt_q_value_win is None:
            plt_q_value_win = viz.line(X=viz_x,
                                       Y=viz_avg_correct_state_q_value,
                                       opts=dict(title=agent.experiment_tag +
                                                 "_train_q_value"),
                                       name="sufficient info")
        else:
            viz.line(X=[len(viz_avg_correct_state_q_value) - 1],
                     Y=[viz_avg_correct_state_q_value[-1]],
                     opts=dict(title=agent.experiment_tag + "_train_q_value"),
                     win=plt_q_value_win,
                     update='append',
                     name="sufficient info")

        if plt_steps_win is None:
            plt_steps_win = viz.line(X=viz_x,
                                     Y=viz_avg_steps,
                                     opts=dict(title=agent.experiment_tag +
                                               "_train_step"),
                                     name="used steps")
        else:
            viz.line(X=[len(viz_avg_steps) - 1],
                     Y=[viz_avg_steps[-1]],
                     opts=dict(title=agent.experiment_tag + "_train_step"),
                     win=plt_steps_win,
                     update='append',
                     name="used steps")

        if eval_plt_win is None:
            eval_plt_win = viz.line(X=viz_x,
                                    Y=viz_eval_correct_state_acc,
                                    opts=dict(title=agent.experiment_tag +
                                              "_eval"),
                                    name="sufficient info")
            viz.line(X=viz_x,
                     Y=viz_eval_qa_acc,
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_eval_correct_state_acc) - 1],
                     Y=[viz_eval_correct_state_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="sufficient info")
            viz.line(X=[len(viz_eval_qa_acc) - 1],
                     Y=[viz_eval_qa_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")

        if eval_plt_steps_win is None:
            eval_plt_steps_win = viz.line(
                X=viz_x,
                Y=viz_eval_steps,
                opts=dict(title=agent.experiment_tag + "_eval_step"),
                name="used steps")
        else:
            viz.line(X=[len(viz_avg_steps) - 1],
                     Y=[viz_eval_steps[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval_step"),
                     win=eval_plt_steps_win,
                     update='append',
                     name="used steps")

        # write accucacies down into file
        _s = json.dumps({
            "time spent":
            str(time_2 - time_1).rsplit(".")[0],
            "sufficient info":
            str(running_avg_correct_state_acc.get_avg()),
            "qa":
            str(running_avg_qa_acc.get_avg()),
            "sufficient qvalue":
            str(running_avg_correct_state_q_value.get_avg()),
            "eval sufficient info":
            str(eval_correct_state_acc),
            "eval qa":
            str(eval_qa_acc),
            "eval steps":
            str(eval_used_steps),
            "used steps":
            str(running_avg_steps.get_avg())
        })
        with open(output_dir + "/" + json_file_name + '.json',
                  'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    env = DGIData(config)
    env.split_reset("train")
    agent = Agent(config)
    agent.zero_noise()
    ave_train_loss = generic.HistoryScoreCache(capacity=500)

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        loss_win = None
        eval_acc_win = None
        viz_loss, viz_eval_loss, viz_eval_acc = [], [], []

    episode_no = 0
    batch_no = 0

    output_dir = "."
    data_dir = "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt",
                                        load_partial_graph=False)

    best_eval_acc, best_training_loss_so_far = 0.0, 10000.0

    try:
        while (True):
            if episode_no > agent.max_episode:
                break
            agent.train()
            triplets = env.get_batch()
            curr_batch_size = len(triplets)
            loss, _, _, _ = agent.get_deep_graph_infomax_logits(triplets)
            # Update Model
            agent.online_net.zero_grad()
            agent.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.online_net.parameters(),
                                           agent.clip_grad_norm)
            agent.optimizer.step()
            loss = generic.to_np(loss)
            ave_train_loss.push(loss)

            # lr schedule
            if batch_no < agent.learning_rate_warmup_until:
                cr = agent.init_learning_rate / math.log2(
                    agent.learning_rate_warmup_until)
                learning_rate = cr * math.log2(batch_no + 1)
            else:
                learning_rate = agent.init_learning_rate
            for param_group in agent.optimizer.param_groups:
                param_group['lr'] = learning_rate

            episode_no += curr_batch_size
            batch_no += 1

            if agent.report_frequency == 0 or (
                    episode_no % agent.report_frequency >
                (episode_no - curr_batch_size) % agent.report_frequency):
                continue

            eval_acc, eval_loss = 0.0, 0.0
            if episode_no % agent.report_frequency <= (
                    episode_no - curr_batch_size) % agent.report_frequency:
                if agent.run_eval:
                    eval_loss, eval_acc = evaluate.evaluate_deep_graph_infomax(
                        env, agent, "valid")
                    if eval_acc > best_eval_acc:
                        best_eval_acc = eval_acc
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")
                        print(
                            "Saving best model so far! with Eval acc : {:2.3f}"
                            .format(best_eval_acc))
                    env.split_reset("train")
                else:
                    if loss < best_training_loss_so_far:
                        best_training_loss_so_far = loss
                        agent.save_model_to_path(output_dir + "/" +
                                                 agent.experiment_tag +
                                                 "_model.pt")

            time_2 = datetime.datetime.now()
            print(
                "Episode: {:3d} | time spent: {:s} | sliding window loss: {:2.3f} | Eval Acc: {:2.3f} | Eval Loss: {:2.3f}"
                .format(episode_no,
                        str(time_2 - time_1).rsplit(".")[0],
                        ave_train_loss.get_avg(), eval_acc, eval_loss))

            # plot using visdom
            if config["general"]["visdom"]:
                viz_loss.append(ave_train_loss.get_avg())
                viz_eval_acc.append(eval_acc)
                viz_eval_loss.append(eval_loss)
                viz_x = np.arange(len(viz_loss)).tolist()
                viz_eval_x = np.arange(len(viz_eval_acc)).tolist()

                if loss_win is None:
                    loss_win = viz.line(X=viz_x,
                                        Y=viz_loss,
                                        opts=dict(title=agent.experiment_tag +
                                                  "_loss"),
                                        name="training loss")
                    viz.line(X=viz_eval_x,
                             Y=viz_eval_loss,
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_loss"),
                             win=loss_win,
                             update='append',
                             name="eval loss")
                else:
                    viz.line(X=[len(viz_loss) - 1],
                             Y=[viz_loss[-1]],
                             opts=dict(title=agent.experiment_tag + "_loss"),
                             win=loss_win,
                             update='append',
                             name="training loss")
                    viz.line(X=[len(viz_eval_loss) - 1],
                             Y=[viz_eval_loss[-1]],
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_loss"),
                             win=loss_win,
                             update='append',
                             name="eval loss")

                if eval_acc_win is None:
                    eval_acc_win = viz.line(
                        X=viz_eval_x,
                        Y=viz_eval_acc,
                        opts=dict(title=agent.experiment_tag + "_eval_acc"),
                        name="eval accuracy")
                else:
                    viz.line(X=[len(viz_eval_acc) - 1],
                             Y=[viz_eval_acc[-1]],
                             opts=dict(title=agent.experiment_tag +
                                       "_eval_acc"),
                             win=eval_acc_win,
                             update='append',
                             name="eval accuracy")

            # write accuracies down into file
            _s = json.dumps({
                "time spent": str(time_2 - time_1).rsplit(".")[0],
                "loss": str(ave_train_loss.get_avg()),
                "eval loss": str(eval_loss),
                "eval accuracy": str(eval_acc)
            })
            with open(output_dir + "/" + json_file_name + '.json',
                      'a+') as outfile:
                outfile.write(_s + '\n')
                outfile.flush()

    # At any point you can hit Ctrl + C to break out of training early.
    except KeyboardInterrupt:
        print('--------------------------------------------')
        print('Exiting from training early...')
    if agent.run_eval:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            print('Evaluating on test set and saving log...')
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt",
                                        load_partial_graph=False)
        _, _ = evaluate.evaluate_deep_graph_infomax(env,
                                                    agent,
                                                    "test",
                                                    verbose=True)
Exemplo n.º 15
0
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    env = ObservationGenerationData(config)
    env.split_reset("train")
    agent = Agent(config)
    agent.zero_noise()
    ave_train_loss = generic.HistoryScoreCache(capacity=500)

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        plt_win = None
        eval_plt_win = None
        viz_loss, viz_eval_loss, viz_eval_f1 = [], [], []

    episode_no = 0
    batch_no = 0

    output_dir = "."
    data_dir = "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_eval_loss_so_far, best_training_loss_so_far = 10000.0, 10000.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False)
        elif os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt", load_partial_graph=False)

    try:
        while(True):
            if episode_no > agent.max_episode:
                break
            agent.train()
            observation_strings, prev_action_strings = env.get_batch()
            curr_batch_size = len(observation_strings)
            lens = [len(elem) for elem in observation_strings]
            max_len = max(lens)
            padded_observation_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in observation_strings]
            padded_prev_action_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in prev_action_strings]
            masks = torch.zeros((curr_batch_size, max_len), dtype=torch.float).cuda() if agent.use_cuda else torch.zeros((curr_batch_size, max_len), dtype=torch.float)
            for i in range(curr_batch_size):
                masks[i, :lens[i]] = 1
            preds_last_batch = []
            last_k_batches_loss = []
            prev_h = None
            for i in range(max_len):
                batch_obs_string = [elem[i] for elem in padded_observation_strings]
                batch_prev_action_string = [elem[i] for elem in padded_prev_action_strings]
                loss, pred, prev_h = agent.observation_generation_teacher_force(batch_obs_string, batch_prev_action_string, masks[:, i], prev_h)
                last_k_batches_loss.append(loss)
                ave_train_loss.push(generic.to_np(loss))
                preds_last_batch.append(pred[-1])
                if ((i + 1) % agent.backprop_frequency == 0 or i == max_len - 1):  # and i > 0:
                    agent.optimizer.zero_grad()
                    ave_k_loss = torch.mean(torch.stack(last_k_batches_loss))
                    ave_k_loss.backward()
                    agent.optimizer.step()
                    last_k_batches_loss = []
                    prev_h = prev_h.detach()

            k = 0
            ep_string = []
            while(masks[-1][k] > 0):
                step_string = []
                regen_strings = preds_last_batch[k].argmax(-1)
                for l in range(len(regen_strings)):
                    step_string.append(agent.word_vocab[regen_strings[l]])
                ep_string.append((' '.join(step_string).split("<eos>")[0]))
                k += 1
                if k == len(masks[-1]):
                    break
            if len(ep_string) >= 3:
                print(' | '.join(ep_string[:3]))
            #####

            # lr schedule
            # learning_rate = 1.0 * (generic.power(agent.model.block_hidden_dim, -0.5) * min(generic.power(batch_no, -0.5), batch_no * generic.power(agent.learning_rate_warmup_until, -1.5)))
            if batch_no < agent.learning_rate_warmup_until:
                cr = agent.init_learning_rate / math.log2(agent.learning_rate_warmup_until)
                learning_rate = cr * math.log2(batch_no + 1)
            else:
                learning_rate = agent.init_learning_rate
            for param_group in agent.optimizer.param_groups:
                param_group['lr'] = learning_rate

            episode_no += curr_batch_size
            batch_no += 1

            time_2 = datetime.datetime.now()
            print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg()))

            if agent.report_frequency == 0 or (episode_no % agent.report_frequency > (episode_no - curr_batch_size) % agent.report_frequency):
                continue

            eval_loss, eval_f1 = 0.0, 0.0
            if episode_no % agent.report_frequency <= (episode_no - curr_batch_size) % agent.report_frequency:
                if agent.run_eval:
                    eval_loss = evaluate.evaluate_observation_generation_loss(env, agent, "valid")
                    eval_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "valid")
                    env.split_reset("train")
                    # if run eval, then save model by eval accuracy
                    if eval_loss < best_eval_loss_so_far:
                        best_eval_loss_so_far = eval_loss
                        agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
                else:
                    if loss < best_training_loss_so_far:
                        best_training_loss_so_far = loss
                        agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")


            time_2 = datetime.datetime.now()
            print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f} | valid loss: {:2.3f} | valid f1: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], loss, eval_loss, eval_f1))

            # plot using visdom
            if config["general"]["visdom"]:
                viz_loss.append(ave_train_loss.get_avg())
                viz_eval_loss.append(eval_loss)
                viz_eval_f1.append(eval_f1)
                viz_x = np.arange(len(viz_loss)).tolist()

                if plt_win is None:
                    plt_win = viz.line(X=viz_x, Y=viz_loss,
                                    opts=dict(title=agent.experiment_tag + "_loss"),
                                    name="training loss")

                    viz.line(X=viz_x, Y=viz_eval_loss,
                            opts=dict(title=agent.experiment_tag + "_eval_loss"),
                            win=plt_win,
                            update='append', name="eval loss")
                else:
                    viz.line(X=[len(viz_loss) - 1], Y=[viz_loss[-1]],
                            opts=dict(title=agent.experiment_tag + "_loss"),
                            win=plt_win,
                            update='append', name="training loss")

                    viz.line(X=[len(viz_eval_loss) - 1], Y=[viz_eval_loss[-1]],
                            opts=dict(title=agent.experiment_tag + "_eval_loss"),
                            win=plt_win,
                            update='append', name="eval loss")


                if eval_plt_win is None:
                    eval_plt_win = viz.line(X=viz_x, Y=viz_eval_f1,
                                   opts=dict(title=agent.experiment_tag + "_eval_f1"),
                                   name="eval f1")
                else:
                    viz.line(X=[len(viz_eval_f1) - 1], Y=[viz_eval_f1[-1]],
                            opts=dict(title=agent.experiment_tag + "_eval_f1"),
                            win=eval_plt_win,
                            update='append', name="eval f1")

            # write accuracies down into file
            _s = json.dumps({"time spent": str(time_2 - time_1).rsplit(".")[0],
                            "loss": str(ave_train_loss.get_avg()),
                            "eval loss": str(eval_loss),
                            "eval f1": str(eval_f1)})
            with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile:
                outfile.write(_s + '\n')
                outfile.flush()
    
    # At any point you can hit Ctrl + C to break out of training early.
    except KeyboardInterrupt:
        print('--------------------------------------------')
        print('Exiting from training early...')
    if agent.run_eval:
        if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"):
            print('Evaluating on test set and saving log...')
            agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False)
        test_loss = evaluate.evaluate_observation_generation_loss(env, agent, "test")
        test_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "test")
        print(test_loss, test_f1)
Exemplo n.º 16
0
def train(data_path):

    time_1 = datetime.datetime.now()
    agent = Agent()

    # visdom
    viz = visdom.Visdom()
    plt_win = None
    eval_plt_win = None
    viz_avg_correct_state_acc, viz_avg_qa_acc = [], []
    viz_eval_sufficient_info_reward, viz_eval_qa_reward = [], []

    step_in_total = 0
    running_avg_qa_reward = generic.HistoryScoreCache(capacity=500)
    running_avg_sufficient_info_reward = generic.HistoryScoreCache(
        capacity=500)
    running_avg_qa_loss = generic.HistoryScoreCache(capacity=500)
    running_avg_correct_state_loss = generic.HistoryScoreCache(capacity=500)

    output_dir, data_dir = ".", "."
    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_sum_reward_so_far = 0.0
    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag +
                          "_model.pt"):
            agent.load_pretrained_model(output_dir + "/" +
                                        agent.experiment_tag + "_model.pt")
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag +
                                        ".pt")
            agent.update_target_net()
        else:
            print(
                "Failed to load pretrained model... couldn't find the checkpoint file..."
            )

    # Create temporary folder for the generated games.
    games_dir = tempfile.TemporaryDirectory(
        prefix="tw_games"
    )  # This is not deleted upon error. It would be better to use a with statement.
    games_dir = pjoin(games_dir.name, "")  # So path ends with '/'.
    # copy grammar files into tmp folder so that it works smoothly
    assert os.path.exists(
        "./textworld_data"), "Oh no! textworld_data folder is not there..."
    os.mkdir(games_dir)
    os.mkdir(pjoin(games_dir, "textworld_data"))
    copy_tree("textworld_data", games_dir + "textworld_data")
    if agent.run_eval:
        assert os.path.exists(pjoin(
            data_path,
            agent.testset_path)), "Oh no! test_set folder is not there..."
        os.mkdir(pjoin(games_dir, agent.testset_path))
        copy_tree(pjoin(data_path, agent.testset_path),
                  pjoin(games_dir, agent.testset_path))

    if agent.train_data_size == -1:
        game_queue_size = agent.batch_size * 5
        game_queue = []

    episode_no = 0
    if agent.train_data_size == -1:
        # endless mode
        game_generator_queue = game_generator.game_generator_queue(
            path=games_dir,
            random_map=agent.random_map,
            question_type=agent.question_type,
            max_q_size=agent.batch_size * 2,
            nb_worker=8)
    else:
        # generate the training set
        all_training_games = game_generator.game_generator(
            path=games_dir,
            random_map=agent.random_map,
            question_type=agent.question_type,
            train_data_size=agent.train_data_size)
        all_training_games.sort()
        all_env_ids = None
    while (True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        if agent.train_data_size == -1:
            # endless mode
            for _ in range(agent.batch_size):
                if not game_generator_queue.empty():
                    tmp_game = game_generator_queue.get()
                    if os.path.exists(tmp_game):
                        game_queue.append(tmp_game)
            if len(game_queue) == 0:
                time.sleep(0.1)
                continue
            can_delete_these = []
            if len(game_queue) > game_queue_size:
                can_delete_these = game_queue[:-game_queue_size]
                game_queue = game_queue[-game_queue_size:]
            sampled_games = np.random.choice(game_queue,
                                             agent.batch_size).tolist()
            env_ids = [
                register_game(gamefile, request_infos=request_infos)
                for gamefile in sampled_games
            ]
        else:
            if all_env_ids is None:
                all_env_ids = [
                    register_game(gamefile, request_infos=request_infos)
                    for gamefile in all_training_games
                ]
            env_ids = np.random.choice(all_env_ids, agent.batch_size).tolist()

        if len(env_ids
               ) != agent.batch_size:  # either less than or greater than
            env_ids = np.random.choice(env_ids, agent.batch_size).tolist()
        env_id = make_batch2(env_ids, parallel=True)
        env = gym.make(env_id)
        env.seed(episode_no)

        obs, infos = env.reset()
        batch_size = len(obs)
        # generate question-answer pairs here
        questions, answers, reward_helper_info = game_generator.generate_qa_pairs(
            infos, question_type=agent.question_type, seed=episode_no)
        print(
            "====================================================================================",
            episode_no)
        print(questions[0], answers[0])

        agent.train()
        agent.init(obs, infos)

        commands, last_facts, init_facts = [], [], []
        commands_per_step, game_facts_cache = [], []
        for i in range(batch_size):
            commands.append("restart")
            last_facts.append(None)
            init_facts.append(None)
            game_facts_cache.append([])
            commands_per_step.append(["restart"])

        observation_strings, possible_words = agent.get_game_info_at_certain_step(
            obs, infos)
        observation_strings = [
            a + " <|> " + item
            for a, item in zip(commands, observation_strings)
        ]
        input_quest, input_quest_char, _ = agent.get_agent_inputs(questions)

        transition_cache = []
        print_cmds = []
        counting_rewards_np = []
        valid_command_rewards_np = []

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        # push init state into counting reward dict
        state_strings = agent.get_state_strings(infos)
        _ = agent.get_binarized_count(state_strings, update=True)
        for step_no in range(agent.max_nb_steps_per_episode):
            # update answerer input
            for i in range(batch_size):
                if agent.not_finished_yet[i] == 1:
                    agent.naozi.push_one(i, copy.copy(observation_strings[i]))
                if agent.prev_step_is_still_interacting[i] == 1:
                    new_facts = process_facts(last_facts[i], infos["game"][i],
                                              infos["facts"][i],
                                              infos["last_action"][i],
                                              commands[i])
                    game_facts_cache[i].append(
                        new_facts
                    )  # info used in reward computing of existence question
                    last_facts[i] = new_facts
                    if step_no == 0:
                        init_facts[i] = copy.copy(new_facts)

            # generate commands
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights

            observation_strings_w_history = agent.naozi.get()
            input_observation, input_observation_char, _ = agent.get_agent_inputs(
                observation_strings_w_history)
            commands, replay_info = agent.act(obs,
                                              infos,
                                              input_observation,
                                              input_observation_char,
                                              input_quest,
                                              input_quest_char,
                                              possible_words,
                                              random=act_randomly)
            for i in range(batch_size):
                commands_per_step[i].append(commands[i])

            replay_info = [
                observation_strings_w_history, questions, possible_words
            ] + replay_info
            admissible_commands = [
                set(item) - set(["look", "wait", "inventory"])
                for item in infos["admissible_commands"]
            ]
            vc_rewards = [
                float(c in ac) for c, ac in zip(commands, admissible_commands)
            ]
            valid_command_rewards_np.append(np.array(vc_rewards))

            # pass commands into env
            obs, _, _, infos = env.step(commands)
            # possible words no not depend on history, because one can only interact with what is currently accessible
            observation_strings, possible_words = agent.get_game_info_at_certain_step(
                obs, infos)
            observation_strings = [
                a + " <|> " + item
                for a, item in zip(commands, observation_strings)
            ]
            # counting rewards
            state_strings = agent.get_state_strings(infos)
            c_rewards = agent.get_binarized_count(state_strings, update=True)
            counting_rewards_np.append(np.array(c_rewards))

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights

            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                interaction_loss = agent.update_interaction()
                if interaction_loss is not None:
                    running_avg_correct_state_loss.push(interaction_loss)
                qa_loss = agent.update_qa()
                if qa_loss is not None:
                    running_avg_qa_loss.push(qa_loss)

            print_cmds.append(commands[0] if agent.
                              prev_step_is_still_interacting[0] else "--")
            # force stopping
            if step_no == agent.max_nb_steps_per_episode - 1:
                replay_info[-1] = torch.zeros_like(replay_info[-1])
            transition_cache.append(replay_info)
            step_in_total += 1
            if (step_no == agent.max_nb_steps_per_episode -
                    1) or (step_no > 0
                           and np.sum(generic.to_np(replay_info[-1])) == 0):
                break

        print(" / ".join(print_cmds))
        # The agent has exhausted all steps, now answer question.
        answerer_input = agent.naozi.get()
        answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs(
            answerer_input)

        chosen_word_indices = agent.answer_question_act_greedy(
            answerer_input_observation, answerer_input_observation_char,
            answerer_observation_ids, input_quest, input_quest_char)  # batch
        chosen_word_indices_np = generic.to_np(chosen_word_indices)
        chosen_answers = [
            agent.word_vocab[item] for item in chosen_word_indices_np
        ]
        # rewards
        # qa reward
        qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers)
        # sufficient info rewards
        masks = [item[-1] for item in transition_cache]
        masks_np = [generic.to_np(item) for item in masks]
        # 1 1 0 0 0 --> 1 1 0 0 0 0
        game_finishing_mask = np.stack(masks_np + [np.zeros((batch_size, ))],
                                       0)  # game step+1 x batch size
        # 1 1 0 0 0 0 --> 0 1 0 0 0
        game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[
            1:, :]  # game step x batch size
        game_running_mask = np.stack(masks_np, 0)  # game step x batch size

        if agent.question_type == "location":
            # sufficient info reward: location question
            reward_helper_info["observation_before_finish"] = answerer_input
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location(
                reward_helper_info)
        elif agent.question_type == "existence":
            # sufficient info reward: existence question
            reward_helper_info["observation_before_finish"] = answerer_input
            reward_helper_info[
                "game_facts_per_step"] = game_facts_cache  # facts before issuing command (we want to stop at correct state)
            reward_helper_info["init_game_facts"] = init_facts
            reward_helper_info["full_facts"] = infos["facts"]
            reward_helper_info["answers"] = answers
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence(
                reward_helper_info)
        elif agent.question_type == "attribute":
            # sufficient info reward: attribute question
            reward_helper_info["answers"] = answers
            reward_helper_info[
                "game_facts_per_step"] = game_facts_cache  # facts before and after issuing commands (we want to compare the differnce)
            reward_helper_info["init_game_facts"] = init_facts
            reward_helper_info["full_facts"] = infos["facts"]
            reward_helper_info[
                "commands_per_step"] = commands_per_step  # commands before and after issuing commands (we want to compare the differnce)
            reward_helper_info["game_finishing_mask"] = game_finishing_mask
            sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute(
                reward_helper_info)
        else:
            raise NotImplementedError

        # push qa experience into qa replay buffer
        for b in range(batch_size):  # data points in batch
            # if the agent is not in the correct state, do not push it into replay buffer
            if np.sum(sufficient_info_reward_np[b]) == 0.0:
                continue
            agent.qa_replay_memory.push(False, qa_reward_np[b],
                                        answerer_input[b], questions[b],
                                        answers[b])

        # assign sufficient info reward and counting reward to the corresponding steps
        counting_rewards_np = np.stack(counting_rewards_np,
                                       1)  # batch x game step
        valid_command_rewards_np = np.stack(valid_command_rewards_np,
                                            1)  # batch x game step
        command_rewards_np = sufficient_info_reward_np + counting_rewards_np * game_running_mask.T * agent.revisit_counting_lambda + valid_command_rewards_np * game_running_mask.T * agent.valid_command_bonus_lambda  # batch x game step
        command_rewards = generic.to_pt(command_rewards_np,
                                        enable_cuda=agent.use_cuda,
                                        type="float")  # batch x game step
        for i in range(command_rewards_np.shape[1]):
            transition_cache[i].append(command_rewards[:, i])
        print(command_rewards_np[0])

        # push command generation experience into replay buffer
        for b in range(batch_size):
            is_prior = np.sum(command_rewards_np[b], 0) > 0.0
            for i in range(len(transition_cache)):
                batch_observation_strings, batch_question_strings, batch_possible_words, batch_chosen_indices, _, batch_rewards = transition_cache[
                    i]
                is_final = True
                if masks_np[i][b] != 0:
                    is_final = False
                agent.command_generation_replay_memory.push(
                    is_prior, batch_observation_strings[b],
                    batch_question_strings[b],
                    [item[b] for item in batch_possible_words],
                    [item[b] for item in batch_chosen_indices],
                    batch_rewards[b], is_final)
                if masks_np[i][b] == 0.0:
                    break

        # for printing
        r_qa = np.mean(qa_reward_np)
        r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1))
        running_avg_qa_reward.push(r_qa)
        running_avg_sufficient_info_reward.push(r_sufficient_info)
        print_rewards = np.mean(np.sum(command_rewards_np, -1))
        obs_string = answerer_input[0]
        print(obs_string)
        # finish game
        agent.finish_of_episode(episode_no, batch_size)
        # close env
        env.close()
        if agent.train_data_size == -1:
            # when games are generated on the fly,
            # remove all files (including .json and .ni) that have been used
            files_to_delete = []
            for gamefile in can_delete_these:
                if not gamefile.endswith(".ulx"):
                    continue
                files_to_delete.append(gamefile)
                files_to_delete.append(gamefile.replace(".ulx", ".json"))
                files_to_delete.append(gamefile.replace(".ulx", ".ni"))
            # print("rm -f {}".format(" ".join(files_to_delete)))
            os.system("rm -f {}".format(" ".join(files_to_delete)))
        episode_no += batch_size

        time_2 = datetime.datetime.now()
        print(
            "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | correct state: {:2.3f}/{:2.3f}"
            .format(episode_no,
                    str(time_2 - time_1).rsplit(".")[0],
                    running_avg_correct_state_loss.get_avg(),
                    running_avg_qa_loss.get_avg(), print_rewards, r_qa,
                    running_avg_qa_reward.get_avg(), r_sufficient_info,
                    running_avg_sufficient_info_reward.get_avg()))

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if episode_no == 0 or (
                episode_no % agent.save_frequency >
            (episode_no - batch_size) % agent.save_frequency):
            continue
        eval_qa_reward, eval_sufficient_info_reward = 0.0, 0.0
        # evaluate
        if agent.run_eval:
            eval_qa_reward, eval_sufficient_info_reward = evaluate.evaluate(
                data_dir, agent)
            # if run eval, then save model by eval accucacy
            if eval_qa_reward + eval_sufficient_info_reward > best_sum_reward_so_far:
                best_sum_reward_so_far = eval_qa_reward + eval_sufficient_info_reward
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")
        # save model
        elif agent.save_checkpoint:
            if running_avg_qa_reward.get_avg(
            ) + running_avg_sufficient_info_reward.get_avg(
            ) > best_sum_reward_so_far:
                best_sum_reward_so_far = running_avg_qa_reward.get_avg(
                ) + running_avg_sufficient_info_reward.get_avg()
                agent.save_model_to_path(output_dir + "/" +
                                         agent.experiment_tag + "_model.pt")

        # plot using visdom
        viz_avg_correct_state_acc.append(
            running_avg_sufficient_info_reward.get_avg())
        viz_avg_qa_acc.append(running_avg_qa_reward.get_avg())
        viz_eval_sufficient_info_reward.append(eval_sufficient_info_reward)
        viz_eval_qa_reward.append(eval_qa_reward)
        viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist()

        if plt_win is None:
            plt_win = viz.line(X=viz_x,
                               Y=viz_avg_correct_state_acc,
                               opts=dict(title=agent.experiment_tag +
                                         "_train"),
                               name="correct state")
            viz.line(X=viz_x,
                     Y=viz_avg_qa_acc,
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_avg_correct_state_acc) - 1],
                     Y=[viz_avg_correct_state_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="correct state")
            viz.line(X=[len(viz_avg_qa_acc) - 1],
                     Y=[viz_avg_qa_acc[-1]],
                     opts=dict(title=agent.experiment_tag + "_train"),
                     win=plt_win,
                     update='append',
                     name="qa")

        if eval_plt_win is None:
            eval_plt_win = viz.line(X=viz_x,
                                    Y=viz_eval_sufficient_info_reward,
                                    opts=dict(title=agent.experiment_tag +
                                              "_eval"),
                                    name="correct state")
            viz.line(X=viz_x,
                     Y=viz_eval_qa_reward,
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")
        else:
            viz.line(X=[len(viz_eval_sufficient_info_reward) - 1],
                     Y=[viz_eval_sufficient_info_reward[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="correct state")
            viz.line(X=[len(viz_eval_qa_reward) - 1],
                     Y=[viz_eval_qa_reward[-1]],
                     opts=dict(title=agent.experiment_tag + "_eval"),
                     win=eval_plt_win,
                     update='append',
                     name="qa")

        # write accucacies down into file
        _s = json.dumps({
            "time spent":
            str(time_2 - time_1).rsplit(".")[0],
            "sufficient info":
            running_avg_sufficient_info_reward.get_avg(),
            "qa":
            running_avg_qa_reward.get_avg(),
            "eval sufficient info":
            eval_sufficient_info_reward,
            "eval qa":
            eval_qa_reward
        })
        with open(output_dir + "/" + json_file_name + '.json',
                  'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()
Exemplo n.º 17
0
 def avg_rewards(self):
     if len(self._storage) == 0:
         return 0.0
     rewards = [self._storage[i].reward for i in range(len(self._storage))]
     return to_np(torch.mean(torch.stack(rewards)))
def train():

    time_1 = datetime.datetime.now()
    config = generic.load_config()
    agent = Agent(config)
    output_dir = "."
    data_dir = "."

    # make game environments
    requested_infos = agent.select_additional_infos_lite()
    requested_infos_eval = agent.select_additional_infos()
    games_dir = "./"

    # training game env
    env, _ = reinforcement_learning_dataset.get_training_game_env(games_dir + config['rl']['data_path'],
                                                                  config['rl']['difficulty_level'],
                                                                  config['rl']['training_size'],
                                                                  requested_infos,
                                                                  agent.max_nb_steps_per_episode,
                                                                  agent.batch_size)

    if agent.run_eval:
        # training game env
        eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env(games_dir + config['rl']['data_path'],
                                                                                         config['rl']['difficulty_level'],
                                                                                         requested_infos_eval,
                                                                                         agent.eval_max_nb_steps_per_episode,
                                                                                         agent.eval_batch_size,
                                                                                         valid_or_test="valid")
    else:
        eval_env, num_eval_game = None, None

    # visdom
    if config["general"]["visdom"]:
        import visdom
        viz = visdom.Visdom()
        reward_win, step_win = None, None
        dqn_loss_win = None
        eval_game_points_win, eval_step_win = None, None
        viz_game_rewards, viz_game_points, viz_game_points_normalized, viz_graph_rewards, viz_count_rewards, viz_step = [], [], [], [], [], []
        viz_dqn_loss = []
        viz_eval_game_points, viz_eval_game_points_normalized, viz_eval_step = [], [], []

    step_in_total = 0
    episode_no = 0
    running_avg_game_points = HistoryScoreCache(capacity=500)
    running_avg_game_points_normalized = HistoryScoreCache(capacity=500)
    running_avg_graph_rewards = HistoryScoreCache(capacity=500)
    running_avg_count_rewards = HistoryScoreCache(capacity=500)
    running_avg_game_steps = HistoryScoreCache(capacity=500)
    running_avg_dqn_loss = HistoryScoreCache(capacity=500)
    running_avg_game_rewards = HistoryScoreCache(capacity=500)

    json_file_name = agent.experiment_tag.replace(" ", "_")
    best_train_performance_so_far, best_eval_performance_so_far = 0.0, 0.0
    prev_performance = 0.0

    if os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"):
        agent.load_pretrained_graph_generation_model(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt")
    else:
        print("No real-valued graph generation module detected... Please check ", data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") 

    # load model from checkpoint
    if agent.load_pretrained:
        if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"):
            # this experiment itself (in case the experiment crashes for unknown reasons on server)
            agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False)
            agent.update_target_net()
        elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
            # load from pre-trained graph encoder
            agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt")
            agent.update_target_net()

    i_am_patient = 0
    perfect_training = 0
    while(True):
        if episode_no > agent.max_episode:
            break
        np.random.seed(episode_no)
        env.seed(episode_no)
        obs, infos = env.reset()
        # filter look and examine actions
        for commands_ in infos["admissible_commands"]:
            for cmd_ in [cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"]]:
                commands_.remove(cmd_)
        batch_size = len(obs)

        agent.train()
        agent.init()

        game_name_list = [game.metadata["uuid"].split("-")[-1] for game in infos["game"]]
        game_max_score_list = [game.max_score for game in infos["game"]]
        chosen_actions = []
        prev_step_dones, prev_rewards = [], []
        prev_graph_hidden_state = torch.zeros(batch_size, agent.online_net.block_hidden_dim)
        if agent.use_cuda:
            prev_graph_hidden_state = prev_graph_hidden_state.cuda()
        for _ in range(batch_size):
            chosen_actions.append("restart")
            prev_step_dones.append(0.0)
            prev_rewards.append(0.0)

        prev_h, prev_c = None, None
        episodes_masks = 1 - torch.tensor(prev_step_dones) # inverse of `prev_step_dones`
        episodes_masks = episodes_masks.cuda() if agent.use_cuda else episodes_masks

        observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(obs, infos)
        observation_for_counting = copy.copy(observation_strings)

        if agent.count_reward_lambda > 0:
            agent.reset_binarized_counter(batch_size)
            _ = agent.get_binarized_count(observation_for_counting)

        # it requires to store sequences of transitions into memory with order,
        # so we use a cache to keep what agents returns, and push them into memory
        # altogether in the end of game.
        transition_cache = []
        still_running_mask = []
        game_rewards, game_points, graph_rewards, count_rewards = [], [], [], []
        print_actions = []

        act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode
        for step_no in range(agent.max_nb_steps_per_episode):
            if agent.noisy_net:
                agent.reset_noise()  # Draw a new set of noisy weights

            # generate adj_matrices
            new_adjacency_matrix, new_graph_hidden_state = agent.generate_adjacency_matrix_for_rl(observation_strings, chosen_actions, prev_graph_hidden_state)
            new_chosen_actions, chosen_indices, prev_h, prev_c = agent.act(observation_strings, new_adjacency_matrix, action_candidate_list, previous_h=prev_h, previous_c=prev_c, random=act_randomly)
            replay_info = [observation_strings, action_candidate_list, chosen_indices, generic.to_np(prev_graph_hidden_state), chosen_actions]
            transition_cache.append(replay_info)
            chosen_actions = new_chosen_actions
            chosen_actions_before_parsing =  [item[idx] for item, idx in zip(infos["admissible_commands"], chosen_indices)]
            obs, scores, dones, infos = env.step(chosen_actions_before_parsing)
            # filter look and examine actions
            for commands_ in infos["admissible_commands"]:
                for cmd_ in [cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"]]:
                    commands_.remove(cmd_)
            ## prev_triplets = current_triplets # commented for obs_gen
            prev_graph_hidden_state = new_graph_hidden_state
            observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(obs, infos)
            observation_for_counting = copy.copy(observation_strings)

            if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
                agent.reset_noise()  # Draw a new set of noisy weights
            if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
                dqn_loss, _ = agent.update_dqn(episode_no)
                if dqn_loss is not None:
                    running_avg_dqn_loss.push(dqn_loss)

            if step_no == agent.max_nb_steps_per_episode - 1:
                # terminate the game because DQN requires one extra step
                dones = [True for _ in dones]

            step_in_total += 1
            still_running = [1.0 - float(item) for item in prev_step_dones]  # list of float
            prev_step_dones = dones
            step_rewards = [float(curr) - float(prev) for curr, prev in zip(scores, prev_rewards)]  # list of float
            game_points.append(copy.copy(step_rewards))
            if agent.use_negative_reward:
                step_rewards = [-1.0 if _lost else r for r, _lost in zip(step_rewards, infos["has_lost"])]  # list of float
                step_rewards = [5.0 if _won else r for r, _won in zip(step_rewards, infos["has_won"])]  # list of float
            prev_rewards = scores
            step_graph_rewards = [0.0 for _ in range(batch_size)] ## adding for obs_gen
            # counting bonus
            if agent.count_reward_lambda > 0:
                step_revisit_counting_rewards = agent.get_binarized_count(observation_for_counting, update=True)
                step_revisit_counting_rewards = [r * agent.count_reward_lambda for r in step_revisit_counting_rewards]
            else:
                step_revisit_counting_rewards = [0.0 for _ in range(batch_size)]
            still_running_mask.append(still_running)
            game_rewards.append(step_rewards)
            graph_rewards.append(step_graph_rewards)
            count_rewards.append(step_revisit_counting_rewards)
            print_actions.append(chosen_actions_before_parsing[0] if still_running[0] else "--")

            # if all ended, break
            if np.sum(still_running) == 0:
                break

        still_running_mask_np = np.array(still_running_mask)
        game_rewards_np = np.array(game_rewards) * still_running_mask_np  # step x batch
        game_points_np = np.array(game_points) * still_running_mask_np  # step x batch
        graph_rewards_np = np.array(graph_rewards) * still_running_mask_np  # step x batch
        count_rewards_np = np.array(count_rewards) * still_running_mask_np  # step x batch
        if agent.graph_reward_lambda > 0.0:
            graph_rewards_pt = generic.to_pt(graph_rewards_np, enable_cuda=agent.use_cuda, type='float')  # step x batch
        else:
            graph_rewards_pt = generic.to_pt(np.zeros_like(graph_rewards_np), enable_cuda=agent.use_cuda, type='float')  # step x batch
        if agent.count_reward_lambda > 0.0:
            count_rewards_pt = generic.to_pt(count_rewards_np, enable_cuda=agent.use_cuda, type='float')  # step x batch
        else:
            count_rewards_pt = generic.to_pt(np.zeros_like(count_rewards_np), enable_cuda=agent.use_cuda, type='float')  # step x batch
        command_rewards_pt = generic.to_pt(game_rewards_np, enable_cuda=agent.use_cuda, type='float')  # step x batch

        # push experience into replay buffer (dqn)
        avg_rewards_in_buffer = agent.dqn_memory.avg_rewards()
        for b in range(game_rewards_np.shape[1]):
            if still_running_mask_np.shape[0] == agent.max_nb_steps_per_episode and still_running_mask_np[-1][b] != 0:
                # need to pad one transition
                _need_pad = True
                tmp_game_rewards = game_rewards_np[:, b].tolist() + [0.0]
            else:
                _need_pad = False
                tmp_game_rewards = game_rewards_np[:, b]
            if np.mean(tmp_game_rewards) < avg_rewards_in_buffer * agent.buffer_reward_threshold:
                continue
            for i in range(game_rewards_np.shape[0]):
                observation_strings, action_candidate_list, chosen_indices, graph_hidden_state, prev_action_strings = transition_cache[i]
                is_final = True
                if still_running_mask_np[i][b] != 0:
                    is_final = False
                agent.dqn_memory.add(observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], graph_hidden_state[b], command_rewards_pt[i][b], graph_rewards_pt[i][b], count_rewards_pt[i][b], is_final)
                if still_running_mask_np[i][b] == 0:
                    break
            if _need_pad:
                observation_strings, action_candidate_list, chosen_indices, graph_hidden_state, prev_action_strings = transition_cache[-1]
                agent.dqn_memory.add(observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], graph_hidden_state[b], command_rewards_pt[-1][b] * 0.0, graph_rewards_pt[-1][b] * 0.0, count_rewards_pt[-1][b] * 0.0, True)

        for b in range(batch_size):
            running_avg_game_points.push(np.sum(game_points_np, 0)[b])
            game_max_score_np = np.array(game_max_score_list, dtype="float32")
            running_avg_game_points_normalized.push((np.sum(game_points_np, 0) / game_max_score_np)[b])
            running_avg_game_steps.push(np.sum(still_running_mask_np, 0)[b])
            running_avg_game_rewards.push(np.sum(game_rewards_np, 0)[b])
            running_avg_graph_rewards.push(np.sum(graph_rewards_np, 0)[b])
            running_avg_count_rewards.push(np.sum(count_rewards_np, 0)[b])

        # finish game
        agent.finish_of_episode(episode_no, batch_size)
        episode_no += batch_size

        if episode_no < agent.learn_start_from_this_episode:
            continue
        if agent.report_frequency == 0 or (episode_no % agent.report_frequency > (episode_no - batch_size) % agent.report_frequency):
            continue
        time_2 = datetime.datetime.now()
        print("Episode: {:3d} | time spent: {:s} | dqn loss: {:2.3f} | game points: {:2.3f} | normalized game points: {:2.3f} | game rewards: {:2.3f} | graph rewards: {:2.3f} | count rewards: {:2.3f} | used steps: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_dqn_loss.get_avg(), running_avg_game_points.get_avg(), running_avg_game_points_normalized.get_avg(), running_avg_game_rewards.get_avg(), running_avg_graph_rewards.get_avg(), running_avg_count_rewards.get_avg(), running_avg_game_steps.get_avg()))
        print(game_name_list[0] + ":    " + " | ".join(print_actions))

        # evaluate
        curr_train_performance = running_avg_game_points_normalized.get_avg()
        eval_game_points, eval_game_points_normalized, eval_game_step = 0.0, 0.0, 0.0
        if agent.run_eval:
            eval_game_points, eval_game_points_normalized, eval_game_step, detailed_scores = evaluate.evaluate_rl_with_real_graphs(eval_env, agent, num_eval_game)
            curr_eval_performance = eval_game_points_normalized
            curr_performance = curr_eval_performance
            if curr_eval_performance > best_eval_performance_so_far:
                best_eval_performance_so_far = curr_eval_performance
                agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
            elif curr_eval_performance == best_eval_performance_so_far:
                if curr_eval_performance > 0.0:
                    agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
                else:
                    if curr_train_performance >= best_train_performance_so_far:
                        agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
        else:
            curr_eval_performance = 0.0
            detailed_scores = ""
            curr_performance = curr_train_performance
            if curr_train_performance >= best_train_performance_so_far:
                agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
        # update best train performance
        if curr_train_performance >= best_train_performance_so_far:
            best_train_performance_so_far = curr_train_performance

        if prev_performance <= curr_performance:
            i_am_patient = 0
        else:
            i_am_patient += 1
        prev_performance = curr_performance

        # if patient >= patience, resume from checkpoint
        if agent.patience > 0 and i_am_patient >= agent.patience:
            if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"):
                print('reload from a good checkpoint...')
                agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False)
                agent.update_target_net()
                i_am_patient = 0

        if running_avg_game_points_normalized.get_avg() >= 0.95:
            perfect_training += 1
        else:
            perfect_training = 0

        # plot using visdom
        if config["general"]["visdom"]:
            viz_game_rewards.append(running_avg_game_rewards.get_avg())
            viz_game_points.append(running_avg_game_points.get_avg())
            viz_game_points_normalized.append(running_avg_game_points_normalized.get_avg())
            viz_graph_rewards.append(running_avg_graph_rewards.get_avg())
            viz_count_rewards.append(running_avg_count_rewards.get_avg())
            viz_step.append(running_avg_game_steps.get_avg())
            viz_dqn_loss.append(running_avg_dqn_loss.get_avg())
            viz_eval_game_points.append(eval_game_points)
            viz_eval_game_points_normalized.append(eval_game_points_normalized)
            viz_eval_step.append(eval_game_step)
            viz_x = np.arange(len(viz_game_rewards)).tolist()

            if reward_win is None:
                reward_win = viz.line(X=viz_x, Y=viz_game_rewards,
                                   opts=dict(title=agent.experiment_tag + "_game_rewards"),
                                   name="game_rewards")
                viz.line(X=viz_x, Y=viz_graph_rewards,
                         opts=dict(title=agent.experiment_tag + "_graph_rewards"),
                         win=reward_win, update='append', name="graph_rewards")
                viz.line(X=viz_x, Y=viz_count_rewards,
                         opts=dict(title=agent.experiment_tag + "_count_rewards"),
                         win=reward_win, update='append', name="count_rewards")
                viz.line(X=viz_x, Y=viz_game_points,
                         opts=dict(title=agent.experiment_tag + "_game_points"),
                         win=reward_win, update='append', name="game_points")
                viz.line(X=viz_x, Y=viz_game_points_normalized,
                         opts=dict(title=agent.experiment_tag + "_game_points_normalized"),
                         win=reward_win, update='append', name="game_points_normalized")
            else:
                viz.line(X=[len(viz_game_rewards) - 1], Y=[viz_game_rewards[-1]],
                         opts=dict(title=agent.experiment_tag + "_game_rewards"),
                         win=reward_win,
                         update='append', name="game_rewards")
                viz.line(X=[len(viz_graph_rewards) - 1], Y=[viz_graph_rewards[-1]],
                         opts=dict(title=agent.experiment_tag + "_graph_rewards"),
                         win=reward_win,
                         update='append', name="graph_rewards")
                viz.line(X=[len(viz_count_rewards) - 1], Y=[viz_count_rewards[-1]],
                         opts=dict(title=agent.experiment_tag + "_count_rewards"),
                         win=reward_win,
                         update='append', name="count_rewards")
                viz.line(X=[len(viz_game_points) - 1], Y=[viz_game_points[-1]],
                         opts=dict(title=agent.experiment_tag + "_game_points"),
                         win=reward_win,
                         update='append', name="game_points")
                viz.line(X=[len(viz_game_points_normalized) - 1], Y=[viz_game_points_normalized[-1]],
                         opts=dict(title=agent.experiment_tag + "_game_points_normalized"),
                         win=reward_win,
                         update='append', name="game_points_normalized")

            if step_win is None:
                step_win = viz.line(X=viz_x, Y=viz_step,
                                   opts=dict(title=agent.experiment_tag + "_step"),
                                   name="step")
            else:
                viz.line(X=[len(viz_step) - 1], Y=[viz_step[-1]],
                         opts=dict(title=agent.experiment_tag + "_step"),
                         win=step_win,
                         update='append', name="step")

            if dqn_loss_win is None:
                dqn_loss_win = viz.line(X=viz_x, Y=viz_dqn_loss,
                                   opts=dict(title=agent.experiment_tag + "_dqn_loss"),
                                   name="dqn loss")
            else:
                viz.line(X=[len(viz_dqn_loss) - 1], Y=[viz_dqn_loss[-1]],
                         opts=dict(title=agent.experiment_tag + "_dqn_loss"),
                         win=dqn_loss_win,
                         update='append', name="dqn loss")

            if eval_game_points_win is None:
                eval_game_points_win = viz.line(X=viz_x, Y=viz_eval_game_points,
                                   opts=dict(title=agent.experiment_tag + "_eval_game_points"),
                                   name="eval game points")
                viz.line(X=viz_x, Y=viz_eval_game_points_normalized,
                         opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"),
                         win=eval_game_points_win, update='append', name="eval_game_points_normalized")
            else:
                viz.line(X=[len(viz_eval_game_points) - 1], Y=[viz_eval_game_points[-1]],
                         opts=dict(title=agent.experiment_tag + "_eval_game_points"),
                         win=eval_game_points_win,
                         update='append', name="eval game_points")
                viz.line(X=[len(viz_eval_game_points_normalized) - 1], Y=[viz_eval_game_points_normalized[-1]],
                         opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"),
                         win=eval_game_points_win,
                         update='append', name="eval_game_points_normalized")

            if eval_step_win is None:
                eval_step_win = viz.line(X=viz_x, Y=viz_eval_step,
                                   opts=dict(title=agent.experiment_tag + "_eval_step"),
                                   name="eval step")
            else:
                viz.line(X=[len(viz_eval_step) - 1], Y=[viz_eval_step[-1]],
                         opts=dict(title=agent.experiment_tag + "_eval_step"),
                         win=eval_step_win,
                         update='append', name="eval step")

        # write accuracies down into file
        _s = json.dumps({"time spent": str(time_2 - time_1).rsplit(".")[0],
                         "dqn loss": str(running_avg_dqn_loss.get_avg()),
                         "train game points": str(running_avg_game_points.get_avg()),
                         "train normalized game points": str(running_avg_game_points_normalized.get_avg()),
                         "train game rewards": str(running_avg_game_rewards.get_avg()),
                         "train graph rewards": str(running_avg_graph_rewards.get_avg()),
                         "train count rewards": str(running_avg_count_rewards.get_avg()),
                         "train steps": str(running_avg_game_steps.get_avg()),
                         "eval game points": str(eval_game_points),
                         "eval normalized game points": str(eval_game_points_normalized),
                         "eval steps": str(eval_game_step),
                         "detailed scores": detailed_scores})
        with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile:
            outfile.write(_s + '\n')
            outfile.flush()

        if curr_performance == 1.0 and curr_train_performance >= 0.95:
            break
        if perfect_training >= 3:
            break
Exemplo n.º 19
0
def evaluate(data_path, agent):

    eval_data_path = pjoin(data_path, agent.eval_data_path)

    with open(eval_data_path) as f:
        data = json.load(f)
    data = data[agent.question_type]
    data = data["random_map"] if agent.random_map else data["fixed_map"]
    correct_answers = []
    predicted_answers = []

    print_qa_reward, print_sufficient_info_reward = [], []
    for game_path in tqdm(data):
        game_file_path = pjoin(data_path, game_path)
        assert os.path.exists(
            game_file_path
        ), "Oh no! game path %s does not exist!" % game_file_path
        env_id = register_games([game_file_path], request_infos=request_infos)
        env_id = make_batch(env_id,
                            batch_size=agent.eval_batch_size,
                            parallel=True)
        env = gym.make(env_id)

        data_questions = [item["question"] for item in data[game_path]]
        data_answers = [item["answer"] for item in data[game_path]]
        data_entities = [item["entity"] for item in data[game_path]]
        if agent.question_type == "attribute":
            data_attributes = [item["attribute"] for item in data[game_path]]

        for q_no in range(len(data_questions)):
            questions = data_questions[q_no:q_no + 1]
            answers = data_answers[q_no:q_no + 1]
            reward_helper_info = {
                "_entities": data_entities[q_no:q_no + 1],
                "_answers": data_answers[q_no:q_no + 1]
            }
            if agent.question_type == "attribute":
                reward_helper_info["_attributes"] = data_attributes[q_no:q_no +
                                                                    1]

            obs, infos = env.reset()
            batch_size = len(obs)
            agent.eval()
            agent.init(obs, infos)
            # get inputs
            commands, last_facts, init_facts = [], [], []
            commands_per_step, game_facts_cache = [], []
            for i in range(batch_size):
                commands.append("restart")
                last_facts.append(None)
                init_facts.append(None)
                game_facts_cache.append([])
                commands_per_step.append(["restart"])

            observation_strings, possible_words = agent.get_game_info_at_certain_step(
                obs, infos)
            observation_strings = [
                a + " <|> " + item
                for a, item in zip(commands, observation_strings)
            ]
            input_quest, input_quest_char, _ = agent.get_agent_inputs(
                questions)

            transition_cache = []

            for step_no in range(agent.eval_max_nb_steps_per_episode):
                # update answerer input
                for i in range(batch_size):
                    if agent.not_finished_yet[i] == 1:
                        agent.naozi.push_one(i,
                                             copy.copy(observation_strings[i]))
                    if agent.prev_step_is_still_interacting[i] == 1:
                        new_facts = process_facts(last_facts[i],
                                                  infos["game"][i],
                                                  infos["facts"][i],
                                                  infos["last_action"][i],
                                                  commands[i])
                        game_facts_cache[i].append(
                            new_facts
                        )  # info used in reward computing of existence question
                        last_facts[i] = new_facts
                        if step_no == 0:
                            init_facts[i] = copy.copy(new_facts)

                observation_strings_w_history = agent.naozi.get()
                input_observation, input_observation_char, _ = agent.get_agent_inputs(
                    observation_strings_w_history)
                commands, replay_info = agent.act(obs,
                                                  infos,
                                                  input_observation,
                                                  input_observation_char,
                                                  input_quest,
                                                  input_quest_char,
                                                  possible_words,
                                                  random=False)
                for i in range(batch_size):
                    commands_per_step[i].append(commands[i])

                replay_info = [
                    observation_strings_w_history, questions, possible_words
                ] + replay_info
                transition_cache.append(replay_info)

                obs, _, _, infos = env.step(commands)
                # possible words no not depend on history, because one can only interact with what is currently accessible
                observation_strings, possible_words = agent.get_game_info_at_certain_step(
                    obs, infos)
                observation_strings = [
                    a + " <|> " + item
                    for a, item in zip(commands, observation_strings)
                ]

                if (step_no == agent.eval_max_nb_steps_per_episode -
                        1) or (step_no > 0 and np.sum(
                            generic.to_np(replay_info[-1])) == 0):
                    break

            # The agent has exhausted all steps, now answer question.
            answerer_input = agent.naozi.get()
            answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs(
                answerer_input)

            chosen_word_indices = agent.answer_question_act_greedy(
                answerer_input_observation, answerer_input_observation_char,
                answerer_observation_ids, input_quest,
                input_quest_char)  # batch
            chosen_word_indices_np = generic.to_np(chosen_word_indices)
            chosen_answers = [
                agent.word_vocab[item] for item in chosen_word_indices_np
            ]

            correct_answers.extend(answers)
            predicted_answers.extend(chosen_answers)
            # rewards
            # qa reward
            qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers)
            # sufficient info rewards
            masks = [item[-1] for item in transition_cache]
            masks_np = [generic.to_np(item) for item in masks]
            # 1 1 0 0 0 --> 1 1 0 0 0 0
            game_finishing_mask = np.stack(
                masks_np + [np.zeros(
                    (batch_size, ))], 0)  # game step+1 x batch size
            # 1 1 0 0 0 0 --> 0 1 0 0 0
            game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[
                1:, :]  # game step x batch size

            if agent.question_type == "location":
                # sufficient info reward: location question
                reward_helper_info[
                    "observation_before_finish"] = answerer_input
                reward_helper_info["game_finishing_mask"] = game_finishing_mask
                sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location(
                    reward_helper_info)
            elif agent.question_type == "existence":
                # sufficient info reward: existence question
                reward_helper_info[
                    "observation_before_finish"] = answerer_input
                reward_helper_info[
                    "game_facts_per_step"] = game_facts_cache  # facts before issuing command (we want to stop at correct state)
                reward_helper_info["init_game_facts"] = init_facts
                reward_helper_info["full_facts"] = infos["facts"]
                reward_helper_info["answers"] = answers
                reward_helper_info["game_finishing_mask"] = game_finishing_mask
                sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence(
                    reward_helper_info)
            elif agent.question_type == "attribute":
                # sufficient info reward: attribute question
                reward_helper_info["answers"] = answers
                reward_helper_info[
                    "game_facts_per_step"] = game_facts_cache  # facts before and after issuing commands (we want to compare the differnce)
                reward_helper_info["init_game_facts"] = init_facts
                reward_helper_info["full_facts"] = infos["facts"]
                reward_helper_info[
                    "commands_per_step"] = commands_per_step  # commands before and after issuing commands (we want to compare the differnce)
                reward_helper_info["game_finishing_mask"] = game_finishing_mask
                sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute(
                    reward_helper_info)
            else:
                raise NotImplementedError

            r_qa = np.mean(qa_reward_np)
            r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1))
            print_qa_reward.append(r_qa)
            print_sufficient_info_reward.append(r_sufficient_info)
        env.close()

    precision, recall, fscore, _ = precision_recall_fscore_support(
        correct_answers, predicted_answers, average='micro')
    print("\n\n---------- From evaluation --------\n")
    print("precision: %f, recall: %f, f1 score: %f" %
          (precision, recall, fscore))
    print("\n\n---------------------------------")

    print("===== Eval =====: qa acc: {:2.3f} | correct state: {:2.3f}".format(
        np.mean(print_qa_reward), np.mean(print_sufficient_info_reward)))
    return np.mean(print_qa_reward), np.mean(print_sufficient_info_reward)