q_func = np.zeros((NUM_ROOM_DESC, NUM_QUESTS, NUM_ACTIONS, NUM_OBJECTS))

    single_run_epoch_rewards_test = []
    pbar = tqdm(range(NUM_EPOCHS), ncols=80)
    for _ in pbar:
        single_run_epoch_rewards_test.append(run_epoch())
        pbar.set_description(
            "Avg reward: {:0.6f} | Ewma reward: {:0.6f}".format(
                np.mean(single_run_epoch_rewards_test),
                utils.ewma(single_run_epoch_rewards_test)))
    return single_run_epoch_rewards_test


if __name__ == '__main__':
    # Data loading and build the dictionaries that use unique index for each state
    (dict_room_desc, dict_quest_desc) = framework.make_all_states_index()
    NUM_ROOM_DESC = len(dict_room_desc)
    NUM_QUESTS = len(dict_quest_desc)

    # set up the game
    framework.load_game_data()

    epoch_rewards_test = []  # shape NUM_RUNS * NUM_EPOCHS

    for _ in range(NUM_RUNS):
        epoch_rewards_test.append(run())

    epoch_rewards_test = np.array(epoch_rewards_test)

    x = np.arange(NUM_EPOCHS)
    fig, axis = plt.subplots()
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    dict_room_desc, dict_quest_desc = framework.make_all_states_index()

    # q_func = np.zeros((NUM_ROOM_DESC, NUM_QUESTS, NUM_ACTIONS, NUM_OBJECTS))

    epi_reward = 0
    # initialize for each episode
    # TODO Your code here
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    t = 0
    while not terminal:
        # Choose next action and execute
        # TODO Your code here
        current_room = dict_room_desc[
            current_room_desc]  # Índice de habitación
        current_quest = dict_quest_desc[
            current_quest_desc]  # Índice de la quest

        # Decidir acción con épsilon greedy
        action_index, object_index = epsilon_greedy(current_room,
                                                    current_quest, q_func,
                                                    epsilon)

        # Paso del juego + traducir descripciones
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, action_index, object_index)
        next_room = dict_room_desc[next_room_desc]
        next_quest = dict_quest_desc[next_quest_desc]

        if for_training:
            # update Q-function.
            # TODO Your code here
            tabular_q_learning(q_func, current_room, current_quest,
                               action_index, object_index, reward, next_room,
                               next_quest, terminal)
            pass

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward += (GAMMA**t) * reward
            pass

        # prepare next step
        # TODO Your code here
        t += 1
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward