示例#1
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    epi_reward = None

    # initialize for each episode
    # TODO Your code here
    epi_reward = 0
    t = 0

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        # TODO Your code here
        current_state = current_room_desc + current_quest_desc
        current_state_vector = utils.extract_bow_feature_vector(
            current_state, dictionary)

        if for_training:
            # update Q-function.
            # TODO Your code here
            action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon)
            next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
                                            current_room_desc, current_quest_desc,
                                            action_index, object_index)
            next_state = next_room_desc + next_quest_desc
            next_state_vector = utils.extract_bow_feature_vector(
                                    next_state, dictionary)
            linear_q_learning(theta, current_state_vector, action_index, object_index,
                          reward, next_state_vector, terminal)

        if not for_training:
            # update reward
            # TODO Your code here
            action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon)
            next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
                current_room_desc, current_quest_desc,
                action_index, object_index)
            epi_reward = epi_reward + reward * (GAMMA**t)
            t = t + 1

        # prepare next step
        # TODO Your code here
        # update current_room_desc and current_quest_desc
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#2
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    # initialize for each episode
    # TODO Your code here

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    # initial value
    count = 0
    epi_reward = 0

    while not terminal:
        # Choose next action and execute
        # TODO Your code here

        # recall index from dictionary by "description" key
        current_state_1 = dict_room_desc[current_room_desc]
        current_state_2 = dict_quest_desc[current_quest_desc]

        (action_index, object_index) = epsilon_greedy(current_state_1,
                                                      current_state_2, q_func,
                                                      epsilon)

        (next_room_desc, next_quest_desc, reward, terminal) \
            = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index)

        next_state_1 = dict_room_desc[next_room_desc]
        next_state_2 = dict_quest_desc[next_quest_desc]

        if for_training:
            # update Q-function.
            # TODO Your code here
            tabular_q_learning(q_func, current_state_1, current_state_2,
                               action_index, object_index, reward,
                               next_state_1, next_state_2, terminal)

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward += np.power(GAMMA, count) * reward

        # prepare next step
        # TODO Your code here
        count += 1
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#3
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """


    if for_training:
        epsilon = TRAINING_EP
    else:
        epsilon = TESTING_EP
    epi_reward = 0
    current_room_desc, current_quest_desc, terminal = framework.newGame()
    step = 0
    while not terminal:
        state_1, state_2 = dict_room_desc[current_room_desc], dict_quest_desc[current_quest_desc]
        action_index, object_index = epsilon_greedy(state_1, state_2, q_func, epsilon)
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index)
        next_state_1, next_state_2 = dict_room_desc[next_room_desc], dict_quest_desc[next_quest_desc]
        if for_training:
            tabular_q_learning(q_func, state_1, state_2, action_index,object_index, reward, next_state_1, next_state_2, terminal)
        if not for_training:
            epi_reward+=(reward*(GAMMA)**step)
        step+=1

    if not for_training:
        return epi_reward
示例#4
0
def run_episode(for_training):
    """
        Runs one episode
        If for training, update Q function
        If for testing, computes and return cumulative discounted reward
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    epi_reward = None

    # initialize for each episode

    epi_reward = 0
    gamma_t = 1
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(current_state, dictionary))

        # choose the next action based on epsilon greedy policy
        (action_index, object_index) = epsilon_greedy(current_state_vector,
                                                      epsilon)

        # go to the next state based on the action
        (next_room_desc, next_quest_desc, reward,
         terminal) = framework.step_game(current_room_desc, current_quest_desc,
                                         action_index, object_index)

        # extract the feature vector of the next state
        next_state = next_room_desc + next_quest_desc
        next_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(next_state, dictionary))

        if for_training:
            # update Q-function.
            # TODO Your code here
            #update the parameters of deep Q network
            deep_q_learning(current_state_vector, action_index, object_index,
                            reward, next_state_vector, terminal)
            #pass

        if not for_training:
            # update reward

            #update episodic reward with discount
            epi_reward += gamma_t * reward
            #pass

        # prepare next step

        #update current state to the next state
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc
        gamma_t = gamma_t * GAMMA

    if not for_training:
        return epi_reward
示例#5
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP


    global STEP_COUNT
    STEP_COUNT=0
    # initialize for each episode
    epi_reward = 0.0

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    
    while not terminal:
        STEP_COUNT+=1
        # Choose next action and execute
        current_room_desc_index = dict_room_desc[current_room_desc]
        current_quest_desc_index = dict_quest_desc[current_quest_desc]   # Get room and quest indices
        
        next_action_index, next_object_index = epsilon_greedy(current_room_desc_index,
                                                              current_quest_desc_index,
                                                              q_func,
                                                              epsilon)  # Get next action/object
        
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
                                                            current_room_desc, 
                                                            current_quest_desc, 
                                                            next_action_index, 
                                                            next_object_index)  # Take a step
        
        # Only need room index; quest remains same during an episode
        next_room_desc_index = dict_room_desc[next_room_desc] 

        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, current_room_desc_index, current_quest_desc_index, 
                               next_action_index, next_object_index, reward, 
                               next_room_desc_index, current_quest_desc_index,
                               terminal)

        if not for_training:
            # update reward
            epi_reward += (GAMMA**(framework.STEP_COUNT - 1))*reward    

        # prepare next step
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

    if not for_training:
        return epi_reward
示例#6
0
def run_episode(for_training):
    """
        Runs one episode
        If for training, update Q function
        If for testing, computes and return cumulative discounted reward
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    # initialize for each episode
    # TODO Your code here

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    count = 0
    epi_reward = 0

    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(current_state, dictionary))

        # TODO Your code here
        (action_index, object_index) = epsilon_greedy(current_state_vector,
                                                      epsilon)

        (next_room_desc, next_quest_desc, reward, terminal) \
            = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index)

        next_state = next_room_desc + next_quest_desc
        next_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(next_state, dictionary))

        if for_training:
            # update Q-function.
            # TODO Your code here
            deep_q_learning(current_state_vector, action_index, object_index,
                            reward, next_state_vector, terminal)

            pass

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward += np.power(GAMMA, count) * reward
            pass

        # prepare next step
        # TODO Your code here
        count += 1
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#7
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0
    gamma = 1

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        current_room_index = dict_room_desc[current_room_desc]
        current_quest_index = dict_quest_desc[current_quest_desc]

        # Choose next action and execute
        # TODO Your code here
        action_index, object_index = epsilon_greedy(current_room_index,
                                                    current_quest_index,
                                                    q_func, epsilon)
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, action_index, object_index)
        next_room_index = dict_room_desc[next_room_desc]
        next_quest_index = dict_quest_desc[next_quest_desc]

        if for_training:
            # update Q-function.
            # TODO Your code here
            tabular_q_learning(q_func, current_room_index, current_quest_index,
                               action_index, object_index, reward,
                               next_room_index, next_quest_index, terminal)
            pass

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward += reward * gamma
            gamma *= GAMMA
            pass

        # prepare next step
        # TODO Your code here
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#8
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0.00
    t = 0
    # initialize for each episode
    # TODO Your code here

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        # TODO Your code here
        state_1 = dict_room_desc[current_room_desc]
        state_2 = dict_quest_desc[current_quest_desc]

        (action_index, object_index) = epsilon_greedy(state_1, state_2, q_func,
                                                      epsilon)
        (next_room_desc, next_quest_desc, reward,
         terminal) = framework.step_game(current_room_desc, current_quest_desc,
                                         action_index, object_index)
        (next_state_1, next_state_2) = (dict_room_desc[next_room_desc],
                                        dict_quest_desc[next_quest_desc])

        if for_training:
            # update Q-function.
            # TODO Your code here
            tabular_q_learning(q_func, state_1, state_2, action_index,
                               object_index, reward, next_state_1,
                               next_state_2, terminal)

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward = epi_reward + (GAMMA**t) * reward
            t += 1

        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

        # prepare next step
        # TODO Your code here

    if not for_training:
        return epi_reward
示例#9
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    gamma_step = 1
    epi_reward = 0
    # initialize for each episode
    # TODO Your code here

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        # TODO Your code here
        cur_room_desc_id = dict_room_desc[current_room_desc]
        cur_quest_desc_id = dict_quest_desc[current_quest_desc]
        (action_index, object_index) = epsilon_greedy(cur_room_desc_id,
                                                      cur_quest_desc_id,
                                                      q_func, epsilon)
        (next_room_desc, next_quest_desc, reward,
         terminal) = framework.step_game(current_room_desc, current_quest_desc,
                                         action_index, object_index)

        if for_training:
            # update Q-function.
            # TODO Your code here
            next_room_desc_id = dict_room_desc[next_room_desc]
            next_quest_desc_id = dict_quest_desc[next_quest_desc]
            tabular_q_learning(q_func, cur_room_desc_id, cur_quest_desc_id,
                               action_index, object_index, reward,
                               next_room_desc_id, next_quest_desc_id, terminal)

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward = epi_reward + gamma_step * reward
            gamma_step = gamma_step * GAMMA

        # prepare next step
        # TODO Your code here
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#10
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    # initialize for each episode
    epi_reward = 0.
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    current_state_1 = dict_room_desc.get(current_room_desc)
    current_state_2 = dict_quest_desc.get(current_quest_desc)
    steps = 0

    while not terminal:
        steps += 1
        # Choose next action and execute
        action_index, object_index = epsilon_greedy(current_state_1,
                                                    current_state_2, q_func,
                                                    epsilon)
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, action_index, object_index)

        current_state_1 = dict_room_desc.get(current_room_desc)
        current_state_2 = dict_quest_desc.get(current_quest_desc)

        next_state_1 = dict_room_desc.get(next_room_desc)
        next_state_2 = dict_quest_desc.get(next_quest_desc)

        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, current_state_1, current_state_2,
                               action_index, object_index, reward,
                               next_state_1, next_state_2, terminal)
            pass

        if not for_training:
            # update reward
            epi_reward += GAMMA**(steps - 1) * reward
            pass

        # prepare next step
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0
    # initialize for each episode
    # TODO Your code here
    num_steps = 0
    # dict_room_desc, dict_quest_desc = framework.make_all_states_index()

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        # TODO Your code here
        state_r, state_q = dict_room_desc[current_room_desc], dict_quest_desc[
            current_quest_desc]
        next_action, next_object = epsilon_greedy(state_r, state_q, q_func,
                                                  epsilon)
        next_room_desc, next_quest_desc, reward, terminal = (
            framework.step_game(current_room_desc, current_quest_desc,
                                next_action, next_object))

        if for_training:
            # update Q-function.
            # TODO Your code here
            next_state_r, next_state_q = dict_room_desc[
                next_room_desc], dict_quest_desc[next_quest_desc]
            tabular_q_learning(q_func, state_r, state_q, next_action,
                               next_object, reward, next_state_r, next_state_q,
                               terminal)

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward = np.power(GAMMA, num_steps) * reward + epi_reward

        # prepare next step
        # TODO Your code here
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc
        num_steps += 1

    if not for_training:
        return epi_reward
示例#12
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0
    t = 0  # step

    # initialize for each episode
    # TODO Your code here

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        current_room_desc_index = dict_room_desc[current_room_desc]
        current_quest_desc_index = dict_quest_desc[current_quest_desc]

        next_action_index, next_object_index = epsilon_greedy(
            current_room_desc_index, current_quest_desc_index, q_func, epsilon)

        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, next_action_index,
            next_object_index)

        next_room_desc_index = dict_room_desc[
            next_room_desc]  # quest remains same

        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, current_room_desc_index,
                               current_quest_desc_index, next_action_index,
                               next_object_index, reward, next_room_desc_index,
                               current_quest_desc_index, terminal)

        if not for_training:
            # update reward
            epi_reward += (GAMMA**t) * reward

        # prepare next step
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc
        t += 1

    if not for_training:
        return epi_reward
示例#13
0
def run_episode(for_training):
    """
        Model, optimiser are freaking global
        Also dictionary. Crap.
        Runs one episode
        If for training, update Q function
        If for testing, computes and return cumulative discounted reward
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    epi_reward = 0.
    t = 0
    # initialize for each episode

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(current_state, dictionary))
        action_index, object_index = epsilon_greedy(
            current_state_vector, epsilon
        )
        next_room_desc, next_quest_desc, reward, terminal= framework.step_game(
            current_room_desc,
            current_quest_desc,
            action_index, object_index
        )
        next_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(
                next_room_desc + next_quest_desc, dictionary
            )
        )

        if for_training:
            # update Q-function.
            deep_q_learning(
                current_state_vector,
                action_index, object_index,
                reward,
                next_state_vector,
                terminal
            )
        else:
        # if not for_training:
            # update reward
            epi_reward += (GAMMA**t)*reward

        # prepare next step
        t += 1

    if not for_training:
        return epi_reward
示例#14
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0
    # initialize for each episode
    count = 0
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    curr_state1 = dict_room_desc[current_room_desc]
    curr_state2 = dict_quest_desc[current_quest_desc]

    while not terminal:
        # Choose next action and execute
        (action_index, object_index) = epsilon_greedy(curr_state1, curr_state2,
                                                      q_func, epsilon)
        (next_room_desc, next_quest_desc, reward, terminal) = \
            framework.step_game(current_room_desc, current_quest_desc,
                                action_index, object_index)
        next_state1 = dict_room_desc[next_room_desc]
        next_state2 = dict_quest_desc[next_quest_desc]

        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, curr_state1, curr_state2, action_index,
                               object_index, reward, next_state1, next_state2,
                               terminal)

        if not for_training:
            # update reward
            epi_reward += (GAMMA**count) * reward
            count += 1

        # prepare next step
        curr_state1 = next_state1
        curr_state2 = next_state2
        current_room_desc = list(dict_room_desc.keys())[list(
            dict_room_desc.values()).index(curr_state1)]
        current_quest_desc = list(dict_quest_desc.keys())[list(
            dict_quest_desc.values()).index(curr_state2)]

    if not for_training:
        return epi_reward
示例#15
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP \
    # initialize for each episode

    gamma_step = 1
    epi_reward = 0

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = utils.extract_bow_feature_vector(
            current_state, dictionary)

        (action_index, object_index) = epsilon_greedy(current_state_vector,
                                                      theta, epsilon)
        (next_room_desc, next_quest_desc, reward,
         terminal) = framework.step_game(current_room_desc, current_quest_desc,
                                         action_index, object_index)

        if for_training:
            # update Q-function.
            next_state = next_room_desc + next_quest_desc
            # Returns the bag-of-words vector representation of the state
            next_state_vector = utils.extract_bow_feature_vector(
                next_state, dictionary)
            linear_q_learning(theta, current_state_vector, action_index,
                              object_index, reward, next_state_vector,
                              terminal)

        if not for_training:
            # update reward
            epi_reward = epi_reward + gamma_step * reward
            gamma_step = gamma_step * GAMMA

        # prepare next step
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#16
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = None
    # initialize for each episode
    epi_reward = 0
    t = 0
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        current_state_1, current_state_2 = dict_room_desc[
            current_room_desc], dict_quest_desc[current_quest_desc]
        # get the next action according to policy
        action_index, object_index = epsilon_greedy(current_state_1,
                                                    current_state_2, q_func,
                                                    epsilon)
        # take action and get the next state
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, action_index, object_index)
        next_state_1, next_state_2 = dict_room_desc[
            next_room_desc], dict_quest_desc[next_quest_desc]
        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, current_state_1, current_state_2,
                               action_index, object_index, reward,
                               next_state_1, next_state_2, terminal)
            pass

        if not for_training:
            # update reward
            epi_reward = epi_reward + GAMMA**t * reward
            t = t + 1
            pass

        # prepare next step
        #current_state_1, current_state_2 = next_state_1, next_state_2
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

    if not for_training:
        return epi_reward
示例#17
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0
    # initialize for each episode
    # TODO Your code here

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()


    counter = 0

    
    while not terminal:
        # Choose next action and execute
        # TODO Your code here
        (action_index, object_index) = epsilon_greedy(dict_room_desc[current_room_desc], dict_quest_desc[current_quest_desc], q_func, epsilon)
        (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index)
        
        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, dict_room_desc[current_room_desc], dict_quest_desc[current_quest_desc], action_index,
                       object_index, reward, dict_room_desc[next_room_desc], dict_quest_desc[next_quest_desc],
                       terminal)
            # TODO Your code here
            pass

        if not for_training:
            # update reward
            epi_reward += reward * (GAMMA ** (framework.STEP_COUNT-1))
            
            # TODO Your code here
            pass
        counter += 1
        # prepare next step
        # TODO Your code here
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc
    if not for_training:
        return epi_reward
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    
    epi_reward = 0.0 # initialize for each episode

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = utils.extract_bow_feature_vector(
            current_state, dictionary)

        next_action_index, next_object_index = epsilon_greedy(current_state_vector, 
                                                              theta, 
                                                              epsilon) # Get next action, object
        
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
                                                            current_room_desc,
                                                            current_quest_desc,
                                                            next_action_index,
                                                            next_object_index) # Take a step
        
        next_state = next_room_desc + next_quest_desc   # Build next state vector
        next_state_vector = utils.extract_bow_feature_vector(next_state, dictionary)      

        if for_training:
            # update Q-function.
            linear_q_learning(theta, current_state_vector, next_action_index, 
                          next_object_index, reward, next_state_vector, terminal) # Update theta

        if not for_training:
            # update reward
            epi_reward += (GAMMA**(framework.STEP_COUNT - 1))*reward

        # prepare next step
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

    if not for_training:
        return epi_reward
def run_episode(for_training):
    """
        Runs one episode
        If for training, update Q function
        If for testing, computes and return cumulative discounted reward
    """
    # My solution:
    epsilon = TRAINING_EP if for_training else TESTING_EP
    epi_reward = 0

    # initialize for each episode
    current_room_desc, current_quest_desc, terminal = framework.newGame()

    t = 0
    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(current_state,
                                             dictionary)).to(device)

        action_index, object_index = epsilon_greedy(current_state_vector,
                                                    epsilon)

        next_room_desc, next_quest_desc, reward, terminal = \
            framework.step_game(current_room_desc, current_quest_desc,
                                action_index, object_index)

        next_state = next_room_desc + next_quest_desc

        next_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(next_state,
                                             dictionary)).to(device)

        if for_training:
            # update Q-function.
            deep_q_learning(current_state_vector, action_index, object_index,
                            reward, next_state_vector, terminal)

        if not for_training:
            # update reward
            epi_reward += GAMMA**t * reward
            t += 1

        # prepare next step
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

    if not for_training:
        return epi_reward
示例#20
0
def run_episode(for_training):
    """
        Runs one episode
        If for training, update Q function
        If for testing, computes and return cumulative discounted reward
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    
    # initialize for each episode
    epi_reward = 0.0
    step = 0
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(current_state, dictionary))

        # Get next c
        next_action_i, next_object_i = epsilon_greedy(current_state_vector, epsilon)
        
        # Make a move
        step += 1
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
                                                            current_room_desc,
                                                            current_quest_desc,
                                                            next_action_i,
                                                            next_object_i) 

        # Next state vector
        next_state = next_room_desc + next_quest_desc 
        next_state_vector = torch.FloatTensor(utils.extract_bow_feature_vector(next_state, dictionary)) 

        if for_training:
            # update Q-function.
            deep_q_learning(current_state_vector, next_action_i, 
                            next_object_i, reward, next_state_vector, terminal)

        if not for_training:
            # update reward
            epi_reward += (GAMMA**(step - 1)) * reward
            
        # prepare next step
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

    if not for_training:
        return epi_reward
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    # initialize for each episode
    epi_reward = 0 
    step = 0
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        current_room_desc_i = dict_room_desc[current_room_desc]
        current_quest_desc_i = dict_quest_desc[current_quest_desc]

        # Get next c
        next_action_i, next_object_i = epsilon_greedy(current_room_desc_i, current_quest_desc_i, q_func, epsilon)

        # Make a move
        step += 1
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(current_room_desc, current_quest_desc, next_action_i, next_object_i)
        next_room_desc_i = dict_room_desc[next_room_desc]

        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, current_room_desc_i, current_quest_desc_i, 
                               next_action_i, next_object_i, reward, 
                               next_room_desc_i, current_quest_desc_i,
                               terminal)

        if not for_training:
            # update reward. sum(gamma^t * reward)
            epi_reward += (GAMMA**(step - 1)) * reward

        # prepare next step
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

    if not for_training:
        return epi_reward
示例#22
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward
    theta & dictionary are freaking global
    Args:
        for_training (bool): True if for training
    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    epi_reward = 0.
    t = 0
    # initialize for each episode
    # theta = ... # global

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = utils.extract_bow_feature_vector(
            current_state, dictionary)
        action_index, object_index = epsilon_greedy(current_state_vector,
                                                    theta, epsilon)
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, action_index, object_index)
        next_state_vector = utils.extract_bow_feature_vector(
            next_room_desc + next_quest_desc, dictionary)

        if for_training:
            # update Q-function.
            linear_q_learning(theta, current_state_vector, action_index,
                              object_index, reward, next_state_vector,
                              terminal)

        if not for_training:
            epi_reward += (GAMMA**t) * reward

        # prepare next step
        t += 1

    if not for_training:
        return epi_reward
示例#23
0
def run_episode(for_training):
    """
        Runs one episode
        If for training, update Q function
        If for testing, computes and return cumulative discounted reward
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    gamma_step = 1
    epi_reward = 0

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(current_state, dictionary))

        (action_index, object_index) = epsilon_greedy(current_state_vector,
                                                      epsilon)
        (next_room_desc, next_quest_desc, reward,
         terminal) = framework.step_game(current_room_desc, current_quest_desc,
                                         action_index, object_index)

        if for_training:
            # update Q-function.
            next_state = next_room_desc + next_quest_desc
            # 32-bit floating point CPU tensor
            next_state_vector = torch.FloatTensor(
                utils.extract_bow_feature_vector(next_state, dictionary))
            deep_q_learning(current_state_vector, action_index, object_index,
                            reward, next_state_vector, terminal)

        if not for_training:
            # update reward
            epi_reward = epi_reward + gamma_step * reward
            gamma_step = gamma_step * GAMMA

        # prepare next step
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#24
0
def run_episode(for_training):
    """
        Runs one episode
        If for training, update Q function
        If for testing, computes and return cumulative discounted reward
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    epi_reward = 0.0

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(current_state, dictionary))

        next_action_index, next_object_index = epsilon_greedy(
            current_state_vector, epsilon)

        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, next_action_index,
            next_object_index)
        next_state = next_room_desc + next_quest_desc
        next_state_vector = torch.FloatTensor(
            utils.extract_bow_feature_vector(next_state, dictionary))

        if for_training:
            deep_q_learning(current_state_vector, next_action_index,
                            next_object_index, reward, next_state_vector,
                            terminal)

        if not for_training:
            epi_reward += (GAMMA**(framework.STEP_COUNT - 1)) * reward

        # prepare next step
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#25
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP
    epi_reward = 0

    # initialize for each episode
    # TODO Your code here

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    t = 0
    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = utils.extract_bow_feature_vector(
            current_state, dictionary)
        # TODO Your code here

        # Decidir acción con epsilon_greedy
        action_index, object_index = epsilon_greedy(current_state_vector,
                                                    theta, epsilon)

        # Paso del juego + traducir descripciones
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, action_index, object_index)
        # next_room  = dict_room_desc[ next_room_desc]
        # next_quest = dict_quest_desc[ next_quest_desc]

        next_state = next_room_desc + next_quest_desc
        next_state_vector = utils.extract_bow_feature_vector(
            next_state, dictionary)

        if for_training:
            # update Q-function.
            # TODO Your code here
            linear_q_learning(theta, current_state_vector, action_index,
                              object_index, reward, next_state_vector,
                              terminal)
            pass

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward += (GAMMA**t) * reward
            pass

        # prepare next step
        # TODO Your code here
        t += 1
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    """ My solution:
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0

    # initialize for each episode
    current_room_desc, current_quest_desc, terminal = framework.newGame()  # string descriptions

    t = 0
    while not terminal:

        # Choose next action and execute
        room_index = dict_room_desc[current_room_desc]
        quest_index = dict_quest_desc[current_quest_desc]

        action_index, object_index = epsilon_greedy(room_index, quest_index,
                                                    q_func, epsilon)

        next_room_desc, next_quest_desc, reward, terminal = \
            framework.step_game(current_room_desc, current_quest_desc,
                                action_index, object_index)

        next_room_index = dict_room_desc[next_room_desc]
        next_quest_index = dict_quest_desc[next_quest_desc]

        if for_training:
            # update Q-function.
            tabular_q_learning(q_func, room_index, quest_index,
                               action_index, object_index,
                               reward, next_room_index, next_quest_index,
                               terminal)

        if not for_training:
            # update reward
            epi_reward += GAMMA ** t * reward
            t += 1

        # prepare next step
        current_room_desc, current_quest_desc = next_room_desc, next_quest_desc

    if not for_training:
        return epi_reward
    """
    # Instructor's solution:
    #   Difference reward section:
    #       Uses epi_reward += gamma_step * reward, then gamma_step += GAMMA
    epsilon = TRAINING_EP if for_training else TESTING_EP
    gamma_step = 1
    epi_reward = 0

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()
    while not terminal:
        # Choose next action and execute
        cur_room_desc_id = dict_room_desc[current_room_desc]
        cur_quest_desc_id = dict_quest_desc[current_quest_desc]
        (action_index, object_index) = epsilon_greedy(cur_room_desc_id,
                                                      cur_quest_desc_id,
                                                      q_func, epsilon)
        (next_room_desc, next_quest_desc, reward,
         terminal) = framework.step_game(current_room_desc, current_quest_desc,
                                         action_index, object_index)

        if for_training:
            # update Q-function.
            next_room_desc_id = dict_room_desc[next_room_desc]
            next_quest_desc_id = dict_quest_desc[next_quest_desc]
            tabular_q_learning(q_func, cur_room_desc_id, cur_quest_desc_id,
                               action_index, object_index, reward,
                               next_room_desc_id, next_quest_desc_id, terminal)

        if not for_training:
            # update reward
            epi_reward = epi_reward + gamma_step * reward
            gamma_step = gamma_step * GAMMA

        # prepare next step
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
def run_episode(for_training, need_history=False):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    epi_reward = 0
    # initialize for each episode
    # TODO Your code here

    global q_func, dict_room_desc, dict_quest_desc

    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    history = list()

    while not terminal:
        # Choose next action and execute

        # get the greedy action wrt the current_room_desc and current_quest_desc
        action_index, object_index = epsilon_greedy(
            state_1=dict_room_desc[current_room_desc],
            state_2=dict_quest_desc[current_quest_desc],
            q_func=q_func,
            epsilon=epsilon)

        # take the action in the environment and get the reward
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc=current_room_desc,
            current_quest_desc=current_quest_desc,
            action_index=action_index,
            object_index=object_index)

        history.append([
            current_room_desc, current_quest_desc, action_index, object_index,
            next_room_desc, next_quest_desc, reward, terminal
        ])

        if for_training:
            # update Q-function.
            tabular_q_learning(
                q_func=q_func,
                current_state_1=dict_room_desc[current_room_desc],
                current_state_2=dict_quest_desc[current_quest_desc],
                action_index=action_index,
                object_index=object_index,
                reward=reward,
                next_state_1=dict_room_desc[next_room_desc],
                next_state_2=dict_quest_desc[next_quest_desc],
                terminal=terminal)

        if not for_training:
            # update reward
            epi_reward += np.power(GAMMA, framework.STEP_COUNT) * reward

        # prepare next step
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        if need_history:
            return epi_reward, history
        else:
            return epi_reward
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    dict_room_desc, dict_quest_desc = framework.make_all_states_index()

    # q_func = np.zeros((NUM_ROOM_DESC, NUM_QUESTS, NUM_ACTIONS, NUM_OBJECTS))

    epi_reward = 0
    # initialize for each episode
    # TODO Your code here
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    t = 0
    while not terminal:
        # Choose next action and execute
        # TODO Your code here
        current_room = dict_room_desc[
            current_room_desc]  # Índice de habitación
        current_quest = dict_quest_desc[
            current_quest_desc]  # Índice de la quest

        # Decidir acción con épsilon greedy
        action_index, object_index = epsilon_greedy(current_room,
                                                    current_quest, q_func,
                                                    epsilon)

        # Paso del juego + traducir descripciones
        next_room_desc, next_quest_desc, reward, terminal = framework.step_game(
            current_room_desc, current_quest_desc, action_index, object_index)
        next_room = dict_room_desc[next_room_desc]
        next_quest = dict_quest_desc[next_quest_desc]

        if for_training:
            # update Q-function.
            # TODO Your code here
            tabular_q_learning(q_func, current_room, current_quest,
                               action_index, object_index, reward, next_room,
                               next_quest, terminal)
            pass

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward += (GAMMA**t) * reward
            pass

        # prepare next step
        # TODO Your code here
        t += 1
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward
示例#29
0
def run_episode(for_training):
    """ Runs one episode
    If for training, update Q function
    If for testing, computes and return cumulative discounted reward

    Args:
        for_training (bool): True if for training

    Returns:
        None
    """
    epsilon = TRAINING_EP if for_training else TESTING_EP

    # initialize for each episode
    # TODO Your code here

    # Look into framework.py file for hint
    # A tuple where the first element is a description of the initial room,
    # the second element is a description of the quest for this new game episode, and
    # the last element is a Boolean variable with value False implying that the game is not over.
    (current_room_desc, current_quest_desc, terminal) = framework.newGame()

    # initial value
    count = 0
    epi_reward = 0

    while not terminal:
        # Choose next action and execute
        current_state = current_room_desc + current_quest_desc
        current_state_vector = utils.extract_bow_feature_vector(current_state, dictionary)
        # TODO Your code here
        (action_index, object_index) = epsilon_greedy(current_state_vector, theta, epsilon)

        # Renaming to shorter name
        a_idx = action_index  # rename for shorter name for action index
        o_idx = object_index  # rename for shorter name for object index
        crd = current_room_desc  # rename for shorter name for current room description
        cqd = current_quest_desc  # rename for shorter name for current quest description

        # the system next state when the selected command is applied at the current state
        (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(crd, cqd, a_idx, o_idx)

        next_state = next_room_desc + next_quest_desc
        # Look into utils.py for the bag-of-words vector representation of the state
        next_state_vector = utils.extract_bow_feature_vector(next_state, dictionary)

        if for_training:
            # update Q-function.
            # TODO Your code here
            linear_q_learning(theta, current_state_vector, a_idx, o_idx, reward, next_state_vector, terminal)
            pass

        if not for_training:
            # update reward
            # TODO Your code here
            epi_reward += np.power(GAMMA, count) * reward
            pass

        # prepare next step
        # TODO Your code here
        count += 1
        current_room_desc = next_room_desc
        current_quest_desc = next_quest_desc

    if not for_training:
        return epi_reward