Exemplo n.º 1
0
def play_game(grid, policy):
    '''
    returns a list of states their returns and we dont use ex
    '''
    s = (2, 0)
    grid.set_state(s)
    a = random_action(policy[s])

    #be aware of timing each triple is s(t), a(t), r(t)
    #but r(t0 results in aking action a(t-1) from s(t-1) to land at s(t)
    states_actions_rewards = [(s, a, 0)]
    while True:
        r = grid.move(a)
        s = grid.current_state()
        if grid.game_over():
            states_actions_rewards.append((s, None, r))
            break
        else:
            a = random_action(policy[s])
            states_actions_rewards.append((s, a, r))

    #calculate returns by working back from terminal state
    G = 0
    states_actions_returns = []
    first = True
    for s, a, r in reversed((states_actions_rewards)):
        if first:
            first = False
        else:
            states_actions_returns.append((s, a, G))
        G = r + GAMMA * G
    states_actions_returns.reverse()
    return states_actions_returns
def play_game(grid, policy):
    '''
    reset game to start at random position
    we need to do this because given our current deterministic policy
    we would never end up at certain states, but we still want to measure them
    :param grid: the grid class object
    :param policy: dictionary containing policies
    :return: a list of states and corresponding returns
    '''

    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    s = grid.current_state()
    states_and_rewards = [(s, 0)]  # list of tuples of (state,reward)
    while not grid.game_over():
        a = policy[s]
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
    #calculate returns by working backwards from terminal state
    G = 0
    states_and_returns = []
    first = True
    for s, r in reversed(states_and_rewards):
        #the value of the terminal state is 0 by definition
        #we should ignore the first state we encounter
        if first:
            first = False
        else:
            states_and_returns.append((s, G))
        G = r + GAMMA * G
    states_and_returns.reverse()  # we want it to be in order of state visited
    return states_and_returns
Exemplo n.º 3
0
def play_game(grid, policy):
    s = (2, 0)
    grid.set_state(s)
    states_and_rewards = [(s, 0)]
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
    return states_and_rewards
    #repeat until convergence
    # we have two so LR end epsilon can decrease at different rates
    t = 1.0
    t2 = 1.0
    deltas = []
    for i in range(20000):
        if i % 100 == 0:
            t += 10e-3
            t2 += 0.01
        if i % 1000 == 0:
            print('i: ', i)
        alpha = ALPHA / t2

        #we play instead of generating an episode
        s = (2, 0)
        grid.set_state(s)

        #get Q(s) to choose first action
        Qs = getQs(model, s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a = max_dict(Qs)[0]
        a = random_action(a, eps=0.5 / t)
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()
Exemplo n.º 5
0
def play_game(grid, policy):
    '''
    reset game to start at random position
    we need to do this because given our current deterministic policy
    we would never end up at certain states, but we still want to measure them
    :param grid: the grid class object
    :param policy: dictionary containing policies
    :return: a list of states and corresponding returns
    '''

    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    s = grid.current_state()
    a = np.random.choice(ALL_POSSIBLE_ACTIONS) #first action is uniformly random

    #be aware of timing
    #each triple is s(t), a(t), r(t)
    #but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t)
    states_actions_rewards = [(s, a, 0)]
    seen_states = set()
    seen_states.add(grid.current_state())
    num_steps = 0
    while True:
        r = grid.move(a)
        num_steps += 1
        s = grid.current_state()

        if s in seen_states:
            # hack so that we don't end up in an infinitely long episode
            # bumping into the wall repeatedly
            # if num_steps == 1 -> bumped into a wall and haven't moved anywhere
            #   reward = -10
            # else:
            #   reward = falls off by 1 / num_steps
            reward = -10. / num_steps
            states_actions_rewards.append((s, None, reward))
            break
        elif grid.game_over():
            states_actions_rewards.append((s, None, r))
            break
        else:
            a = policy[s]
            states_actions_rewards.append((s, a, r))
        seen_states.add(s)

    # calculate the returns by working backwards from the terminal state
    G = 0
    states_actions_returns = []
    first = True
    for s, a, r in reversed(states_actions_rewards):
        #the value of the terminal state is 0 so we ingore first state
        #and we ignore the last G which is meaningless since it doesnt correspond
        if first:
            first = False
        else:
            states_actions_returns.append((s,a,G))
        G = r + GAMMA*G
    states_actions_returns.reverse() # we want it to be in order of state visited
    return states_actions_returns