sys.path.append(os.path.join(sys.path[0], '..'))

from plotting import plotPiV
from gridworld import Gridworld

# V = ...
# Q = ...
V_converged = False
w = Gridworld()
gamma = .8
# init V at random
V = np.random.rand(Gridworld.states.shape[0])
V_old = 0
while not V_converged:

    # compute Q function
    # ...
    Q = Gridworld.reward + gamma * V
    # compute V function
    # ...

    V_diff = V - V_old
    V_diff = np.sum(np.absolute(V_diff).flat)
    if V_diff < 0.01:
        V_converged = True
    V = V_old
# convert policy for plot
pi = Gridworld.actions[0][Q.argmax(2).flat].reshape(
    (V.shape[0], V.shape[1], 2))
plotPiV(pi, V)
示例#2
0
        # ...
        # a_t = ...

        # apply action and get reward
        # R = ...

        # next state
        # s_next = ...

        # next actions
        # ...
        # a_next = ...

        # perform sarsa update
        # Q[s_t[0], s_t[1], a_t] = ...

    # in the terminal states we don't want to consider any other action
    # besides staying where we are
    Q[s_next[0], s_next[1], 1:] = -1000
    # update the state action value function for the terminal state
    # Q[s_next[0], s_next[1], 0] = ...

    # decrease beta
    beta *= .999

# compute value function and policy for plot
V = Q.max(2)
pi = gridworld.Gridworld.actions[0][Q.argmax(2).flat].reshape(
    (V.shape[0], V.shape[1], 2))
plotting.plotPiV(pi, V)
示例#3
0
def lspi():
    gridworld = Gridworld()

    gamma = .8
    beta = 1
    numEpisodes = 10
    beta_factor = 1 - 5 * numEpisodes / 10000

    # Sample the state space with different grid sizes
    X_1, X_2 = np.meshgrid(np.arange(.5, 7, 1),
                           np.arange(.5, 7, 1),
                           indexing='ij')
    X_1m, X_2m = np.meshgrid(np.arange(.5, 7, 0.5),
                             np.arange(.5, 7, 0.5),
                             indexing='ij')

    # initialize the policy randomly. should give for each state in s (nx2) a
    # random action of the form (r,a), where r ~ Unif[0,1] and a ~ Unif[0,2pi].
    # pi = lambda s: ...

    # samples from initial distribution n starting positions (you can start
    # with a random initialization in the entire gridworld)
    # initialDistribution = lambda n: ...

    converged = False

    # generate an ndgrid over the state space for the centers of the basis
    # functions
    X1, X2, A1, A2 = np.meshgrid(np.arange(.5, 7, 1), np.arange(.5, 7, 1),
                                 np.arange(-1, 2), np.arange(-1, 2))
    # NOTE: the policy returns the action in polar coordinates while the basis
    # functions use cartesian coordinates!!! You have to convert between these
    # representations.

    # matrix of the centers
    c = np.column_stack(
        (np.transpose(X1.flatten()), np.transpose(X2.flatten()),
         np.transpose(A1.flatten()), np.transpose(A2.flatten())))

    # number of basis functions
    # k = ...

    # initialize weights
    # w = ...

    # compute bandwiths with median trick
    bw = np.zeros(4)
    for i in range(4):
        dist = pdist(c[:, [i]])
        bw[i] = np.sqrt(np.median(dist**2)) * .4

    # feature function (making use of rbf)
    # feature = lambda x_: ...

    # time step
    t = 0

    # initialize A and b
    # A = ...
    # b = ...

    while not converged:
        # Policy evaluation
        # sample data
        s1, a, r, s2 = sampleData(gridworld, pi, initialDistribution,
                                  numEpisodes, 50)

        # compute actions in cartesian space
        ac1, ac2 = pol2cart(a[:, 0, np.newaxis], a[:, 1, np.newaxis])

        # compute PHI
        # PHI = ...

        # compute PPI
        # PPI = ...

        # update A and b
        # A = ...
        # b = ...

        # compute new w
        w_old = w
        # w = ...

        # Policy improvement
        # pi = ...

        beta = beta_factor * beta
        t = t + 1

        # Check for convergence
        if np.abs(w - w_old).sum() / len(w) < 0.05:
            converged = True

        print(t, ' - ', beta, ' - ', np.abs(w - w_old).sum() / len(w))

        ### plotting
        a = policy(np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1))),
                   feature, w, 0)

        ax1, ax2 = pol2cart(a[:, 0].reshape(-1, 1), a[:, 1].reshape(-1, 1))
        phi = rbf(
            np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1), ax1, ax2)), c,
            bw)
        Q = phi.dot(w)
        n_plot = len(X_1m)

        plot_a = np.hstack((ax1, ax2)).reshape((n_plot, n_plot, 2))
        plot_V = Q.reshape((n_plot, n_plot))

        plotPiV(plot_a, plot_V, vmin=-5, vmax=5, block=False)

    plotPiV(plot_a, plot_V, vmin=-5, vmax=5)