sys.path.append(os.path.join(sys.path[0], '..')) from plotting import plotPiV from gridworld import Gridworld # V = ... # Q = ... V_converged = False w = Gridworld() gamma = .8 # init V at random V = np.random.rand(Gridworld.states.shape[0]) V_old = 0 while not V_converged: # compute Q function # ... Q = Gridworld.reward + gamma * V # compute V function # ... V_diff = V - V_old V_diff = np.sum(np.absolute(V_diff).flat) if V_diff < 0.01: V_converged = True V = V_old # convert policy for plot pi = Gridworld.actions[0][Q.argmax(2).flat].reshape( (V.shape[0], V.shape[1], 2)) plotPiV(pi, V)
# ... # a_t = ... # apply action and get reward # R = ... # next state # s_next = ... # next actions # ... # a_next = ... # perform sarsa update # Q[s_t[0], s_t[1], a_t] = ... # in the terminal states we don't want to consider any other action # besides staying where we are Q[s_next[0], s_next[1], 1:] = -1000 # update the state action value function for the terminal state # Q[s_next[0], s_next[1], 0] = ... # decrease beta beta *= .999 # compute value function and policy for plot V = Q.max(2) pi = gridworld.Gridworld.actions[0][Q.argmax(2).flat].reshape( (V.shape[0], V.shape[1], 2)) plotting.plotPiV(pi, V)
def lspi(): gridworld = Gridworld() gamma = .8 beta = 1 numEpisodes = 10 beta_factor = 1 - 5 * numEpisodes / 10000 # Sample the state space with different grid sizes X_1, X_2 = np.meshgrid(np.arange(.5, 7, 1), np.arange(.5, 7, 1), indexing='ij') X_1m, X_2m = np.meshgrid(np.arange(.5, 7, 0.5), np.arange(.5, 7, 0.5), indexing='ij') # initialize the policy randomly. should give for each state in s (nx2) a # random action of the form (r,a), where r ~ Unif[0,1] and a ~ Unif[0,2pi]. # pi = lambda s: ... # samples from initial distribution n starting positions (you can start # with a random initialization in the entire gridworld) # initialDistribution = lambda n: ... converged = False # generate an ndgrid over the state space for the centers of the basis # functions X1, X2, A1, A2 = np.meshgrid(np.arange(.5, 7, 1), np.arange(.5, 7, 1), np.arange(-1, 2), np.arange(-1, 2)) # NOTE: the policy returns the action in polar coordinates while the basis # functions use cartesian coordinates!!! You have to convert between these # representations. # matrix of the centers c = np.column_stack( (np.transpose(X1.flatten()), np.transpose(X2.flatten()), np.transpose(A1.flatten()), np.transpose(A2.flatten()))) # number of basis functions # k = ... # initialize weights # w = ... # compute bandwiths with median trick bw = np.zeros(4) for i in range(4): dist = pdist(c[:, [i]]) bw[i] = np.sqrt(np.median(dist**2)) * .4 # feature function (making use of rbf) # feature = lambda x_: ... # time step t = 0 # initialize A and b # A = ... # b = ... while not converged: # Policy evaluation # sample data s1, a, r, s2 = sampleData(gridworld, pi, initialDistribution, numEpisodes, 50) # compute actions in cartesian space ac1, ac2 = pol2cart(a[:, 0, np.newaxis], a[:, 1, np.newaxis]) # compute PHI # PHI = ... # compute PPI # PPI = ... # update A and b # A = ... # b = ... # compute new w w_old = w # w = ... # Policy improvement # pi = ... beta = beta_factor * beta t = t + 1 # Check for convergence if np.abs(w - w_old).sum() / len(w) < 0.05: converged = True print(t, ' - ', beta, ' - ', np.abs(w - w_old).sum() / len(w)) ### plotting a = policy(np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1))), feature, w, 0) ax1, ax2 = pol2cart(a[:, 0].reshape(-1, 1), a[:, 1].reshape(-1, 1)) phi = rbf( np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1), ax1, ax2)), c, bw) Q = phi.dot(w) n_plot = len(X_1m) plot_a = np.hstack((ax1, ax2)).reshape((n_plot, n_plot, 2)) plot_V = Q.reshape((n_plot, n_plot)) plotPiV(plot_a, plot_V, vmin=-5, vmax=5, block=False) plotPiV(plot_a, plot_V, vmin=-5, vmax=5)