def sarsa_lambda(max_episode, gamma, lbd, N0, optimal_Q=None): # sarsa(lambda) Q = np.zeros([21, 10, 2]) N = np.zeros([21, 10, 2]) mse = [] for i in range(max_episode): # initial eligibility traces E = np.zeros([21, 10, 2]) # initial a new episode episode = Easy21() x, y = episode.State() action = epsilon_greedy(N0, N, Q, x, y) # sample until terminal while not episode.is_game_end(): N[x - 1, y - 1, action] += 1 E[x - 1, y - 1, action] += 1 # run one step ([xp, yp], reward) = episode.step(action) if episode.is_game_end(): # if the episode is in terminal state, Q[s', a'] is 0 delta = reward - Q[x - 1, y - 1, action] actionp = 0 else: actionp = epsilon_greedy(N0, N, Q, xp, yp) delta = reward + gamma * Q[xp - 1, yp - 1, actionp] - Q[x - 1, y - 1, action] alpha = 1.0 / N[x - 1, y - 1, action] Q += (alpha * delta * E) E *= (gamma * lbd) x, y, action = xp, yp, actionp if (i % 1000 == 0) and (optimal_Q is not None): mse.append(np.sum((Q - optimal_Q)**2)) return (Q, mse)
def monte_carlo(max_episode, gamma, N0): # monte carlo Q = np.zeros([21, 10, 2]) N = np.zeros([21, 10, 2]) for i in range(max_episode): # initial a new episode episode = Easy21() # the initial state of the episode x, y = episode.State() # sample until terminal history = [] while not episode.is_game_end(): # decide action action = epsilon_greedy(N0, N, Q, x, y) N[x - 1, y - 1, action] += 1 # run one step state, reward = episode.step(action) history.append(([x, y], action, reward)) [x, y] = state # calculate return Gt for each state in this episode Gt = 0 for j, (state, action, reward) in enumerate(reversed(history)): [x, y] = state alpha = 1.0 / N[x - 1, y - 1, action] Gt = gamma * Gt + reward Q[x - 1, y - 1, action] += alpha * (Gt - Q[x - 1, y - 1, action]) return Q
import numpy as np import dill as pickle from environment import Easy21 import utils import time toc = time.time() env = Easy21() N0 = 100 actions = [0, 1] def reset(): Q = np.zeros((22, 11, len(actions))) NSA = np.zeros((22, 11, len(actions))) wins = 0 return Q, NSA, wins Q, NSA, wins = reset() trueQ = pickle.load(open('Q.dill', 'rb')) NS = lambda p, d: np.sum(NSA[p, d]) alpha = lambda p, d, a: 1 / NSA[p, d, a] eps = lambda p, d: N0 / (N0 + NS(p, d)) # policy improvement - by epsilon-greedy