def linear_approx_sarsa(max_episode, alpha, e, gamma, lbd, optimal_Q=None): theta = np.random.randn(36) / 100 mse = [] for i in range(max_episode): # initial eligibility traces E = np.zeros([ 36, ]) # initial a new episode episode = Easy21() x, y = episode.observe() action = cal_action([x, y], theta, e) # sample until terminal while not episode.is_terminal(): # run one step ([xp, yp], reward) = episode.step(action) if episode.is_terminal(): # if the episode is in terminal state, Q[s', a'] is 0 q0 = cal_q([x, y], action, theta) delta = reward - q0 actionp = 0 else: actionp = cal_action([xp, yp], theta, e) q0 = cal_q([x, y], action, theta) q1 = cal_q([xp, yp], actionp, theta) delta = reward + gamma * q1 - q0 E = E * (gamma * lbd) + q_gradient([x, y], action) theta += (alpha * delta * E) x, y, action = xp, yp, actionp if (i % 1000 == 0) and (optimal_Q is not None): mse.append(np.sum((cal_q_table(theta) - optimal_Q)**2)) return (theta, mse)
def run(self): self.reset() game = Easy21() S = game.state() A = self.epsilon_greedy_action(S) while not game.isTerminal(): Aprime = None game, R = game.step(A) Sprime = game.state() self.N[S][A] += 1 self.E[S][A] += 1 # Initialize Q to zero for "all" states # Our lookup table is only for interesting states # So we hack around by putting Q = 0 if game.isTerminal(): Q = 0 else: Aprime = self.epsilon_greedy_action(Sprime) Q = self.q[Sprime][Aprime] """ This is our TD error """ alpha = self.alpha(S, A) delta = R + Sarsa.GAMMA * Q - self.q[S][A] self.update(alpha, delta) S = Sprime A = Aprime
def sarsa_lambda(max_episode, gamma, lbd, N0, optimal_Q=None): # sarsa(lambda) Q = np.zeros([10, 21, 2]) N = np.zeros([10, 21, 2]) mse = [] for i in range(max_episode): # initial eligibility traces E = np.zeros([10, 21, 2]) # initial a new episode episode = Easy21() x, y = episode.observe() action = epsilon_greedy(N0, N, Q, x, y) # sample until terminal while not episode.is_terminal(): N[x - 1, y - 1, action] += 1 E[x - 1, y - 1, action] += 1 # run one step ([xp, yp], reward) = episode.step(action) if episode.is_terminal(): # if the episode is in terminal state, Q[s', a'] is 0 delta = reward - Q[x - 1, y - 1, action] actionp = 0 else: actionp = epsilon_greedy(N0, N, Q, xp, yp) delta = reward + gamma * Q[xp - 1, yp - 1, actionp] - Q[x - 1, y - 1, action] alpha = 1.0 / N[x - 1, y - 1, action] Q += (alpha * delta * E) E *= (gamma * lbd) x, y, action = xp, yp, actionp if (i % 1000 == 0) and (optimal_Q is not None): mse.append(np.sum((Q - optimal_Q)**2)) return (Q, mse)
def monte_carlo(max_episode, gamma, N0): # monte carlo Q = np.zeros([10, 21, 2]) N = np.zeros([10, 21, 2]) for i in range(max_episode): # initial a new episode episode = Easy21() # the initial state of the episode x, y = episode.observe() # sample until terminal history = [] while not episode.is_terminal(): # decide action action = epsilon_greedy(N0, N, Q, x, y) N[x - 1, y - 1, action] += 1 # run one step (state, reward) = episode.step(action) history.append(([x, y], action, reward)) [x, y] = state # calculate return Gt for each state in this episode Gt = 0 for j, (state, action, reward) in enumerate(reversed(history)): [x, y] = state alpha = 1.0 / N[x - 1, y - 1, action] Gt = gamma * Gt + reward Q[x - 1, y - 1, action] += alpha * (Gt - Q[x - 1, y - 1, action]) return Q
def test_initialization(self): for i in range(30): game = Easy21() state = game.state() # [ dealer's sum , player's sum ] self.assertIsInstance(state, list) self.assertEqual(len(state), 2) self.assertTrue(state[0] in range(1, 11)) self.assertTrue(state[1] in range(1, 22)) state, reward = game.step(STICK) self.assertEqual(state, TERMINATED) self.assertTrue(reward in [-1, 0, 1])
def run(self): game = Easy21() G = None walk = [] while True: s = game.state() action = self.epsilon_greedy_action(s) self.N[s][action] += 1 game,G = game.step(action) # We break if the game has ended walk.append((s,action)) if game.isTerminal(): break for s, action in walk: self.q[s][action] = self.q[s][action] + self.alpha(s, action) * (G - self.q[s][action])
def main(): #initialize env = Easy21() runs = 1000 for t in range(runs): env.start() results = [] total_reward = 0.0 start_dealer, start_player = env.state() print (" started at start state dealer %d and player %d" % (start_dealer, start_player)) # run our episode while not env.is_finished(): current_state = env.state() dealer, player = current_state print (" start at current state dealer %d, player %d" % (dealer, player)) action = ep_greedy(current_state) print ("ep-greedy picked action %d" % action) next_state, reward = env.step(current_state, action) print ("------------------------------------") print (next_state) print (reward) print ("------------------------------------") # MC record our current state and the reward from this state onwards results.append((current_state, action)) total_reward += reward if(dealer <= 10 and player <= 21): current_state = next_state # after an end of the episode for (state, action) in results: # incremet the state count dealer, player = state possible_states[dealer-1, player-1] += 1 possible_states_actions[dealer-1, player-1, action] += 1 # step-size alpha = 1/possible_states_actions[dealer-1, player-1, action] # update our value function in the direction towards the reward q_states_actions[dealer-1, player-1, action] += alpha * (total_reward - q_states_actions[dealer-1, player-1, action]) with open('q_true.txt', 'w+') as outfile: outfile.write('# Array shape: {0}\n'.format(q_states_actions.shape)) # equivalent to array[i,:,:] for data_slice in q_states_actions: np.savetxt(outfile, data_slice, fmt='%-7.2f') outfile.write('# New slice \n')
def run(self): self.reset() game = Easy21() S = game.state() A = self.epsilon_greedy_action(S) while not game.isTerminal(): Aprime = None game, R = game.step(A) Sprime = game.state() # Initialize Q to zero for "all" states # Our lookup table is only for interesting states # So we hack around by putting Q = 0 if game.isTerminal(): Q = 0 else: Aprime = self.epsilon_greedy_action(Sprime) Q = self.Q(Sprime, Aprime) """ This is our TD error """ delta = R + Sarsa2.GAMMA * Q - self.Q(S, A) features = FeatureExtractor(S, A).features() self.update(delta, features) S = Sprime A = Aprime
def setUp(self): self.env = Easy21() self.state = self.env.reset()
import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d import numpy as np def game_state_to_control_state(game_state): return PLAYER_MAX*(game_state[0]-1) + game_state[1] - 1 # start learning control = TDControl(CARD_MAX*PLAYER_MAX, len(ACTIONS)) control_mc = MCControl(CARD_MAX*PLAYER_MAX, len(ACTIONS)) for i in range(50000): game = Easy21(verbose=False) episode = [] state = game.state() while state != TERMINATED: action = control.action(game_state_to_control_state(state)) next_state, reward = game.step(ACTIONS[action]) control.experience(game_state_to_control_state(state), action, reward) control_mc.experience(game_state_to_control_state(state), action, reward) # input('') # print(control.Qsa[game_state_to_control_state(state)][action]) # print(control.Esa[game_state_to_control_state(state)][action]) state = next_state
def main(): # initialise our env env = Easy21() # run 1000 episodes runs = 1000 global q_states_actions global possible_states global possible_states_actions global GAMMA for x in [x * 0.1 for x in range(0, 10)]: lambda_x = x # run our episodes for t in range(runs): # initialize our records results = [] # start the game env.start() # initialize first state and action start_state = env.state() action = ep_greedy(start_state) # re-init eligibility traces each episode e_states_actions = np.zeros((DEALER_STATE, PLAYER_STATE, ACTIONS), dtype=np.float32) while not env.is_finished(): current_state = env.state() # take action A, observe S' , Reward next_state, reward = env.step(current_state, action) next_dealer, next_player = next_state #print ("next_dealer is %d, next_player is %d, immediate reward is %d" % (next_dealer, next_player, reward)) current_dealer, current_player = current_state current_q_values = q_states_actions[current_dealer-1, current_player-1, action] # if we should still proceed if(next_dealer <= 10 and next_player <= 21): # choose A' from S' using e-greedy next_action = ep_greedy(next_state) # calculate the TD error (delta) next_q_values = q_states_actions[next_dealer-1, next_player-1, next_action] td_error = reward + (GAMMA * next_q_values) - current_q_values else: td_error = reward - current_q_values # add counts during the episode e_states_actions[current_dealer-1, current_player-1, action] += 1 possible_states[current_dealer-1, current_player-1] += 1 possible_states_actions[current_dealer-1, current_player-1, action] += 1 # step-size alpha = 1/possible_states_actions[current_dealer-1, current_player-1, action] results.append((current_state, action)) # update the action-value for the whole of episode of each steps for (state, action) in results: # update all action-value and eligilibity traces in the results dealer, player = state q_states_actions[dealer-1, player-1, action] += alpha * td_error * e_states_actions[dealer-1, player-1, action] e_states_actions[dealer-1, player-1, action] = GAMMA * lambda_x * e_states_actions[dealer-1, player-1, action] if(next_dealer <= 10 and next_player <= 21): current_state = next_state action = next_action # Read the q true from disk q_true_txt = np.loadtxt('q_true.txt') # original shape of the array q_true = q_true_txt.reshape((DEALER_STATE,PLAYER_STATE,ACTIONS)) q_diff = q_states_actions - q_true print(np.mean(np.square(q_diff)))
# epsilon-greedy policy(action) selection from current value function def eps_greedy(state, n0=100.0): s1, s2 = state epsilon = n0 / (n0 + n_state[s1, s2]) if random.random() < epsilon: # random selection if random.random() < 0.5: return 0 else: return 1 else: # greedy selection return np.argmax(q_sa[s1, s2, :]) my_game = Easy21() max_epoch = 100000 for epoch in range(max_epoch): curr_state = my_game.init_game() results_list = [] result_sum = 0.0 # start and finish one episode while curr_state is not None: curr_action = eps_greedy(curr_state) next_state, curr_result = my_game.step(curr_action) results_list.append((curr_state, curr_action)) result_sum += curr_result curr_state = next_state
state = next_state action = next_action norms.append(np.linalg.norm(self.Q[1:22, 1:11, :])) if log: print("Completed the total of {} episodes".format(n_episodes)) print("The total time taken is {}s".format(time.time() - start)) return self.Q, norms from easy21 import Easy21, Action env = Easy21() env_name = 'Easy21' TD = TDLambda(env, 2, (22, 22)) Q = TD.train(n_episodes=100000) V = np.max(Q, axis=-1) relevant_V = V[1:22, 1:11] import matplotlib.pyplot as plt policy = np.argmax(Q, axis=-1) plt.subplot(121) plt.imshow(V) plt.subplot(122) plt.imshow(relevant_V)
def setUp(self): self.env = Easy21()
def main(): # initialise our env env = Easy21() # run 1000 episodes runs = 1000 global q_states_actions global DEALERS_FEATURE global PLAYERS_FEATURE global ACTIONS_FEATURE for x in [x * 0.1 for x in range(0, 10)]: lambda_x = x win = 0 for t in range(runs): #initialize our records results = [] # start the game env.start() # initialize first state and action action = ep_greedy(env.state()) # initialize our eligibility traces for each episode traces = np.zeros( (DEALERS_FEATURE * PLAYERS_FEATURE * ACTIONS_FEATURE), dtype=np.float32) # initialize weights should have the same number as features weights = np.random.uniform(0, 1, size=DEALERS_FEATURE * PLAYERS_FEATURE * ACTIONS_FEATURE) q_states_actions = np.zeros( (DEALERS_STATE, PLAYERS_STATE, ACTIONS_FEATURE), dtype=np.float32) while not env.is_finished(): current_state = env.state() current_dealer, current_player = current_state # take action A, observe S', Reward next_state, reward = env.step(current_state, action) next_dealer, next_player = next_state # choose an action based on policy with features (http://artint.info/html/ArtInt_272.html) current_q_values = Q_approx(current_dealer - 1, current_player - 1, action, weights) current_feature_vec = Features(current_dealer - 1, current_player - 1, action) # if we should still proceed if (next_dealer <= 10 and next_player <= 21): # choose A' from S' using e-greedy next_action = ep_greedy(next_state) # calculate the TD error (delta) next_q_values = Q_approx(current_dealer - 1, current_player - 1, action, weights) td_error = reward + (GAMMA * next_q_values) - current_q_values else: td_error = reward - current_q_values # backward view lambda traces = GAMMA * lambda_x * traces + current_feature_vec weights += ALPHA * td_error * traces if reward == 1: win += 1 # Read the q true from disk q_true_txt = np.loadtxt('q_true.txt') # original shape of the array q_true = q_true_txt.reshape( (DEALERS_STATE, PLAYERS_STATE, ACTIONS_FEATURE)) q_diff = q_states_actions - q_true #print(np.mean(np.square(q_diff))) print("lambda %f, won %d" % (lambda_x, win))