def runMC(size): q = np.zeros((2, 11, 22)) counterSA = np.zeros((2, 11, 22)) counterState = np.zeros((11, 22)) for i in range(size): # sample one episode s0 = easy21.init() s = s0 #print("init ", s) episodes = [] while s[0] != 1: counterState[s[1:]] += 1 a = getAction(s, q, counterState) #print("action: ", a) sa = (a, s[1], s[2]) counterSA[sa] += 1 episodes.append(sa) sprime = easy21.step(s, a) s = sprime #print("state: ", s) #print("result: ", s[2]) for state in episodes: #print("s,a ", state) q[state] += (1 / counterSA[state]) * (s[2] - q[state]) return q
def train(self): """Train the agent on NUM_EPISODES to learn the action-value function""" elapsed_episodes = 0 while elapsed_episodes < const.SARSA_NUM_EPISODES: eligibility_trace = np.zeros(const.STATE_ACTION_SPACE) state = easy21.new_game() epsilon = const.N_0 / (const.N_0 + self.num_visits(state)) action = epsilon_greedy(epsilon, self.Q, state) while state.terminal != const.TERMINAL: # Take 1 step forward from the current state and simulate action next_state, reward = easy21.step(state, action) epsilon = const.N_0 / (const.N_0 + self.num_visits(next_state)) alpha = 1 / self.state_action_count(state, action) next_action = epsilon_greedy(epsilon, self.Q, next_state) # Compute TD-error and increment counts cur_idx = (state.player, state.dealer, action) next_idx = (next_state.player, next_state.dealer, next_action) td_error = reward + self.Q[next_idx] - self.Q[cur_idx] eligibility_trace[cur_idx] += 1 self.state_action_counts[cur_idx] += 1 self.state_visits[state.player - 1, state.dealer - 1] += 1 # Update action-value function and eligibility traces self.Q += alpha * td_error * eligibility_trace eligibility_trace *= const.LAMBDA state = next_state action = next_action elapsed_episodes += 1
def run_episode(Q_sa, N_sa, lambd): E_sa = np.zeros((2, 11, 22)) gamma = 1 dealers_first_card = easy21.get_first_card() players_first_card = easy21.get_first_card() state = easy21.State(dealers_first_card, players_first_card) # initialize ACTION action = easy21.get_action(state, Q_sa, N_sa) assert action in [0, 1], str(action) game_history = [] while not state.is_terminal: old_players_sum = state.players_sum old_dealers_card = state.dealers_card old_action = action next_state, reward = easy21.step(state, action) if not next_state.is_terminal: next_action = easy21.get_action(state, Q_sa, N_sa) delta = reward + \ gamma * \ Q_sa[next_action, next_state.dealers_card, next_state.players_sum] - \ Q_sa[old_action, old_dealers_card, old_players_sum] else: next_action = 0 delta = reward + gamma * 0 - Q_sa[old_action, old_dealers_card, old_players_sum] N_sa[old_action, old_dealers_card, old_players_sum] += 1 alpha = 1 / N_sa[old_action, old_dealers_card, old_players_sum] E_sa[old_action, old_dealers_card, old_players_sum] += 1 Q_sa = Q_sa + alpha * delta * E_sa E_sa = gamma * lambd * E_sa state = next_state action = next_action return (Q_sa, N_sa, reward)
def run_episode(theta, lambd, feature_map): E_sa = np.zeros(36) gamma = 1 dealers_first_card = easy21.get_first_card() players_first_card = easy21.get_first_card() state = easy21.State(dealers_first_card, players_first_card) # initialize ACTION action = get_feature_action(state, theta, feature_map) assert action in [0, 1], str(action) game_history = [] while not state.is_terminal: old_players_sum = state.players_sum old_dealers_card = state.dealers_card old_action = action next_state, reward = easy21.step(state, action) if not next_state.is_terminal: next_action = get_feature_action(state, theta, feature_map) delta = reward + \ gamma * \ np.dot(feature_map[next_action, next_state.dealers_card, next_state.players_sum], theta) - \ np.dot(feature_map[old_action, old_dealers_card, old_players_sum], theta) else: next_action = 0 delta = reward + gamma * 0 - np.dot( feature_map[old_action, old_dealers_card, old_players_sum], theta) alpha = 0.01 E_sa = gamma * lambd * E_sa + feature_map[ (old_action, old_dealers_card, old_players_sum)] theta = theta + alpha * delta * E_sa state = next_state action = next_action return (theta, reward)
def trial(): state = State.new() states = [state] while not state.terminated: action = next_action(state) N[state] += 1 N[(state, action)] += 1 state, reward = step(state, action) update(states, reward)
def play_from_state(state): is_terminal = False history = [] while not is_terminal: action = get_action(state) history.append((state, action)) reward, state, is_terminal = easy21.step(state, action) return reward, history
def run_episode(Q_sa, N_sa): dealers_first_card = easy21.get_first_card() players_first_card = easy21.get_first_card() state = easy21.State(dealers_first_card, players_first_card) reward = None history = [] action = "" while not state.is_terminal: action = easy21.get_action(state, Q_sa, N_sa) old_state = copy.deepcopy(state) state, reward = easy21.step(state, action) history.append((old_state, action, reward)) return (history)
def runTD(size, tdLambda): # init q q = np.zeros((2, 11, 22)) # action, dc, sum counterSA = np.zeros((2, 11, 22)) counterState = np.zeros((11, 22)) # plot learning curve sqrErr = [] mcq = np.load("montecarlo-qsa.npy") # each episode for i in range(size): # Eligibility e = np.zeros((2, 11, 22)) # init one episode s0 = easy21.init() s = s0 a = getAction(s, q, counterState) #print("init ", s) # each step while s[0] != 1: #print("action: ", a) counterState[s[1:]] += 1 # get s' and a' sprime = easy21.step(s, a) aprime = getAction( sprime, q, counterState ) if sprime[0] != 1 else 0 # handle a' for terminal s' # err sa, saprime = (a, s[1], s[2]), (aprime, sprime[1], sprime[2]) counterSA[sa] += 1 alpha = 1 / counterSA[sa] r = 0 if sprime[0] != 1 else sprime[2] g = q[saprime] if sprime[0] != 1 else 0 g += r err = g - q e[sa] += 1 # update q matric q = q + alpha * err * e e = e * tdLambda s, a = sprime, aprime #print("state: ", s) #print("result: ", s[2]) sqrErr.append(np.power((q - mcq), 2).mean()) return q, sqrErr
def runTD(size, tdLambda): # init q qw = np.zeros((36, 1)) # weight w of q # plot learning curve sqrErr = [] mcq = np.load("montecarlo-qsa.npy") # each episode for i in range(size): # Eligibility e = np.zeros((2, 3, 6)) # init one episode s0 = easy21.init() s = s0 a = getAction(s, qw) #print("init ", s) # each step while s[0] != 1: #print("action: ", a) # get s' and a' sprime = easy21.step(s, a) aprime = getAction(sprime, qw) if sprime[0] != 1 else 0 # err sa, saprime = (a, s[1], s[2]), (aprime, sprime[1], sprime[2]) alpha = 0.01 r = 0 if sprime[0] != 1 else sprime[2] g = getQValue(aprime, sprime[1], sprime[2], qw) if sprime[0] != 1 else 0 g += r err = g - getQValue(a, s[1], s[2], qw) indList = convertSA(a, s[1], s[2]) for ind in indList: e[ind[0], ind[1], ind[2]] += 1 # update q matric qw = qw + alpha * err * e.reshape(36, 1) e = e * tdLambda s,a = sprime, aprime #print("state: ", s) #print("result: ", s[2]) sqrErr.append(np.power((getQMatrix(qw) - mcq), 2).mean()) return qw, sqrErr
def sarsa(lamb, all_errors=False): Q = defaultdict(int) N = defaultdict(int) errors = [] for episode_i in range(EPISODES): if args.progress and episode_i % (0.1 * EPISODES) == 0: print('{:.2f}%...'.format((episode_i + EPISODES * lamb * 10) * 100 / (EPISODES * len(lambdas))), end='\r', flush=True) E = defaultdict(int) state = easy21.init_state() action = get_action(Q, N, state) N[state, action] += 1 is_terminal = False while not is_terminal: reward, new_state, is_terminal = easy21.step(state, action) new_action = get_action(Q, N, new_state) E[state, action] += 1 N[new_state, new_action] += 1 d = reward + DISCOUNT * Q[new_state, new_action] - Q[state, action] for (state, action), e in E.items(): Q[state, action] += step_size(N, (state, action)) * d * e E[state, action] *= DISCOUNT * lamb state, action = new_state, new_action if all_errors: errors.append(calculate_error(Q)) if all_errors: return errors, Q else: return calculate_error(Q), Q
def train(self): """Train the action-value function on NUM_EPISODES games.""" elapsed_episodes = 0 while elapsed_episodes < const.MC_NUM_EPISODES: state = easy21.new_game() episode = [] # [ (state_0, action_0, reward_1, state_1), ... ] # Experience 1 episode while state.terminal == const.NON_TERMINAL: epsilon = const.N_0 / (const.N_0 + self.num_visits(state)) action = epsilon_greedy(epsilon, self.Q, state) next_state, rwd = easy21.step(state, action) episode.append((state, action, rwd, next_state)) state = next_state # Learn from experienced episode total_return = episode[-1][2] for (state, action, reward, next_state) in episode: self.increment_state_action_counts(state, action) self.increment_state_visits(state) alpha_t = 1 / self.state_action_count(state, action) self.update_q_function(state, action, alpha_t, total_return) elapsed_episodes += 1
def run_episode(Q_sa, N_sa): gamma = 1 dealers_first_card = easy21.get_first_card() players_first_card = easy21.get_first_card() state = easy21.State(dealers_first_card, players_first_card) while not state.is_terminal: # initialize ACTION action = easy21.get_egreedy_action(state, Q_sa, N_sa) # action = easy21.get_random_action() # we could also run this off policy!! old_state = copy.deepcopy(state) next_state, reward = easy21.step(state, action) if state.is_terminal: target = reward else: target = reward + gamma * np.max(Q_sa[:, state.dealers_card, state.players_sum]) N_sa[action, old_state.dealers_card, old_state.players_sum] += 1 # alpha = 1 / N_sa[action, old_state.dealers_card, old_state.players_sum] alpha = 0.05 Q_sa[action, old_state.dealers_card, old_state.players_sum] += +alpha * ( target - Q_sa[action, old_state.dealers_card, old_state.players_sum]) return (Q_sa, N_sa, reward)
Z = defaultdict(float) state = State() e = epsilon(Nzero, Ns, state) action = greedysoft(state, actionvalue, e) episode = [] while state.gameover==0: Nsa[(state.player, state.dealer, action)] += 1 Ns[(state.player, state.dealer)] += 1 episode += [(state.player, state.dealer, action)] startstate = copy(state) # Take action A, observe reward, S state, reward = step(state, action) e = epsilon(Nzero, Ns, state) # Choose A' from S' using Q policy ingameaction = greedysoft(state, actionvalue, e) # Delta = R + gamma*Q' + Q; Z = Z + 1 d = reward + ( discount * actionvalue[(state.player, state.dealer, ingameaction)] ) - actionvalue[(startstate.player, startstate.dealer, action)] Z[(startstate.player, startstate.dealer, action)] += 1 stateaction = (state.player, state.dealer, action) for stateaction in episode: a = Nsa[stateaction] ** -1 actionvalue[stateaction] += a * d * Z[stateaction] Z[stateaction] *= lamBda
import colorama import easy21 colorama.init(convert=True) while True: state = easy21.init_state() print('state:', state) is_terminal = False while not is_terminal: print('action ({}it/{}tick): '.format(colorama.Fore.LIGHTGREEN_EX + 'h' + colorama.Style.RESET_ALL, colorama.Fore.LIGHTRED_EX + 's' + colorama.Style.RESET_ALL), end='') action = input() reward, state, is_terminal = easy21.step(state, action) print('state:', state) print('reward:', reward) print('\n')
# player and dealer each draw one black card dealer = np.random.randint(1, 11) player = np.random.randint(1, 11) deal = ([dealer, player], 0) # play one hand history = [] reward = None while reward == None: dealer = deal[0][0] player = deal[0][1] action = epsilon_greedy(N0, N, Q, dealer, player) N[dealer - 1, player - 1, action] += 1 deal = step([dealer, player], action) reward = deal[1] history.append(([dealer, player], action, reward)) # update the Q matrix for this hand - based on https://github.com/xrz000/Easy21 Gt = 0 for j, ([dealer, player], action, reward) in enumerate(reversed( history)): # backtrack from the end of the hand to the beginning alpha = 1.0 / N[ dealer - 1, player - 1, action] # update factor as per the assignment. As you have more visits to the state, the update factor decreases. if reward == None: reward = 0 Gt = gamma * Gt + reward # cumulative reward for each move in the hand. Since the reward is only given at the end of the hand, this is the same for all states. Q[dealer - 1, player - 1, action] += alpha * ( Gt - Q[dealer - 1, player - 1, action]