def td_lambda_backward_view(iterations, N0, discount_factor, Lambda, value_star): MSEs = [] actions = ["Hit", "Stick"] action_value = np.array([[[0.0, 0.0] for i in range(0, 22)] for j in range(10)]) number_action_value = np.array([[[0.0, 0.0] for i in range(0, 22)] for j in range(10)]) eligibility_traces = None for it in range(iterations): """plays one episode""" eligibility_traces = np.array([[[0.0, 0.0] for i in range(0, 22)] for j in range(10)]) game = Easy21() visits = [] ##Action chosen epsilon-greedily first_state = game.state index_action = policy_epsilon_greedy(N0, first_state["dealer"], first_state["player_sum"], action_value, number_action_value) """plays game epsilon-greedily""" while game.isTerminal == False: last_state = game.state dealer, player_sum = last_state["dealer"], last_state["player_sum"] pick_action = actions[index_action] number_action_value[dealer - 1, player_sum, index_action] += 1 alpha = 1 / number_action_value[dealer - 1, player_sum, index_action] _, reward = game.step(pick_action) eligibility_traces[dealer - 1, player_sum, index_action] += 1 if game.isTerminal == False: next_state = game.state next_dealer, next_player_sum = next_state[ "dealer"], next_state["player_sum"] next_index_action = policy_epsilon_greedy( N0, next_dealer, next_player_sum, action_value, number_action_value) target = reward + discount_factor * action_value[ next_dealer - 1, next_player_sum, next_index_action] index_action = next_index_action else: target = reward delta = target - action_value[dealer - 1, player_sum, index_action] delta_tot = eligibility_traces * delta * alpha ##We update all states and actions action_value += delta_tot eligibility_traces = discount_factor * Lambda * eligibility_traces """episode ended""" error_episode = np.linalg.norm(get_value(action_value) - value_star)**2 / (2 * 22 * 10) MSEs.append(error_episode) return MSEs, action_value
def lfa_control(num_episodes, td_lambda, Q_MC): """ :param num_episodes: Positive integer :param td_lambda: value between 0 to 1 :param Q_MC:Q from MC :return: Q, V and mean square error """ env = Easy21() # instantiate the env gamma = 1 epsilon = 0.05 alpha = 0.01 # Initialization Q = np.zeros((2, 11, 22)) weight = np.zeros((36,)) mean_square_error = [] # looping through number of episodes for i_episode in range(1, num_episodes + 1): bin_feat_vector = np.zeros((36,)) state = env.reset() # start with a initial state if i_episode % 1000 == 0: print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() while True: if np.random.rand() < epsilon: action = np.random.randint(0, 2) else: action = np.argmax([np.sum(phi(state, 0)*weight), np.sum(phi(state, 1)*weight)]) next_state, reward, done = env.step(action) if np.random.rand() < epsilon: next_action = np.random.randint(0, 2) else: next_action = np.argmax([np.sum(phi(next_state, 0)*weight), np.sum(phi(next_state, 1)*weight)]) cuboid = phi(state, action) cuboid_next = phi(next_state, next_action) Q_phi = np.sum(cuboid * weight) Q_phi_next = np.sum(cuboid_next * weight) td_lambda_error = reward + gamma * Q_phi_next - Q_phi bin_feat_vector = bin_feat_vector * td_lambda * gamma + cuboid delta_weight = alpha * td_lambda_error * bin_feat_vector weight += delta_weight state = next_state if done: break Q = calculate_Q(weight) mean_square_error.append(np.mean((Q_MC - Q[:, 1:11, 1:22]) ** 2)) return Q[:, 1:11, 1:22], np.max(Q[:, 1:11, 1:22], axis=0), mean_square_error
def sarsa_control(N0, num_episodes, td_lambda, Q_MC): """ :param N0: constant integer :param num_episodes: Positive integer :param td_lambda: value between 0 to 1 :param Q_MC: Q_MC:Q from MC :return: Q and mean square error """ env = Easy21() # instantiate the env NS = np.zeros((11, 22)) NSA = np.zeros((2, 11, 22)) Q = np.zeros((2, 11, 22)) gamma = 1 # Initializing empty list of the mean square error mean_square_error = [] # Episodes for i_episode in range(0, num_episodes): episode_state_action = np.zeros((2, 11, 22)) state = env.reset() # Initiate state action = np.random.randint(0, 2) # Initiate action if i_episode % 1000 == 0: print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() while True: next_state, reward, done = env.step(action) # Update epsilon NS[state[0], state[1]] += 1 epsilon = N0 / (N0 + NS[state[0], state[1]]) greedy_action = epsilon_greedy(Q, next_state, epsilon) # making updates QSA = Q[greedy_action, next_state[0], next_state[1]] - Q[action, state[0], state[1]] td_lambda_error = reward + gamma * QSA episode_state_action[action, state[0], state[1]] += 1 # Update alpha NSA[action, state[0], state[1]] += 1 alpha = 1 / NSA[action, state[0], state[1]] Q += alpha * td_lambda_error * episode_state_action episode_state_action *= td_lambda * gamma state, action = next_state, greedy_action if done: break mean_square_error.append(np.mean((Q_MC - Q[:, 1:11, 1:22])**2)) return Q[:, 1:11, 1:22], np.max(Q[:, 1:11, 1:22], axis=0), mean_square_error
def __init__(self, N0=100): self._env = Easy21() self._actions = self._env.actionSpace() # linear approx. function and parameters self._w = np.zeros([36, 1]) self._Q = lambda d, p, a: np.dot(self._feature(d, p, a).T, self._w) self._eps = 0.05 self._alpha = 0.01
def montecarlo(iterations, it_conf, N0, discount_factor, true_value): """Computes Monte-Carlo algorithm""" actions = ["Hit", "Stick"] action_value = np.array([[[0.0, 0.0] for i in range(0, 22)] for j in range(10)]) number_action_value = np.array([[[0, 0] for i in range(0, 22)] for j in range(10)]) deltas = [] variance = [] for it in range(iterations): if it % it_conf == 0: print("Iteration n°{}/{}".format(it / int(1e3), iterations / int(1e3))) var = 0 """plays one episode""" game = Easy21() Gt = 0 k = 0 visits = [] while game.isTerminal == False: last_state = game.state dealer, player_sum = last_state["dealer"], last_state["player_sum"] action_value_ij = action_value[dealer - 1, player_sum] ##Pick action epsilon-greedily index_action = policy_epsilon_greedy(N0, dealer, player_sum, action_value, number_action_value) pick_action = actions[index_action] state, reward = game.step(pick_action) visits.append([last_state, index_action]) number_action_value[dealer - 1, player_sum, index_action] += 1 Gt += reward * discount_factor**k k += 1 delta = 0 """episode ended""" for step in visits: state = step[0] action = step[1] dealer, player_sum = state["dealer"], state["player_sum"] delta_action_value = ( Gt - action_value[dealer - 1, player_sum, action] ) / number_action_value[dealer - 1, player_sum, action] action_value[dealer - 1, player_sum, action] += delta_action_value var += abs(delta_action_value) variance.append(var) deltas.append( np.linalg.norm(true_value - get_value(action_value))**2 / (10 * 21)) return action_value, deltas, variance
def __init__(self, N0=100): self._env = Easy21() self._actions = self._env.actionSpace() self._N0 = N0 self._Q = np.zeros((11, 22, 2)) # action-value function, tabular self._Nsa = np.zeros( (11, 22, 2)) # number of times s,a has been selected # number of times s has been visited self._Ns = lambda d, p: sum(self._Nsa[d, p]) # ε of each state self._eps = lambda d, p: self._N0 / (self._N0 + self._Ns(d, p)) # alpha of each s,a pair self._alpha = lambda d, p, a: 1 / self._Nsa[d, p, a]
def mc_control(N0=100, num_episodes=1000): """ :param N0:integer constant :param num_episodes:Positive integer :return: Q & V """ # Get the environment env = Easy21() # instantiate the env # initializing Zeros arrays for num of state visited and state action pair NS = np.zeros((11, 22)) NSA = np.zeros((2, 11, 22)) # Initializing state action function Q = np.zeros((2, 11, 22)) # looping over episodes for i_episode in range(1, num_episodes + 1): if i_episode % 1000 == 0: print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() episode = [] state = env.reset() episode.append(state) Gt = 0 while True: # Update epsilon values NS[state[0], state[1]] += 1 epsilon = N0 / (N0 + NS[state[0], state[1]]) action = epsilon_greedy(Q, state, epsilon) episode.append(action) # Update alpha values NSA[action, state[0], state[1]] += 1 alpha = 1 / NSA[action, state[0], state[1]] state, reward, done = env.step(action) # Sum the reward Gt += reward if done: break else: # Append state to the episode episode.append(state) # Update all states for index, event in enumerate(episode): if index % 2 == 0: state = event else: action = event Q[action, state[0], state[1]] += alpha * (Gt - Q[action, state[0], state[1]]) return Q[:, 1:11, 1:22], np.max(Q[:, 1:11, 1:22], axis=0)