def monte_carlo_control(): action_value_function = defaultdict(float) n_s = defaultdict(int) n_s_a = defaultdict(int) n_zero = 1E5 episodes = xrange(int(1E8)) pbar = ProgressBar(maxval=len(episodes)).start() for episode in episodes: state = State() while not state.terminal: player = state.player dealer = state.dealer epsilon = float(n_zero) / (n_zero + n_s[(dealer, player)]) action = epsilon_greedy_policy(action_value_function, state, epsilon) n_s[(dealer, player)] += 1 n_s_a[(dealer, player, action)] += 1 reward = step(state, action) # update the action value function alpha = 1.0 / n_s_a[(dealer, player, action)] new_reward = action_value_function[(dealer, player, action)] action_value_function[(dealer, player, action)] += alpha * (reward - new_reward) pbar.update(episode) pbar.finish() value_function = action_value_to_value_function(action_value_function) plot_value_function(value_function, "Optimal Value Function: Question 2") return action_value_function
def sarsa(lambd): n_episodes = 1000 epi_batch = 100 episodes = xrange(n_episodes) action_value_function = defaultdict(float) linear_function = LinearFunction() params_hit = np.array([0 for i in range(18)]) params_stick = np.array([0 for i in range(18)]) n_zero = 10 epsilon = 0.05 alpha = 0.01 if lambd == 0.0 or lambd == 1.0: mses = [] for episode in episodes: if episode%epi_batch == 0: if lambd == 0.0 or lambd == 1.0: mses.append(calculate_mse(action_value_function)) # initialize state, action, epsilon, and eligibility-trace state = State() linear_function.update(state) current_feats = linear_function.get_features() action = epsilon_greedy_policy(action_value_function, state, epsilon, current_feats) eligibility_hit = np.array([0 for i in range(18)]) eligibility_stick = np.array([0 for i in range(18)]) while not state.terminal: np_feats = np.array(current_feats) if action is HIT: eligibility_hit = np.add(eligibility_hit, np_feats) else: eligibility_stick = np.add(eligibility_stick, np_feats) reward = step(state, action) linear_function.update(state) new_features = linear_function.get_features() # update delta delta_hit = reward - np.array(tuple(new_features)).dot(params_hit) delta_stick = reward - np.array(tuple(new_features)).dot(params_stick) # update Action Value Function if action == HIT: update_action_value_function(action_value_function, (new_features, action), params_hit) else: update_action_value_function(action_value_function, (new_features, action), params_stick) # update delta, parameters, and eligibility-trace if action == HIT: delta_hit += action_value_function[(tuple(new_features), HIT)] else: delta_stick += action_value_function[(tuple(new_features), STICK)] params_hit = np.add(params_hit, alpha * delta_hit * eligibility_hit) params_stick = np.add(params_stick, alpha * delta_stick * eligibility_stick) eligibility_hit = eligibility_hit * lambd eligibility_stick = eligibility_stick * lambd # decide an action action = epsilon_greedy_policy(action_value_function, state, epsilon, new_features) # update state and action current_features = new_features if lambd == 0.0 or lambd == 1.0: mses.append(calculate_mse(action_value_function)) # plot mses curve if lambd == 0.0 or lambd == 1.0: print "Plotting learning curve for $\lambda$=",lambd x = range(0, n_episodes + 1, epi_batch) fig = plt.figure() plt.title('Learning curve of MSE against Episodes @ $\lambda$ = ' + str(lambd)) plt.xlabel("episode number") plt.xlim([0, n_episodes]) plt.xticks(range(0, n_episodes + 1, epi_batch)) plt.ylabel("Mean-Squared Error (MSE)") plt.plot(x, mses) fname = "lapprox_mse_lambda%f_%s.png" % (lambd, str(datetime.now())) plt.savefig(fname) # plt.show() mse = calculate_mse(action_value_function) return mse
def sarsa(lambd): n_episodes = 1000 epi_batch = 100 episodes = xrange(n_episodes) action_value_function = defaultdict(float) n_zero = 100 n_s = defaultdict(int) n_s_a = defaultdict(int) if lambd == 0.0 or lambd == 1.0: mses = [] for episode in episodes: if episode%epi_batch == 0: if lambd == 0.0 or lambd == 1.0: mses.append(compute_mse(action_value_function)) # initialize state, action, epsilon, and eligibility-trace state = State() current_dealer = state.dealer current_player = state.player epsilon = float(n_zero) / (n_zero + n_s[(current_dealer, current_player)]) current_action = epsilon_greedy_policy(action_value_function, state, epsilon) eligibility_trace = defaultdict(int) while not state.terminal: n_s[(current_dealer, current_player)] += 1 n_s_a[(current_dealer, current_player, current_action)] += 1 reward = step(state, current_action) new_dealer = state.dealer new_player = state.player epsilon = float(n_zero) / (n_zero + n_s[(new_dealer, new_player)]) new_action = epsilon_greedy_policy(action_value_function, state, epsilon) alpha = 1.0 / n_s_a[(current_dealer, current_player, current_action)] prev_action_value = action_value_function[(current_dealer, current_player, current_action)] new_action_value = action_value_function[(new_dealer, new_player, new_action)] delta = reward + new_action_value - prev_action_value eligibility_trace[(current_dealer, current_player, current_action)] += 1 for key in action_value_function.keys(): dealer, player, action = key # update the action value function action_value_function[(dealer, player, action)] \ += alpha * delta * eligibility_trace[(dealer, player, action)] # update eligibility-trace eligibility_trace[(dealer, player, action)] *= lambd # update state and action current_dealer = new_dealer current_player = new_player current_action = new_action if lambd == 0.0 or lambd == 1.0: mses.append(compute_mse(action_value_function)) # plot mses curve if lambd == 0.0 or lambd == 1.0: print "Plotting learning curve for $\lambda$=",lambd x = range(0, n_episodes + 1, epi_batch) fig = plt.figure() plt.title('Learning curve of MSE against episode number: $\lambda$ = ' + str(lambd)) plt.xlabel("episode number") plt.xlim([0, n_episodes]) plt.xticks(range(0, n_episodes + 1, epi_batch)) plt.ylabel("Mean-Squared Error (MSE)") plt.plot(x, mses) fname = "mse_lambda%f_%s.png" % (lambd, str(datetime.now())) plt.savefig(fname) # plt.show() mse = compute_mse(action_value_function) return mse