def monte_carlo(iterations=1000000, policy=policies.epsilon_greedy, n_zero=100): """ Performs Monte Carlo control in the Easy21 game. :param iterations: number of monte carlo iterations :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :return: value function and the plot of the optimal value function """ # (player, dealer, action) key value_function = defaultdict(float) # (player, dealer) key counter_state = defaultdict(int) # (player, dealer, action) key counter_state_action = defaultdict(int) # number of wins wins = 0 print('Iterations completed:') for i in xrange(iterations): if (i % 500000) == 0: print(i) # create a new random starting state state = environment.State() # play a round observed_keys = [] while not state.terminal: player = state.player_sum dealer = state.dealer_first_card # find an action defined by the policy epsilon = n_zero / float(n_zero + counter_state[(player, dealer)]) action = policy(epsilon, value_function, state) observed_keys.append((player, dealer, action)) # take a step [state, reward] = environment.step(state, action) # we have reached an end of episode if reward is not None: # update over all keys for key in observed_keys: # update counts counter_state[key[:-1]] += 1 counter_state_action[key] += 1 # update value function alpha = 1.0 / counter_state_action[key] value_function[key] += alpha * (reward - value_function[key]) if reward == 1: wins += 1 print('Wins: %.4f%%' % ((float(wins) / iterations) * 100)) # plot the optimal value function plotting.plot_value_function(value_function) return value_function
def q_network_test(): env = BlackjackEnv() estimator = Estimator(0.001) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = q_network(env, sess, estimator, episode_num=10000) plotting.plot_value_function(V, title='Optimal Value Function')
def q_learning_test(): env = BlackjackEnv() Q = q_learning(env, episode_nums=10000) V = defaultdict(float) for state, actions in Q.items(): max_q = np.max(actions) V[state] = max_q plotting.plot_value_function(V, title='Optimal Value Function')
def mc_control_with_epsilon_greedy_test(): env = BlackjackEnv() Q = mc_control_with_epsilon_greedy(env, episode_nums=10000) V = defaultdict(float) for state, actions in Q.items(): max_q = np.max(actions) V[state] = max_q plotting.plot_value_function(V, title='Optimal Value Function')
def main(): env = BlackjackEnv() actor = Actor() estimator = Estimator() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = ac_test4debug(sess, env, actor, estimator, episode_num=10000) plotting.plot_value_function(V, title='Optimal Value Function')
def td_network_test(): env = BlackjackEnv() estimator = Estimator(learning_rate=0.003) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = td_network(env, sess, estimator) #print(sess.run(estimator.w)) #print(sess.run(estimator.b)) plotting.plot_value_function(V, title='Optimal Value')
def main(): env = gym.make('Blackjack-v0') env.seed(SEED) V = mc_policy_eval(sample_policy, env, 10000) plot_value_function(V, title="10,000 Episodes") V = mc_policy_eval(sample_policy, env, 500000) plot_value_function(V, title="500,000 Episodes") env.close()
def dyna_q_test(): env = BlackjackEnv() estimator = Estimator(0.003) model = Model(0.003) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) V = dyna_q(env, sess, estimator, model, episode_num=3000, train_model_times=3000, train_with_model_times=3) plotting.plot_value_function(V, title='Optimal Value Function')
def main(): env = gym.make('Blackjack-v0') env.seed(SEED) policy, Q = mc_control_importance_sampling(env, 500000) # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, actions in Q.items(): action_value = np.max(actions) V[state] = action_value plot_value_function(V, title="Optimal Value Function") env.close()
returns_sum = defaultdict(float) returns_count = defaultdict(float) # the final value function V = defaultdict(float) for i_episode in range(1, num_episodes+1): episode = [] state = env.reset() for t in range(200): action = policy(state) next_state, reward, done= env.step(action) episode.append((state, action, reward)) if done: break state = next_state states_in_episode = set([tuple(x[0]) for x in episode]) for state in states_in_episode: # Find the first occurance of the state in the episode first_occurence_idx = next(i for i, x in enumerate(episode) if x[0] == state) G = sum([x[2]*(discount_factor**i) for i, x in enumerate(episode[first_occurence_idx:])]) returns_sum[state] += G returns_count[state] += 1.0 V[state] = returns_sum[state] / returns_count[state] import plotting plotting.plot_value_function(V, title="10000 steps")
sum_env, n_sims, omega = omega, epsilon = epsilon, init_val = init_val, episode_file=path_fun("sum_state"), warmup=warmup) time_to_completion_sum = time.time() - start_time_sum print("Number of explored states (sum states): " + str(len(sumQ))) print("Cumulative avg. reward = " + str(sum_avg_reward)) print("Training time: \n " + "Expanded state space: {} \n Sum state space: {}".format( time_to_completion_expanded, time_to_completion_sum)) # Convert Q (extended state) to sum state representation and make 3D plots Q_conv = ql.convert_to_sum_states(Q, env) V_conv = ql.convert_to_value_function(Q_conv) V_conv_filt = ql.fill_missing_sum_states(ql.filter_states(V_conv)) pl.plot_value_function(V_conv_filt, title = "Expanded state, " + str(decks) + " decks", directory = plot_dir, file_name = "3D_exp_" + str(decks) + "_decks.png") # Likewise make 3D plots for sumQ V_sum = ql.convert_to_value_function(sumQ) V_sum_filt = ql.fill_missing_sum_states(ql.filter_states(V_sum)) pl.plot_value_function(V_sum_filt, title = "Sum state, " + str(decks) + " decks", directory = plot_dir, file_name = "3D_sum_" + str(decks) + "_decks.png") # create line plots env_types = ["hand", "sum"] fig, lgd = pl.plot_avg_reward_episode(directory, env_types, [str(decks)]) fig.savefig("{}/avgReturnEp_ndeck{}.png".format(plot_dir, decks), bbox_extra_artists=(lgd,), bbox_inches='tight') matplotlib.pyplot.close(fig)
for i,x in enumerate(episode): if x[0] == s_eps: first_visit_pos=i G = sum([e[2]*discount**idx for idx,e in enumerate(episode[first_visit_pos:])]) return_sum[s_eps]+=G return_count[s_eps]+=1.0 V[s_eps] = return_sum[s_eps]*1.0/return_count[s_eps] return V env = Blackjack() V_10k = mc_prediction(sample_policy, env, num_episodes=10000) plotting.plot_value_function(V_10k, title="10,000 Steps") V_500k = mc_prediction(sample_policy, env, num_episodes=500000) plotting.plot_value_function(V_500k, title="500,000 Steps")
sa_in_episode = set([(tuple(x[0]), x[1]) for x in episode]) for state, action in sa_in_episode: sa_pair = (state, action) # Find the first occurance of the (state, action) pair in the episode first_occurence_idx = next(i for i, x in enumerate(episode) if x[0] == state and x[1] == action) # Sum up all rewards since the first occurance G = sum([ x[2] * (discount_factor**i) for i, x in enumerate(episode[first_occurence_idx:]) ]) # Calculate average return for this state over all sampled episodes returns_sum[sa_pair] += G returns_count[sa_pair] += 1.0 Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair] # The policy is improved implicitly by changing the Q dictionary return Q, policy Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1) # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, actions in Q.items(): action_value = np.max(actions) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
done = False while not done: reward, next_state = env.step(convert_agent_action(action)) next_action = agent.take_action(state) if next_state == 'terminal': done = True transition = Transition(state, action, reward, next_state, next_action, done) agent.step(transition) if not done: state = next_state action = next_action last_episodes_rewards.append(reward) if episode % last_episodes_rewards.maxlen == 0: success_rates.append( last_episodes_rewards.count(1) / last_episodes_rewards.maxlen) episodes_x.append(episode) success_rates.pop(0) episodes_x.pop(0) plotting.plot_value_function(agent.q_table) plt.show()
# states.append(state) # rewards.append(reward) state = next_state # num_states = len(states) # for i, s in enumerate(states): # G = np.sum(np.array( # rewards[i:]) * np.array([gamma ** i for i in range(0, num_states - i)])) # N[s] += 1.0 # V[s] = V[s] + 0.01 * (G - V[s]) return V def naive_policy(state): player_hand, _, _ = state if player_hand >= 20: return 0 else: return 1 if __name__ == '__main__': env = BlackjackEnv() steps = 200000 v = TDPolicyEvaluation(env, naive_policy, steps, 1.0, 0.5) # print(v) plotting.plot_value_function(v, title="{} Steps".format(steps))
print("\r{} @ {}/{} ({})".format(t, i + 1, n_episodes, episode_reward[i]), end="") if done: break state = next_state G = 0 for state, reward, action in episode[::-1]: G = reward + discount * G for state, reward, action in episode: N[state][action] += 1 Q[state][action] += (G - Q[state][action]) / N[state][action] G = (G - reward) / discount print() return Q, episode_reward, episode_length Q, rewards, lengths = mc(env, 800000) plt.plot(pd.Series(rewards).rolling(10000, min_periods=10000).mean()) plt.show() plotting.plot_value_function(np.amax(Q, 2)) plotting.plot_value_function(np.argmax(Q, 2), title="Policy function")
W =1 prob_b=prob_b[::-1] for idx,eps in enumerate(episode[::-1]): state,action,reward = eps pair=(state,action) G = discount_factor*G+reward return_count[pair]+=W Q[state][action]+=W*1.0/return_count[pair]*(G-Q[state][action]) target_policy[state] = np.argmax(Q[state]) if target_policy[state]!=action: break W = W*1.0/prob_b[idx] return Q env=Blackjack() Q = Off_policy_MC_Control(env, episode_nums=500000) V = defaultdict(float) for state, actions in Q.items(): action_value = np.max(actions) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
def plot_action_graph(self): plotting.plot_value_function(self.state_action_map, title="100,000 Steps")
Q[state][action] += (G - Q[state][action]) / N[state][action] G = (G - reward) / discount print() return Q, episode_reward, episode_length Qtrue, _, _ = mc(env, 1000000) sqerrs = [] lambdas = np.arange(0, 1.01, 0.1) for lambda0 in lambdas: Q, err = sarsa(env, 1000, 1.0, 0.1, 0.05, lambda0, Qtrue) sqerrs.append(err) plt.plot(sqerrs[0]) plt.plot(sqerrs[-1]) plt.title("Q mse over episodes") plt.xlabel("episode") plt.ylabel("Q mse") plt.legend(["lambda=0", "lambda=1"]) plt.show() plt.plot(lambdas, [err[-1] for err in sqerrs]) plt.title("Q mse for different lambda") plt.xlabel("lambda") plt.ylabel("Q mse") plt.show() plotting.plot_value_function(np.amax(Qtrue, axis=2)) plotting.plot_value_function(np.amax(Q, axis=2))
next_state, reward, done, _ = env.step(action) episode.append((state, action, reward)) state = next_state # Compute states values for state, action, reward in episode: firstOccurence = next(i for i, x in enumerate(episode) if x[0] == state) G = sum([x[2]*discount_factor**i for i, x in enumerate(episode[firstOccurence:])]) returns_sum[state] += G returns_count[state] += 1.0 V[state] = returns_sum[state]/returns_count[state] break return V def sample_policy(observation): score, dealer_score, usable_ace = observation return 0 if score >= 20 else 1 matplotlib.style.use('ggplot') env = BlackjackEnv() V_10k = mc_prediction(sample_policy, env, num_episodes=10000) plotting.plot_value_function(V_10k, title="10,000 Steps") V_500k = mc_prediction(sample_policy, env, num_episodes=500000) plotting.plot_value_function(V_500k, title="500,000 Steps")
N=100000 M=1000 x = (np.random.rand(N)-0.5)*8 w_x = p(x)/q(x) w_x = w_x/sum(w_x) w_xc = np.cumsum(w_x) #used for uniform quantile inverse # resample from x with replacement with probability of w_x X=np.array([]) for i in range(M): u = np.random.rand() X = np.hstack((X,x[w_xc>u][0])) x = np.linspace(-4,4,500) plt.plot(x,p(x)) plt.hist(X,bins=100,normed=True) plt.title('Sampling Importance Resampling') plt.show() if __name__ == '__main__': policy = sample_policy env = gym.make('Blackjack-v0') # V = mc_prediction(policy, env, num_episodes=80000) # plottig(V) Q, policy = mc_control_epsilon_greedy(env, num_episodes=100000, epsilon=0.1) V = defaultdict(float) for state, action in Q.items(): action_value = np.max(action) V[state] = action_value plot_value_function(V) # importance_sampling() pass