def main(): env = gym.envs.make("MountainCar-v0") # Feature Preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = sklearn.preprocessing.StandardScaler() scaler.fit(observation_examples) # Used to convert a state to a featurized represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = sklearn.pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) featurizer.fit(scaler.transform(observation_examples)) estimator = Estimator(env, scaler, featurizer) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = q_learning(env, estimator, 100, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
def run_qlearning(self, max_number_of_episodes=100, interactive=False, display_frequency=1): if interactive: plt.ion() plt.show() else: plt.close() # repeat for each episode for episode_number in range(max_number_of_episodes): # initialize state state = self.env.reset() done = False # used to indicate terminal state R = 0 # used to display accumulated rewards for an episode t = 0 # used to display accumulated steps for an episode i.e episode length # repeat for each step of episode, until state is terminal while not done: t += 1 # increase step counter - for display # choose action from state using policy derived from Q action = self.agent.act(state) # take action, observe reward and next state next_state, reward, done, _ = self.env.step(action) # agent learn (Q-Learning update) self.agent.learn(state, action, reward, next_state, done) # state <- next state state = next_state R += reward # accumulate reward - for display # if interactive display, show update for each step if interactive: self.update_display_step() self.episode_length = np.append( self.episode_length, t) # keep episode length - for display self.episode_reward = np.append( self.episode_reward, R) # keep episode reward - for display # if interactive display, show update for the episode if interactive: self.update_display_episode() # if not interactive display, show graph at the end if not interactive: self.fig.clf() stats = plotting.EpisodeStats( episode_lengths=self.episode_length, episode_rewards=self.episode_reward, episode_running_variance=np.zeros(max_number_of_episodes)) plotting.plot_episode_stats(stats, display_frequency)
def main(): print "PolyRL Q Learning" #discretizing the action space action_space = np.linspace(env.min_action, env.max_action, num=10) w_param = np.random.normal(size=(400, action_space.shape[0] + 1)) print "Action Space", action_space num_episodes = 200 smoothing_window = 100 stats_poly_q_learning = poly_rl_q_learning(env, w_param, num_episodes, epsilon=0.1) rewards_smoothed_stats_poly_q_learning = pd.Series( stats_poly_q_learning.episode_rewards).rolling( smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_smoothed_stats_poly_q_learning np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Persistence_Length_Exploration/Results/' + 'Trial_PolyRL' + '.npy', cum_rwd) plotting.plot_episode_stats(stats_poly_q_learning) env.close()
def main(): print "Tree Backup (lambda)" theta = np.zeros(shape=(400, env.action_space.n)) num_episodes = 1000 smoothing_window = 1 # stats_sarsa_tb_lambda, cumulative_errors = q_sigma_lambda_on_policy_static_sigma(env, theta, num_episodes) # rewards_smoothed_stats_tb_lambda = pd.Series(stats_sarsa_tb_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_smoothed_stats_tb_lambda # cum_err = cumulative_errors # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/' + 'Q_sigma_lambda_OnPolicy_Static_Sigma_RBF_Cum_Rwd_2' + '.npy', cum_rwd) # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/' + 'Q_sigma_lambda_OnPolicy_Static_Sigma_RBF_Cum_Err_2' + '.npy', cum_err) # plotting.plot_episode_stats(stats_sarsa_tb_lambda) # env.close() stats_sarsa_tb_lambda, cumulative_errors = q_sigma_lambda_on_policy_dynamic_sigma( env, theta, num_episodes) rewards_smoothed_stats_tb_lambda = pd.Series( stats_sarsa_tb_lambda.episode_rewards).rolling( smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_smoothed_stats_tb_lambda cum_err = cumulative_errors np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/' + 'Q_sigma_lambda_OnPolicy_Dynamic_Sigma_RBF_Cum_Rwd_2' + '.npy', cum_rwd) np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/' + 'Q_sigma_lambda_OnPolicy_Dynamic_Sigma_RBF_Cum_Err_2' + '.npy', cum_err) plotting.plot_episode_stats(stats_sarsa_tb_lambda) env.close()
def test_td_control_method(env): """ plot_episode_stats([test_expected_sarsa_method(env),test_n_setps_expected_sarsa_method(env),test_off_policy_n_steps_sarsa(env), test_n_steps_sarsa_method(env),test_qlearning_method(env),test_sarsa_lambda_method(env),test_q_lambda_method(env)]) """ plot_episode_stats([test_qlearning_method(env),test_q_lambda_method(env),test_double_q_learning_method(env),test_dynaQ_method_trival(env),test_dynaQ_method_priority(env)])
def main(): env = gym.make('MountainCar-v0') outdir = './experiment-results' # env = wrappers.Monitor(env, directory=outdir, force=True) # Keeps track of useful statistics num_episodes = 300 stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Feature Preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = sklearn.preprocessing.StandardScaler() scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = sklearn.pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) featurizer.fit(scaler.transform(observation_examples)) agent = Agent(env.action_space.n, scaler, featurizer, env.observation_space.sample(), epsilon=0, gamma=1) for i_episode in range(num_episodes): print("\rEpisode {}/{} ({})".format( i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="") sys.stdout.flush() state = env.reset() action = agent.set_initial_state(state) for t in itertools.count(): next_state, reward, done, info = env.step(action) action = agent.act(next_state, reward) # book-keeping stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break env.close() # gym.upload(outdir, api_key='sk_9YxUhFDaT5XSahcLut47w') plotting.plot_cost_to_go_mountain_car(env, agent.Q) plotting.plot_episode_stats(stats, smoothing_window=25)
def main(): matplotlib.style.use('ggplot') env = gym.envs.make("MountainCar-v0") num_episodes = 100 estimator_q_learning = tile_coding_estimator.Estimator(env) statistics_q_learning = plotting.EpisodeStats( "q_learning", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. q_learning_tile_coding.q_learning(env, estimator_q_learning, num_episodes, statistics_q_learning, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator_q_learning) estimator_sarsa = tile_coding_estimator.Estimator(env) statistics_sarsa = plotting.EpisodeStats( "sarsa", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. sarsa_tile_coding.sarsa(env, estimator_sarsa, num_episodes, statistics_sarsa, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator_sarsa) estimator_expected_sarsa = tile_coding_estimator.Estimator(env) statistics_expected_sarsa = plotting.EpisodeStats( "expected_sarsa", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. expected_sarsa_tile_coding.expected_sarsa(env, estimator_expected_sarsa, num_episodes, statistics_expected_sarsa, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator_expected_sarsa) plotting.plot_episode_stats( [statistics_q_learning, statistics_sarsa, statistics_expected_sarsa], smoothing_window=25)
def main(): estimator = Estimator() num_episodes = 5000 stats = q_learning(env, estimator, num_episodes, epsilon=0.1) cum_rwd = save_cum_rwd(stats, smoothing_window=1) plotting.plot_episode_stats(stats) env.close()
def test_approximation_control_method(env): episode_stats = [ test_approximation_control_sarsa(env), test_approximation_control_expected_sarsa(env), test_approximation_control_q_learning(env) ] plotting.plot_episode_stats(episode_stats) plotting.plot_3d_q_value(env, episode_stats)
def main(): Q, stats = sarsa(env, 50000) # plotting.plot_episode_stats(stats) # V = defaultdict(float) # for state, actions in Q.items(): # action_value = np.max(actions) # V[state] = action_value # plotting.plot_value_function(V, title="Optimal Value Function") plotting.plot_episode_stats(stats)
def main(): global num_of_load global num_of_dump global num_of_return global state global old_state global old_time global Mean_TD_Error global Iterations global nTrucks BucketA_capacity = 1.5 BucketB_capacity = 1.0 Truck1_capacity = 6 Truck2_capacity = 3 Truck1_speed = 15.0 Truck2_speed = 20.0 Truck1_speedRatio = Truck1_speed / (Truck1_speed + Truck2_speed) Truck2_speedRatio = Truck2_speed / (Truck1_speed + Truck2_speed) #run session (initialise tf global vars) sess.run(init) num_episodes = 200 # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_loss=np.zeros(num_episodes)) for i_episode in range(num_episodes): #reset global vars num_of_load = 0 num_of_dump = 0 num_of_return = 0 state = np.zeros(12) old_state = np.zeros((nTrucks, 12)) old_time = np.zeros(nTrucks) Mean_TD_Error = 0 Iterations = 0 # Print out which episode we're on, useful for debugging. if i_episode % 1 == 0: print "\rEpisode: ", i_episode + 1, " / ", num_episodes #run simulation run_sim(nTrucks, BucketA_capacity, BucketB_capacity, Truck1_capacity, Truck2_capacity, Truck1_speedRatio, Truck2_speedRatio) stats.episode_lengths[i_episode] = Hrs[i_episode] stats.episode_rewards[i_episode] = ProdRate[i_episode] stats.episode_loss[i_episode] = abs(Mean_TD_Error) plotting.plot_episode_stats(stats, name='Linear_Qlearning', smoothing_window=20)
def main(): theta = np.random.normal(size=(400,env.action_space.n)) num_episodes = 200 print "Running for Total Episodes", num_episodes smoothing_window = 1 stats_q_sigma_off_policy = Q_Sigma_Off_Policy_2_Step(env, theta, num_episodes, epsilon=0.1) rewards_smoothed_stats_q_sigma_off_policy = pd.Series(stats_q_sigma_off_policy.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_smoothed_stats_q_sigma_off_policy np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/raw_results/' + 'Off_Policy_Q_Sigma_2_step' + '.npy', cum_rwd) plotting.plot_episode_stats(stats_q_sigma_off_policy) env.close()
def main(): print "SARSA(lambda)" theta = np.zeros(shape=(total_states)) num_episodes = 5000 smoothing_window = 1 stats_sarsa_lambda = sarsa_lambda(env, theta, num_episodes) rewards_smoothed_stats_sarsa_lambda = pd.Series(stats_sarsa_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_smoothed_stats_sarsa_lambda np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/Cliff_Walking_Results/' + 'sarsa_lambda_rbf' + '.npy', cum_rwd) plotting.plot_episode_stats(stats_sarsa_lambda) env.close()
def main(): tf.reset_default_graph() global_step = tf.Variable(0, name="global_step", trainable=False) policy_estimator = PolicyEstimator() value_estimator = ValueEstimator() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Note, due to randomness in the policy the number of episodes you need to learn a good # policy may vary. ~300 seemed to work well for me. stats = actor_critic(env, policy_estimator, value_estimator, 300) plotting.plot_episode_stats(stats, smoothing_window=25)
def main(): print "Q Learning" theta = np.random.normal(size=(400, env.action_space.n)) num_episodes = 2000 smoothing_window = 200 stats_q_learning = q_learning(env, theta, num_episodes, epsilon=0.1) rewards_smoothed_stats_q_learning = pd.Series( stats_q_learning.episode_rewards).rolling( smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_smoothed_stats_q_learning np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Persistence_Length_Exploration/Results/' + 'Trial_Q_Learning' + '.npy', cum_rwd) plotting.plot_episode_stats(stats_q_learning) env.close()
def q_learning_cliff_walking(Q, env, eps=200, epsilon=0.05, alpha=0.5, discount_factor=1.0, timesteps=800): nS = env.nS nA = env.nA episode_reward = [] episode_length = [] init_epsilon = epsilon for ep in range(eps): epsilon -= (init_epsilon * 0.0005) epsilon = max(epsilon, 0) policy = fn_policy(Q, env, epsilon=epsilon) current_state = env.reset() current_action = np.random.choice(np.arange(nA), p=policy[current_state]) total_reward = 0 for ts in range(timesteps): next_state, reward, done, prob = env.step(current_action) total_reward += reward next_action = np.random.choice(np.arange(nA), p=policy[next_state]) next_greedy_action = np.argmax(policy[next_state]) Q[current_state][ current_action] = Q[current_state][current_action] + alpha * ( reward + (discount_factor * Q[next_state][next_greedy_action]) - Q[current_state][current_action]) next_action_array = np.ones(nA) * epsilon / nA next_action_array[np.argmax(Q[current_state])] += (1 - epsilon) policy[current_state] = next_action_array if done: print( "Episode {} ended after {} timesteps with total reward of {}" .format(ep, ts, total_reward)) episode_length.append(ts) episode_reward.append(total_reward) break current_state = next_state current_action = next_action stats = plotting.EpisodeStats(episode_lengths=episode_length, episode_rewards=episode_reward) plotting.plot_episode_stats(stats)
def main(): tf.reset_default_graph() global_step = tf.Variable(0, name="global_step", trainable=False) policy_estimator = PolicyEstimator() value_estimator = ValueEstimator(env) num_episodes = 1000 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stats = actor_critic(env, policy_estimator, value_estimator, num_episodes) plotting.plot_episode_stats(stats, smoothing_window=25) env.close()
def main(): print "True Online Tree Backup (lambda)" # theta = np.random.normal(size=(400)) theta = np.zeros(shape=(400, env.action_space.n)) num_episodes = 1000 smoothing_window = 1 stats_sarsa_tb_lambda, cumulative_errors = true_online_tree_backup_lambda(env, theta, num_episodes) rewards_smoothed_stats_tb_lambda = pd.Series(stats_sarsa_tb_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_smoothed_stats_tb_lambda cum_err = cumulative_errors np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/' + 'True_Online_Tree_Backup_RBF_Cum_Rwd_2' + '.npy', cum_rwd) np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/' + 'True_Online_Tree_Backup_RBF_Cum_Err_2' + '.npy', cum_err) plotting.plot_episode_stats(stats_sarsa_tb_lambda) env.close()
def main(): env = CliffWalkingEnv() num_episodes = 500 tf.reset_default_graph() tf.Variable(0, name="global_step", trainable=False) policyEstimatorReinforce = policy_estimator.PolicyEstimator( env, scope="Policy_Estimator_Reinforce") valueEstimatorReinforce = value_estimator.ValueEstimator( env, scope="Value_Estimator_Reinforce") statistics_reinforce = plotting.EpisodeStats( "Reinforce", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) policyEstimatorAC = policy_estimator.PolicyEstimator( env, scope="Policy_Estimator_AC") valueEstimatorAC = value_estimator.ValueEstimator( env, scope="Value_Estimator_AC") statistics_ac = plotting.EpisodeStats( "AC", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) with tf.Session() as session: session.run(tf.global_variables_initializer()) # Note, due to randomness in the policy the number of episodes you need to learn a good # policy may vary. ~2000-5000 seemed to work well for me. reinforce.reinforce(env, statistics_reinforce, policyEstimatorReinforce, valueEstimatorReinforce, num_episodes, discount_factor=1.0) actor_critic.actor_critic(env, statistics_ac, policyEstimatorAC, valueEstimatorAC, num_episodes) plotting.plot_episode_stats([statistics_reinforce, statistics_ac], smoothing_window=25)
def main(): parser = argparse.ArgumentParser(description='Run Reinforcment Learning at an Office in Tsinghua University') parser.add_argument('--env', default='band_control-v0', help='Environment name') parser.add_argument('-o', '--output', default='office-QN-Rh', help='Directory to save data to') parser.add_argument('--num', default=500, help='Number of Episodes') parser.add_argument('--gamma', default=0.95, help='Discount Factor') parser.add_argument('--alpha', default=0.5, help='Constant step-size parameter') parser.add_argument('--epsilon', default=0.05, help='Epsilon greedy policy') parser.add_argument('--epsilon_min', default=0.05, help='Smallest Epsilon that can get') parser.add_argument('--epsilon_decay', default=0.9, help='Epsilon decay after the number of episodes') parser.add_argument('--batch_size', default=32, help='Sampling batch size') parser.add_argument('--lr', default=0.001, help='Learning rate') args = parser.parse_args() output = get_output_folder(args.output, args.env) #create environment print(args.env) env = gym.make(args.env) ################# tabular Q learning ########## #### change the environment to _process_state_table before use it # Q, stats = QL.q_learning(env, int(args.num), float(args.gamma), float(args.alpha), float(args.epsilon), # float(args.epsilon_min), float(args.epsilon_decay), output) # plotting.plot_episode_stats(stats, smoothing_window=1) # print(Q) ############### Q learning with Neural network approximation and fixed target ################ #### change the environment to _process_state_DDQN before use it state_size = env.nS action_size = env.nA agent = QN.QNAgent(state_size, action_size, float(args.gamma), float(args.lr)) stats = QN.q_learning(env, agent, int(args.num), int(args.batch_size), float(args.epsilon), float(args.epsilon_min), float(args.epsilon_decay), output) plotting.plot_episode_stats(stats, smoothing_window=1)
def main(): estimator = Estimator() number_of_episodes = 1000 print('Two Step Sarsa') stats_sarsa_2_step = sarsa_2_step_TD(env, estimator, number_of_episodes, discount_factor=1.0, epsilon=0.015, epsilon_decay=1.0) plotting.plot_episode_stats(stats_sarsa_2_step, smoothing_window=25) print('Two Step Q Learning') stats_Q = Q_learning_2_step_TD(env, estimator, number_of_episodes, discount_factor=1, epsilon=0.015, epsilon_decay=1.0) print('Two Step Tree Backup') stats_tree = two_step_tree_backup(env, estimator, number_of_episodes, discount_factor=1.0, epsilon=0.1) print('SARSA') stats_sarsa = sarsa(env, estimator, number_of_episodes, discount_factor=1.0, epsilon=0.015, epsilon_decay=1.0) print('Expected SARSA') stats_expected_sarsa = expected_sarsa(env, estimator, number_of_episodes, discount_factor=1.0, epsilon=0.015, epsilon_decay=1.0) plot_episode_stats(stats_sarsa_2_step, stats_Q, stats_tree, stats_sarsa, stats_expected_sarsa)
def sarsa_WindyGridWorld(Q, env, eps=200, discount_factor=1.0, alpha=0.5, epsilon=0.05): nS = env.nS nA = env.nA episode_lengths = [] episode_rewards = [] for ep in range(eps): timesteps = 700 current_state = env.reset() policy = fn_policy(env, Q, epsilon=epsilon) action_arr = policy[current_state] action = np.random.choice(np.arange(nA), p=action_arr) total_reward = 0 for ts in range(timesteps): next_state, reward, done, prob = env.step(action) total_reward += reward next_action_arr = policy[next_state] next_action = np.random.choice(np.arange(nA), p=next_action_arr) Q[current_state][action] = Q[current_state][action] + alpha * ( reward + (discount_factor * Q[next_state][next_action]) - Q[current_state][action]) act_arr = np.ones(nA) * epsilon / nA act_arr[np.argmax(Q[current_state])] += (1 - epsilon) policy[current_state] = act_arr if done: print("Episode {} ended after {} timesteps".format(ep, ts)) episode_lengths.append(ts) episode_rewards.append(total_reward) break current_state = next_state action = next_action stats = plotting.EpisodeStats(episode_lengths=np.array(episode_lengths), episode_rewards=np.array(episode_rewards)) plotting.plot_episode_stats(stats)
def main(): Q, stats = q_learning(env, 300) plotting.plot_episode_stats(stats)
import sys if "../" not in sys.path: sys.path.append("../") from lib.envs.cliff_walking import CliffWalkingEnv from lib import plotting from agents import QLearningAgent import numpy as np env_shape = (4, 12) start_position = (3, 0) end_positions = [(3, 11)] cliff = tuple((3, i + 1) for i in range(10)) env = CliffWalkingEnv(env_shape, start_position, end_positions, cliff) n_actions = env.action_space.n agent = QLearningAgent(alpha=0.5, epsilon=0.1, discount=0.99, n_actions=n_actions) agent.train(env, n_episodes=1000, t_max=10**3, verbose=True, verbose_per_episode=500) plotting.draw_policy(env, agent) plotting.plot_episode_stats(agent)
if update_time >= 0: action_state_update_time = env_list[update_time][1] evaluated_state_index = update_time + self.n - 1 if evaluated_state_index < len(states): state_update_time = states[evaluated_state_index] action_state_update_time.update( 0, state_update_time.get_actions(), time_step=update_time) else: action_state_update_time.update(0, None, time_step=update_time) if update_time == T - 1: a_ss = [a_s for _, a_s in env_list] for a_s in a_ss: a_s.clear_reward_calculator() break return stats if __name__ == '__main__': q_learning = NStepSarsa(CliffWalkingEnv(), 1) stats = q_learning.run(200, get_learning_rate=lambda x1, x2: 1) plotting.plot_episode_stats(stats) q_learning.show_one_episode() # q_learning = NStepSarsa(WindyGridworldEnv(), 8) # stats = q_learning.run(50000) # plotting.plot_episode_stats(stats) # q_learning.show_one_episode()
# next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) # td_target = reward + discount_factor * q_values_next[next_action] # Update the function approximator using our target estimator.update(state, action, td_target) #plt.figure() plt.clf() plt.imshow(env.render(mode='rgb_array')) print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, last_reward)), sys.stdout.flush() if done: break state = next_state return stats estimator = Estimator() # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = q_learning(env, estimator, 100, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
target_estimator=target_estimator, state_processor=state_processor, experiment_dir=experiment_dir, num_episodes=3000, replay_memory_size=200000, replay_memory_init_size=20000, update_target_estimator_every=10000, epsilon_start=1, epsilon_end=0.1, epsilon_decay_steps=500000, discount_factor=0.99, batch_size=32, record_video_every=50): episode_reward_array.append(stats.episode_rewards[-1]) if num_episode % 50 == 0: fig1, fig2, fig3, fig4 = plotting.plot_episode_stats(stats, smoothing_window=10, noshow=True) fig1.savefig(experiment_dir + '/episode_length.jpg') fig2.savefig(experiment_dir + '/reward.jpg') fig3.savefig(experiment_dir + '/episode_per_t.jpg') fig4.savefig(experiment_dir + '/episode_reward.jpg') np.savetxt(experiment_dir + '/episode_reward.txt', episode_reward_array, newline=" ") num_episode += 1 avg_50_reward.append(np.average(stats.episode_rewards[max(0, num_episode - 50):])) print("\nEpisode Reward: {} , Last 50 average: {}".format(stats.episode_rewards[-1], np.average( stats.episode_rewards[max(0, num_episode - 50):]))) # In[ ]:
def plot(self, stats): plotting.plot_episode_stats(stats)
from collections import defaultdict import matplotlib import numpy as np sys.path.append('/home/ornot/GymRL') from algorithm import mc_online_policy_control, q_learning, sarsa, expected_sarsa from env import windy_gridworld from lib import plotting matplotlib.style.use('ggplot') env = windy_gridworld.WindyGridworldEnv() num_episodes = 200 # TD online statistics_sara = plotting.EpisodeStats("sara", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) Q_sara = sarsa.sarsa(env, num_episodes, statistics_sara) # TD_offline statistics_q_learning = plotting.EpisodeStats( "q_learning", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) Q_learning = q_learning.q_learning(env, num_episodes, statistics_q_learning) plotting.plot_episode_stats([statistics_sara, statistics_q_learning])
def main(): Q, stats = sarsa(env, 500) plotting.plot_episode_stats(stats)
# update current state state = new_state if terminated: break return stats # In[122]: estimator = Estimator() # In[123]: # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = q_learning(env, estimator, 100, epsilon=0.0) # In[124]: plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)