def get_env(argument): switcher = { "cliffwalking": CliffWalkingEnv(), "cliffwalkingenv": CliffWalkingEnv(), "cliff": CliffWalkingEnv(), "cliffs": CliffWalkingEnv(), "windygridworld": WindyGridworldEnv(), "windygridworldenv": WindyGridworldEnv(), "windygrid": WindyGridworldEnv(), "windy": WindyGridworldEnv(), "simplemaze": SimpleRoomsEnv(), "simplegrid": SimpleRoomsEnv(), "simplegridworld": SimpleRoomsEnv(), "simplegridworldenv": SimpleRoomsEnv(), "simpleroomsenv": SimpleRoomsEnv(), "simpleroom": SimpleRoomsEnv(), "maze": SimpleRoomsEnv(), "grid": SimpleRoomsEnv() } return switcher.get(argument)
def getEnv(domain): if domain == "Blackjack": return BlackjackEnv() elif domain == "Gridworld": return GridworldEnv() elif domain == "CliffWalking": return CliffWalkingEnv() elif domain == "WindyGridworld": return WindyGridworldEnv() else: try: return gym.make(domain) except: assert False, "Domain must be a valid (and installed) Gym environment"
import pandas as pd import sys import random from lib import plotting from sklearn.linear_model import SGDRegressor from sklearn.kernel_approximation import RBFSampler from collections import namedtuple from collections import defaultdict from lib.envs.cliff_walking import CliffWalkingEnv from lib.envs.windy_gridworld import WindyGridworldEnv from lib import plotting #env = CliffWalkingEnv() env = WindyGridworldEnv() # #with the mountaincar from openAi gym # env = gym.envs.make("MountainCar-v0") #samples from the state space to compute the features observation_examples = np.array( [env.observation_space.sample() for x in range(1)]) scaler = sklearn.preprocessing.StandardScaler() scaler.fit(observation_examples) #convert states to a feature representation: #used an RBF sampler here for the feature map featurizer = sklearn.pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
evaluated_state_index = update_time + self.n - 1 if evaluated_state_index < len(states): log.debug('=n') state_update_time = states[evaluated_state_index] action_state_update_time.update( 0, state_update_time.get_actions(), time_step=update_time) else: log.debug('<n') action_state_update_time.update(0, None, time_step=update_time) if update_time == T - 1: a_ss = [a_s for _, a_s in env_list] for a_s in a_ss: a_s.clear_reward_calculator() break return stats if __name__ == '__main__': q_learning = OffNStepSarsa(WindyGridworldEnv(), 1) stats = q_learning.run(5000) plotting.plot_episode_stats(stats) q_learning.show_one_episode() # q_learning = NStepSarsa(WindyGridworldEnv(), 8) # stats = q_learning.run(50000) # plotting.plot_episode_stats(stats) # q_learning.show_one_episode()
# coding: utf-8 # In[1]: import gym import numpy as np import sys if "../" not in sys.path: sys.path.append("../") from lib.envs.windy_gridworld import WindyGridworldEnv # In[2]: env = WindyGridworldEnv() print(env.reset()) env.render() print(env.step(1)) env.render() print(env.step(1)) env.render() print(env.step(1)) env.render() print(env.step(2)) env.render()
for action_state in reversed(action_states): state, action_state, reward = action_state g = discount_factor * g + reward action_state.update_c(w) action_state.update_q(g, w) action = state.get_next_action_state(GreedyPolicy()) if action != action_state: break w = w * n return state def generate_one_episode_action_states_by_policy(self, policy): actions = [] state = self.env.reset() for t in range(100): action = state.get_next_action_state(policy) next_state, reward, done, _ = self.env.step(action.get_gym_action()) actions.append((state, action, reward)) if done: break state = next_state return actions if __name__ == '__main__': q_learning = McOfflinePolicy(WindyGridworldEnv()) q_learning.run(5000) # plotting.plot_episode_stats(stats) q_learning.show_one_episode()
epsilon, action_type=DoubleQAction, learning_rate=learning_rate) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): state_actions = set() state = self.env.reset() for t in itertools.count(): action_state = state.get_next_action_state( EGreedyPolicy(epsilon)) next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) if state not in state_actions: state_actions.add(action_state) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t action_state.update(reward, next_state.get_actions()) if done: break state = next_state return stats if __name__ == '__main__': q_learning = DoubleQLearning(WindyGridworldEnv()) stats = q_learning.run(50000) plotting.plot_episode_stats(stats) q_learning.show_one_episode()
learning_rate=learning_rate) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): state_actions = set() state = self.env.reset() for t in itertools.count(): action_state = state.get_next_action_state( EGreedyPolicy(epsilon)) next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) if state not in state_actions: state_actions.add(action_state) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t action_state.update(reward, next_state.get_actions(), policy=EGreedyPolicy(epsilon)) if done: break state = next_state return stats if __name__ == '__main__': sarsa = Sarsa(WindyGridworldEnv()) stats = sarsa.run(20000) plotting.plot_episode_stats(stats) sarsa.show_one_episode()
def main(): # print "SARSA" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_sarsa = sarsa(env, num_episodes) # rewards_sarsa = pd.Series(stats_sarsa.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_sarsa # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'SARSA' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_sarsa) # env.close() # print "Q Learning" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_q_learning = q_learning(env, num_episodes) # rewards_q_learning = pd.Series(stats_q_learning.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_q_learning # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Learning' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_q_learning) # env.close() # print "Double Q Learning" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_double_q_learning = double_q_learning(env, num_episodes) # rewards_double_q_learning = pd.Series(stats_double_q_learning.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_double_q_learning # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Double_Q_Learning' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_double_q_learning) # env.close() # print "One Step Tree Backup (Expected SARSA)" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_expected_sarsa = one_step_tree_backup(env, num_episodes) # rewards_expected_sarsa = pd.Series(stats_expected_sarsa.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_expected_sarsa # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'One_Step_Tree_Backup' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_expected_sarsa) # env.close() # print "Two Step Tree Backup" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_two_step_tree_backup = two_step_tree_backup(env, num_episodes) # rewards_two_step_tree_backup = pd.Series(stats_two_step_tree_backup.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_two_step_tree_backup # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Two_Step_Tree_Backup' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_two_step_tree_backup) # env.close() # print "Three Step Tree Backup" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_three_step_tree_backup = three_step_tree_backup(env, num_episodes) # rewards_three_step_tree_backup = pd.Series(stats_three_step_tree_backup.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_three_step_tree_backup # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Three_Step_Tree_Backup' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_three_step_tree_backup) # env.close() # print "Q(sigma) On Policy" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_q_sigma_on_policy = q_sigma_on_policy(env, num_episodes) # rewards_stats_q_sigma_on_policy = pd.Series(stats_q_sigma_on_policy.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_stats_q_sigma_on_policy # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_On_Policy' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_q_sigma_on_policy) # env.close() # print "Q(sigma) Off Policy" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_q_sigma_off_policy = Q_Sigma_Off_Policy(env, num_episodes) # rewards_stats_q_sigma_off_policy = pd.Series(stats_q_sigma_off_policy.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_stats_q_sigma_off_policy # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_Off_Policy' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_q_sigma_off_policy) # env.close() # print "Q(sigma) Off Policy 2 Step" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_q_sigma_off_policy_2_step = Q_Sigma_Off_Policy_2_Step(env, num_episodes) # rewards_stats_q_sigma_off_policy_2 = pd.Series(stats_q_sigma_off_policy_2_step.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_stats_q_sigma_off_policy_2 # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_Off_Policy_2_Step' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_q_sigma_off_policy_2_step) # env.close() # print "Q(sigma) Off Policy 3 Step" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_q_sigma_off_policy_3_step = Q_Sigma_Off_Policy_3_Step(env, num_episodes) # rewards_stats_q_sigma_off_policy_3 = pd.Series(stats_q_sigma_off_policy_3_step.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_stats_q_sigma_off_policy_3 # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_Off_Policy_3_Step' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_q_sigma_off_policy_3_step) # env.close() # print "SARSA(lambda)" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_sarsa_lambda = sarsa_lambda(env, num_episodes) # rewards_stats_sarsa_lambda = pd.Series(stats_sarsa_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_stats_sarsa_lambda # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Sarsa(lambda)' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_sarsa_lambda) # env.close() # print "Watkins Q(lambda)" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_q_lambda = q_lambda_watkins(env, num_episodes) # rewards_stats_q_lambda = pd.Series(stats_q_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_stats_q_lambda # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Watkins Q(lambda)' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_q_lambda) # env.close() # print "Naive Q(lambda)" # env = WindyGridworldEnv() # num_episodes = 2000 # smoothing_window = 1 # stats_q_lambda_naive = q_lambda_naive(env, num_episodes) # rewards_stats_q_naive = pd.Series(stats_q_lambda_naive.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() # cum_rwd = rewards_stats_q_naive # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Naive Q(lambda)' + '.npy', cum_rwd) # # plotting.plot_episode_stats(stats_q_lambda_naive) # env.close() print "Tree Backup(lambda)" env = WindyGridworldEnv() num_episodes = 2000 smoothing_window = 1 stats_tree_lambda = tree_backup_lambda(env, num_episodes) rewards_stats_tree_lambda = pd.Series(stats_tree_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_stats_tree_lambda np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Tree Backup(lambda)' + '.npy', cum_rwd) # plotting.plot_episode_stats(stats_tree_lambda) env.close()
import sys if "../" not in sys.path: sys.path.append("../") from lib.envs.windy_gridworld import WindyGridworldEnv UP = 0 RIGHT = 1 DOWN = 2 LEFT = 3 env = WindyGridworldEnv() print(env.reset()) print(env.nS) print(env.nA) for i in range(env.nS): print("state" + str(i) + ":") print(env.P[i]) env.render() print(env.step(RIGHT)) env.render() print(env.step(RIGHT)) env.render() print(env.step(UP)) env.render()
def main(): print "Q(sigma) On Policy Eligiblity Traces, Static Sigma" env = WindyGridworldEnv() num_episodes = 2000 smoothing_window = 1 stats_q_sigma_on_policy = q_sigma_on_policy_eligibility_traces_static_sigma( env, num_episodes) rewards_stats_q_sigma_on_policy = pd.Series( stats_q_sigma_on_policy.episode_rewards).rolling( smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_stats_q_sigma_on_policy np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_On_Policy_Eligibility_Traces_Static_Sigma' + '.npy', cum_rwd) # plotting.plot_episode_stats(stats_q_sigma_on_policy) env.close() print "Q(sigma) On Policy Eligiblity Traces, Dynamic Sigma" env = WindyGridworldEnv() num_episodes = 2000 smoothing_window = 1 stats_q_sigma_on_policy = q_sigma_on_policy_eligibility_traces_dynamic_sigma( env, num_episodes) rewards_stats_q_sigma_on_policy = pd.Series( stats_q_sigma_on_policy.episode_rewards).rolling( smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_stats_q_sigma_on_policy np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_On_Policy_Eligibility_Traces_Dynamic_Sigma' + '.npy', cum_rwd) # plotting.plot_episode_stats(stats_q_sigma_on_policy) env.close() print "Q(sigma) Off Policy Eligiblity Traces, Static Sigma" env = WindyGridworldEnv() num_episodes = 2000 smoothing_window = 1 stats_q_sigma_on_policy = q_sigma_off_policy_eligibility_traces_static_sigma( env, num_episodes) rewards_stats_q_sigma_on_policy = pd.Series( stats_q_sigma_on_policy.episode_rewards).rolling( smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_stats_q_sigma_on_policy np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_Off_Policy_Eligibility_Traces_Static_Sigma' + '.npy', cum_rwd) # plotting.plot_episode_stats(stats_q_sigma_on_policy) env.close() print "Q(sigma) Off Policy Eligiblity Traces, Dynamic Sigma" env = WindyGridworldEnv() num_episodes = 2000 smoothing_window = 1 stats_q_sigma_on_policy = q_sigma_off_policy_eligibility_traces_dynamic_sigma( env, num_episodes) rewards_stats_q_sigma_on_policy = pd.Series( stats_q_sigma_on_policy.episode_rewards).rolling( smoothing_window, min_periods=smoothing_window).mean() cum_rwd = rewards_stats_q_sigma_on_policy np.save( '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/' + 'Q_Sigma_Off_Policy_Eligibility_Traces_Dynamic_Sigma' + '.npy', cum_rwd) # plotting.plot_episode_stats(stats_q_sigma_on_policy) env.close()
epsilon, action_type=QAction, learning_rate=learning_rate) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): state_actions = set() state = self.env.reset() for t in itertools.count(): action_state = state.get_next_action_state( EGreedyPolicy(epsilon)) next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) if state not in state_actions: state_actions.add(action_state) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t action_state.update(reward, next_state.get_actions()) if done: break state = next_state return stats if __name__ == '__main__': q_learning = QLearning(WindyGridworldEnv()) stats = q_learning.run(200) plotting.plot_episode_stats(stats) q_learning.show_one_episode()