示例#1
0
def get_env(argument):
    switcher = {
        "cliffwalking": CliffWalkingEnv(),
        "cliffwalkingenv": CliffWalkingEnv(),
        "cliff": CliffWalkingEnv(),
        "cliffs": CliffWalkingEnv(),
        "windygridworld": WindyGridworldEnv(),
        "windygridworldenv": WindyGridworldEnv(),
        "windygrid": WindyGridworldEnv(),
        "windy": WindyGridworldEnv(),
        "simplemaze": SimpleRoomsEnv(),
        "simplegrid": SimpleRoomsEnv(),
        "simplegridworld": SimpleRoomsEnv(),
        "simplegridworldenv": SimpleRoomsEnv(),
        "simpleroomsenv": SimpleRoomsEnv(),
        "simpleroom": SimpleRoomsEnv(),
        "maze": SimpleRoomsEnv(),
        "grid": SimpleRoomsEnv()
    }
    return switcher.get(argument)
示例#2
0
def getEnv(domain):
    if domain == "Blackjack":
        return BlackjackEnv()
    elif domain == "Gridworld":
        return GridworldEnv()
    elif domain == "CliffWalking":
        return CliffWalkingEnv()
    elif domain == "WindyGridworld":
        return WindyGridworldEnv()
    else:
        try:
            return gym.make(domain)
        except:
            assert False, "Domain must be a valid (and installed) Gym environment"
import pandas as pd
import sys
import random

from lib import plotting
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_approximation import RBFSampler
from collections import namedtuple

from collections import defaultdict
from lib.envs.cliff_walking import CliffWalkingEnv
from lib.envs.windy_gridworld import WindyGridworldEnv
from lib import plotting
#env = CliffWalkingEnv()
env = WindyGridworldEnv()

# #with the mountaincar from openAi gym
# env = gym.envs.make("MountainCar-v0")

#samples from the state space to compute the features
observation_examples = np.array(
    [env.observation_space.sample() for x in range(1)])

scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

#convert states to a feature representation:
#used an RBF sampler here for the feature map
featurizer = sklearn.pipeline.FeatureUnion([
    ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
示例#4
0
                    evaluated_state_index = update_time + self.n - 1
                    if evaluated_state_index < len(states):
                        log.debug('=n')
                        state_update_time = states[evaluated_state_index]
                        action_state_update_time.update(
                            0,
                            state_update_time.get_actions(),
                            time_step=update_time)
                    else:
                        log.debug('<n')
                        action_state_update_time.update(0,
                                                        None,
                                                        time_step=update_time)
                if update_time == T - 1:
                    a_ss = [a_s for _, a_s in env_list]
                    for a_s in a_ss:
                        a_s.clear_reward_calculator()
                    break
        return stats


if __name__ == '__main__':
    q_learning = OffNStepSarsa(WindyGridworldEnv(), 1)
    stats = q_learning.run(5000)
    plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()
    # q_learning = NStepSarsa(WindyGridworldEnv(), 8)
    # stats = q_learning.run(50000)
    # plotting.plot_episode_stats(stats)
    # q_learning.show_one_episode()
示例#5
0
# coding: utf-8

# In[1]:

import gym
import numpy as np
import sys

if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.windy_gridworld import WindyGridworldEnv

# In[2]:

env = WindyGridworldEnv()

print(env.reset())
env.render()

print(env.step(1))
env.render()

print(env.step(1))
env.render()

print(env.step(1))
env.render()

print(env.step(2))
env.render()
示例#6
0
            for action_state in reversed(action_states):
                state, action_state, reward = action_state
                g = discount_factor * g + reward
                action_state.update_c(w)
                action_state.update_q(g, w)
                action = state.get_next_action_state(GreedyPolicy())
                if action != action_state:
                    break
                w = w * n
        return state

    def generate_one_episode_action_states_by_policy(self, policy):
        actions = []
        state = self.env.reset()
        for t in range(100):
            action = state.get_next_action_state(policy)
            next_state, reward, done, _ = self.env.step(action.get_gym_action())
            actions.append((state, action, reward))
            if done:
                break
            state = next_state
        return actions


if __name__ == '__main__':
    q_learning = McOfflinePolicy(WindyGridworldEnv())
    q_learning.run(5000)
    # plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()

                       epsilon,
                       action_type=DoubleQAction,
                       learning_rate=learning_rate)
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))
        for i_episode in tqdm(range(num_episodes)):
            state_actions = set()
            state = self.env.reset()
            for t in itertools.count():
                action_state = state.get_next_action_state(
                    EGreedyPolicy(epsilon))
                next_state, reward, done, _ = self.env.step(
                    action_state.get_gym_action())
                if state not in state_actions:
                    state_actions.add(action_state)
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                action_state.update(reward, next_state.get_actions())
                if done:
                    break
                state = next_state

        return stats


if __name__ == '__main__':
    q_learning = DoubleQLearning(WindyGridworldEnv())
    stats = q_learning.run(50000)
    plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()
示例#8
0
                       learning_rate=learning_rate)
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))
        for i_episode in tqdm(range(num_episodes)):
            state_actions = set()
            state = self.env.reset()
            for t in itertools.count():
                action_state = state.get_next_action_state(
                    EGreedyPolicy(epsilon))
                next_state, reward, done, _ = self.env.step(
                    action_state.get_gym_action())
                if state not in state_actions:
                    state_actions.add(action_state)
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                action_state.update(reward,
                                    next_state.get_actions(),
                                    policy=EGreedyPolicy(epsilon))
                if done:
                    break
                state = next_state

        return stats


if __name__ == '__main__':
    sarsa = Sarsa(WindyGridworldEnv())
    stats = sarsa.run(20000)
    plotting.plot_episode_stats(stats)
    sarsa.show_one_episode()
def main():

	# print "SARSA"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_sarsa = sarsa(env, num_episodes)
	# rewards_sarsa = pd.Series(stats_sarsa.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_sarsa
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'SARSA' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_sarsa)
	# env.close()


	# print "Q Learning"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_q_learning = q_learning(env, num_episodes)
	# rewards_q_learning = pd.Series(stats_q_learning.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_q_learning
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Q_Learning' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_q_learning)
	# env.close()

	# print "Double Q Learning"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_double_q_learning = double_q_learning(env, num_episodes)
	# rewards_double_q_learning = pd.Series(stats_double_q_learning.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_double_q_learning
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Double_Q_Learning' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_double_q_learning)
	# env.close()


	# print "One Step Tree Backup (Expected SARSA)"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_expected_sarsa = one_step_tree_backup(env, num_episodes)
	# rewards_expected_sarsa = pd.Series(stats_expected_sarsa.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_expected_sarsa
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'One_Step_Tree_Backup' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_expected_sarsa)
	# env.close()

	# print "Two Step Tree Backup"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_two_step_tree_backup = two_step_tree_backup(env, num_episodes)
	# rewards_two_step_tree_backup = pd.Series(stats_two_step_tree_backup.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_two_step_tree_backup
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Two_Step_Tree_Backup' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_two_step_tree_backup)
	# env.close()

	# print "Three Step Tree Backup"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_three_step_tree_backup = three_step_tree_backup(env, num_episodes)
	# rewards_three_step_tree_backup = pd.Series(stats_three_step_tree_backup.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_three_step_tree_backup
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Three_Step_Tree_Backup' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_three_step_tree_backup)
	# env.close()


	# print "Q(sigma) On Policy"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_q_sigma_on_policy = q_sigma_on_policy(env, num_episodes)
	# rewards_stats_q_sigma_on_policy = pd.Series(stats_q_sigma_on_policy.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_stats_q_sigma_on_policy
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Q_Sigma_On_Policy' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_q_sigma_on_policy)
	# env.close()


	# print "Q(sigma) Off Policy"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_q_sigma_off_policy = Q_Sigma_Off_Policy(env, num_episodes)
	# rewards_stats_q_sigma_off_policy = pd.Series(stats_q_sigma_off_policy.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_stats_q_sigma_off_policy
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Q_Sigma_Off_Policy' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_q_sigma_off_policy)
	# env.close()


	# print "Q(sigma) Off Policy 2 Step"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_q_sigma_off_policy_2_step = Q_Sigma_Off_Policy_2_Step(env, num_episodes)
	# rewards_stats_q_sigma_off_policy_2 = pd.Series(stats_q_sigma_off_policy_2_step.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_stats_q_sigma_off_policy_2
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Q_Sigma_Off_Policy_2_Step' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_q_sigma_off_policy_2_step)
	# env.close()


	# print "Q(sigma) Off Policy 3 Step"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_q_sigma_off_policy_3_step = Q_Sigma_Off_Policy_3_Step(env, num_episodes)
	# rewards_stats_q_sigma_off_policy_3 = pd.Series(stats_q_sigma_off_policy_3_step.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_stats_q_sigma_off_policy_3
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Q_Sigma_Off_Policy_3_Step' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_q_sigma_off_policy_3_step)
	# env.close()


	# print "SARSA(lambda)"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_sarsa_lambda = sarsa_lambda(env, num_episodes)
	# rewards_stats_sarsa_lambda = pd.Series(stats_sarsa_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_stats_sarsa_lambda
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Sarsa(lambda)' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_sarsa_lambda)
	# env.close()



	# print "Watkins Q(lambda)"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_q_lambda = q_lambda_watkins(env, num_episodes)
	# rewards_stats_q_lambda = pd.Series(stats_q_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_stats_q_lambda
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Watkins Q(lambda)' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_q_lambda)
	# env.close()


	# print "Naive Q(lambda)"
	# env = WindyGridworldEnv()
	# num_episodes = 2000
	# smoothing_window = 1
	# stats_q_lambda_naive = q_lambda_naive(env, num_episodes)
	# rewards_stats_q_naive = pd.Series(stats_q_lambda_naive.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	# cum_rwd = rewards_stats_q_naive
	# np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Naive Q(lambda)' + '.npy', cum_rwd)
	# # plotting.plot_episode_stats(stats_q_lambda_naive)
	# env.close()


	print "Tree Backup(lambda)"
	env = WindyGridworldEnv()
	num_episodes = 2000
	smoothing_window = 1
	stats_tree_lambda = tree_backup_lambda(env, num_episodes)
	rewards_stats_tree_lambda = pd.Series(stats_tree_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
	cum_rwd = rewards_stats_tree_lambda
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'  + 'Tree Backup(lambda)' + '.npy', cum_rwd)
	# plotting.plot_episode_stats(stats_tree_lambda)
	env.close()
示例#10
0
import sys

if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.windy_gridworld import WindyGridworldEnv

UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3

env = WindyGridworldEnv()

print(env.reset())
print(env.nS)
print(env.nA)

for i in range(env.nS):
    print("state" + str(i) + ":")
    print(env.P[i])
env.render()

print(env.step(RIGHT))
env.render()

print(env.step(RIGHT))
env.render()

print(env.step(UP))
env.render()
示例#11
0
def main():

    print "Q(sigma) On Policy Eligiblity Traces, Static Sigma"
    env = WindyGridworldEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_on_policy = q_sigma_on_policy_eligibility_traces_static_sigma(
        env, num_episodes)
    rewards_stats_q_sigma_on_policy = pd.Series(
        stats_q_sigma_on_policy.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_on_policy
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'
        + 'Q_Sigma_On_Policy_Eligibility_Traces_Static_Sigma' + '.npy',
        cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_on_policy)
    env.close()

    print "Q(sigma) On Policy Eligiblity Traces, Dynamic Sigma"
    env = WindyGridworldEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_on_policy = q_sigma_on_policy_eligibility_traces_dynamic_sigma(
        env, num_episodes)
    rewards_stats_q_sigma_on_policy = pd.Series(
        stats_q_sigma_on_policy.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_on_policy
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'
        + 'Q_Sigma_On_Policy_Eligibility_Traces_Dynamic_Sigma' + '.npy',
        cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_on_policy)
    env.close()

    print "Q(sigma) Off Policy Eligiblity Traces, Static Sigma"
    env = WindyGridworldEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_on_policy = q_sigma_off_policy_eligibility_traces_static_sigma(
        env, num_episodes)
    rewards_stats_q_sigma_on_policy = pd.Series(
        stats_q_sigma_on_policy.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_on_policy
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'
        + 'Q_Sigma_Off_Policy_Eligibility_Traces_Static_Sigma' + '.npy',
        cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_on_policy)
    env.close()

    print "Q(sigma) Off Policy Eligiblity Traces, Dynamic Sigma"
    env = WindyGridworldEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_on_policy = q_sigma_off_policy_eligibility_traces_dynamic_sigma(
        env, num_episodes)
    rewards_stats_q_sigma_on_policy = pd.Series(
        stats_q_sigma_on_policy.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_on_policy
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/WindyGridWorld_Results/'
        + 'Q_Sigma_Off_Policy_Eligibility_Traces_Dynamic_Sigma' + '.npy',
        cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_on_policy)
    env.close()
示例#12
0
                       epsilon,
                       action_type=QAction,
                       learning_rate=learning_rate)
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))
        for i_episode in tqdm(range(num_episodes)):
            state_actions = set()
            state = self.env.reset()
            for t in itertools.count():
                action_state = state.get_next_action_state(
                    EGreedyPolicy(epsilon))
                next_state, reward, done, _ = self.env.step(
                    action_state.get_gym_action())
                if state not in state_actions:
                    state_actions.add(action_state)
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                action_state.update(reward, next_state.get_actions())
                if done:
                    break
                state = next_state

        return stats


if __name__ == '__main__':
    q_learning = QLearning(WindyGridworldEnv())
    stats = q_learning.run(200)
    plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()