def main():
    print "Tree Backup Two Step"
    env = CliffWalkingEnv()
    Total_num_experiments = 5
    num_episodes = 20

    alpha = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1])

    Averaged_All_Rwd_Alpha = np.zeros(shape=(num_episodes, len(alpha)))
    Averaged_All_Error_Alpha = np.zeros(shape=(num_episodes, len(alpha)))

    for e in range(Total_num_experiments):
        All_Rwd_Alpha, All_Error_Alpha = tree_backup_two_step(
            env, num_episodes)

        Averaged_All_Rwd_Alpha = Averaged_All_Rwd_Alpha + All_Rwd_Alpha
        Averaged_All_Error_Alpha = Averaged_All_Error_Alpha + All_Error_Alpha

    Averaged_All_Rwd_Alpha = np.true_divide(Averaged_All_Rwd_Alpha,
                                            Total_num_experiments)
    Averaged_All_Error_Alpha = np.true_divide(Averaged_All_Error_Alpha,
                                              Total_num_experiments)

    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Tree_Backup_Results/'
        + 'Tree Backup_Two_Step_' + 'Reward_Alpha_' + '.npy',
        Averaged_All_Rwd_Alpha)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Tree_Backup_Results/'
        + 'Tree Backup_Two_Step_' + 'Error_Alpha_' + '.npy',
        Averaged_All_Error_Alpha)

    env.close()
def main():
    print "Adaptive Q(sigma) On Policy"
    env = CliffWalkingEnv()
    Total_num_experiments = 10
    num_episodes = 2000

    alpha = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1])
    sigma_initialised = np.array([1, 0.75, 0.5, 0.25, 0])

    Averaged_All_Rwd_Sigma = np.zeros(shape=(num_episodes,
                                             len(sigma_initialised)))
    Averaged_All_Rwd_Sigma_Alpha = np.zeros(shape=(len(sigma_initialised),
                                                   len(alpha)))

    Averaged_All_Error_Sigma = np.zeros(shape=(num_episodes,
                                               len(sigma_initialised)))
    Averaged_All_Error_Sigma_Alpha = np.zeros(shape=(len(sigma_initialised),
                                                     len(alpha)))

    for e in range(Total_num_experiments):
        All_Rwd_Sigma, All_Error_Sigma, All_Rwd_Sigma_Alpha, All_Error_Sigma_Alpha = adaptive_q_sigma_on_policy(
            env, num_episodes)

        Averaged_All_Rwd_Sigma = Averaged_All_Rwd_Sigma + All_Rwd_Sigma
        Averaged_All_Rwd_Sigma_Alpha = Averaged_All_Rwd_Sigma_Alpha + All_Rwd_Sigma_Alpha

        Averaged_All_Error_Sigma = Averaged_All_Error_Sigma + All_Error_Sigma
        Averaged_All_Error_Sigma_Alpha = Averaged_All_Error_Sigma_Alpha + All_Error_Sigma_Alpha

    Averaged_All_Rwd_Sigma = np.true_divide(Averaged_All_Rwd_Sigma,
                                            Total_num_experiments)
    Averaged_All_Rwd_Sigma_Alpha = np.true_divide(Averaged_All_Rwd_Sigma_Alpha,
                                                  Total_num_experiments)
    Averaged_All_Error_Sigma = np.true_divide(Averaged_All_Error_Sigma,
                                              Total_num_experiments)
    Averaged_All_Error_Sigma_Alpha = np.true_divide(
        Averaged_All_Error_Sigma_Alpha, Total_num_experiments)

    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Adaptive_OnPolicy_Q_Sigma_Results/'
        + 'Adaptive_On_Policy_Q_sigma' + 'Reward_Sigma_' + '.npy',
        Averaged_All_Rwd_Sigma)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Adaptive_OnPolicy_Q_Sigma_Results/'
        + 'Adaptive_On_Policy_Q_sigma' + 'Sigma_Alpha' + '.npy',
        Averaged_All_Rwd_Sigma_Alpha)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Adaptive_OnPolicy_Q_Sigma_Results/'
        + 'Adaptive_On_Policy_Q_sigma' + 'Error_Sigma_' + '.npy',
        Averaged_All_Error_Sigma)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Adaptive_OnPolicy_Q_Sigma_Results/'
        + 'Adaptive_On_Policy_Q_sigma' + 'Error_Sigma_Alpha' + '.npy',
        Averaged_All_Error_Sigma_Alpha)

    # plotting.plot_episode_stats(stats_tree_lambda)
    env.close()
Exemplo n.º 3
0
def main():
    print "Tree Backup(lambda)"
    env = CliffWalkingEnv()
    Total_num_experiments = 10
    num_episodes = 1000

    lambda_param = np.array(
        [0, 0.1, 0.15, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.975, 0.99, 1])
    alpha = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1])

    Averaged_All_Rwd_Lambda = np.zeros(shape=(num_episodes, len(lambda_param)))
    Averaged_All_Lambda_Alpha = np.zeros(shape=(len(lambda_param), len(alpha)))
    Averaged_All_Error_Lambda = np.zeros(shape=(num_episodes,
                                                len(lambda_param)))
    Averaged_All_Error_Lambda_Alpha = np.zeros(shape=(len(lambda_param),
                                                      len(alpha)))

    for e in range(Total_num_experiments):
        All_Rwd_Lambda, All_Lambda_Alpha, All_Error_Lambda, All_Error_Lambda_Alpha = tree_backup_lambda(
            env, num_episodes)

        Averaged_All_Rwd_Lambda = Averaged_All_Rwd_Lambda + All_Rwd_Lambda
        Averaged_All_Lambda_Alpha = Averaged_All_Lambda_Alpha + All_Lambda_Alpha
        Averaged_All_Error_Lambda = Averaged_All_Error_Lambda + All_Error_Lambda
        Averaged_All_Error_Lambda_Alpha = Averaged_All_Error_Lambda_Alpha + All_Error_Lambda_Alpha

    Averaged_All_Rwd_Lambda = np.true_divide(Averaged_All_Rwd_Lambda,
                                             Total_num_experiments)
    Averaged_All_Lambda_Alpha = np.true_divide(Averaged_All_Lambda_Alpha,
                                               Total_num_experiments)
    Averaged_All_Error_Lambda = np.true_divide(Averaged_All_Error_Lambda,
                                               Total_num_experiments)
    Averaged_All_Error_Lambda_Alpha = np.true_divide(
        Averaged_All_Error_Lambda_Alpha, Total_num_experiments)

    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Tree_Backup_Results/'
        + 'Tree Backup(lambda)_' + 'Reward_Lambda_' + '.npy',
        Averaged_All_Rwd_Lambda)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Tree_Backup_Results/'
        + 'Tree Backup(lambda)_' + 'Lambda_Alpha' + '.npy',
        Averaged_All_Lambda_Alpha)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Tree_Backup_Results/'
        + 'Tree Backup(lambda)_' + 'Error_Lambda_' + '.npy',
        Averaged_All_Error_Lambda)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/Tree_Backup_Results/'
        + 'Tree Backup(lambda)_' + 'Error_Lambda_Alpha' + '.npy',
        Averaged_All_Error_Lambda_Alpha)

    # plotting.plot_episode_stats(stats_tree_lambda)
    env.close()
Exemplo n.º 4
0
def get_env(argument):
    switcher = {
        "cliffwalking": CliffWalkingEnv(),
        "cliffwalkingenv": CliffWalkingEnv(),
        "cliff": CliffWalkingEnv(),
        "cliffs": CliffWalkingEnv(),
        "windygridworld": WindyGridworldEnv(),
        "windygridworldenv": WindyGridworldEnv(),
        "windygrid": WindyGridworldEnv(),
        "windy": WindyGridworldEnv(),
        "simplemaze": SimpleRoomsEnv(),
        "simplegrid": SimpleRoomsEnv(),
        "simplegridworld": SimpleRoomsEnv(),
        "simplegridworldenv": SimpleRoomsEnv(),
        "simpleroomsenv": SimpleRoomsEnv(),
        "simpleroom": SimpleRoomsEnv(),
        "maze": SimpleRoomsEnv(),
        "grid": SimpleRoomsEnv()
    }
    return switcher.get(argument)
Exemplo n.º 5
0
def main():

	print "Tree Backup(lambda)"
	env = CliffWalkingEnv()
	Total_num_experiments = 2
	num_episodes = 30

	theta = np.zeros(shape=(400, env.action_space.n))

	lambda_param = np.array([0.1, 0.15, 0.2, 0.4, 0.6, 0.8, 0.9, 1])
	alpha = np.array([0.1, 0.2, 0.4, 0.5])

	Averaged_All_Rwd_Lambda = np.zeros(shape=(num_episodes, len(lambda_param)))
	Averaged_All_Lambda_Alpha = np.zeros(shape=(len(lambda_param), len(alpha)))
	Averaged_All_Error_Lambda = np.zeros(shape=(num_episodes, len(lambda_param)))	
	Averaged_All_Error_Lambda_Alpha = np.zeros(shape=(len(lambda_param), len(alpha)))


	for e in range(Total_num_experiments):
		All_Rwd_Lambda, All_Lambda_Alpha, All_Error_Lambda, All_Error_Lambda_Alpha = tree_backup_lambda(env, theta, num_episodes)

		Averaged_All_Rwd_Lambda = Averaged_All_Rwd_Lambda + All_Rwd_Lambda
		Averaged_All_Lambda_Alpha = Averaged_All_Lambda_Alpha + All_Lambda_Alpha
		Averaged_All_Error_Lambda = Averaged_All_Error_Lambda + All_Error_Lambda
		Averaged_All_Error_Lambda_Alpha = Averaged_All_Error_Lambda_Alpha + All_Error_Lambda_Alpha		

	Averaged_All_Rwd_Lambda = np.true_divide(Averaged_All_Rwd_Lambda, Total_num_experiments)
	Averaged_All_Lambda_Alpha = np.true_divide(Averaged_All_Lambda_Alpha, Total_num_experiments)
	Averaged_All_Error_Lambda = np.true_divide(Averaged_All_Error_Lambda, Total_num_experiments)
	Averaged_All_Error_Lambda_Alpha = np.true_divide(Averaged_All_Error_Lambda_Alpha, Total_num_experiments)		


	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/Cliff_Walking_Results/'  + 'Tree Backup(lambda)_RBF_' +  'Reward_Lambda_' + '.npy', Averaged_All_Rwd_Lambda)
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/Cliff_Walking_Results/'  + 'Tree Backup(lambda)_RBF_' +  'Lambda_Alpha' + '.npy', Averaged_All_Lambda_Alpha)
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/Cliff_Walking_Results/'  + 'Tree Backup(lambda)_RBF_' +  'Error_Lambda_' + '.npy', Averaged_All_Error_Lambda)
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/Cliff_Walking_Results/'  + 'Tree Backup(lambda)_RBF_' +  'Error_Lambda_Alpha' + '.npy', Averaged_All_Error_Lambda_Alpha)

	# plotting.plot_episode_stats(stats_tree_lambda)
	env.close()
Exemplo n.º 6
0
def getEnv(domain):
    if domain == "Blackjack":
        return BlackjackEnv()
    elif domain == "Gridworld":
        return GridworldEnv()
    elif domain == "CliffWalking":
        return CliffWalkingEnv()
    elif domain == "WindyGridworld":
        return WindyGridworldEnv()
    else:
        try:
            return gym.make(domain)
        except:
            assert False, "Domain must be a valid (and installed) Gym environment"
Exemplo n.º 7
0
                if update_time >= 0:
                    action_state_update_time = env_list[update_time][1]
                    evaluated_state_index = update_time + self.n - 1
                    if evaluated_state_index < len(states):
                        state_update_time = states[evaluated_state_index]
                        action_state_update_time.update(
                            0,
                            state_update_time.get_actions(),
                            time_step=update_time)
                    else:
                        action_state_update_time.update(0,
                                                        None,
                                                        time_step=update_time)
                if update_time == T - 1:
                    a_ss = [a_s for _, a_s in env_list]
                    for a_s in a_ss:
                        a_s.clear_reward_calculator()
                    break
        return stats


if __name__ == '__main__':
    q_learning = NStepSarsa(CliffWalkingEnv(), 1)
    stats = q_learning.run(200, get_learning_rate=lambda x1, x2: 1)
    plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()
    # q_learning = NStepSarsa(WindyGridworldEnv(), 8)
    # stats = q_learning.run(50000)
    # plotting.plot_episode_stats(stats)
    # q_learning.show_one_episode()
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 27 10:16:40 2016
Cliff_Env_Playground.py
@author: guy
"""
import gym
import numpy as np
import sys

if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.cliff_walking import CliffWalkingEnv

env = CliffWalkingEnv()

#%%

print(env.reset())
env.render()

print(env.step(0))
env.render()

print(env.step(1))
env.render()

print(env.step(1))
env.render()
Exemplo n.º 9
0
                       epsilon,
                       action_type=QAction,
                       learning_rate=learning_rate)
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))
        for i_episode in tqdm(range(num_episodes)):
            state_actions = set()
            state = self.env.reset()
            for t in itertools.count():
                action_state = state.get_next_action_state(
                    EGreedyPolicy(epsilon))
                next_state, reward, done, _ = self.env.step(
                    action_state.get_gym_action())
                if state not in state_actions:
                    state_actions.add(action_state)
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                action_state.update(reward, next_state.get_actions())
                if done:
                    break
                state = next_state

        return stats


if __name__ == '__main__':
    q_learning = QLearning(CliffWalkingEnv())
    stats = q_learning.run(200)
    plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()
Exemplo n.º 10
0
        for _ in tqdm(range(num_episodes)):
            action_states = []
            state = self.env.reset()
            states = [state]
            for t in range(100):
                action_state = state.get_next_action_state(
                    EGreedyPolicy(epsilon))
                next_state, reward, done, _ = self.env.step(
                    action_state.get_gym_action())
                action_state.add_reward_calculator(t)
                if state not in states:
                    states.append(state)
                if action_state not in action_states:
                    action_states.append(action_state)
                for a_s in action_states:
                    a_s.cache_reward(reward, t)
                if done:
                    break
                state = next_state
            for i, s in enumerate(action_states):
                s.update(0, [], time_step=i)
            for a_s in action_states:
                a_s.clear_reward_calculator()


if __name__ == '__main__':
    q_learning = McOnline(CliffWalkingEnv())
    q_learning.run(500000, learning_rate=1)
    # plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()
Exemplo n.º 11
0
 def setUpClass(cls):
     np.random.seed(0)
     env = CliffWalkingEnv()
     cls.Q, cls.stats = q_learning(env, 500)
def main():
    env = CliffWalkingEnv()

    Q, stats = q_learning(env, 500)
    plotting.plot_episode_stats(stats)
Exemplo n.º 13
0
def main():

    # print "SARSA"
    # env = CliffWalkingEnv()
    # num_episodes = 2000
    # smoothing_window = 1
    # stats_sarsa = sarsa(env, num_episodes)
    # rewards_sarsa = pd.Series(stats_sarsa.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_sarsa
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'  + 'SARSA' + '.npy', cum_rwd)
    # # plotting.plot_episode_stats(stats_sarsa)
    # env.close()

    # print "Q Learning"
    # env = CliffWalkingEnv()
    # num_episodes = 2000
    # smoothing_window = 1
    # stats_q_learning = q_learning(env, num_episodes)
    # rewards_q_learning = pd.Series(stats_q_learning.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_q_learning
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'  + 'Q_Learning' + '.npy', cum_rwd)
    # # plotting.plot_episode_stats(stats_q_learning)
    # env.close()

    # print "Double Q Learning"
    # env = CliffWalkingEnv()
    # num_episodes = 2000
    # smoothing_window = 1
    # stats_double_q_learning = double_q_learning(env, num_episodes)
    # rewards_double_q_learning = pd.Series(stats_double_q_learning.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_double_q_learning
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'  + 'Double_Q_Learning' + '.npy', cum_rwd)
    # # plotting.plot_episode_stats(stats_double_q_learning)
    # env.close()

    print "One Step Tree Backup (Expected SARSA)"
    env = CliffWalkingEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_expected_sarsa = one_step_tree_backup(env, num_episodes)
    rewards_expected_sarsa = pd.Series(
        stats_expected_sarsa.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_expected_sarsa
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'One_Step_Tree_Backup' + '.npy', cum_rwd)
    # plotting.plot_episode_stats(stats_expected_sarsa)
    env.close()

    print "Two Step Tree Backup"
    env = CliffWalkingEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_two_step_tree_backup = two_step_tree_backup(env, num_episodes)
    rewards_two_step_tree_backup = pd.Series(
        stats_two_step_tree_backup.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_two_step_tree_backup
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'Two_Step_Tree_Backup' + '.npy', cum_rwd)
    # plotting.plot_episode_stats(stats_two_step_tree_backup)
    env.close()

    print "Three Step Tree Backup"
    env = CliffWalkingEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_three_step_tree_backup = three_step_tree_backup(env, num_episodes)
    rewards_three_step_tree_backup = pd.Series(
        stats_three_step_tree_backup.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_three_step_tree_backup
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'Three_Step_Tree_Backup' + '.npy', cum_rwd)
    # plotting.plot_episode_stats(stats_three_step_tree_backup)
    env.close()

    print "Q(sigma) On Policy"
    env = CliffWalkingEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_on_policy = q_sigma_on_policy(env, num_episodes)
    rewards_stats_q_sigma_on_policy = pd.Series(
        stats_q_sigma_on_policy.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_on_policy
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'Q_Sigma_On_Policy' + '.npy', cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_on_policy)
    env.close()

    print "Q(sigma) Off Policy"
    env = CliffWalkingEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_off_policy = Q_Sigma_Off_Policy(env, num_episodes)
    rewards_stats_q_sigma_off_policy = pd.Series(
        stats_q_sigma_off_policy.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_off_policy
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'Q_Sigma_Off_Policy' + '.npy', cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_off_policy)
    env.close()

    print "Q(sigma) Off Policy 2 Step"
    env = CliffWalkingEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_off_policy_2_step = Q_Sigma_Off_Policy_2_Step(
        env, num_episodes)
    rewards_stats_q_sigma_off_policy_2 = pd.Series(
        stats_q_sigma_off_policy_2_step.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_off_policy_2
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'Q_Sigma_Off_Policy_2_Step' + '.npy', cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_off_policy_2_step)
    env.close()

    print "Q(sigma) Off Policy 3 Step"
    env = CliffWalkingEnv()
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_off_policy_3_step = Q_Sigma_Off_Policy_3_Step(
        env, num_episodes)
    rewards_stats_q_sigma_off_policy_3 = pd.Series(
        stats_q_sigma_off_policy_3_step.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_off_policy_3
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'Q_Sigma_Off_Policy_3_Step' + '.npy', cum_rwd)
    # plotting.plot_episode_stats(stats_q_sigma_off_policy_3_step)
    env.close()

    # print "SARSA(lambda)"
    # env = CliffWalkingEnv()
    # num_episodes = 2000
    # smoothing_window = 1
    # stats_sarsa_lambda = sarsa_lambda(env, num_episodes)
    # rewards_stats_sarsa_lambda = pd.Series(stats_sarsa_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_stats_sarsa_lambda
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'  + 'Sarsa(lambda)' + '.npy', cum_rwd)
    # # plotting.plot_episode_stats(stats_sarsa_lambda)
    # env.close()

    # print "Watkins Q(lambda)"
    # env = CliffWalkingEnv()
    # num_episodes = 2000
    # smoothing_window = 1
    # stats_q_lambda = q_lambda_watkins(env, num_episodes)
    # rewards_stats_q_lambda = pd.Series(stats_q_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_stats_q_lambda
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'  + 'Watkins Q(lambda)' + '.npy', cum_rwd)
    # # plotting.plot_episode_stats(stats_q_lambda)
    # env.close()

    # print "Naive Q(lambda)"
    # env = CliffWalkingEnv()
    # num_episodes = 2000
    # smoothing_window = 1
    # stats_q_lambda_naive = q_lambda_naive(env, num_episodes)
    # rewards_stats_q_naive = pd.Series(stats_q_lambda_naive.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_stats_q_naive
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'  + 'Naive Q(lambda)' + '.npy', cum_rwd)
    # # plotting.plot_episode_stats(stats_q_lambda_naive)
    # env.close()

    # print "Tree Backup(lambda)"
    # env = CliffWalkingEnv()
    # num_episodes = 2000
    # smoothing_window = 1
    # stats_tree_lambda = tree_backup_lambda(env, num_episodes)
    # rewards_stats_tree_lambda = pd.Series(stats_tree_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_stats_tree_lambda
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'  + 'Tree Backup(lambda)' + '.npy', cum_rwd)
    # # plotting.plot_episode_stats(stats_tree_lambda)
    # env.close()
    """
	DOES NOT WORK FULLY YET
	"""
    print "Q(sigma)(lambda)"
    num_episodes = 2000
    smoothing_window = 1
    stats_q_sigma_lambda = q_sigma_lambda(env, num_episodes)
    rewards_stats_q_sigma_lambda = pd.Series(
        stats_q_sigma_lambda.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_stats_q_sigma_lambda
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Tabular/CliffWalking_Results/'
        + 'Q(sigma_lambda)' + '.npy', cum_rwd)
    plotting.plot_episode_stats(stats_q_sigma_lambda)
    env.close()
Exemplo n.º 14
0
            for action_state in reversed(action_states):
                state, action_state, reward = action_state
                g = discount_factor * g + reward
                action_state.update_c(w)
                action_state.update_q(g, w)
                action = state.get_next_action_state(GreedyPolicy())
                if action != action_state:
                    break
                w = w * (1 / 0.5)
        return state

    def generate_one_episode_action_states_by_policy(self, policy):
        actions = []
        state = self.env.reset()
        for t in range(100):
            action = state.get_next_action_state(policy)
            next_state, reward, done, _ = self.env.step(
                action.get_gym_action())
            actions.append((state, action, reward))
            if done:
                break
            state = next_state
        return actions


if __name__ == '__main__':
    q_learning = McOfflinePolicy(CliffWalkingEnv())
    q_learning.run(500000)
    # plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()
Exemplo n.º 15
0
import sys

if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.cliff_walking import CliffWalkingEnv

UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3

env = CliffWalkingEnv()

print(env.reset())
env.render()

print(env.step(UP))
env.render()

print(env.step(RIGHT))
env.render()

print(env.step(RIGHT))
env.render()

print(env.step(DOWN))
env.render()
Exemplo n.º 16
0
import sys

if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.cliff_walking import CliffWalkingEnv
from lib import plotting
from agents import QLearningAgent
import numpy as np

env_shape = (4, 12)
start_position = (3, 0)
end_positions = [(3, 11)]
cliff = tuple((3, i + 1) for i in range(10))

env = CliffWalkingEnv(env_shape, start_position, end_positions, cliff)
n_actions = env.action_space.n
agent = QLearningAgent(alpha=0.5,
                       epsilon=0.1,
                       discount=0.99,
                       n_actions=n_actions)

agent.train(env,
            n_episodes=1000,
            t_max=10**3,
            verbose=True,
            verbose_per_episode=500)

plotting.draw_policy(env, agent)
plotting.plot_episode_stats(agent)
Exemplo n.º 17
0
import tensorflow as tf
import collections

from lib.envs.cliff_walking import CliffWalkingEnv
from lib import plotting

matplotlib.style.use('ggplot')

# env = CliffWalkingEnv()

from collections import defaultdict
from lib.envs.cliff_walking import CliffWalkingEnv
from lib.envs.windy_gridworld import WindyGridworldEnv
from lib import plotting

env = CliffWalkingEnv()


class PolicyEstimator():
    """
    Policy Function approximator. 
    """
    def __init__(self, learning_rate=0.01, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.action = tf.placeholder(dtype=tf.int32, name="action")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just table lookup estimator
            state_one_hot = tf.one_hot(self.state,
                                       int(env.observation_space.n))
Exemplo n.º 18
0
from lib.envs.cliff_walking import CliffWalkingEnv

shape = (4, 12)
start = (3, 0)
end = [(3, 11)]
cliff = tuple((3, i + 1) for i in range(11))

env = CliffWalkingEnv(shape, start, end, cliff)
env.render()