예제 #1
0
    "iht_size": 4096,
    "num_tilings": 8,
    "num_tiles": 8,
    "actor_step_size": 1e-1,
    "critic_step_size": 1e-0,
    "avg_reward_step_size": 1e-2,
    "num_actions": 3,
    "seed": 99,
}

rl_glue = RLGlue(PendulumEnvironment, ActorCriticSoftmaxAgent)
rl_glue.rl_init(agent_info, env_info)

# start env/agent
rl_glue.rl_start()
rl_glue.rl_step()

# simple alias
agent = rl_glue.agent

print("agent next_action: {}".format(agent.last_action))
print("agent avg reward: {}\n".format(agent.avg_reward))

assert agent.last_action == 1
assert agent.avg_reward == -0.03139092653589793

print("agent first 10 values of actor weights[0]: \n{}\n".format(
    agent.actor_w[0][:10]))
print("agent first 10 values of actor weights[1]: \n{}\n".format(
    agent.actor_w[1][:10]))
print("agent first 10 values of actor weights[2]: \n{}\n".format(
예제 #2
0
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # sweep agent parameters
    for num_tilings in agent_parameters['num_tilings']:
        for num_tiles in agent_parameters["num_tiles"]:
            for actor_ss in agent_parameters["actor_step_size"]:
                for critic_ss in agent_parameters["critic_step_size"]:
                    for avg_reward_ss in agent_parameters[
                            "avg_reward_step_size"]:

                        env_info = {}
                        agent_info = {
                            "num_tilings": num_tilings,
                            "num_tiles": num_tiles,
                            "actor_step_size": actor_ss,
                            "critic_step_size": critic_ss,
                            "avg_reward_step_size": avg_reward_ss,
                            "num_actions": agent_parameters["num_actions"],
                            "iht_size": agent_parameters["iht_size"]
                        }

                        # results to save
                        return_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))
                        exp_avg_reward_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))

                        # using tqdm we visualize progress bars
                        for run in tqdm(
                                range(1,
                                      experiment_parameters["num_runs"] + 1)):
                            env_info["seed"] = run
                            agent_info["seed"] = run

                            rl_glue.rl_init(agent_info, env_info)
                            rl_glue.rl_start()

                            num_steps = 0
                            total_return = 0.
                            return_arr = []

                            # exponential average reward without initial bias
                            exp_avg_reward = 0.0
                            exp_avg_reward_ss = 0.01
                            exp_avg_reward_normalizer = 0

                            while num_steps < experiment_parameters[
                                    'max_steps']:
                                num_steps += 1

                                rl_step_result = rl_glue.rl_step()

                                reward = rl_step_result[0]
                                total_return += reward
                                return_arr.append(reward)
                                avg_reward = rl_glue.rl_agent_message(
                                    "get avg reward")

                                exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * (
                                    1 - exp_avg_reward_normalizer)
                                ss = exp_avg_reward_ss / exp_avg_reward_normalizer
                                exp_avg_reward += ss * (reward -
                                                        exp_avg_reward)

                                return_per_step[run - 1][num_steps -
                                                         1] = total_return
                                exp_avg_reward_per_step[run -
                                                        1][num_steps -
                                                           1] = exp_avg_reward

                        if not os.path.exists('results'):
                            os.makedirs('results')

                        save_name = "ActorCriticSoftmax_tilings_{}_tiledim_{}_actor_ss_{}_critic_ss_{}_avg_reward_ss_{}".format(
                            num_tilings, num_tiles, actor_ss, critic_ss,
                            avg_reward_ss)
                        total_return_filename = "results/{}_total_return.npy".format(
                            save_name)
                        exp_avg_reward_filename = "results/{}_exp_avg_reward.npy".format(
                            save_name)

                        np.save(total_return_filename, return_per_step)
                        np.save(exp_avg_reward_filename,
                                exp_avg_reward_per_step)
예제 #3
0
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):
    rl_glue = RLGlue(environment, agent)

    # sweep agent parameters
    for num_tilings in agent_parameters['num_tilings']:
        for num_tiles in agent_parameters["num_tiles"]:
            for update_ss in agent_parameters["update_step_size"]:
                for avg_reward_ss in agent_parameters["avg_reward_step_size"]:
                    for epsilon in agent_parameters["epsilon"]:
                        env_info = {}
                        agent_info = {
                            "num_tilings": num_tilings,
                            "num_tiles": num_tiles,
                            "alpha": update_ss,
                            "avg_reward_step_size": avg_reward_ss,
                            "epsilon": epsilon,
                            "num_actions": agent_parameters["num_actions"],
                            "iht_size": agent_parameters["iht_size"]
                        }
                        # results to save
                        print(agent_info)
                        return_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))
                        exp_avg_reward_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))
                        # using tqdm we visualize progress bars
                        avg_reward_list = []
                        avg_reward = -10000
                        for run in tqdm(
                                range(1,
                                      experiment_parameters["num_runs"] + 1)):
                            env_info["seed"] = run
                            agent_info["seed"] = run
                            rl_glue.rl_init(agent_info, env_info)
                            rl_glue.rl_start()
                            num_steps = 0
                            total_return = 0.
                            #return_arr = []
                            # exponential average reward without initial bias
                            exp_avg_reward = 0.0
                            exp_avg_reward_ss = 0.01
                            exp_avg_reward_normalizer = 0
                            while num_steps < experiment_parameters[
                                    'max_steps']:
                                num_steps += 1
                                if experiment_parameters[
                                        'max_steps'] - num_steps == 10000:
                                    rl_glue.change_agent()
                                rl_step_result = rl_glue.rl_step()
                                reward = rl_step_result[0]
                                total_return += reward
                                #return_arr.append(reward)
                                avg_reward = rl_glue.rl_agent_message(
                                    "get avg reward")
                                exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * (
                                    1 - exp_avg_reward_normalizer)
                                ss = exp_avg_reward_ss / exp_avg_reward_normalizer
                                exp_avg_reward += ss * (reward -
                                                        exp_avg_reward)
                                return_per_step[run - 1][num_steps -
                                                         1] = total_return
                                exp_avg_reward_per_step[run -
                                                        1][num_steps -
                                                           1] = exp_avg_reward
                            avg_reward_list.append(avg_reward)
                        print(np.average(avg_reward_list))
                        if not os.path.exists('results_sarsa'):
                            os.makedirs('results_sarsa')

                        save_name = "semi-gradient_sarsa_tilings_{}_tiledim_{}_update_ss_{}_epsilon_ss_{}_avg_reward_ss_{}_max_steps_{}".format(
                            num_tilings, num_tiles, update_ss, epsilon,
                            avg_reward_ss, experiment_parameters['max_steps'])
                        total_return_filename = "results_sarsa/{}_total_return.npy".format(
                            save_name)
                        exp_avg_reward_filename = "results_sarsa/{}_exp_avg_reward.npy".format(
                            save_name)

                        np.save(total_return_filename, return_per_step)
                        np.save(exp_avg_reward_filename,
                                exp_avg_reward_per_step)
예제 #4
0
def experiment(num_runs, max_steps):

    agent = Agent()
    environment = Environment()
    rlg = RLGlue(environment, agent)

    optimal_actions_optimistic = np.zeros(max_steps)
    optimal_actions_realistic = np.zeros(max_steps)

    for run in range(num_runs):

        # initialize RL-Glue
        rlg.rl_init()
        _, last_action = rlg.rl_start()

        optimal = environment.env_optimal_action()

        if last_action == optimal:
            optimal_actions_optimistic[0] += 1

        for i in range(1, max_steps):
            _, _, last_action, _ = rlg.rl_step()

            if last_action == optimal:
                optimal_actions_optimistic[i] += 1

        print("\rCurrent: %i" % run, end="")

    for run in range(num_runs):

        # initialize RL-Glue
        rlg.rl_init()
        agent.set_epsilon(0.1)
        agent.set_q(0)
        _, last_action = rlg.rl_start()

        optimal = environment.env_optimal_action()

        if last_action == optimal:
            optimal_actions_realistic[0] += 1

        for i in range(1, max_steps):
            _, _, last_action, _ = rlg.rl_step()

            if last_action == optimal:
                optimal_actions_realistic[i] += 1

        print("\rCurrent: %i" % run, end="")

    optimal_actions_optimistic /= num_runs
    optimal_actions_realistic /= num_runs

    fig, ax = plt.subplots()
    ax.plot(np.arange(1, 1001),
            optimal_actions_optimistic,
            'r',
            label='optimistic,greedy,Q1 = 0.5, epsilon = 0')
    ax.plot(np.arange(1, 1001),
            optimal_actions_realistic,
            'b',
            label='realistic,realistic,Q1 = 0, epsilon = 0.1')
    ax.legend()
    plt.xticks([1, 200, 400, 600, 800, 1000])
    plt.show()
        rl_glue.rl_init(agent_info, env_info)

        reward_sums = []
        state_visits = np.zeros(48)
        #         last_episode_total_reward = 0
        for episode in range(num_episodes):
            if episode < num_episodes - 10:
                # Runs an episode
                rl_glue.rl_episode(0)
            else:
                # Runs an episode while keeping track of visited states
                state, action = rl_glue.rl_start()
                state_visits[state] += 1
                is_terminal = False
                while not is_terminal:
                    reward, state, action, is_terminal = rl_glue.rl_step()
                    state_visits[state] += 1

            reward_sums.append(rl_glue.rl_return())
#             last_episode_total_reward = rl_glue.rl_return()

        all_reward_sums[algorithm].append(reward_sums)
        all_state_visits[algorithm].append(state_visits)

# save results
import os
import shutil
os.makedirs('results', exist_ok=True)
np.save('results/q_learning.npy', all_reward_sums['Q-learning'])
np.save('results/expected_sarsa.npy', all_reward_sums['Expected Sarsa'])
shutil.make_archive('results', 'zip', '.', 'results')
예제 #6
0
def main():
    # Seed rng's for consistent testing
    random.seed(0)
    np.random.seed(0)

    # Generate agent, environment and RLGlue
    env = Environment()
    agent = Agent(env.get_actions())
    rlglue = RLGlue(env, agent)
    del agent, env

    # Configure experiment
    num_eps = 100000
    # initialize rlglue
    rlglue.rl_init()

    avg_rewards = []
    avg_reward = 0
    max_reward = 0
    best_policy = None
    # Run through each episode
    #rlglue.rl_env_message('renderON')
    #for ep in range(num_eps):
    ep = 0
    while ep < num_eps:
        ep += 1
        #if ep % int(num_eps/10) == 0:
        #print('ep:', ep, 'bestpolicy', max_reward)
        # start episode
        rlglue.rl_start()
        rewards = 0
        steps = 1
        # Run episode to its completion
        terminal = False
        while not terminal:
            reward, state, action, terminal = rlglue.rl_step()
            rewards += reward
            steps += 1

        avg_reward = rewards
        avg_rewards.append(avg_reward)

        if rewards > max_reward:
            max_reward = rewards
            best_policy = rlglue.rl_agent_message('policy')
            pickle.dump(best_policy, open("policy.pickle", "wb"))
            print('ep', ep, 'reward', avg_reward)
        #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps)
        #print(rlglue.rl_agent_message('policy'))
        #input()

    plt.plot(avg_rewards)
    plt.plot(moving_average(avg_rewards, 10))
    plt.plot(moving_average(avg_rewards, 100))
    plt.savefig('results.png')

    # Get generated policy
    policy = rlglue.rl_agent_message('policy')

    # Test policy
    result = testPolicy(best_policy)
예제 #7
0
def main():
    # Seed rng's for consistent testing
    random.seed(0)
    np.random.seed(0)

    # Generate agent, environment and RLGlue
    env = Environment()
    env.env_init()
    agent = Agent(env.get_actions(), env.get_max_observation(),
                  env.get_min_observation())
    rlglue = RLGlue(env, agent)
    del agent, env

    # Configure experiment
    num_eps = 100000
    # initialize rlglue
    rlglue.rl_init()

    avg_rewards = []
    avg_reward = 0
    max_reward = 0
    best_policy = None
    # Run through each episode
    #rlglue.rl_env_message('renderON')
    #for ep in range(num_eps):
    ep = 0
    x = 10
    last_i = x
    last_n = np.zeros(x)
    best = (0, -1)
    while ep < num_eps:
        last_i += 1
        if last_i >= len(last_n):
            last_i = 0
        ep += 1
        #if ep % int(num_eps/10) == 0:
        #print('ep:', ep, 'bestpolicy', max_reward)
        # start episode
        rlglue.rl_start()
        rewards = 0
        steps = 1
        # Run episode to its completion
        terminal = False
        while not terminal:
            reward, state, action, terminal = rlglue.rl_step()
            rewards += reward
            steps += 1

        if steps > best[0]:
            best = (steps, ep)

        avg_reward = steps
        avg_rewards.append(avg_reward)
        last_n[last_i] = steps

        #print('ep',ep, 'steps', steps)
        #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps)
        #print(rlglue.rl_agent_message('policy'))
        #input()
        #if best[0] >= 500:
        print('ep', ep, 'mvg avg', np.average(last_n), steps, 'best', best)
        #if np.average(last_n) > 400:
        if ep > 2500:
            #rlglue.rl_env_message('renderON')

            break

    plt.plot(avg_rewards)
    plt.plot(moving_average(avg_rewards, 10))
    plt.plot(moving_average(avg_rewards, 100))
    plt.savefig('results.png')
예제 #8
0
from environment import Environment
from agent import MonteCarloAgent
import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":
    max_steps = 8000
    count_episode = -1
    episode = np.zeros(8000)

    # Create and pass agent and environment objects to RLGlue
    environment = Environment()
    agent = MonteCarloAgent()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore

    rlglue.rl_init()
    terminal = True

    for step in range(max_steps):

        if terminal:
            rlglue.rl_start()
            count_episode += 1

        _, _, _, terminal = rlglue.rl_step()

        episode[step] = count_episode

    plt.plot(np.arange(8000), episode)
    plt.show()
예제 #9
0
파일: greeedy.py 프로젝트: kathypuiu/RL
num_steps=1000 #for each exp
env=ten_arm_env.Enviroment
agent=GreedyAgent
agent_info={"num_actions":10} #nr of arms
enf_info={}
all_averages=[]

for i in tqdm(range(num_runs)): #tqdm->progress bar
	rl_glue=RLGlue(env, agent) #creates the experiemt
	rl_glue.rl_init(agent_info, env_info)
	rl_glue.rl_start()
	scores=[0]
	average=[]

	for i in range(num_steps):
		reward, _, action, _=rl_glue.rl_step()#agent and env take a step and return
		scored.append(scores[-1] + reward)
		average.append(scored[-1]/(i+1))
	all_averages.append(averages)

plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k')
plt.plot([1.55 for _ in range(num_steps)], linestyle="--")
plt.plot(np.mean(all_averages, axis=0))
plt.legend(["Best Possible", "Greedy"])
plt.title("Average Reward of Greedy Agent")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.show()
greedy_scores = np.mean(all_averages, axis=0)
np.save("greedy_scores", greedy_scores)