"iht_size": 4096, "num_tilings": 8, "num_tiles": 8, "actor_step_size": 1e-1, "critic_step_size": 1e-0, "avg_reward_step_size": 1e-2, "num_actions": 3, "seed": 99, } rl_glue = RLGlue(PendulumEnvironment, ActorCriticSoftmaxAgent) rl_glue.rl_init(agent_info, env_info) # start env/agent rl_glue.rl_start() rl_glue.rl_step() # simple alias agent = rl_glue.agent print("agent next_action: {}".format(agent.last_action)) print("agent avg reward: {}\n".format(agent.avg_reward)) assert agent.last_action == 1 assert agent.avg_reward == -0.03139092653589793 print("agent first 10 values of actor weights[0]: \n{}\n".format( agent.actor_w[0][:10])) print("agent first 10 values of actor weights[1]: \n{}\n".format( agent.actor_w[1][:10])) print("agent first 10 values of actor weights[2]: \n{}\n".format(
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # sweep agent parameters for num_tilings in agent_parameters['num_tilings']: for num_tiles in agent_parameters["num_tiles"]: for actor_ss in agent_parameters["actor_step_size"]: for critic_ss in agent_parameters["critic_step_size"]: for avg_reward_ss in agent_parameters[ "avg_reward_step_size"]: env_info = {} agent_info = { "num_tilings": num_tilings, "num_tiles": num_tiles, "actor_step_size": actor_ss, "critic_step_size": critic_ss, "avg_reward_step_size": avg_reward_ss, "num_actions": agent_parameters["num_actions"], "iht_size": agent_parameters["iht_size"] } # results to save return_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) exp_avg_reward_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) # using tqdm we visualize progress bars for run in tqdm( range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() num_steps = 0 total_return = 0. return_arr = [] # exponential average reward without initial bias exp_avg_reward = 0.0 exp_avg_reward_ss = 0.01 exp_avg_reward_normalizer = 0 while num_steps < experiment_parameters[ 'max_steps']: num_steps += 1 rl_step_result = rl_glue.rl_step() reward = rl_step_result[0] total_return += reward return_arr.append(reward) avg_reward = rl_glue.rl_agent_message( "get avg reward") exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * ( 1 - exp_avg_reward_normalizer) ss = exp_avg_reward_ss / exp_avg_reward_normalizer exp_avg_reward += ss * (reward - exp_avg_reward) return_per_step[run - 1][num_steps - 1] = total_return exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward if not os.path.exists('results'): os.makedirs('results') save_name = "ActorCriticSoftmax_tilings_{}_tiledim_{}_actor_ss_{}_critic_ss_{}_avg_reward_ss_{}".format( num_tilings, num_tiles, actor_ss, critic_ss, avg_reward_ss) total_return_filename = "results/{}_total_return.npy".format( save_name) exp_avg_reward_filename = "results/{}_exp_avg_reward.npy".format( save_name) np.save(total_return_filename, return_per_step) np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # sweep agent parameters for num_tilings in agent_parameters['num_tilings']: for num_tiles in agent_parameters["num_tiles"]: for update_ss in agent_parameters["update_step_size"]: for avg_reward_ss in agent_parameters["avg_reward_step_size"]: for epsilon in agent_parameters["epsilon"]: env_info = {} agent_info = { "num_tilings": num_tilings, "num_tiles": num_tiles, "alpha": update_ss, "avg_reward_step_size": avg_reward_ss, "epsilon": epsilon, "num_actions": agent_parameters["num_actions"], "iht_size": agent_parameters["iht_size"] } # results to save print(agent_info) return_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) exp_avg_reward_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) # using tqdm we visualize progress bars avg_reward_list = [] avg_reward = -10000 for run in tqdm( range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() num_steps = 0 total_return = 0. #return_arr = [] # exponential average reward without initial bias exp_avg_reward = 0.0 exp_avg_reward_ss = 0.01 exp_avg_reward_normalizer = 0 while num_steps < experiment_parameters[ 'max_steps']: num_steps += 1 if experiment_parameters[ 'max_steps'] - num_steps == 10000: rl_glue.change_agent() rl_step_result = rl_glue.rl_step() reward = rl_step_result[0] total_return += reward #return_arr.append(reward) avg_reward = rl_glue.rl_agent_message( "get avg reward") exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * ( 1 - exp_avg_reward_normalizer) ss = exp_avg_reward_ss / exp_avg_reward_normalizer exp_avg_reward += ss * (reward - exp_avg_reward) return_per_step[run - 1][num_steps - 1] = total_return exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward avg_reward_list.append(avg_reward) print(np.average(avg_reward_list)) if not os.path.exists('results_sarsa'): os.makedirs('results_sarsa') save_name = "semi-gradient_sarsa_tilings_{}_tiledim_{}_update_ss_{}_epsilon_ss_{}_avg_reward_ss_{}_max_steps_{}".format( num_tilings, num_tiles, update_ss, epsilon, avg_reward_ss, experiment_parameters['max_steps']) total_return_filename = "results_sarsa/{}_total_return.npy".format( save_name) exp_avg_reward_filename = "results_sarsa/{}_exp_avg_reward.npy".format( save_name) np.save(total_return_filename, return_per_step) np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
def experiment(num_runs, max_steps): agent = Agent() environment = Environment() rlg = RLGlue(environment, agent) optimal_actions_optimistic = np.zeros(max_steps) optimal_actions_realistic = np.zeros(max_steps) for run in range(num_runs): # initialize RL-Glue rlg.rl_init() _, last_action = rlg.rl_start() optimal = environment.env_optimal_action() if last_action == optimal: optimal_actions_optimistic[0] += 1 for i in range(1, max_steps): _, _, last_action, _ = rlg.rl_step() if last_action == optimal: optimal_actions_optimistic[i] += 1 print("\rCurrent: %i" % run, end="") for run in range(num_runs): # initialize RL-Glue rlg.rl_init() agent.set_epsilon(0.1) agent.set_q(0) _, last_action = rlg.rl_start() optimal = environment.env_optimal_action() if last_action == optimal: optimal_actions_realistic[0] += 1 for i in range(1, max_steps): _, _, last_action, _ = rlg.rl_step() if last_action == optimal: optimal_actions_realistic[i] += 1 print("\rCurrent: %i" % run, end="") optimal_actions_optimistic /= num_runs optimal_actions_realistic /= num_runs fig, ax = plt.subplots() ax.plot(np.arange(1, 1001), optimal_actions_optimistic, 'r', label='optimistic,greedy,Q1 = 0.5, epsilon = 0') ax.plot(np.arange(1, 1001), optimal_actions_realistic, 'b', label='realistic,realistic,Q1 = 0, epsilon = 0.1') ax.legend() plt.xticks([1, 200, 400, 600, 800, 1000]) plt.show()
rl_glue.rl_init(agent_info, env_info) reward_sums = [] state_visits = np.zeros(48) # last_episode_total_reward = 0 for episode in range(num_episodes): if episode < num_episodes - 10: # Runs an episode rl_glue.rl_episode(0) else: # Runs an episode while keeping track of visited states state, action = rl_glue.rl_start() state_visits[state] += 1 is_terminal = False while not is_terminal: reward, state, action, is_terminal = rl_glue.rl_step() state_visits[state] += 1 reward_sums.append(rl_glue.rl_return()) # last_episode_total_reward = rl_glue.rl_return() all_reward_sums[algorithm].append(reward_sums) all_state_visits[algorithm].append(state_visits) # save results import os import shutil os.makedirs('results', exist_ok=True) np.save('results/q_learning.npy', all_reward_sums['Q-learning']) np.save('results/expected_sarsa.npy', all_reward_sums['Expected Sarsa']) shutil.make_archive('results', 'zip', '.', 'results')
def main(): # Seed rng's for consistent testing random.seed(0) np.random.seed(0) # Generate agent, environment and RLGlue env = Environment() agent = Agent(env.get_actions()) rlglue = RLGlue(env, agent) del agent, env # Configure experiment num_eps = 100000 # initialize rlglue rlglue.rl_init() avg_rewards = [] avg_reward = 0 max_reward = 0 best_policy = None # Run through each episode #rlglue.rl_env_message('renderON') #for ep in range(num_eps): ep = 0 while ep < num_eps: ep += 1 #if ep % int(num_eps/10) == 0: #print('ep:', ep, 'bestpolicy', max_reward) # start episode rlglue.rl_start() rewards = 0 steps = 1 # Run episode to its completion terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() rewards += reward steps += 1 avg_reward = rewards avg_rewards.append(avg_reward) if rewards > max_reward: max_reward = rewards best_policy = rlglue.rl_agent_message('policy') pickle.dump(best_policy, open("policy.pickle", "wb")) print('ep', ep, 'reward', avg_reward) #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps) #print(rlglue.rl_agent_message('policy')) #input() plt.plot(avg_rewards) plt.plot(moving_average(avg_rewards, 10)) plt.plot(moving_average(avg_rewards, 100)) plt.savefig('results.png') # Get generated policy policy = rlglue.rl_agent_message('policy') # Test policy result = testPolicy(best_policy)
def main(): # Seed rng's for consistent testing random.seed(0) np.random.seed(0) # Generate agent, environment and RLGlue env = Environment() env.env_init() agent = Agent(env.get_actions(), env.get_max_observation(), env.get_min_observation()) rlglue = RLGlue(env, agent) del agent, env # Configure experiment num_eps = 100000 # initialize rlglue rlglue.rl_init() avg_rewards = [] avg_reward = 0 max_reward = 0 best_policy = None # Run through each episode #rlglue.rl_env_message('renderON') #for ep in range(num_eps): ep = 0 x = 10 last_i = x last_n = np.zeros(x) best = (0, -1) while ep < num_eps: last_i += 1 if last_i >= len(last_n): last_i = 0 ep += 1 #if ep % int(num_eps/10) == 0: #print('ep:', ep, 'bestpolicy', max_reward) # start episode rlglue.rl_start() rewards = 0 steps = 1 # Run episode to its completion terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() rewards += reward steps += 1 if steps > best[0]: best = (steps, ep) avg_reward = steps avg_rewards.append(avg_reward) last_n[last_i] = steps #print('ep',ep, 'steps', steps) #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps) #print(rlglue.rl_agent_message('policy')) #input() #if best[0] >= 500: print('ep', ep, 'mvg avg', np.average(last_n), steps, 'best', best) #if np.average(last_n) > 400: if ep > 2500: #rlglue.rl_env_message('renderON') break plt.plot(avg_rewards) plt.plot(moving_average(avg_rewards, 10)) plt.plot(moving_average(avg_rewards, 100)) plt.savefig('results.png')
from environment import Environment from agent import MonteCarloAgent import numpy as np import matplotlib.pyplot as plt if __name__ == "__main__": max_steps = 8000 count_episode = -1 episode = np.zeros(8000) # Create and pass agent and environment objects to RLGlue environment = Environment() agent = MonteCarloAgent() rlglue = RLGlue(environment, agent) del agent, environment # don't use these anymore rlglue.rl_init() terminal = True for step in range(max_steps): if terminal: rlglue.rl_start() count_episode += 1 _, _, _, terminal = rlglue.rl_step() episode[step] = count_episode plt.plot(np.arange(8000), episode) plt.show()
num_steps=1000 #for each exp env=ten_arm_env.Enviroment agent=GreedyAgent agent_info={"num_actions":10} #nr of arms enf_info={} all_averages=[] for i in tqdm(range(num_runs)): #tqdm->progress bar rl_glue=RLGlue(env, agent) #creates the experiemt rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() scores=[0] average=[] for i in range(num_steps): reward, _, action, _=rl_glue.rl_step()#agent and env take a step and return scored.append(scores[-1] + reward) average.append(scored[-1]/(i+1)) all_averages.append(averages) plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k') plt.plot([1.55 for _ in range(num_steps)], linestyle="--") plt.plot(np.mean(all_averages, axis=0)) plt.legend(["Best Possible", "Greedy"]) plt.title("Average Reward of Greedy Agent") plt.xlabel("Steps") plt.ylabel("Average reward") plt.show() greedy_scores = np.mean(all_averages, axis=0) np.save("greedy_scores", greedy_scores)