def main(agent_info, agent_class, steps, filename): env_class = floating_horsetrack_environment.Environment rl_glue = RLGlue(env_class, agent_class) max_steps = steps step = 0 episode_end = [] cum_reward = 0 agent_info.update({"actions": env_class.actions}) rl_glue.rl_init(agent_info) while step < max_steps: rl_glue.rl_start() is_terminal = False while not is_terminal and step < max_steps: reward, state, action, is_terminal = rl_glue.rl_step() cum_reward += reward step += 1 if is_terminal: episode_end.append(step) rl_glue.rl_cleanup() save_results(episode_end, len(episode_end), "data/{}".format(filename))
def testPolicy(policy): agent = testAgent(policy) env = Environment() rlglue = RLGlue(env, agent) del env, agent rlglue.rl_init() # set up 2d array for average rewards # rewards[step] = sum of rewards across all runs for that step rewards = [0 for i in range(1000)] for run in range(1): rlglue.rl_init() #rlglue.rl_env_message('renderON') rlglue.rl_start() terminal = False for step in range(1000): if not terminal: r, s, a, terminal = rlglue.rl_step() rewards[step] += r # average rewards rewards = [i / 1 for i in rewards] return rewards
def main(agent_info, agent_class, env_info, env_class, steps, param_info): # env_class = horsetrack_environment.Environment rl_glue = RLGlue(env_class, agent_class) max_steps = steps max_episodes = 5 step = 0 episodes = 0 episode_end = np.ones(max_episodes) * max_steps cum_reward = 0 # max_steps = 20000 agent_info.update({"actions": env_class.actions}) rl_glue.rl_init(agent_info, env_info) while step < max_steps and episodes < max_episodes: rl_glue.rl_start() is_terminal = False while not is_terminal and step < max_steps: reward, state, action, is_terminal = rl_glue.rl_step() cum_reward += reward step += 1 if is_terminal: episode_end[episodes] = step episodes += 1 rl_glue.rl_cleanup() save_results(episode_end, "{}".format(param_info))
def main(): num_eps = 5000 num_runs = 10 random.seed(0) np.random.seed(0) agent = Agent() env = Environment() rlglue = RLGlue(env, agent) del agent, env for run in range(num_runs): rlglue.rl_init() performances = [] for ep in range(num_eps): rlglue.rl_start() #rlglue.rl_env_message('renderON') terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() # Find the first policy that performs at 100% performance = testPolicy(rlglue.rl_agent_message('policy')) * 100 performances.append(performance) if performance >= 100: #print(rlglue.rl_agent_message('policy')) print('Episode: %d' % (ep + 1)) break plt.plot(performances) plt.savefig('test.png')
def main(): env_class = horsetrack_environment.Environment agent_class = random_agent.Agent rl_glue = RLGlue(env_class, agent_class) num_episodes = 1000 max_steps = 100000 print("\tPrinting one dot for every run: {}".format(num_episodes), end=' ') print("total runs to complete.") total_steps = [0 for _ in range(max_steps)] for i in range(num_episodes): rl_glue.rl_init(agent_info={"actions": env_class.actions}) rl_glue.rl_start() is_terminal = False while rl_glue.num_steps < max_steps and not is_terminal: reward, state, action, is_terminal = rl_glue.rl_step() # optimal_action[num_steps] += 1 if "action is optimal" else 0 total_steps[i] = rl_glue.num_steps rl_glue.rl_cleanup() print(".", end='') sys.stdout.flush() # prop_optimal = [num_optimal / num_episodes for num_optimal in optimal_action] save_results(total_steps, len(total_steps), "RL_EXP_OUT.dat") print("\nDone")
def main(): num_eps = 200000 agent = Agent() env = Environment() rlglue = RLGlue(env, agent) del agent, env solves = 0 rlglue.rl_init() rewards = [] for ep in range(num_eps): rlglue.rl_start() #rlglue.rl_env_message('renderON') terminal = False reward = 0 while not terminal: reward, state, action, terminal = rlglue.rl_step() if ep > 1000: rlglue.rl_env_message('renderON') print(state) time.sleep(0.1) rewards.append(reward) if ep >= 99: if np.average(rewards[ep-99:ep+1]) > 0.78: print('solved at episode %d' % ep+1) break else: pass
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) np.random.seed(0) num_episodes = 200 see_eps = [157] num_runs = 1 max_eps_steps = 100000 # test with various stepsizes (alphas) for agent stepSizes = np.linspace(0.01, 1, 100) # best stepsize so far (comment out to test many) stepSizes = [0.559184] # seperate run for each stepsize for step in stepSizes: # initialize agent and software, with chosen stepsize rlglue.rl_init() rlglue.rl_agent_message('step:' + str(step)) # keep track of total rewards for each episode total_rewards = [] for ep in range(num_episodes): # render only selected episodes if ep in see_eps: rlglue.rl_env_message('rOFF') if ep + 1 in see_eps: rlglue.rl_env_message('rON') print("Episode %d" % (ep + 1)) # initializse for episode rlglue.rl_start() terminal = False total_reward = 0 # run episode and calculate total reward while not terminal: reward, state, action, terminal = rlglue.rl_step() total_reward += reward total_rewards.append(total_reward) # calculate average reward of the last 100 episodes if ep >= 99: total = np.sum(total_rewards[ep - 99:ep + 1]) avg = total / 100 # check if results indicate the problem is solved if avg > -110: print("Solved at episode %d, avg reward: %f" % (ep + 1, avg)) break # close environment environment.close()
def main(data_output_location="new_data"): env_class = horsetrack_environment.Environment agent_class = random_agent.Agent agent_name = agent_class.__module__[agent_class.__module__.find(".") + 1:] environment_name = env_class.__module__[env_class.__module__.find(".") + 1:] rl_glue = RLGlue(env_class, agent_class) # num_episodes = 2000 # max_steps = 1000 max_total_steps = 100_000 for epsilon in [0.0, 0.1]: for alpha in [2, 1, 0.5, 0.25, 0.125, 0.0625]: print("Running Agent: {} on Environment: {}.".format( agent_name, environment_name)) agent_init_info = { "actions": [-1, 1], "world_size": 100, "epsilon": epsilon, "alpha": alpha } termination_times = [] rl_glue.rl_init(agent_init_info=agent_init_info) step_counter = 0 while step_counter < max_total_steps: rl_glue.rl_start() is_terminal = False while step_counter < max_total_steps and not is_terminal: reward, state, action, is_terminal = rl_glue.rl_step() step_counter += 1 rl_glue.rl_cleanup() # print(".", end='') sys.stdout.flush() if is_terminal: termination_times.append(step_counter) epoch_datetime = int( (datetime.datetime.now() - datetime.datetime.utcfromtimestamp(0)).total_seconds()) save_results( termination_times, len(termination_times), "{}/{}_{}__{}__epsilon{}__alpha{}.dat".format( data_output_location, epoch_datetime, agent_name, environment_name, epsilon, alpha)) print("\nDone")
def run_experiment(): #specify hyper-parameters num_runs = 1 max_episodes = 1000000 max_steps_per_episode = 100 num_states = 181 num_actions = 2 alpha = 0.01 eps = 0.1 Q1 = 0 results = np.zeros(max_episodes) results_run = 0 agent = RandomAgent(num_states, num_actions, alpha, eps, Q1) environment = BlackJack() rlglue = RLGlue(environment, agent) print( "\nPrinting one dot for every run: {0} total runs to complete".format( num_runs)) for run in range(num_runs): np.random.seed(run) results_run = 0.0 rlglue.rl_init() for e in range(1, max_episodes + 1): rlglue.rl_start() for s in range(max_steps_per_episode): r, _, _, terminal = rlglue.rl_step() results_run += r results[e - 1] += r if terminal: break if e % 10000 == 0: print( "\nEpisode {}: average return till episode is {}, and policy is" .format(e, results_run / e)) print(rlglue.rl_agent_message("printPolicy")) print(".") print("Average return over experiment: {}".format( (results / num_runs).mean())) #save final policy to file -- change file name as necessary with open("policy.txt", 'w') as f: f.write(rlglue.rl_agent_message("printPolicy")) #save all the experiment data for analysis -- change file name as necessary save_results(results / num_runs, max_episodes, "RL_EXP_OUT.dat")
def testPolicy(policy): env = Environment() agent = testAgent(policy) rlglue = RLGlue(env, agent) rlglue.rl_init() #rlglue.rl_env_message('renderON') performance = 0 for ep in range(100): rlglue.rl_start() terminal = False reward = None while not terminal: reward, state, action, terminal = rlglue.rl_step() if reward > 0: performance += 1 return performance / 100
def testPolicy(policy): agent = testAgent(policy) env = Environment() rlglue = RLGlue(env, agent) del env, agent rlglue.rl_init() for run in range(1): rlglue.rl_init() rlglue.rl_env_message('renderON') rlglue.rl_start() total_reward = 0 terminal = False while not terminal: r, s, a, terminal = rlglue.rl_step() total_reward += r return total_reward
def experiment1(): agent = RandomAgent() environment = Environment1D() rlg = RLGlue(environment, agent) max_steps = 1000 # max number of steps in an episode num_runs = 2000 # number of repetitions of the experiment optimal_action = np.zeros(max_steps) for k in range(num_runs): # initialize RL-Glue rlg.rl_init() #env_init + agent_init rlg.rl_start() for i in range(max_steps): #step action = rlg.rl_step()[2] if action == environment.env_message(): optimal_action[i] += 1 ratio_optimal_action = optimal_action / num_runs return ratio_optimal_action
def main(): env = drifter_distractor_env.Environment env = switched_drifter_distractor_env.Environment agents = [random_agent.Agent, weight_change_agent.Agent] agent_types = ["absolute_error", "squared_error", "weight_change"] for agent_type in agent_types: agent = agents[1] agent_info = { "num_actions": 4, "action_selection": "softmax", "agent_type": agent_type } env_info = {} num_runs = 1 num_steps = 100000 actions = [0 for _ in range(4)] errors = [] for run in range(num_runs): rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() for step in range(num_steps): reward, state, action, is_terminal = rl_glue.rl_step() actions[action] += 1 # np.save("data/squared_error", rl_glue.agent.track_actions) np.save("data/{}".format(agent_type), rl_glue.agent.track_actions) # print(rl_glue.environment.arm_1) # print(rl_glue.environment.arm_2) # print(rl_glue.environment.arm_3) # print(rl_glue.environment.arm_4) print(actions)
def run_experiment(env, agent, agent_info, env_info, num_experiments=1, num_steps=None, seeds=None): all_scores = [] for _ in range(num_experiments): rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() scores = [0] averages = [] for _ in range(num_steps): reward, state, action, is_terminal = rl_glue.rl_step() scores.append(scores[-1] + reward) averages.append(scores[-1] / (i + 1)) all_scores.append(averages) return all_scores
for num_moves in possible_num_moves: episode = np.zeros(max_steps) for run in range(max_run): count_episode = -1 rlglue.rl_init() agent.possibleMoves = num_moves terminal = True for step in range(max_steps): if terminal: rlglue.rl_start() count_episode += 1 _, _, _, terminal = rlglue.rl_step() episode[step] += count_episode plt.plot(np.arange(max_steps), episode / max_run, label="Possible move: " + str(num_moves)) plt.legend() plt.xlabel("Time steps") plt.ylabel("Episodes") plt.title("One-step Sarsa for Different Possible Moves") plt.show()
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # sweep agent parameters for num_tilings in agent_parameters['num_tilings']: for num_tiles in agent_parameters["num_tiles"]: for update_ss in agent_parameters["update_step_size"]: for avg_reward_ss in agent_parameters["avg_reward_step_size"]: for epsilon in agent_parameters["epsilon"]: env_info = {} agent_info = {"num_tilings": num_tilings, "num_tiles": num_tiles, "alpha": update_ss, "avg_reward_step_size": avg_reward_ss, "epsilon":epsilon, "num_actions": agent_parameters["num_actions"], "iht_size": agent_parameters["iht_size"]} # results to save return_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) exp_avg_reward_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) # using tqdm we visualize progress bars avg_reward_list = [] avg_reward = -10000 for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() num_steps = 0 total_return = 0. #return_arr = [] # exponential average reward without initial bias exp_avg_reward = 0.0 exp_avg_reward_ss = 0.01 exp_avg_reward_normalizer = 0 while num_steps < experiment_parameters['max_steps']: num_steps += 1 rl_step_result = rl_glue.rl_step() reward = rl_step_result[0] total_return += reward #return_arr.append(reward) avg_reward = rl_glue.rl_agent_message("get avg reward") exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * ( 1 - exp_avg_reward_normalizer) ss = exp_avg_reward_ss / exp_avg_reward_normalizer exp_avg_reward += ss * (reward - exp_avg_reward) return_per_step[run - 1][num_steps - 1] = total_return exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward avg_reward_list.append(avg_reward) print(np.average(avg_reward_list)) if not os.path.exists('results_sarsa'): os.makedirs('results_sarsa') save_name = "semi-gradient_sarsa_tilings_{}_tiledim_{}_update_ss_{}_epsilon_ss_{}_avg_reward_ss_{}_max_steps_{}".format( num_tilings, num_tiles, update_ss, epsilon, avg_reward_ss, experiment_parameters["max_steps"]) total_return_filename = "results_sarsa/{}_total_return.npy".format(save_name) exp_avg_reward_filename = "results_sarsa/{}_exp_avg_reward.npy".format(save_name) np.save(total_return_filename, return_per_step) np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
# here it just needs the number of actions (number of arms). env_info = { } # Pass the environment the information it needs; in this case, it is nothing. all_averages = [] for i in tqdm( range(num_runs) ): # tqdm is what creates the progress bar below once the code is run rl_glue = RLGlue( env, agent ) # Creates a new RLGlue experiment with the env and agent we chose above rl_glue.rl_init( agent_info, env_info ) # Pass RLGlue what it needs to initialize the agent and environment rl_glue.rl_start() # Start the experiment scores = [0] averages = [] for i in range(num_steps): reward, _, action, _ = rl_glue.rl_step( ) # The environment and agent take a step and return # the reward, and action taken. scores.append(scores[-1] + reward) averages.append(scores[-1] / (i + 1)) all_averages.append(averages) plt.figure(figsize=(15, 5), dpi=80, facecolor='w', edgecolor='k') plt.plot([1.55 for _ in range(num_steps)], linestyle="--") plt.plot(np.mean(all_averages, axis=0))
for run in tqdm(range(num_runs)): agent_info["seed"] = run rl_glue = RLGlue(env, agents[algorithm]) rl_glue.rl_init(agent_info, env_info) reward_sums = [] state_visits = np.zeros(agent_info["num_states"]) # last_episode_total_reward = 0 for episode in range(num_episodes): start_time = time.clock() if episode < num_episodes - 10: # Runs an episode rl_glue.rl_episode(0) else: # Runs an episode while keeping track of visited states state, action = rl_glue.rl_start() state_visits[state] += 1 is_terminal = False while not is_terminal: # # stop the program # line = sys.stdin.readline() # print 'line=', line # if line == 'q': # sys.exit() reward, state, action, is_terminal = rl_glue.rl_step() state_visits[state] += 1 reward_sums.append(rl_glue.rl_return()) # last_episode_total_reward = rl_glue.rl_return() end_time = time.clock() print "The time of ", episode, " episode:", end_time - start_time
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # sweep agent parameters for num_tilings in agent_parameters['num_tilings']: for num_tiles in agent_parameters["num_tiles"]: for actor_ss in agent_parameters["actor_step_size"]: for critic_ss in agent_parameters["critic_step_size"]: for avg_reward_ss in agent_parameters[ "avg_reward_step_size"]: env_info = {} agent_info = { "num_tilings": num_tilings, "num_tiles": num_tiles, "actor_step_size": actor_ss, "critic_step_size": critic_ss, "avg_reward_step_size": avg_reward_ss, "num_actions": agent_parameters["num_actions"], "iht_size": agent_parameters["iht_size"] } # results to save return_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) exp_avg_reward_per_step = np.zeros( (experiment_parameters["num_runs"], experiment_parameters["max_steps"])) # using tqdm we visualize progress bars for run in tqdm( range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run agent_info["seed"] = run rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() num_steps = 0 total_return = 0. return_arr = [] # exponential average reward without initial bias exp_avg_reward = 0.0 exp_avg_reward_ss = 0.01 exp_avg_reward_normalizer = 0 while num_steps < experiment_parameters[ 'max_steps']: num_steps += 1 rl_step_result = rl_glue.rl_step() reward = rl_step_result[0] total_return += reward return_arr.append(reward) avg_reward = rl_glue.rl_agent_message( "get avg reward") exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * ( 1 - exp_avg_reward_normalizer) ss = exp_avg_reward_ss / exp_avg_reward_normalizer exp_avg_reward += ss * (reward - exp_avg_reward) return_per_step[run - 1][num_steps - 1] = total_return exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward if not os.path.exists('results'): os.makedirs('results') save_name = "ActorCriticSoftmax_tilings_{}_tiledim_{}_actor_ss_{}_critic_ss_{}_avg_reward_ss_{}".format( num_tilings, num_tiles, actor_ss, critic_ss, avg_reward_ss) total_return_filename = "results/{}_total_return.npy".format( save_name) exp_avg_reward_filename = "results/{}_exp_avg_reward.npy".format( save_name) np.save(total_return_filename, return_per_step) np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
agent_info = { "iht_size": 4096, "num_tilings": 8, "num_tiles": 8, "actor_step_size": 1e-1, "critic_step_size": 1e-0, "avg_reward_step_size": 1e-2, "num_actions": 3, "seed": 99, } rl_glue = RLGlue(PendulumEnvironment, ActorCriticSoftmaxAgent) rl_glue.rl_init(agent_info, env_info) # start env/agent rl_glue.rl_start() rl_glue.rl_step() # simple alias agent = rl_glue.agent print("agent next_action: {}".format(agent.last_action)) print("agent avg reward: {}\n".format(agent.avg_reward)) assert agent.last_action == 1 assert agent.avg_reward == -0.03139092653589793 print("agent first 10 values of actor weights[0]: \n{}\n".format( agent.actor_w[0][:10])) print("agent first 10 values of actor weights[1]: \n{}\n".format( agent.actor_w[1][:10]))
def experiment(num_runs, max_steps): agent = Agent() environment = Environment() rlg = RLGlue(environment, agent) optimal_actions_optimistic = np.zeros(max_steps) optimal_actions_realistic = np.zeros(max_steps) for run in range(num_runs): # initialize RL-Glue rlg.rl_init() _, last_action = rlg.rl_start() optimal = environment.env_optimal_action() if last_action == optimal: optimal_actions_optimistic[0] += 1 for i in range(1, max_steps): _, _, last_action, _ = rlg.rl_step() if last_action == optimal: optimal_actions_optimistic[i] += 1 print("\rCurrent: %i" % run, end="") for run in range(num_runs): # initialize RL-Glue rlg.rl_init() agent.set_epsilon(0.1) agent.set_q(0) _, last_action = rlg.rl_start() optimal = environment.env_optimal_action() if last_action == optimal: optimal_actions_realistic[0] += 1 for i in range(1, max_steps): _, _, last_action, _ = rlg.rl_step() if last_action == optimal: optimal_actions_realistic[i] += 1 print("\rCurrent: %i" % run, end="") optimal_actions_optimistic /= num_runs optimal_actions_realistic /= num_runs fig, ax = plt.subplots() ax.plot(np.arange(1, 1001), optimal_actions_optimistic, 'r', label='optimistic,greedy,Q1 = 0.5, epsilon = 0') ax.plot(np.arange(1, 1001), optimal_actions_realistic, 'b', label='realistic,realistic,Q1 = 0, epsilon = 0.1') ax.legend() plt.xticks([1, 200, 400, 600, 800, 1000]) plt.show()
def main(): # Seed rng's for consistent testing random.seed(0) np.random.seed(0) # Generate agent, environment and RLGlue env = Environment() env.env_init() agent = Agent(env.get_actions(), env.get_max_observation(), env.get_min_observation()) rlglue = RLGlue(env, agent) del agent, env # Configure experiment num_eps = 100000 # initialize rlglue rlglue.rl_init() avg_rewards = [] avg_reward = 0 max_reward = 0 best_policy = None # Run through each episode #rlglue.rl_env_message('renderON') #for ep in range(num_eps): ep = 0 x = 10 last_i = x last_n = np.zeros(x) best = (0, -1) while ep < num_eps: last_i += 1 if last_i >= len(last_n): last_i = 0 ep += 1 #if ep % int(num_eps/10) == 0: #print('ep:', ep, 'bestpolicy', max_reward) # start episode rlglue.rl_start() rewards = 0 steps = 1 # Run episode to its completion terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() rewards += reward steps += 1 if steps > best[0]: best = (steps, ep) avg_reward = steps avg_rewards.append(avg_reward) last_n[last_i] = steps #print('ep',ep, 'steps', steps) #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps) #print(rlglue.rl_agent_message('policy')) #input() #if best[0] >= 500: print('ep', ep, 'mvg avg', np.average(last_n), steps, 'best', best) #if np.average(last_n) > 400: if ep > 2500: #rlglue.rl_env_message('renderON') break plt.plot(avg_rewards) plt.plot(moving_average(avg_rewards, 10)) plt.plot(moving_average(avg_rewards, 100)) plt.savefig('results.png')
def main(): # Seed rng's for consistent testing random.seed(0) np.random.seed(0) # Generate agent, environment and RLGlue env = Environment() agent = Agent(env.get_actions()) rlglue = RLGlue(env, agent) del agent, env # Configure experiment num_eps = 100000 # initialize rlglue rlglue.rl_init() avg_rewards = [] avg_reward = 0 max_reward = 0 best_policy = None # Run through each episode #rlglue.rl_env_message('renderON') #for ep in range(num_eps): ep = 0 while ep < num_eps: ep += 1 #if ep % int(num_eps/10) == 0: #print('ep:', ep, 'bestpolicy', max_reward) # start episode rlglue.rl_start() rewards = 0 steps = 1 # Run episode to its completion terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() rewards += reward steps += 1 avg_reward = rewards avg_rewards.append(avg_reward) if rewards > max_reward: max_reward = rewards best_policy = rlglue.rl_agent_message('policy') pickle.dump(best_policy, open("policy.pickle", "wb")) print('ep', ep, 'reward', avg_reward) #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps) #print(rlglue.rl_agent_message('policy')) #input() plt.plot(avg_rewards) plt.plot(moving_average(avg_rewards, 10)) plt.plot(moving_average(avg_rewards, 100)) plt.savefig('results.png') # Get generated policy policy = rlglue.rl_agent_message('policy') # Test policy result = testPolicy(best_policy)