def main(agent_info, agent_class, steps, filename):
    env_class = floating_horsetrack_environment.Environment
    rl_glue = RLGlue(env_class, agent_class)

    max_steps = steps
    step = 0
    episode_end = []
    cum_reward = 0

    agent_info.update({"actions": env_class.actions})
    rl_glue.rl_init(agent_info)

    while step < max_steps:
        rl_glue.rl_start()

        is_terminal = False

        while not is_terminal and step < max_steps:
            reward, state, action, is_terminal = rl_glue.rl_step()
            cum_reward += reward

            step += 1

        if is_terminal:
            episode_end.append(step)
        rl_glue.rl_cleanup()

    save_results(episode_end, len(episode_end), "data/{}".format(filename))
예제 #2
0
def testPolicy(policy):
    agent = testAgent(policy)
    env = Environment()
    rlglue = RLGlue(env, agent)
    del env, agent
    rlglue.rl_init()

    # set up 2d array for average rewards
    # rewards[step] = sum of rewards across all runs for that step
    rewards = [0 for i in range(1000)]
    for run in range(1):
        rlglue.rl_init()
        #rlglue.rl_env_message('renderON')
        rlglue.rl_start()

        terminal = False
        for step in range(1000):
            if not terminal:
                r, s, a, terminal = rlglue.rl_step()
                rewards[step] += r

    # average rewards
    rewards = [i / 1 for i in rewards]

    return rewards
예제 #3
0
def main(agent_info, agent_class, env_info, env_class, steps, param_info):
    # env_class = horsetrack_environment.Environment
    rl_glue = RLGlue(env_class, agent_class)

    max_steps = steps
    max_episodes = 5
    step = 0
    episodes = 0
    episode_end = np.ones(max_episodes) * max_steps
    cum_reward = 0

    # max_steps = 20000

    agent_info.update({"actions": env_class.actions})
    rl_glue.rl_init(agent_info, env_info)

    while step < max_steps and episodes < max_episodes:
        rl_glue.rl_start()

        is_terminal = False

        while not is_terminal and step < max_steps:
            reward, state, action, is_terminal = rl_glue.rl_step()
            cum_reward += reward

            step += 1

        if is_terminal:
            episode_end[episodes] = step
            episodes += 1
        rl_glue.rl_cleanup()

    save_results(episode_end, "{}".format(param_info))
예제 #4
0
def main():

    num_eps = 5000
    num_runs = 10
    random.seed(0)
    np.random.seed(0)
    agent = Agent()
    env = Environment()
    rlglue = RLGlue(env, agent)
    del agent, env
    for run in range(num_runs):
        rlglue.rl_init()
        performances = []
        for ep in range(num_eps):
            rlglue.rl_start()
            #rlglue.rl_env_message('renderON')
            terminal = False
            while not terminal:
                reward, state, action, terminal = rlglue.rl_step()

            # Find the first policy that performs at 100%
            performance = testPolicy(rlglue.rl_agent_message('policy')) * 100
            performances.append(performance)
            if performance >= 100:
                #print(rlglue.rl_agent_message('policy'))
                print('Episode: %d' % (ep + 1))
                break
        plt.plot(performances)
    plt.savefig('test.png')
예제 #5
0
def main():
    env_class = horsetrack_environment.Environment
    agent_class = random_agent.Agent
    rl_glue = RLGlue(env_class, agent_class)

    num_episodes = 1000
    max_steps = 100000

    print("\tPrinting one dot for every run: {}".format(num_episodes),
          end=' ')
    print("total runs to complete.")

    total_steps = [0 for _ in range(max_steps)]

    for i in range(num_episodes):
        rl_glue.rl_init(agent_info={"actions": env_class.actions})
        rl_glue.rl_start()

        is_terminal = False
        while rl_glue.num_steps < max_steps and not is_terminal:
            reward, state, action, is_terminal = rl_glue.rl_step()
            # optimal_action[num_steps] += 1 if "action is optimal" else 0

        total_steps[i] = rl_glue.num_steps

        rl_glue.rl_cleanup()
        print(".", end='')
        sys.stdout.flush()

    # prop_optimal = [num_optimal / num_episodes for num_optimal in optimal_action]
    save_results(total_steps, len(total_steps), "RL_EXP_OUT.dat")
    print("\nDone")
예제 #6
0
def main():
	
	num_eps = 200000

	agent = Agent()
	env = Environment()
	rlglue = RLGlue(env, agent)
	del agent, env
	solves = 0
	rlglue.rl_init()
	rewards = []
	for ep in range(num_eps):
		rlglue.rl_start()
		#rlglue.rl_env_message('renderON')
		terminal = False
		reward = 0
		while not terminal:
		
			reward, state, action, terminal = rlglue.rl_step()
			if ep > 1000:
				rlglue.rl_env_message('renderON')
				print(state)
				time.sleep(0.1)
		rewards.append(reward)
		if ep >= 99:
			if np.average(rewards[ep-99:ep+1]) >  0.78:
				print('solved at episode %d' % ep+1)
				break
			else:
				pass
예제 #7
0
def question_1():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    np.random.seed(0)
    num_episodes = 200
    see_eps = [157]
    num_runs = 1
    max_eps_steps = 100000

    # test with various stepsizes (alphas) for agent
    stepSizes = np.linspace(0.01, 1, 100)
    # best stepsize so far (comment out to test many)
    stepSizes = [0.559184]

    # seperate run for each stepsize
    for step in stepSizes:

        # initialize agent and software, with chosen stepsize
        rlglue.rl_init()
        rlglue.rl_agent_message('step:' + str(step))

        # keep track of total rewards for each episode
        total_rewards = []

        for ep in range(num_episodes):
            # render only selected episodes
            if ep in see_eps:
                rlglue.rl_env_message('rOFF')
            if ep + 1 in see_eps:
                rlglue.rl_env_message('rON')
                print("Episode %d" % (ep + 1))

            # initializse for episode
            rlglue.rl_start()
            terminal = False
            total_reward = 0

            # run episode and calculate total reward
            while not terminal:
                reward, state, action, terminal = rlglue.rl_step()
                total_reward += reward
            total_rewards.append(total_reward)

            # calculate average reward of the last 100 episodes
            if ep >= 99:
                total = np.sum(total_rewards[ep - 99:ep + 1])
                avg = total / 100

                # check if results indicate the problem is solved
                if avg > -110:
                    print("Solved at episode %d, avg reward: %f" %
                          (ep + 1, avg))
                    break

    # close environment
    environment.close()
예제 #8
0
def main(data_output_location="new_data"):

    env_class = horsetrack_environment.Environment
    agent_class = random_agent.Agent

    agent_name = agent_class.__module__[agent_class.__module__.find(".") + 1:]
    environment_name = env_class.__module__[env_class.__module__.find(".") +
                                            1:]

    rl_glue = RLGlue(env_class, agent_class)

    # num_episodes = 2000
    # max_steps = 1000
    max_total_steps = 100_000

    for epsilon in [0.0, 0.1]:
        for alpha in [2, 1, 0.5, 0.25, 0.125, 0.0625]:
            print("Running Agent: {} on Environment: {}.".format(
                agent_name, environment_name))
            agent_init_info = {
                "actions": [-1, 1],
                "world_size": 100,
                "epsilon": epsilon,
                "alpha": alpha
            }
            termination_times = []

            rl_glue.rl_init(agent_init_info=agent_init_info)

            step_counter = 0

            while step_counter < max_total_steps:
                rl_glue.rl_start()
                is_terminal = False

                while step_counter < max_total_steps and not is_terminal:
                    reward, state, action, is_terminal = rl_glue.rl_step()
                    step_counter += 1

                rl_glue.rl_cleanup()
                # print(".", end='')
                sys.stdout.flush()

                if is_terminal:
                    termination_times.append(step_counter)

            epoch_datetime = int(
                (datetime.datetime.now() -
                 datetime.datetime.utcfromtimestamp(0)).total_seconds())

            save_results(
                termination_times, len(termination_times),
                "{}/{}_{}__{}__epsilon{}__alpha{}.dat".format(
                    data_output_location, epoch_datetime, agent_name,
                    environment_name, epsilon, alpha))

    print("\nDone")
예제 #9
0
def run_experiment():

    #specify hyper-parameters
    num_runs = 1
    max_episodes = 1000000
    max_steps_per_episode = 100
    num_states = 181
    num_actions = 2
    alpha = 0.01
    eps = 0.1
    Q1 = 0

    results = np.zeros(max_episodes)
    results_run = 0

    agent = RandomAgent(num_states, num_actions, alpha, eps, Q1)
    environment = BlackJack()
    rlglue = RLGlue(environment, agent)

    print(
        "\nPrinting one dot for every run: {0} total runs to complete".format(
            num_runs))

    for run in range(num_runs):
        np.random.seed(run)
        results_run = 0.0

        rlglue.rl_init()
        for e in range(1, max_episodes + 1):
            rlglue.rl_start()
            for s in range(max_steps_per_episode):
                r, _, _, terminal = rlglue.rl_step()
                results_run += r
                results[e - 1] += r

                if terminal:
                    break

            if e % 10000 == 0:
                print(
                    "\nEpisode {}: average return till episode is {}, and policy is"
                    .format(e, results_run / e))
                print(rlglue.rl_agent_message("printPolicy"))
        print(".")

    print("Average return over experiment: {}".format(
        (results / num_runs).mean()))

    #save final policy to file -- change file name as necessary
    with open("policy.txt", 'w') as f:
        f.write(rlglue.rl_agent_message("printPolicy"))

    #save all the experiment data for analysis -- change file name as necessary
    save_results(results / num_runs, max_episodes, "RL_EXP_OUT.dat")
예제 #10
0
def testPolicy(policy):
    env = Environment()
    agent = testAgent(policy)
    rlglue = RLGlue(env, agent)
    rlglue.rl_init()
    #rlglue.rl_env_message('renderON')
    performance = 0
    for ep in range(100):
        rlglue.rl_start()
        terminal = False
        reward = None
        while not terminal:
            reward, state, action, terminal = rlglue.rl_step()
        if reward > 0:
            performance += 1

    return performance / 100
예제 #11
0
def testPolicy(policy):
    agent = testAgent(policy)
    env = Environment()
    rlglue = RLGlue(env, agent)
    del env, agent
    rlglue.rl_init()

    for run in range(1):
        rlglue.rl_init()
        rlglue.rl_env_message('renderON')
        rlglue.rl_start()

        total_reward = 0
        terminal = False
        while not terminal:
            r, s, a, terminal = rlglue.rl_step()
            total_reward += r

    return total_reward
예제 #12
0
def experiment1():
    agent = RandomAgent()
    environment = Environment1D()
    rlg = RLGlue(environment, agent)

    max_steps = 1000  # max number of steps in an episode
    num_runs = 2000  # number of repetitions of the experiment
    optimal_action = np.zeros(max_steps)

    for k in range(num_runs):

        # initialize RL-Glue
        rlg.rl_init()  #env_init + agent_init
        rlg.rl_start()
        for i in range(max_steps):  #step
            action = rlg.rl_step()[2]
            if action == environment.env_message():
                optimal_action[i] += 1
    ratio_optimal_action = optimal_action / num_runs

    return ratio_optimal_action
def main():
    env = drifter_distractor_env.Environment
    env = switched_drifter_distractor_env.Environment

    agents = [random_agent.Agent, weight_change_agent.Agent]
    agent_types = ["absolute_error", "squared_error", "weight_change"]

    for agent_type in agent_types:
        agent = agents[1]

        agent_info = {
            "num_actions": 4,
            "action_selection": "softmax",
            "agent_type": agent_type
        }
        env_info = {}

        num_runs = 1
        num_steps = 100000

        actions = [0 for _ in range(4)]

        errors = []

        for run in range(num_runs):
            rl_glue = RLGlue(env, agent)
            rl_glue.rl_init(agent_info, env_info)
            rl_glue.rl_start()

            for step in range(num_steps):
                reward, state, action, is_terminal = rl_glue.rl_step()
                actions[action] += 1

        # np.save("data/squared_error", rl_glue.agent.track_actions)
        np.save("data/{}".format(agent_type), rl_glue.agent.track_actions)
        # print(rl_glue.environment.arm_1)
        # print(rl_glue.environment.arm_2)
        # print(rl_glue.environment.arm_3)
        # print(rl_glue.environment.arm_4)
        print(actions)
예제 #14
0
def run_experiment(env,
                   agent,
                   agent_info,
                   env_info,
                   num_experiments=1,
                   num_steps=None,
                   seeds=None):
    all_scores = []
    for _ in range(num_experiments):
        rl_glue = RLGlue(env, agent)
        rl_glue.rl_init(agent_info, env_info)
        rl_glue.rl_start()

        scores = [0]
        averages = []

        for _ in range(num_steps):
            reward, state, action, is_terminal = rl_glue.rl_step()
            scores.append(scores[-1] + reward)
            averages.append(scores[-1] / (i + 1))

        all_scores.append(averages)

    return all_scores
예제 #15
0
    for num_moves in possible_num_moves:

        episode = np.zeros(max_steps)

        for run in range(max_run):
            count_episode = -1
            rlglue.rl_init()
            agent.possibleMoves = num_moves

            terminal = True

            for step in range(max_steps):

                if terminal:
                    rlglue.rl_start()
                    count_episode += 1

                _, _, _, terminal = rlglue.rl_step()

                episode[step] += count_episode

        plt.plot(np.arange(max_steps),
                 episode / max_run,
                 label="Possible move: " + str(num_moves))

    plt.legend()
    plt.xlabel("Time steps")
    plt.ylabel("Episodes")
    plt.title("One-step Sarsa for Different Possible Moves")
    plt.show()
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    rl_glue = RLGlue(environment, agent)

    # sweep agent parameters
    for num_tilings in agent_parameters['num_tilings']:
        for num_tiles in agent_parameters["num_tiles"]:
            for update_ss in agent_parameters["update_step_size"]:
                for avg_reward_ss in agent_parameters["avg_reward_step_size"]:
                    for epsilon in agent_parameters["epsilon"]:
                        env_info = {}
                        agent_info = {"num_tilings": num_tilings,
                                      "num_tiles": num_tiles,
                                      "alpha": update_ss,
                                      "avg_reward_step_size": avg_reward_ss,
                                      "epsilon":epsilon,
                                      "num_actions": agent_parameters["num_actions"],
                                      "iht_size": agent_parameters["iht_size"]}
                        # results to save
                        return_per_step = np.zeros(
                            (experiment_parameters["num_runs"], experiment_parameters["max_steps"]))
                        exp_avg_reward_per_step = np.zeros(
                                (experiment_parameters["num_runs"], experiment_parameters["max_steps"]))
                        # using tqdm we visualize progress bars
                        avg_reward_list = []
                        avg_reward = -10000
                        for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)):
                            env_info["seed"] = run
                            agent_info["seed"] = run
                            rl_glue.rl_init(agent_info, env_info)
                            rl_glue.rl_start()
                            num_steps = 0
                            total_return = 0.
                            #return_arr = []
                            # exponential average reward without initial bias
                            exp_avg_reward = 0.0
                            exp_avg_reward_ss = 0.01
                            exp_avg_reward_normalizer = 0
                            while num_steps < experiment_parameters['max_steps']:
                                num_steps += 1
                                rl_step_result = rl_glue.rl_step()
                                reward = rl_step_result[0]
                                total_return += reward
                                #return_arr.append(reward)
                                avg_reward = rl_glue.rl_agent_message("get avg reward")
                                exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * (
                                                1 - exp_avg_reward_normalizer)
                                ss = exp_avg_reward_ss / exp_avg_reward_normalizer
                                exp_avg_reward += ss * (reward - exp_avg_reward)

                                return_per_step[run - 1][num_steps - 1] = total_return
                                exp_avg_reward_per_step[run - 1][num_steps - 1] = exp_avg_reward
                            avg_reward_list.append(avg_reward)
                        print(np.average(avg_reward_list))
                        if not os.path.exists('results_sarsa'):
                            os.makedirs('results_sarsa')

                        save_name = "semi-gradient_sarsa_tilings_{}_tiledim_{}_update_ss_{}_epsilon_ss_{}_avg_reward_ss_{}_max_steps_{}".format(
                            num_tilings, num_tiles, update_ss, epsilon, avg_reward_ss, experiment_parameters["max_steps"])
                        total_return_filename = "results_sarsa/{}_total_return.npy".format(save_name)
                        exp_avg_reward_filename = "results_sarsa/{}_exp_avg_reward.npy".format(save_name)

                        np.save(total_return_filename, return_per_step)
                        np.save(exp_avg_reward_filename, exp_avg_reward_per_step)
예제 #17
0
# here it just needs the number of actions (number of arms).
env_info = {
}  # Pass the environment the information it needs; in this case, it is nothing.

all_averages = []

for i in tqdm(
        range(num_runs)
):  # tqdm is what creates the progress bar below once the code is run
    rl_glue = RLGlue(
        env, agent
    )  # Creates a new RLGlue experiment with the env and agent we chose above
    rl_glue.rl_init(
        agent_info, env_info
    )  # Pass RLGlue what it needs to initialize the agent and environment
    rl_glue.rl_start()  # Start the experiment

    scores = [0]
    averages = []

    for i in range(num_steps):
        reward, _, action, _ = rl_glue.rl_step(
        )  # The environment and agent take a step and return
        # the reward, and action taken.
        scores.append(scores[-1] + reward)
        averages.append(scores[-1] / (i + 1))
    all_averages.append(averages)

plt.figure(figsize=(15, 5), dpi=80, facecolor='w', edgecolor='k')
plt.plot([1.55 for _ in range(num_steps)], linestyle="--")
plt.plot(np.mean(all_averages, axis=0))
예제 #18
0
        for run in tqdm(range(num_runs)):
            agent_info["seed"] = run
            rl_glue = RLGlue(env, agents[algorithm])
            rl_glue.rl_init(agent_info, env_info)

            reward_sums = []
            state_visits = np.zeros(agent_info["num_states"])
            #         last_episode_total_reward = 0
            for episode in range(num_episodes):
                start_time = time.clock()
                if episode < num_episodes - 10:
                    # Runs an episode
                    rl_glue.rl_episode(0)
                else:
                    # Runs an episode while keeping track of visited states
                    state, action = rl_glue.rl_start()
                    state_visits[state] += 1
                    is_terminal = False
                    while not is_terminal:
                        # # stop the program
                        # line = sys.stdin.readline()
                        # print 'line=', line
                        # if line == 'q':
                        #     sys.exit()
                        reward, state, action, is_terminal = rl_glue.rl_step()
                        state_visits[state] += 1

                reward_sums.append(rl_glue.rl_return())
                #             last_episode_total_reward = rl_glue.rl_return()
                end_time = time.clock()
                print "The time of ", episode, " episode:", end_time - start_time
예제 #19
0
def run_experiment(environment, agent, environment_parameters,
                   agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)

    # sweep agent parameters
    for num_tilings in agent_parameters['num_tilings']:
        for num_tiles in agent_parameters["num_tiles"]:
            for actor_ss in agent_parameters["actor_step_size"]:
                for critic_ss in agent_parameters["critic_step_size"]:
                    for avg_reward_ss in agent_parameters[
                            "avg_reward_step_size"]:

                        env_info = {}
                        agent_info = {
                            "num_tilings": num_tilings,
                            "num_tiles": num_tiles,
                            "actor_step_size": actor_ss,
                            "critic_step_size": critic_ss,
                            "avg_reward_step_size": avg_reward_ss,
                            "num_actions": agent_parameters["num_actions"],
                            "iht_size": agent_parameters["iht_size"]
                        }

                        # results to save
                        return_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))
                        exp_avg_reward_per_step = np.zeros(
                            (experiment_parameters["num_runs"],
                             experiment_parameters["max_steps"]))

                        # using tqdm we visualize progress bars
                        for run in tqdm(
                                range(1,
                                      experiment_parameters["num_runs"] + 1)):
                            env_info["seed"] = run
                            agent_info["seed"] = run

                            rl_glue.rl_init(agent_info, env_info)
                            rl_glue.rl_start()

                            num_steps = 0
                            total_return = 0.
                            return_arr = []

                            # exponential average reward without initial bias
                            exp_avg_reward = 0.0
                            exp_avg_reward_ss = 0.01
                            exp_avg_reward_normalizer = 0

                            while num_steps < experiment_parameters[
                                    'max_steps']:
                                num_steps += 1

                                rl_step_result = rl_glue.rl_step()

                                reward = rl_step_result[0]
                                total_return += reward
                                return_arr.append(reward)
                                avg_reward = rl_glue.rl_agent_message(
                                    "get avg reward")

                                exp_avg_reward_normalizer = exp_avg_reward_normalizer + exp_avg_reward_ss * (
                                    1 - exp_avg_reward_normalizer)
                                ss = exp_avg_reward_ss / exp_avg_reward_normalizer
                                exp_avg_reward += ss * (reward -
                                                        exp_avg_reward)

                                return_per_step[run - 1][num_steps -
                                                         1] = total_return
                                exp_avg_reward_per_step[run -
                                                        1][num_steps -
                                                           1] = exp_avg_reward

                        if not os.path.exists('results'):
                            os.makedirs('results')

                        save_name = "ActorCriticSoftmax_tilings_{}_tiledim_{}_actor_ss_{}_critic_ss_{}_avg_reward_ss_{}".format(
                            num_tilings, num_tiles, actor_ss, critic_ss,
                            avg_reward_ss)
                        total_return_filename = "results/{}_total_return.npy".format(
                            save_name)
                        exp_avg_reward_filename = "results/{}_exp_avg_reward.npy".format(
                            save_name)

                        np.save(total_return_filename, return_per_step)
                        np.save(exp_avg_reward_filename,
                                exp_avg_reward_per_step)
예제 #20
0
agent_info = {
    "iht_size": 4096,
    "num_tilings": 8,
    "num_tiles": 8,
    "actor_step_size": 1e-1,
    "critic_step_size": 1e-0,
    "avg_reward_step_size": 1e-2,
    "num_actions": 3,
    "seed": 99,
}

rl_glue = RLGlue(PendulumEnvironment, ActorCriticSoftmaxAgent)
rl_glue.rl_init(agent_info, env_info)

# start env/agent
rl_glue.rl_start()
rl_glue.rl_step()

# simple alias
agent = rl_glue.agent

print("agent next_action: {}".format(agent.last_action))
print("agent avg reward: {}\n".format(agent.avg_reward))

assert agent.last_action == 1
assert agent.avg_reward == -0.03139092653589793

print("agent first 10 values of actor weights[0]: \n{}\n".format(
    agent.actor_w[0][:10]))
print("agent first 10 values of actor weights[1]: \n{}\n".format(
    agent.actor_w[1][:10]))
예제 #21
0
def experiment(num_runs, max_steps):

    agent = Agent()
    environment = Environment()
    rlg = RLGlue(environment, agent)

    optimal_actions_optimistic = np.zeros(max_steps)
    optimal_actions_realistic = np.zeros(max_steps)

    for run in range(num_runs):

        # initialize RL-Glue
        rlg.rl_init()
        _, last_action = rlg.rl_start()

        optimal = environment.env_optimal_action()

        if last_action == optimal:
            optimal_actions_optimistic[0] += 1

        for i in range(1, max_steps):
            _, _, last_action, _ = rlg.rl_step()

            if last_action == optimal:
                optimal_actions_optimistic[i] += 1

        print("\rCurrent: %i" % run, end="")

    for run in range(num_runs):

        # initialize RL-Glue
        rlg.rl_init()
        agent.set_epsilon(0.1)
        agent.set_q(0)
        _, last_action = rlg.rl_start()

        optimal = environment.env_optimal_action()

        if last_action == optimal:
            optimal_actions_realistic[0] += 1

        for i in range(1, max_steps):
            _, _, last_action, _ = rlg.rl_step()

            if last_action == optimal:
                optimal_actions_realistic[i] += 1

        print("\rCurrent: %i" % run, end="")

    optimal_actions_optimistic /= num_runs
    optimal_actions_realistic /= num_runs

    fig, ax = plt.subplots()
    ax.plot(np.arange(1, 1001),
            optimal_actions_optimistic,
            'r',
            label='optimistic,greedy,Q1 = 0.5, epsilon = 0')
    ax.plot(np.arange(1, 1001),
            optimal_actions_realistic,
            'b',
            label='realistic,realistic,Q1 = 0, epsilon = 0.1')
    ax.legend()
    plt.xticks([1, 200, 400, 600, 800, 1000])
    plt.show()
예제 #22
0
def main():
    # Seed rng's for consistent testing
    random.seed(0)
    np.random.seed(0)

    # Generate agent, environment and RLGlue
    env = Environment()
    env.env_init()
    agent = Agent(env.get_actions(), env.get_max_observation(),
                  env.get_min_observation())
    rlglue = RLGlue(env, agent)
    del agent, env

    # Configure experiment
    num_eps = 100000
    # initialize rlglue
    rlglue.rl_init()

    avg_rewards = []
    avg_reward = 0
    max_reward = 0
    best_policy = None
    # Run through each episode
    #rlglue.rl_env_message('renderON')
    #for ep in range(num_eps):
    ep = 0
    x = 10
    last_i = x
    last_n = np.zeros(x)
    best = (0, -1)
    while ep < num_eps:
        last_i += 1
        if last_i >= len(last_n):
            last_i = 0
        ep += 1
        #if ep % int(num_eps/10) == 0:
        #print('ep:', ep, 'bestpolicy', max_reward)
        # start episode
        rlglue.rl_start()
        rewards = 0
        steps = 1
        # Run episode to its completion
        terminal = False
        while not terminal:
            reward, state, action, terminal = rlglue.rl_step()
            rewards += reward
            steps += 1

        if steps > best[0]:
            best = (steps, ep)

        avg_reward = steps
        avg_rewards.append(avg_reward)
        last_n[last_i] = steps

        #print('ep',ep, 'steps', steps)
        #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps)
        #print(rlglue.rl_agent_message('policy'))
        #input()
        #if best[0] >= 500:
        print('ep', ep, 'mvg avg', np.average(last_n), steps, 'best', best)
        #if np.average(last_n) > 400:
        if ep > 2500:
            #rlglue.rl_env_message('renderON')

            break

    plt.plot(avg_rewards)
    plt.plot(moving_average(avg_rewards, 10))
    plt.plot(moving_average(avg_rewards, 100))
    plt.savefig('results.png')
예제 #23
0
def main():
    # Seed rng's for consistent testing
    random.seed(0)
    np.random.seed(0)

    # Generate agent, environment and RLGlue
    env = Environment()
    agent = Agent(env.get_actions())
    rlglue = RLGlue(env, agent)
    del agent, env

    # Configure experiment
    num_eps = 100000
    # initialize rlglue
    rlglue.rl_init()

    avg_rewards = []
    avg_reward = 0
    max_reward = 0
    best_policy = None
    # Run through each episode
    #rlglue.rl_env_message('renderON')
    #for ep in range(num_eps):
    ep = 0
    while ep < num_eps:
        ep += 1
        #if ep % int(num_eps/10) == 0:
        #print('ep:', ep, 'bestpolicy', max_reward)
        # start episode
        rlglue.rl_start()
        rewards = 0
        steps = 1
        # Run episode to its completion
        terminal = False
        while not terminal:
            reward, state, action, terminal = rlglue.rl_step()
            rewards += reward
            steps += 1

        avg_reward = rewards
        avg_rewards.append(avg_reward)

        if rewards > max_reward:
            max_reward = rewards
            best_policy = rlglue.rl_agent_message('policy')
            pickle.dump(best_policy, open("policy.pickle", "wb"))
            print('ep', ep, 'reward', avg_reward)
        #print('ep:',ep, 'avg reward:', avg_reward, 'steps:', steps)
        #print(rlglue.rl_agent_message('policy'))
        #input()

    plt.plot(avg_rewards)
    plt.plot(moving_average(avg_rewards, 10))
    plt.plot(moving_average(avg_rewards, 100))
    plt.savefig('results.png')

    # Get generated policy
    policy = rlglue.rl_agent_message('policy')

    # Test policy
    result = testPolicy(best_policy)