Exemplo n.º 1
0
def main():
    num_episodes = 2000
    #max_steps = 10000
    num_runs = 30
    true_value = np.load("TrueValueFunction.npy")

    # Create and pass agent and environment objects to RLGlue
    environment = RandomWalkEnvironment()
    agent = TabularAgent()
    rlglue = RLGlue(environment, agent)
    result1 = experiment(rlglue, num_episodes, num_runs)
    del agent, environment  # don't use these anymore

    environment = RandomWalkEnvironment()
    agent = TileAgent()
    rlglue = RLGlue(environment, agent)
    result2 = experiment(rlglue, num_episodes, num_runs)
    del agent, environment  # don't use these anymore

    plt.plot(range(num_episodes), result1, label="Tabular feature encoding")
    plt.plot(range(num_episodes), result2, label="Tile coding")
    plt.xlabel("Episodes")
    plt.ylabel("RMSE")
    plt.legend()
    plt.show()
Exemplo n.º 2
0
def main():
    max_steps = 1000  # max number of steps in an episode
    num_runs = 2000  # number of repetitions of the experiment

    # Create and pass agent and environment objects to RLGlue
    agent = RandomAgent1()
    environment = OneStateEnvironment()
    rlglue = RLGlue(environment, agent)
    result1 = experiment2(rlglue,num_runs,max_steps)

    agent = RandomAgent2()
    environment = OneStateEnvironment()
    rlglue = RLGlue(environment, agent)
    result2 = experiment2(rlglue, num_runs, max_steps)


    axis = [i for i in range(max_steps)]

    plt.plot(axis, result1, color="grey")
    plt.plot(axis, result2, color="blue")

    plt.yticks([0,.2,.4,.6,.8,1],['0%','20%','40%','60%','80%','100%'])

    #formatter = FuncFormatter(str(100*y) + '%')
    #plt.gca().yaxis.set_major_formatter(formatter)
    plt.text(200,.85,r'Optimisic, greedy Q1=5, ε=0')
    plt.text(600,.55,r'Realistic,ε-greedy Q1=0, ε=0.1')
    plt.ylabel('%\nOptimal\naction',rotation = 0)
    plt.xlabel('Steps')
    plt.show()
Exemplo n.º 3
0
def foreground(servos, time_steps, episodes, plotting_data):
    my_agent = DanceBot(plotting_data)
    my_env = ServoEnvironment(servos)
    my_rl_glue = RLGlue(my_env, my_agent)

    print("\nRunning {} episodes with {} time-steps each.".format(episodes,
                                                                  time_steps))
    for _ in range(episodes):
        my_rl_glue.rl_episode(time_steps)
Exemplo n.º 4
0
def foreground(servos, time_steps, episodes, plotting_data):
    my_agent = DanceBot(plotting_data)
    my_env = ServoEnvironment(servos)
    my_rl_glue = RLGlue(my_env, my_agent)

    print("\nRunning {} episodes with {} time-steps each.".format(
        episodes, time_steps))
    for _ in range(episodes):
        my_rl_glue.rl_episode(time_steps)
Exemplo n.º 5
0
def main(agent_info, agent_class, env_info, env_class, steps, param_info):
    # env_class = horsetrack_environment.Environment
    rl_glue = RLGlue(env_class, agent_class)

    max_steps = steps
    max_episodes = 5
    step = 0
    episodes = 0
    episode_end = np.ones(max_episodes) * max_steps
    cum_reward = 0

    # max_steps = 20000

    agent_info.update({"actions": env_class.actions})
    rl_glue.rl_init(agent_info, env_info)

    while step < max_steps and episodes < max_episodes:
        rl_glue.rl_start()

        is_terminal = False

        while not is_terminal and step < max_steps:
            reward, state, action, is_terminal = rl_glue.rl_step()
            cum_reward += reward

            step += 1

        if is_terminal:
            episode_end[episodes] = step
            episodes += 1
        rl_glue.rl_cleanup()

    save_results(episode_end, "{}".format(param_info))
Exemplo n.º 6
0
def testPolicy(policy):
    agent = testAgent(policy)
    env = Environment()
    rlglue = RLGlue(env, agent)
    del env, agent
    rlglue.rl_init()

    # set up 2d array for average rewards
    # rewards[step] = sum of rewards across all runs for that step
    rewards = [0 for i in range(1000)]
    for run in range(1):
        rlglue.rl_init()
        #rlglue.rl_env_message('renderON')
        rlglue.rl_start()

        terminal = False
        for step in range(1000):
            if not terminal:
                r, s, a, terminal = rlglue.rl_step()
                rewards[step] += r

    # average rewards
    rewards = [i / 1 for i in rewards]

    return rewards
def main(agent_info, agent_class, steps, filename):
    env_class = floating_horsetrack_environment.Environment
    rl_glue = RLGlue(env_class, agent_class)

    max_steps = steps
    step = 0
    episode_end = []
    cum_reward = 0

    agent_info.update({"actions": env_class.actions})
    rl_glue.rl_init(agent_info)

    while step < max_steps:
        rl_glue.rl_start()

        is_terminal = False

        while not is_terminal and step < max_steps:
            reward, state, action, is_terminal = rl_glue.rl_step()
            cum_reward += reward

            step += 1

        if is_terminal:
            episode_end.append(step)
        rl_glue.rl_cleanup()

    save_results(episode_end, len(episode_end), "data/{}".format(filename))
Exemplo n.º 8
0
def main():
	
	num_eps = 200000

	agent = Agent()
	env = Environment()
	rlglue = RLGlue(env, agent)
	del agent, env
	solves = 0
	rlglue.rl_init()
	rewards = []
	for ep in range(num_eps):
		rlglue.rl_start()
		#rlglue.rl_env_message('renderON')
		terminal = False
		reward = 0
		while not terminal:
		
			reward, state, action, terminal = rlglue.rl_step()
			if ep > 1000:
				rlglue.rl_env_message('renderON')
				print(state)
				time.sleep(0.1)
		rewards.append(reward)
		if ep >= 99:
			if np.average(rewards[ep-99:ep+1]) >  0.78:
				print('solved at episode %d' % ep+1)
				break
			else:
				pass
Exemplo n.º 9
0
def main():
    env_class = horsetrack_environment.Environment
    agent_class = random_agent.Agent
    rl_glue = RLGlue(env_class, agent_class)

    num_episodes = 1000
    max_steps = 100000

    print("\tPrinting one dot for every run: {}".format(num_episodes),
          end=' ')
    print("total runs to complete.")

    total_steps = [0 for _ in range(max_steps)]

    for i in range(num_episodes):
        rl_glue.rl_init(agent_info={"actions": env_class.actions})
        rl_glue.rl_start()

        is_terminal = False
        while rl_glue.num_steps < max_steps and not is_terminal:
            reward, state, action, is_terminal = rl_glue.rl_step()
            # optimal_action[num_steps] += 1 if "action is optimal" else 0

        total_steps[i] = rl_glue.num_steps

        rl_glue.rl_cleanup()
        print(".", end='')
        sys.stdout.flush()

    # prop_optimal = [num_optimal / num_episodes for num_optimal in optimal_action]
    save_results(total_steps, len(total_steps), "RL_EXP_OUT.dat")
    print("\nDone")
Exemplo n.º 10
0
def main(data_output_location="new_data"):

    env_class = horsetrack_environment.Environment
    agent_class = random_agent.Agent

    agent_name = agent_class.__module__[agent_class.__module__.find(".") + 1:]
    environment_name = env_class.__module__[env_class.__module__.find(".") +
                                            1:]

    rl_glue = RLGlue(env_class, agent_class)

    # num_episodes = 2000
    # max_steps = 1000
    max_total_steps = 100_000

    for epsilon in [0.0, 0.1]:
        for alpha in [2, 1, 0.5, 0.25, 0.125, 0.0625]:
            print("Running Agent: {} on Environment: {}.".format(
                agent_name, environment_name))
            agent_init_info = {
                "actions": [-1, 1],
                "world_size": 100,
                "epsilon": epsilon,
                "alpha": alpha
            }
            termination_times = []

            rl_glue.rl_init(agent_init_info=agent_init_info)

            step_counter = 0

            while step_counter < max_total_steps:
                rl_glue.rl_start()
                is_terminal = False

                while step_counter < max_total_steps and not is_terminal:
                    reward, state, action, is_terminal = rl_glue.rl_step()
                    step_counter += 1

                rl_glue.rl_cleanup()
                # print(".", end='')
                sys.stdout.flush()

                if is_terminal:
                    termination_times.append(step_counter)

            epoch_datetime = int(
                (datetime.datetime.now() -
                 datetime.datetime.utcfromtimestamp(0)).total_seconds())

            save_results(
                termination_times, len(termination_times),
                "{}/{}_{}__{}__epsilon{}__alpha{}.dat".format(
                    data_output_location, epoch_datetime, agent_name,
                    environment_name, epsilon, alpha))

    print("\nDone")
Exemplo n.º 11
0
def main():

    num_eps = 5000
    num_runs = 10
    random.seed(0)
    np.random.seed(0)
    agent = Agent()
    env = Environment()
    rlglue = RLGlue(env, agent)
    del agent, env
    for run in range(num_runs):
        rlglue.rl_init()
        performances = []
        for ep in range(num_eps):
            rlglue.rl_start()
            #rlglue.rl_env_message('renderON')
            terminal = False
            while not terminal:
                reward, state, action, terminal = rlglue.rl_step()

            # Find the first policy that performs at 100%
            performance = testPolicy(rlglue.rl_agent_message('policy')) * 100
            performances.append(performance)
            if performance >= 100:
                #print(rlglue.rl_agent_message('policy'))
                print('Episode: %d' % (ep + 1))
                break
        plt.plot(performances)
    plt.savefig('test.png')
Exemplo n.º 12
0
def question_4():
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 200
    num_runs = 50
    max_eps_steps = 1000000

    steps = np.zeros([num_runs, num_episodes])
    rewards = []
    for r in range(num_runs):
        print("run number : ", r + 1)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
        reward = rlglue.total_reward()
        rewards.append(reward)
    mean = sum(rewards) / len(rewards)
    stder = statistics.stdev(rewards) / math.sqrt(len(rewards))
    print("mean:", mean)
    print("std:", stder)
    np.save('bonus_steps', steps)
    np.save("mean", mean)
    np.save("stder", stder)
Exemplo n.º 13
0
def run_experiment():

    #specify hyper-parameters
    num_runs = 1
    max_episodes = 1000000
    max_steps_per_episode = 100
    num_states = 181
    num_actions = 2
    alpha = 0.01
    eps = 0.1
    Q1 = 0

    results = np.zeros(max_episodes)
    results_run = 0

    agent = RandomAgent(num_states, num_actions, alpha, eps, Q1)
    environment = BlackJack()
    rlglue = RLGlue(environment, agent)

    print(
        "\nPrinting one dot for every run: {0} total runs to complete".format(
            num_runs))

    for run in range(num_runs):
        np.random.seed(run)
        results_run = 0.0

        rlglue.rl_init()
        for e in range(1, max_episodes + 1):
            rlglue.rl_start()
            for s in range(max_steps_per_episode):
                r, _, _, terminal = rlglue.rl_step()
                results_run += r
                results[e - 1] += r

                if terminal:
                    break

            if e % 10000 == 0:
                print(
                    "\nEpisode {}: average return till episode is {}, and policy is"
                    .format(e, results_run / e))
                print(rlglue.rl_agent_message("printPolicy"))
        print(".")

    print("Average return over experiment: {}".format(
        (results / num_runs).mean()))

    #save final policy to file -- change file name as necessary
    with open("policy.txt", 'w') as f:
        f.write(rlglue.rl_agent_message("printPolicy"))

    #save all the experiment data for analysis -- change file name as necessary
    save_results(results / num_runs, max_episodes, "RL_EXP_OUT.dat")
def main(data_output_location="data"):

    env_class = example_environment.Environment
    agent_class = example_agent.Agent

    rl_glue = RLGlue(env_class, agent_class)

    num_episodes = 2000
    max_steps = 1000

    print("\tPrinting one dot for every run: {}".format(num_episodes), end=' ')
    print("total runs to complete.")

    optimal_action = [0 for _ in range(max_steps)]

    agent_data = {}
    for i in range(num_episodes):
        agent_data = run_episode(rl_glue_instance=rl_glue,
                                 max_steps=max_steps,
                                 optimal_action=optimal_action,
                                 agent_data=agent_data)

    prop_optimal = [
        num_optimal / num_episodes for num_optimal in optimal_action
    ]
    save_results(prop_optimal, max_steps,
                 "{}/RL_EXP_OUT.dat".format(data_output_location))
    print("\nDone")
def main():
    choice = input(
        "Enter 1 to select question3 agent, enter 2 to select bonus agent: ")
    if choice == '1':

        Q1 = float(input("Enter value of Q1: "))
        alpha = float(input("Enter value of alpha: "))
        epsilon = float(input("Enter value of epsilon: "))
        agent = BanditAgent(Q1, alpha, epsilon)
    if choice == '2':
        Q1 = float(input("Enter value of Q1: "))
        c = float(input("Enter value of c: "))
        alpha = float(input("Enter value of alpha: "))
        agent = Ucb_BanditAgent(Q1, c, alpha)
    name = input("Input output file name: ")

    max_steps = 1000  # max number of steps in an episode
    num_runs = 2000  # number of repetitions of the experiment

    # Create and pass agent and environment objects to RLGlue

    environment = BanditEnv()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore

    # run the experiment
    optimalAction = BanditExp(rlglue, num_runs, max_steps)
    result = optimalAction / num_runs
    print(result)
    with open(name + '.csv', 'w') as out_file:
        for i in range(max_steps):
            out_file.write("%f\n" % result[i])
Exemplo n.º 16
0
def question_3():
    # Specify hyper-parameters
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 1000000
    for _ in range(num_runs):
        rlglue.rl_init()
        i = 0
        for i in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            print(i)
    fout = open('value', 'w')
    steps = 50
    w, iht = rlglue.rl_agent_message("ValueFunction")
    Q = np.zeros([steps, steps])
    for i in range(steps):
        for j in range(steps):
            values = []
            for a in range(3):
                value = 0
                for index in tiles(iht, 8, [
                        8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 *
                    (-0.07 + (j * 0.14 / steps)) / 0.14
                ], [a]):
                    value -= w[index]
                values.append(value)
            height = max(values)
            fout.write(repr(height) + ' ')
            Q[j][i] = height
        fout.write('\n')
    fout.close()
    np.save("value", Q)
Exemplo n.º 17
0
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    
    rl_glue = RLGlue(environment, agent)
        
    # save sum of reward at the end of each episode
    agent_sum_reward = np.zeros((experiment_parameters["num_runs"], 
                                 experiment_parameters["num_episodes"]))

    env_info = {}

    agent_info = agent_parameters

    # one agent setting
    for run in range(1, experiment_parameters["num_runs"]+1):
        agent_info["seed"] = run
        agent_info["network_config"]["seed"] = run
        env_info["seed"] = run

        rl_glue.rl_init(agent_info, env_info)
        
        for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)):
            # run episode
            rl_glue.rl_episode(experiment_parameters["timeout"])
            
            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run - 1, episode - 1] = episode_reward
    save_name = "{}".format(rl_glue.agent.name)
    if not os.path.exists('results'):
        os.makedirs('results')
    np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
    shutil.make_archive('results', 'zip', 'results')
Exemplo n.º 18
0
def main():
    max_steps = 100  # max number of steps in an episode
    num_runs = 10  # number of repetitions of the experiment

    # Create and pass agent and environment objects to RLGlue
    agent = RandomAgent()
    environment = OneStateEnvironment()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore

    result = experiment2(rlglue, num_runs, max_steps)
    print("experiment2 average reward: {}\n".format(result))
Exemplo n.º 19
0
def run_experiment(env_info,
                   agent_info,
                   num_episodes=5000,
                   experiment_name=None,
                   plot_freq=100,
                   true_values_file=None,
                   value_error_threshold=1e-8):
    env = CliffWalkEnvironment
    agent = TDAgent
    rl_glue = RLGlue(env, agent)

    rl_glue.rl_init(agent_info, env_info)

    manager = Manager(env_info,
                      agent_info,
                      true_values_file=true_values_file,
                      experiment_name=experiment_name)
    for episode in range(1, num_episodes + 1):
        rl_glue.rl_episode(0)  # no step limit
        if episode % plot_freq == 0:
            values = rl_glue.agent.agent_message("get_values")
            manager.visualize(values, episode)

    values = rl_glue.agent.agent_message("get_values")
    if true_values_file is not None:
        # Grading: The Manager will check that the values computed using your TD agent match
        # the true values (within some small allowance) across the states. In addition, it also
        # checks whether the root mean squared value error is close to 0.
        manager.run_tests(values, value_error_threshold)

    return values
Exemplo n.º 20
0
def better_run_experiment(num_runs, num_episodes):
    # Same as last function
    all_steps = []
    agent = ag.ExpectedSarsaAgent
    env = enviro.MountainEnvironment

    total_reward_per_run = []
    for run in range(num_runs):
        start = time.time()
        if run % 5 == 0:
            print("RUN: {}".format(run))

        initial_weights = np.random.uniform(-0.001, 0)
        agent_info = {
            "num_tilings": 32,
            "num_tiles": 4,
            "iht_size": 4096,
            "epsilon": 0.1,
            "gamma": 1,
            "alpha": 0.7 / 32,
            "initial_weights": initial_weights,
            "num_actions": 3
        }
        env_info = {
            "min_position": -1.2,
            "max_position": 0.5,
            "min_velocity": -0.07,
            "max_velocity": 0.07,
            "gravity": 0.0025,
            "action_discount": 0.001
        }
        rl_glue = RLGlue(env, agent)
        rl_glue.rl_init(agent_info, env_info)
        steps_per_episode = []

        for episode in range(num_episodes):
            # 15000 is max steps of the episode
            rl_glue.rl_episode(15000)
            steps_per_episode.append(rl_glue.num_steps)
        total_reward = np.sum(steps_per_episode) * -1
        all_steps.append(np.array(steps_per_episode))
        print("Run time: {}".format(time.time() - start))
        total_reward_per_run.append(total_reward)

    data = np.mean(total_reward_per_run)
    data_std_err = np.std(total_reward_per_run, axis=0) / np.sqrt(num_runs)
    plt.title("Expected Sarsa MountainCar (Alternate Parameters)",
              fontdict={
                  'fontsize': 16,
                  'fontweight': 25
              },
              pad=15.0)
    plt.xlabel("Epsiode", labelpad=5.0)
    plt.ylabel("Steps per Episode (averaged over " + str(num_runs) + " runs)",
               labelpad=10.0)
    plt.plot(np.mean(np.array(all_steps), axis=0))
    plt.show()
    np.save("ExpectedSarsa_test", np.array(all_steps))
    print("mean: ", data)
    print("standard error: ", data_std_err)
Exemplo n.º 21
0
def run_experiment(env_info,
                   agent_info,
                   num_episodes=5000,
                   experiment_name=None,
                   plot_freq=100,
                   true_values_file=None,
                   value_error_threshold=1e-8):
    env = CliffWalkEnvironment
    agent = TDAgent
    rl_glue = RLGlue(env, agent)

    rl_glue.rl_init(agent_info, env_info)

    manager = Manager(env_info,
                      agent_info,
                      true_values_file=true_values_file,
                      experiment_name=experiment_name)
    for episode in range(1, num_episodes + 1):
        rl_glue.rl_episode(0)  # no step limit
        if episode % plot_freq == 0:
            values = rl_glue.agent.agent_message("get_values")
            manager.visualize(values, episode)

    values = rl_glue.agent.agent_message("get_values")
    return values
Exemplo n.º 22
0
def main():
    max_steps = 1000  # max number of steps in an episode --> 1000
    num_runs = 2000  # number of repetitions of the experiment --> 2000

    # Create and pass agent and environment objects to RLGlue

    #this is the epsilon optimistic approch where we explore 10% of the time
    agent = RandomAgent()
    environment = Environment1D()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore

    result = experiment(rlglue, num_runs, max_steps)
    #print(result)
    plt.plot(result, label='something', color='blue')

    #this is the epsilon greedy approch where we dont explore
    agent = RandomAgent2()
    environment = Environment1D()
    rlglue = RLGlue(environment, agent)
    del agent, environment  # don't use these anymore
    result = experiment(rlglue, num_runs, max_steps)
    plt.plot(result, label='something', color='red')
    plt.show()
Exemplo n.º 23
0
def main():
    max_steps = 1000  # max number of steps in an episode
    num_runs = 2000
    # number of repetitions of the experiment
    # Create and pass agent and environment objects to RLGlue
    agent = RandomAgent()
    environment = OneStateEnvironment()
    rlglue = RLGlue(environment, agent)
    del agent, environment
    result = experiment2(rlglue, num_runs, max_steps)
    list1 = range(1, max_steps + 1)
    plt.plot(list1, result)
    plt.yticks(np.arange(0, 100, step=20))
    plt.ylabel("Optimal %")
    plt.xlabel("Steps")
    plt.show()
Exemplo n.º 24
0
def main():
    max_steps = 2000  # max number of steps in an episode --> 2000
    num_runs = 30  # number of repetitions of the experiment --> 30
    result_plot = []
    val_func = np.load('TrueValueFunction.npy')
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    result_plot = experiment(val_func, rlglue, num_runs, max_steps,
                             result_plot)

    plt.plot(result_plot, label='something', color='blue')
    plt.ylabel('VE')
    plt.xlabel('Episodes')
    plt.show()
Exemplo n.º 25
0
def tiling(real_value):
    environment = RandomWalkEnvironment()
    agent = Agent2()
    rl = RLGlue(environment, agent)
    error = np.zeros(5000)
    for run in range(30):
        rl.rl_init()
        for episode in range(2000):
            rl.rl_episode(10000)
            estimate = rl.RL_agent_message("ValueFunction")
            error[episode] += np.sqrt(
                np.mean(np.power(real_value - estimate, 2)))

        rl.RL_cleanup()
    file2 = open("tiling_output.txt", "w")
    for i in range(2000):
        file2.write(format(error[i] / 10))
    file2.close()
Exemplo n.º 26
0
def main():
    # Seed rng's for consistent testing
    random.seed(0)
    np.random.seed(0)

    # Generate agent, environment and RLGlue
    env = Environment()
    agent = Agent(env.get_actions())
    rlglue = RLGlue(env, agent)
    del agent, env

    # Get generated policy
    policy = pickle.load(open('policy.pickle', 'rb'))

    # Test policy
    result = testPolicy(policy)
    print('result:', result)
Exemplo n.º 27
0
Arquivo: main.py Projeto: healqq/lumia
def run_experiment(env_info, agent_info, 
                   num_episodes=5000,
                   value_error_threshold=1e-8,
                   plot_freq=10):
    env = GridEnvironment
    agent = TDAgent
    rl_glue = RLGlue(env, agent)

    rl_glue.rl_init(agent_info, env_info)
    steps = []
    for episode in range(1, num_episodes + 1):
        rl_glue.rl_episode(0) # no step limit
        steps.append(rl_glue.agent.agent_message("get_steps"))
        if episode % plot_freq == 0:
            values = rl_glue.agent.agent_message("get_values")
            print(rl_glue.environment.env_message("get_grid_state"))
        rl_glue.rl_cleanup()
    values = rl_glue.agent.agent_message("get_values")
    
    return [values, steps]
Exemplo n.º 28
0
def testPolicy(policy):
    env = Environment()
    agent = testAgent(policy)
    rlglue = RLGlue(env, agent)
    rlglue.rl_init()
    #rlglue.rl_env_message('renderON')
    performance = 0
    for ep in range(100):
        rlglue.rl_start()
        terminal = False
        reward = None
        while not terminal:
            reward, state, action, terminal = rlglue.rl_step()
        if reward > 0:
            performance += 1

    return performance / 100
def question_3():

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 100000

    steps = np.zeros([num_runs, num_episodes])
    # only 1 run
    for r in range(num_runs):
        print("1000 episode run : ", r)
        rlglue.rl_init()
        for e in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            steps[r, e] = rlglue.num_ep_steps()
        # get the list of value functions [X,Y,Z] represents position, velocity, state-value
        Return = rlglue.rl_agent_message(1)
    return Return
Exemplo n.º 30
0
def question_3():
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    max_eps_steps = 100000
    num_episodes = 1000
    num_runs = 1
    numActions=3

    rlglue.rl_init()
    for e in range(num_episodes):
        rlglue.rl_episode(max_eps_steps)

    weights = rlglue.rl_agent_message("3D plot of the cast-to-go")

    fout = open('value','w')
    steps = 50
    z = np.zeros((50,50))
    for i in range(steps):
        for j in range(steps):
            values = []
            for a in range(numActions):
                tile = [8*(-1.2+(i*1.7/steps))/1.7,8*(-0.07+(j*0.14/steps))/0.14]
                inds =  agent.get_index(tile,a)
                values.append(np.sum([weights[i] for i in inds]))
            height = max(values)
            z[j][i]=-height
            fout.write(repr(-height)+' ')
        fout.write('\n')
    fout.close()

    fig = plt.figure()
    ax = fig.add_subplot(111,projection ='3d')
    x = np.arange(-1.2,0.5,1.7/50)
    y = np.arange(-0.07,0.07,0.14/50)
    x,y = np.meshgrid(x,y)
    ax.set_xticks([-1.2, 0.5])
    ax.set_yticks([0.07, -0.07])
    ax.set_ylabel('Velocity')
    ax.set_xlabel('Position')
    ax.set_zlabel('Cost-To-Go')
    ax.plot_surface(x,y,z)
    plt.savefig('cost-to-go.png')
    plt.show()
    np.save('steps', steps)
Exemplo n.º 31
0
def question_1(num_episodes):
    # Specify hyper-parameters

    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)

    max_eps_steps = 100000

    steps = np.zeros(num_episodes)

    rlglue.rl_init()
    for e in tqdm(range(num_episodes)):
        rlglue.rl_episode(max_eps_steps)
        steps[e] = rlglue.num_ep_steps()
        # print(steps[e])

    return steps