Пример #1
0
 def mountain_car(self, i):
     qrl = MountainCar(env_name='MountainCar-v0',
                       learning_rate=0.97**i,
                       discount=0.99**i,
                       iterations=1000)
     qrl.run()
     num_rewards = len(qrl.rewards)
     return self.rolling_mean(qrl.rewards,
                              window=num_rewards // 100,
                              strides=num_rewards // 50)
Пример #2
0
def fig_10_2():
    fig, ax = plt.subplots()
    qhat, nab_qhat = get_qhats(N_TIL, N_TLGS)

    for alp in FIG_10_2_ALP_L:
        print(f"[alpha={alp}]")
        tot_n_steps = np.zeros(FIG_10_2_N_EP)
        for seed in range(FIG_10_2_N_RUNS):
            print(f"[RUN #{seed}]")
            alg = EpisodicSemiGradientTD0(MountainCar(),
                                          alp,
                                          N_TIL * N_TLGS,
                                          eps=0)
            alg.seed(seed)
            tot_n_steps += np.array(
                alg.pol_eva(qhat, nab_qhat, FIG_10_2_N_EP, FIG_10_2_G))
        plt.plot(tot_n_steps / FIG_10_2_N_RUNS, label=f'alpha={alp}')
    plt.yscale('log')
    xticks, yticks = [0, 500], [100, 200, 400, 1000]
    plot_figure(ax, 'Figure 10.2', xticks, xticks, 'Episode', yticks, yticks,
                'Steps\nper episode\n(log scale)')
    fig.set_size_inches(20, 14)
    plt.legend()
    save_plot('fig10.2', dpi=100)
    plt.show()
Пример #3
0
def fig_10_4():
    fig, ax = plt.subplots()
    qhat, nab_qhat = get_qhats(N_TIL, N_TLGS)

    alg = nStepSemiGradSarsa(MountainCar(), 0, N_TIL * N_TLGS, 0, 0)
    for n in FIG_10_4_ALP_BND:
        alg.n = n
        print(f"[n={n}]")
        steps_l = []
        alpha_l = np.linspace(*FIG_10_4_ALP_BND[n], FIG_10_4_ALP_PTS)
        for alpha in alpha_l:
            alg.a = alpha / N_TLGS
            print(f"[alpha={alg.a}]")
            tot_steps = 0
            for seed in range(FIG_10_4_N_RUNS):
                print(f"[RUN #{seed}]")
                alg.reset()
                alg.seed(seed)
                for ep in range(FIG_10_4_N_EP):
                    tot_steps += alg.pol_eva(None,
                                             qhat,
                                             nab_qhat,
                                             1,
                                             FIG_10_4_G,
                                             max_steps=1000)[0]
            steps_l.append(tot_steps / (FIG_10_4_N_RUNS * FIG_10_4_N_EP))
        plt.plot(alpha_l, steps_l, label=f'n={n}')
    xticks, yticks = np.linspace(0, 1.5, 4), np.linspace(220, 300, 5)
    left_title = (
        f'Mountain Car\nSteps per\nepisode\n(averaged \nover ' +
        f'first\n{FIG_10_4_N_EP} episodes\nand {FIG_10_4_N_RUNS} runs')
    plot_figure(ax,
                'Figure 10.4',
                list(xticks) + [1.8],
                xticks,
                f'alpha * number of tilings ({N_TLGS})',
                yticks,
                yticks,
                left_title,
                labelpad=20)
    fig.set_size_inches(20, 14)
    plt.legend()
    save_plot('fig10.4', dpi=100)
    plt.show()
Пример #4
0
def fig_12_11():
    fig, ax = plt.subplots()
    F, qhat = get_fn_mc(N_TIL, N_TLGS)
    for alg_name in FIG_12_11_ALG_STR.keys():
        steps_l = []
        alpha_l = np.linspace(*FIG_12_11_ALP_BND[alg_name], FIG_12_11_N_PTS)
        for alpha in alpha_l:
            alg = alg_name(MountainCar(), alpha / N_TLGS, N_TIL * N_TLGS,
                           FIG_12_11_LAM, F, qhat, FIG_12_11_EPS, FIG_12_11_G)
            print(f"[ALPHA={alg.a}]")
            tot_steps = 0
            for seed in range(FIG_12_11_N_RUNS):
                print(f"[RUN #{seed}]")
                alg.reset()
                alg.seed(seed)
                for ep in range(FIG_12_11_N_EP):
                    tot_steps += alg.pol_eva(None,
                                             1,
                                             max_steps=FIG_12_11_MAX_STEPS)[0]
            steps_l.append(tot_steps / (FIG_12_11_N_RUNS * FIG_12_11_N_EP))
        plt.plot(alpha_l,
                 -np.array(steps_l),
                 label=FIG_12_11_ALG_STR[alg_name])
    xticks, yticks = np.linspace(0.2, 2, 10), np.linspace(-550, -150, 9)
    xnames = map(lambda x: str(x)[:3], xticks)
    left_title = (
        f'Mountain Car\nReward per\nepisode\n(averaged \nover ' +
        f'first\n{FIG_12_11_N_EP} episodes\n{FIG_12_11_N_RUNS} runs)')
    plot_figure(ax,
                'Figure 12.11',
                xticks,
                xnames,
                f'alpha * number of tilings ({N_TLGS})',
                yticks,
                yticks,
                left_title,
                labelpad=45)
    fig.set_size_inches(20, 14)
    plt.legend()
    save_plot('fig12.11', dpi=100)
    plt.show()
Пример #5
0
def fig_12_10():
    fig, ax = plt.subplots()
    for lam in FIG_12_10_LAM_L:
        print(f"[LAM={lam}]")
        steps_l = []
        alpha_l = np.linspace(FIG_12_10_ALP_MIN, FIG_12_10_ALP_MAX,
                              FIG_12_10_N_PTS)
        for alpha in alpha_l:
            F, qhat = get_fn_mc(N_TIL, N_TLGS)
            alg = SarsaLam(MountainCar(), alpha / N_TLGS, N_TIL * N_TLGS, lam,
                           F, qhat, FIG_12_10_EPS, FIG_12_10_G)
            print(f"[ALPHA={alg.a}]")
            tot_steps = 0
            for seed in range(FIG_12_10_N_RUNS):
                print(f"[RUN #{seed}]")
                alg.reset()
                alg.seed(seed)
                for ep in range(FIG_12_10_N_EP):
                    print(f"[EP #{ep}]")
                    tot_steps += alg.pol_eva(None,
                                             1,
                                             max_steps=FIG_12_10_MAX_STEPS)[0]
            steps_l.append(tot_steps / (FIG_12_10_N_RUNS * FIG_12_10_N_EP))
        plt.plot(alpha_l, steps_l, label=f'lam={lam}')
    xticks, yticks = np.linspace(0.5, 1.5, 5), np.linspace(180, 300, 7)
    left_title = (
        f'Mountain Car\nSteps per\nepisode\n(averaged \nover ' +
        f'first\n{FIG_12_10_N_EP} episodes\n{FIG_12_10_N_RUNS} runs)')
    plot_figure(ax,
                'Figure 12.10',
                list(xticks) + [1.6],
                xticks,
                f'alpha * number of tilings ({N_TLGS})', [160] + list(yticks),
                yticks,
                left_title,
                labelpad=35)
    fig.set_size_inches(20, 14)
    plt.legend()
    save_plot('fig12.10', dpi=100)
    plt.show()
Пример #6
0
def fig_10_3():
    fig, ax = plt.subplots()
    qhat, nab_qhat = get_qhats(N_TIL, N_TLGS)

    for (n, alp) in zip(FIG_10_3_N_L, FIG_10_3_ALP_L):
        print(f"[n={n}, alpha={alp}]")
        tot_n_steps = np.zeros(FIG_10_3_N_EP)
        for seed in range(FIG_10_3_N_RUNS):
            print(f"[RUN #{seed}]")
            alg = nStepSemiGradSarsa(MountainCar(), alp, N_TIL * N_TLGS, 0, n)
            alg.seed(seed)
            tot_n_steps += np.array(
                alg.pol_eva(None, qhat, nab_qhat, FIG_10_3_N_EP, FIG_10_3_G))
        plt.plot(tot_n_steps / FIG_10_3_N_RUNS, label=f'n={n}')
    plt.yscale('log')
    xticks, yticks = [0, 500], [100, 200, 400, 1000]
    plot_figure(ax, 'Figure 10.3', xticks, xticks, 'Episode', yticks, yticks,
                'Steps\nper episode\n(log scale)')
    fig.set_size_inches(20, 14)
    plt.legend()
    save_plot('fig10.3', dpi=100)
    plt.show()
Пример #7
0
def fig_10_1():
    def plot_and_save(filename, title, alg, n_ep, max_steps=np.inf):
        fig = plt.figure()
        alg.pol_eva(qhat, nab_qhat, n_ep, FIG_10_1_G, max_steps=max_steps)
        print_qhat_mc(alg, fig, '111', title)
        fig.set_size_inches(20, 14)
        save_plot(filename, dpi=100)
        plt.show()

    qhat, nab_qhat = get_qhats(N_TIL, N_TLGS)
    env = MountainCar()
    alg = EpisodicSemiGradientTD0(env, FIG_10_1_ALP, N_TIL * N_TLGS, eps=0)
    alg.seed(0)
    plot_and_save(f'fig10.1_{FIG_10_1_STEPS}_steps', f'Step {FIG_10_1_STEPS}',
                  alg, 1, FIG_10_1_STEPS)

    tot_ep = 1
    for ep in FIG_10_1_EP_L:
        alg.pol_eva(qhat, nab_qhat, ep - tot_ep, FIG_10_2_G)
        plot_and_save(f'fig10.1_{ep}_episodes', f'Episode {ep}', alg,
                      ep - tot_ep)
        tot_ep += (ep - tot_ep)
def main():

    env = MountainCar(mass=0.2, friction=0.3, delta_t=0.1)

    # Define the state arrays for velocity and position
    tot_action = 3  # Three possible actions
    tot_bins = 12  # the value used to discretize the space
    velocity_state_array = np.linspace(-1.5,
                                       +1.5,
                                       num=tot_bins - 1,
                                       endpoint=False)
    position_state_array = np.linspace(-1.2,
                                       +0.5,
                                       num=tot_bins - 1,
                                       endpoint=False)

    # Random policy as a square matrix of size (tot_bins x tot_bins)
    # Three possible actions represented by three integers
    policy_matrix = np.random.randint(low=0,
                                      high=tot_action,
                                      size=(tot_bins,
                                            tot_bins)).astype(np.float32)
    print("Policy Matrix:")
    print(policy_matrix)

    # The state-action matrix and the visit counter
    # The rows are the velocities and the columns the positions.
    state_action_matrix = np.zeros((tot_action, tot_bins * tot_bins))
    visit_counter_matrix = np.zeros((tot_action, tot_bins * tot_bins))

    # Variables
    gamma = 0.999
    alpha = 0.001
    tot_episode = 100000
    epsilon_start = 0.9  # those are the values for epsilon decay
    epsilon_stop = 0.1
    epsilon_decay_step = 3000
    print_episode = 500  # print every...
    movie_episode = 10000  # movie saved every...
    reward_list = list()
    step_list = list()

    for episode in range(tot_episode):
        epsilon = return_decayed_value(epsilon_start,
                                       epsilon_stop,
                                       episode,
                                       decay_step=epsilon_decay_step)
        # Reset and return the first observation
        observation = env.reset(exploring_starts=False)
        # The observation is digitized, meaning that an integer corresponding
        # to the bin where the raw float belongs is obtained and use as replacement.
        observation = (np.digitize(observation[1], velocity_state_array),
                       np.digitize(observation[0], position_state_array))
        is_starting = True
        cumulated_reward = 0
        for step in range(100):
            #Take the action from the action matrix
            #action = policy_matrix[observation[0], observation[1]]
            #Take the action using epsilon-greedy
            action = return_epsilon_greedy_action(policy_matrix,
                                                  observation,
                                                  epsilon=epsilon)
            if (is_starting):
                action = np.random.randint(0, tot_action)
                is_starting = False
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            new_observation = (np.digitize(new_observation[1],
                                           velocity_state_array),
                               np.digitize(new_observation[0],
                                           position_state_array))
            new_action = policy_matrix[new_observation[0], new_observation[1]]
            #Updating the state-action matrix
            state_action_matrix = update_state_action(
                state_action_matrix, visit_counter_matrix, observation,
                new_observation, action, new_action, reward, alpha, gamma)
            #Updating the policy
            policy_matrix = update_policy(policy_matrix, state_action_matrix,
                                          observation)
            #Increment the visit counter
            visit_counter_matrix = update_visit_counter(
                visit_counter_matrix, observation, action)
            observation = new_observation
            cumulated_reward += reward
            if done: break

        # Store the data for statistics
        reward_list.append(cumulated_reward)
        step_list.append(step)
        # Printing utilities
        if (episode % print_episode == 0):
            print("")
            print("Episode: " + str(episode + 1))
            print("Epsilon: " + str(epsilon))
            print("Episode steps: " + str(step + 1))
            print("Cumulated Reward: " + str(cumulated_reward))
            print("Policy matrix: ")
            print_policy(policy_matrix)
        if (episode % movie_episode == 0):
            print("Saving the reward plot in: ./reward.png")
            plot_curve(reward_list,
                       filepath="./reward.png",
                       x_label="Episode",
                       y_label="Reward",
                       x_range=(0, len(reward_list)),
                       y_range=(-1.1, 1.1),
                       color="red",
                       kernel_size=500,
                       alpha=0.4,
                       grid=True)
            print("Saving the step plot in: ./step.png")
            plot_curve(step_list,
                       filepath="./step.png",
                       x_label="Episode",
                       y_label="Steps",
                       x_range=(0, len(step_list)),
                       y_range=(-0.1, 100),
                       color="blue",
                       kernel_size=500,
                       alpha=0.4,
                       grid=True)
            print("Saving the gif in: ./mountain_car.gif")
            env.render(file_path='./mountain_car.gif', mode='gif')
            print("Complete!")

    # Save reward and steps in npz file for later use
    # np.savez("./statistics.npz", reward=np.asarray(reward_list), step=np.asarray(step_list))
    # Time to check the utility matrix obtained
    print("Policy matrix after " + str(tot_episode) + " episodes:")
    print_policy(policy_matrix)
def main():

    env = MountainCar(mass=0.2, friction=0.3, delta_t=0.1)

    # Define the state arrays for velocity and position
    tot_action = 3  # Three possible actions
    tot_bins = 12  # the value used to discretize the space
    velocity_state_array = np.linspace(-1.5, +1.5, num=tot_bins-1,
                                       endpoint=False)
    position_state_array = np.linspace(-1.2, +0.5, num=tot_bins-1,
                                       endpoint=False)

    # Random policy as a square matrix of size (tot_bins x tot_bins)
    # Three possible actions represented by three integers
    policy_matrix = np.random.randint(low=0, high=tot_action, size=(tot_bins,tot_bins))
    print("Policy Matrix:")
    print(policy_matrix)

    # The state-action matrix and the visit counter
    # The rows are the velocities and the columns the positions.
    state_action_matrix = np.zeros((tot_action, tot_bins*tot_bins))
    visit_counter_matrix = np.zeros((tot_action, tot_bins*tot_bins))

    # Variables
    gamma = 0.999
    alpha = 0.001
    tot_episode = 100000
    epsilon_start = 0.9  # those are the values for epsilon decay
    epsilon_stop = 0.1
    epsilon_decay_step = 3000
    print_episode = 500  # print every...
    movie_episode = 10000  # movie saved every...
    reward_list = list()
    step_list = list()

    for episode in range(tot_episode):
        epsilon = return_decayed_value(epsilon_start, epsilon_stop, episode,
                                       decay_step=epsilon_decay_step)
        # Reset and return the first observation
        observation = env.reset(exploring_starts=False)
        # The observation is digitized, meaning that an integer corresponding
        # to the bin where the raw float belongs is obtained and use as replacement.
        observation = (np.digitize(observation[1], velocity_state_array),
                       np.digitize(observation[0], position_state_array))
        is_starting = True
        cumulated_reward = 0
        for step in range(100):
            #Take the action from the action matrix
            #action = policy_matrix[observation[0], observation[1]]
            #Take the action using epsilon-greedy
            action = return_epsilon_greedy_action(policy_matrix, observation,
                                                  epsilon=epsilon)
            if(is_starting):
                action = np.random.randint(0, tot_action)
                is_starting = False
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            new_observation = (np.digitize(new_observation[1],
                                           velocity_state_array),
                               np.digitize(new_observation[0],
                                           position_state_array))
            new_action = policy_matrix[new_observation[0], new_observation[1]]
            #Updating the state-action matrix
            state_action_matrix = update_state_action(state_action_matrix,
                                                      visit_counter_matrix,
                                                      observation,
                                                      new_observation,
                                                      action,
                                                      new_action,
                                                      reward,
                                                      alpha,
                                                      gamma,
                                                      tot_bins)
            #Updating the policy
            policy_matrix = update_policy(policy_matrix,
                                          state_action_matrix,
                                          observation,
                                          tot_bins)
            #Increment the visit counter
            visit_counter_matrix = update_visit_counter(visit_counter_matrix,
                                                        observation,
                                                        action,
                                                        tot_bins)
            observation = new_observation
            cumulated_reward += reward
            if done: break

        # Store the data for statistics
        reward_list.append(cumulated_reward)
        step_list.append(step)
        # Printing utilities
        if(episode % print_episode == 0):
            print("")
            print("Episode: " + str(episode+1))
            print("Epsilon: " + str(epsilon))
            print("Episode steps: " + str(step+1))
            print("Cumulated Reward: " + str(cumulated_reward))
            print("Policy matrix: ")
            print_policy(policy_matrix)
        if(episode % movie_episode == 0):
            print("Saving the reward plot in: ./reward.png")
            plot_curve(reward_list, filepath="./reward.png",
                       x_label="Episode", y_label="Reward",
                       x_range=(0, len(reward_list)), y_range=(-1.1,1.1),
                       color="red", kernel_size=500,
                       alpha=0.4, grid=True)
            print("Saving the step plot in: ./step.png")
            plot_curve(step_list, filepath="./step.png",
                       x_label="Episode", y_label="Steps",
                       x_range=(0, len(step_list)), y_range=(-0.1,100),
                       color="blue", kernel_size=500,
                       alpha=0.4, grid=True)
            print("Saving the gif in: ./mountain_car.gif")
            env.render(file_path='./mountain_car.gif', mode='gif')
            print("Complete!")

    # Save reward and steps in npz file for later use
    # np.savez("./statistics.npz", reward=np.asarray(reward_list), step=np.asarray(step_list))
    # Time to check the utility matrix obtained
    print("Policy matrix after " + str(tot_episode) + " episodes:")
    print_policy(policy_matrix)
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from mountain_car import MountainCar
import random

my_car = MountainCar(mass=0.2, friction=0.3, delta_t=0.1)
cumulated_reward = 0
print("Starting random agent...")
for step in range(100):
    action = random.randint(a=0, b=2)
    observation, reward, done = my_car.step(action)
    cumulated_reward += reward
    if done: break
print("Finished after: " + str(step+1) + " steps")
print("Cumulated Reward: " + str(cumulated_reward))
print("Saving the gif in: ./mountain_car.gif")
my_car.render(file_path='./mountain_car.gif', mode='gif')
print("Complete!")
Пример #11
0
DECAY = 0.99
ALPHA = 0.9

BIN_NUM = 64

train_times = 31250
vis_times = 10

output_path = 'output/mountain_car_sarsa/'
if not os.path.exists(output_path):
    os.makedirs(output_path)


def random_action(_, n):
    return np.random.randint(n)


bin_shape = (BIN_NUM, BIN_NUM)
games = MountainCar(PARA_SIZE, REPLAY_SIZE)
inc_learner = BinIncLearner(Discretization(state_bounds, bin_shape), bin_shape,
                            (action_n, ), ALPHA)

framework = SARSA(games, inc_learner, random_action, DECAY)

for v in xrange(vis_times):
    framework.loop(train_times)
    visualize_states(output_path + 'states_%i.png' % v,
                     inc_learner.eval_batch_no_default)

games.close()
Пример #12
0
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from mountain_car import MountainCar
import random

my_car = MountainCar(mass=0.2, friction=0.3, delta_t=0.1)
cumulated_reward = 0
print("Starting random agent...")
for step in range(100):
    action = random.randint(a=0, b=2)
    observation, reward, done = my_car.step(action)
    cumulated_reward += reward
    if done: break
print("Finished after: " + str(step + 1) + " steps")
print("Cumulated Reward: " + str(cumulated_reward))
print("Saving the gif in: ./mountain_car.gif")
my_car.render(file_path='./mountain_car.gif', mode='gif')
print("Complete!")
Пример #13
0
import argparse
import os
from mountain_car import MountainCar
import matplotlib.pyplot as plt

ENV_DICT = {
    'mountain_car': MountainCar(mnt=False),
}


def play(env):
    def refresh():
        os.system('cls' if os.name == 'nt' else 'clear')
        print(env)

    while True:
        env.reset()
        done = False
        v = []
        while not done:
            key = ''
            while key not in env.keys:
                refresh()
                key = input("press key\n$>")
                if key == "exit()":
                    exit()
                if (key == 'p'):
                    env.show(n_pts=10000)
                if (key == 'v'):
                    plt.plot(v)
                    plt.show()
Пример #14
0
    """The world's simplest agent!"""

    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return self.action_space.sample()


if __name__ == '__main__':
    logging.basicConfig()
    log = logging.getLogger("mountain-car")
    log.setLevel(level='INFO')

    # we will use our environment (wrapper of OpenAI env)
    mountain_car = MountainCar()

    # specify which agent you want to use, 
    # BonsaiAgent that uses trained Brain or
    # RandomAgent that randomly selects next action
    agent = BonsaiAgent()

    episode_count = 100

    try:
        for i in range(episode_count):
            #start a new episode and get the new state
            mountain_car.episode_start()
            state = mountain_car.get_state()

            while True: