Python MountainCar.MountainCar примеры, mountaincar.MountainCar.MountainCar Python примеры использования

Пример #1

0

Показать файл

def sarsa_lambda_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1) ** len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams['tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # e ← 0
        e = np.zeros(w.shape)

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound and count < 1e3:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]
            if new_s == [0.5, 0]:
                new_q = 0
            else:
                new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)

            # e←γλe+∂qw(s,a)/∂w;
            e = l * 1 * e + dqdw
            # δ←r+γqw(s′,a′)−qw(s,a);
            delta = r + 1 * new_q - q
            # w←w+αδe;
            w += lr * delta * e

            # print(w)

            s = new_s
            a = new_a
            count += 1
        # print('update end')

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards

Пример #2

0

Показать файл

Файл: mountainCarEnv.py Проект: osigaud/ArmModelPython

 def __init__(self):
     self.env = MountainCar()
     self.noiseRange = 1.0
     self.om = 0
     self.alpha = 0.6
     self.beta = 0.4
     self.t = 0
     self.totStep = 0
     self.r = 0
     self.ep = 0
     self.perfs = result_log(algo="DDPG", l1=20, l2=10)
     self.actif = True

Пример #3

0

Показать файл

Файл: policy_improvement.py Проект: stonezhng/cs687-ReinforcementLearning-Homework

def sarsa_mountaincar(lr, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(
            fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(
                fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards

Пример #4

0

Показать файл

 def __init__(self, logger = None):
     self.env = MountainCar()
     self.noiseRange = 1.0
     self.noiseMax = 1.0
     self.om = 0
     self.alpha = 0.6
     self.beta = 0.4
     self.t = 0
     self.totStep = 0
     self.r = 0
     self.ep = 0
     if logger==None:
         self.perfs = result_log(algo="DDPG", l1=20, l2=10)
     else:
         self.perfs = logger
     self.actif = True

Пример #5

0

Показать файл

Файл: agent.py Проект: rayush7/Continuous_Mountain_Car_Problem_Actor_Critic_RL_Solution

    def __init__(self):
        self.mc = MountainCar()

        # Params for env
        self.state_dim = 2
        self.action_dim = 1
        self.max_action = self.mc.F

        # Params for training
        self.batch_size = 32
        self.gamma = 0.99
        self.tau = 0.001
        self.actor_lr = 0.0001
        self.critic_lr = 0.0004
        self.noise = OU_Noise(self.action_dim, 0)
        self.explore = 100000
        self.epsilon = 0.3
        self.counter = 0

        # Init
        self.actor = Actor(self.state_dim, self.action_dim,
                           self.max_action).to(device)
        self.actor_target = Actor(self.state_dim, self.action_dim,
                                  self.max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.actor_lr)

        self.critic = Critic(self.state_dim, self.action_dim).to(device)
        self.critic_target = Critic(self.state_dim, self.action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           self.critic_lr)
        self.replay_buffer = ReplayBuffer()
        self.num_training = 0

        self.state_curr = None
        self.state_prev = None

Пример #6

0

Показать файл

Файл: episodic_SARSA_w_FA.py Проект: c-boe/Reinforcement-learning

if __name__ == "__main__":

    #
    alpha = 0.1 / 8
    epsilon = 0.05
    gamma = 1
    num_episodes = 500

    dim1 = np.array((-1.2, 0.5))
    dim2 = np.array((-0.07, 0.07))

    dims = np.array((dim1, dim2))

    num_actions = 3
    num_tilings = 8
    tiles_per_dim = 8

    idcs_per_action = num_tilings * tiles_per_dim**len(dims)
    displ_vecs = np.array([(1, 3), (3, 1), (-1, 3), (3, -1), (1, -3), (-3, 1),
                           (-1, -3), (-3, -1)],
                          dtype="float")

    #%%
    fvecs = feature_vecs(dims, num_tilings, tiles_per_dim, displ_vecs)

    weights = init_weights(num_tilings, tiles_per_dim, len(dims), num_actions)
    env = MountainCar()

    #%%
    episodic_sg_SARSA(env, fvecs, weights, alpha, epsilon, gamma, num_actions,
                      num_episodes)

Пример #7

0

Показать файл

 def __init__(self, g=10.0, d=100.0, H=10., m=10.0, F=3.0):
     """Instanciate a new environement in its initial state.
     """
     self.mc = MountainCar(g=g, d=d, H=H, m=m, F=F, R=50.0, T=0.0)

Пример #8

0

Показать файл

Файл: mountaincar_exp.py Проект: Optimization-Tools/gp-for-interpretable-rl

from mountaincar import MountainCar
from gp_gym_info import info
import gym

# GP Parameters
info["env_name"] = "MountainCar-v0"
info["pop_size"] = 100
info["max_gens"] = 10
info["max_depth"] = 1
info["num_eps"] = 100

agent = MountainCar(info)
best_program = agent.train()
print(best_program)
f = agent.fit(best_program, 100, 200, render=False)
print(f)

Пример #9

0

Показать файл

Файл: reinforce.py Проект: stonezhng/cs687-ReinforcementLearning-Homework

def reinforce_mc(alpha, beta, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        w = np.zeros((1, (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        e = np.zeros(w.shape)

        hist_s = []
        hist_a = []
        hist_r = []
        hist_pi = []

        count = 0
        dj = np.zeros(theta.shape)

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            new_s, r = mc.P_and_R(s, a)

            hist_a.append(a)
            hist_s.append(s)
            hist_r.append(r)
            hist_pi.append(pi_temp)

            s = new_s
            count += 1

        for i in range(len(hist_a)):
            g = 0
            for j in range(i, len(hist_s)):
                g += hist_r[j]
            v, dv = fa.vw(w, hist_s[i], base, baseparams)
            dj += (g - v) * dsoftmax(hist_s[i], hist_a[i], order, actions,
                                     hist_pi[i])
            e = l * e + dv

            if i == len(hist_s) - 1:
                delta = hist_r[i] + 0 - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]
            else:
                delta = hist_r[i] + fa.vw(w, hist_s[i + 1], base, baseparams)[0] - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]

            w += alpha * delta * e
        theta += beta * dj

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards

Пример #10

0

Показать файл

Файл: agent.py Проект: rayush7/Continuous_Mountain_Car_Problem_Actor_Critic_RL_Solution

 def __init__(self):
     """Init a new agent.
     """
     mc = MountainCar()

Пример #11

0

Показать файл

Файл: actor_critic.py Проект: stonezhng/cs687-ReinforcementLearning-Homework

def actor_critic_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, (order + 1)**len(s)))
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        # ev ← 0
        e = np.zeros(w.shape)
        # et ← 0
        # et = np.zeros(theta.shape)

        count = 0

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(a)

            # print(pi_temp)

            # dydtheta_list = []
            # for na in actions:
            #     dydtheta_list.append(fa.qw_ele(theta, s, na, actions, base, baseparams)[1])
            #
            # dtheta = estimation.dsoftmax(fa.qw(theta, s, actions, base, baseparams), dydtheta_list, actions.index(
            # a), eps(x))

            dtheta = np.zeros((1, len(actions) * (order + 1)**len(s)))

            for idx in range(len(actions)):
                phi = fa.fourier_phi_mc(s, order).T
                if actions[idx] == a:
                    # print('target')
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = (1 - pi_temp[idx]) * phi
                else:
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = -pi_temp[idx] * phi

            # Take action a and observe r and s′;

            new_s, r = mc.P_and_R(s, a)

            # Critic update using TD(λ)
            # ev←γλev+∂vw(s);
            v, dv = fa.vw(w, s, base, baseparams)
            if new_s[0] > mc.right_bound:
                new_v = 0
            else:
                new_v = fa.vw(w, new_s, base, baseparams)[0]

            e = l * mc.gamma * e
            e += dv
            # δ ← r + γvw(s′,a′) − vw(s,a);
            delta = r + mc.gamma * new_v - v
            # w←w+αδev;
            w += lr * delta * e

            # Actor update
            # θ + αγ^tδ ∂ ln(π(s,a,θ))
            theta += lr * delta * dtheta

            s = new_s
            count += 1

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards

Python MountainCar.MountainCar примеры использования