示例#1
0
class Environment:
    def __init__(self, g=10.0, d=100.0, H=10., m=10.0, F=3.0):
        """Instanciate a new environement in its initial state.
        """
        self.mc = MountainCar(g=g, d=d, H=H, m=m, F=F, R=50.0, T=0.0)

    def reset(self):
        self.mc.reset()
        # place the car at a random place near the bottom
        self.mc.x = np.random.uniform(-self.mc.d*1.3, -self.mc.d*0.7)

    def get_range(self):
        return [-1.5*self.mc.d, 0.0]

    def observe(self):
        """Returns the current observation that the agent can make
        of the environment, if applicable.
        """
        return (self.mc.x, self.mc.vx)

    def act(self, action):
        """Perform given action by the agent on the environment,
        and returns a reward.
        """
        reward = self.mc.act(action)
        return (reward, "victory" if reward > 0.0 else None)
示例#2
0
 def __init__(self):
     self.env = MountainCar()
     self.noiseRange = 1.0
     self.om = 0
     self.alpha = 0.6
     self.beta = 0.4
     self.t = 0
     self.totStep = 0
     self.r = 0
     self.ep = 0
     self.perfs = result_log(algo="DDPG", l1=20, l2=10)
     self.actif = True
示例#3
0
 def __init__(self, logger = None):
     self.env = MountainCar()
     self.noiseRange = 1.0
     self.noiseMax = 1.0
     self.om = 0
     self.alpha = 0.6
     self.beta = 0.4
     self.t = 0
     self.totStep = 0
     self.r = 0
     self.ep = 0
     if logger==None:
         self.perfs = result_log(algo="DDPG", l1=20, l2=10)
     else:
         self.perfs = logger
     self.actif = True
示例#4
0
class MountainCarEnv(Env):
    print_interval = 100
    def __init__(self):
        self.env = MountainCar()
        self.noiseRange = 1.0
        self.om = 0
        self.alpha = 0.6
        self.beta = 0.4
        self.t = 0
        self.totStep = 0
        self.r = 0
        self.ep = 0
        self.perfs = result_log(algo="DDPG", l1=20, l2=10)
        self.actif = True
        #self.plot = result_plot()
    
    def state(self):
        return [self.env.getObservation()]
    def act(self, action):
        actNoise = action + self.noise_func()
        self.env.performAction(actNoise[0])
        r = self.env.getReward()
        self.t += 1
        self.r += r
        return actNoise, [r]
    def reset(self, noise=True):
        self.actif = True
        self.env.reset()
        self.om = 0
        self.totStep+=self.t
        if self.totStep != 0:
            self.perfs.addData(self.totStep, self.t, self.r)
        self.t = 0
        self.r = 0
        self.ep += 1
        if not noise:
            self.noiseRange = 0.0
        else:
            self.noiseRange = random.uniform(0.,1.0)
    def noise_func(self):
        self.om = self.om-self.alpha*self.om + self.beta*random.gauss(0,1)*self.noiseRange
        return self.om
    def isFinished(self):
        if self.actif and not self.env.isFinished():
            return False
        else:
            self.actif = False
            return True
    def getActionSize(self):
        return 1
    def getStateSize(self):
        return 2
    def getActionBounds(self):
        return [[1.2], [-1.2]]
    def printEpisode(self):
        print time.strftime("[%H:%M:%S]"), " Episode : " , self.ep, " steps : ", self.t, " reward : ", self.r
    def performances(self):
        pass#self.plot.clear()
        #self.plot.add_row(self.perfs)
示例#5
0
def sarsa_lambda_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1) ** len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams['tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # e ← 0
        e = np.zeros(w.shape)

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound and count < 1e3:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]
            if new_s == [0.5, 0]:
                new_q = 0
            else:
                new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)

            # e←γλe+∂qw(s,a)/∂w;
            e = l * 1 * e + dqdw
            # δ←r+γqw(s′,a′)−qw(s,a);
            delta = r + 1 * new_q - q
            # w←w+αδe;
            w += lr * delta * e

            # print(w)

            s = new_s
            a = new_a
            count += 1
        # print('update end')

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards
def sarsa_mountaincar(lr, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(
            fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(
                fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards
    def __init__(self):
        self.mc = MountainCar()

        # Params for env
        self.state_dim = 2
        self.action_dim = 1
        self.max_action = self.mc.F

        # Params for training
        self.batch_size = 32
        self.gamma = 0.99
        self.tau = 0.001
        self.actor_lr = 0.0001
        self.critic_lr = 0.0004
        self.noise = OU_Noise(self.action_dim, 0)
        self.explore = 100000
        self.epsilon = 0.3
        self.counter = 0

        # Init
        self.actor = Actor(self.state_dim, self.action_dim,
                           self.max_action).to(device)
        self.actor_target = Actor(self.state_dim, self.action_dim,
                                  self.max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.actor_lr)

        self.critic = Critic(self.state_dim, self.action_dim).to(device)
        self.critic_target = Critic(self.state_dim, self.action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           self.critic_lr)
        self.replay_buffer = ReplayBuffer()
        self.num_training = 0

        self.state_curr = None
        self.state_prev = None
if __name__ == "__main__":

    #
    alpha = 0.1 / 8
    epsilon = 0.05
    gamma = 1
    num_episodes = 500

    dim1 = np.array((-1.2, 0.5))
    dim2 = np.array((-0.07, 0.07))

    dims = np.array((dim1, dim2))

    num_actions = 3
    num_tilings = 8
    tiles_per_dim = 8

    idcs_per_action = num_tilings * tiles_per_dim**len(dims)
    displ_vecs = np.array([(1, 3), (3, 1), (-1, 3), (3, -1), (1, -3), (-3, 1),
                           (-1, -3), (-3, -1)],
                          dtype="float")

    #%%
    fvecs = feature_vecs(dims, num_tilings, tiles_per_dim, displ_vecs)

    weights = init_weights(num_tilings, tiles_per_dim, len(dims), num_actions)
    env = MountainCar()

    #%%
    episodic_sg_SARSA(env, fvecs, weights, alpha, epsilon, gamma, num_actions,
                      num_episodes)
示例#9
0
        self.s_t = s_tp1
        self.a_t = a_tp1

        return self.actions[a_tp1] if a_tp1 != None else None

    def reset(self):
        self.s_t = None
        self.a_t = None

    def proposeAction(self, state):
        return self.actions[self.policy(state)]

# domain constructor:
# you can replace the mountain car domain by the inverted pendulum
domain = MountainCar(random_start= False, max_episode=10000)
# domain = SwingPendulum(random_start= False, max_episode_length=10000)

phi = TileCoding(input_indicies = [np.array([0,1])],
                 ntiles = [10],
                 ntilings=[10],
                 state_range = domain.state_range,
                 bias_term = True)

valuefn = LinearTD(len(domain.discrete_actions),
                   phi,
                   alpha = 0.01,
                   gamma= 0.995)

# this is a sub-optimal policy for mountain car that naively always
# increases the energy of the system. You can swap the e-greedy policy
示例#10
0
 def __init__(self, g=10.0, d=100.0, H=10., m=10.0, F=3.0):
     """Instanciate a new environement in its initial state.
     """
     self.mc = MountainCar(g=g, d=d, H=H, m=m, F=F, R=50.0, T=0.0)
from mountaincar import MountainCar
from gp_gym_info import info
import gym

# GP Parameters
info["env_name"] = "MountainCar-v0"
info["pop_size"] = 100
info["max_gens"] = 10
info["max_depth"] = 1
info["num_eps"] = 100

agent = MountainCar(info)
best_program = agent.train()
print(best_program)
f = agent.fit(best_program, 100, 200, render=False)
print(f)
def reinforce_mc(alpha, beta, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        w = np.zeros((1, (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        e = np.zeros(w.shape)

        hist_s = []
        hist_a = []
        hist_r = []
        hist_pi = []

        count = 0
        dj = np.zeros(theta.shape)

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            new_s, r = mc.P_and_R(s, a)

            hist_a.append(a)
            hist_s.append(s)
            hist_r.append(r)
            hist_pi.append(pi_temp)

            s = new_s
            count += 1

        for i in range(len(hist_a)):
            g = 0
            for j in range(i, len(hist_s)):
                g += hist_r[j]
            v, dv = fa.vw(w, hist_s[i], base, baseparams)
            dj += (g - v) * dsoftmax(hist_s[i], hist_a[i], order, actions,
                                     hist_pi[i])
            e = l * e + dv

            if i == len(hist_s) - 1:
                delta = hist_r[i] + 0 - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]
            else:
                delta = hist_r[i] + fa.vw(w, hist_s[i + 1], base, baseparams)[0] - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]

            w += alpha * delta * e
        theta += beta * dj

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards
 def __init__(self):
     """Init a new agent.
     """
     mc = MountainCar()
from deap import gp

import random
random.seed(5)

# env = gym.make("MountainCarContinuous-v0")
# print(env.action_space.low)

info["env_name"] = "MountainCarContinuous-v0"
info["pop_size"] = 100
info["max_gens"] = 10
info["max_depth"] = 1
info["tournament_size"] = 5
info["num_eps"] = 10
agent = MountainCar(info)

solutions = {}
force_values = np.arange(0.0, 1.0, 0.1)
fitness_scores = []
counter = 1
for force in force_values:
    solution = "IFLTE(0.0, velocity, {}, {})".format(force, -force)
    f = agent.fit(solution, 100, 200, render=False)[0]
    fitness_scores.append(f)

    # Timing
    print(counter)
    counter += 1

    if f >= 90:
def actor_critic_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, (order + 1)**len(s)))
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        # ev ← 0
        e = np.zeros(w.shape)
        # et ← 0
        # et = np.zeros(theta.shape)

        count = 0

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(a)

            # print(pi_temp)

            # dydtheta_list = []
            # for na in actions:
            #     dydtheta_list.append(fa.qw_ele(theta, s, na, actions, base, baseparams)[1])
            #
            # dtheta = estimation.dsoftmax(fa.qw(theta, s, actions, base, baseparams), dydtheta_list, actions.index(
            # a), eps(x))

            dtheta = np.zeros((1, len(actions) * (order + 1)**len(s)))

            for idx in range(len(actions)):
                phi = fa.fourier_phi_mc(s, order).T
                if actions[idx] == a:
                    # print('target')
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = (1 - pi_temp[idx]) * phi
                else:
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = -pi_temp[idx] * phi

            # Take action a and observe r and s′;

            new_s, r = mc.P_and_R(s, a)

            # Critic update using TD(λ)
            # ev←γλev+∂vw(s);
            v, dv = fa.vw(w, s, base, baseparams)
            if new_s[0] > mc.right_bound:
                new_v = 0
            else:
                new_v = fa.vw(w, new_s, base, baseparams)[0]

            e = l * mc.gamma * e
            e += dv
            # δ ← r + γvw(s′,a′) − vw(s,a);
            delta = r + mc.gamma * new_v - v
            # w←w+αδev;
            w += lr * delta * e

            # Actor update
            # θ + αγ^tδ ∂ ln(π(s,a,θ))
            theta += lr * delta * dtheta

            s = new_s
            count += 1

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards