class Environment: def __init__(self, g=10.0, d=100.0, H=10., m=10.0, F=3.0): """Instanciate a new environement in its initial state. """ self.mc = MountainCar(g=g, d=d, H=H, m=m, F=F, R=50.0, T=0.0) def reset(self): self.mc.reset() # place the car at a random place near the bottom self.mc.x = np.random.uniform(-self.mc.d*1.3, -self.mc.d*0.7) def get_range(self): return [-1.5*self.mc.d, 0.0] def observe(self): """Returns the current observation that the agent can make of the environment, if applicable. """ return (self.mc.x, self.mc.vx) def act(self, action): """Perform given action by the agent on the environment, and returns a reward. """ reward = self.mc.act(action) return (reward, "victory" if reward > 0.0 else None)
def __init__(self): self.env = MountainCar() self.noiseRange = 1.0 self.om = 0 self.alpha = 0.6 self.beta = 0.4 self.t = 0 self.totStep = 0 self.r = 0 self.ep = 0 self.perfs = result_log(algo="DDPG", l1=20, l2=10) self.actif = True
def __init__(self, logger = None): self.env = MountainCar() self.noiseRange = 1.0 self.noiseMax = 1.0 self.om = 0 self.alpha = 0.6 self.beta = 0.4 self.t = 0 self.totStep = 0 self.r = 0 self.ep = 0 if logger==None: self.perfs = result_log(algo="DDPG", l1=20, l2=10) else: self.perfs = logger self.actif = True
class MountainCarEnv(Env): print_interval = 100 def __init__(self): self.env = MountainCar() self.noiseRange = 1.0 self.om = 0 self.alpha = 0.6 self.beta = 0.4 self.t = 0 self.totStep = 0 self.r = 0 self.ep = 0 self.perfs = result_log(algo="DDPG", l1=20, l2=10) self.actif = True #self.plot = result_plot() def state(self): return [self.env.getObservation()] def act(self, action): actNoise = action + self.noise_func() self.env.performAction(actNoise[0]) r = self.env.getReward() self.t += 1 self.r += r return actNoise, [r] def reset(self, noise=True): self.actif = True self.env.reset() self.om = 0 self.totStep+=self.t if self.totStep != 0: self.perfs.addData(self.totStep, self.t, self.r) self.t = 0 self.r = 0 self.ep += 1 if not noise: self.noiseRange = 0.0 else: self.noiseRange = random.uniform(0.,1.0) def noise_func(self): self.om = self.om-self.alpha*self.om + self.beta*random.gauss(0,1)*self.noiseRange return self.om def isFinished(self): if self.actif and not self.env.isFinished(): return False else: self.actif = False return True def getActionSize(self): return 1 def getStateSize(self): return 2 def getActionBounds(self): return [[1.2], [-1.2]] def printEpisode(self): print time.strftime("[%H:%M:%S]"), " Episode : " , self.ep, " steps : ", self.t, " reward : ", self.r def performances(self): pass#self.plot.clear() #self.plot.add_row(self.perfs)
def sarsa_lambda_mc(lr, l, baseparams, eps, epoch=100, base='fourier'): mc = MountainCar() estimated_rewards = np.zeros(epoch) actions = mc.actions w = None if base == 'fourier': order = baseparams['order'] s = mc.d_zero() w = np.zeros((1, len(actions) * (order + 1) ** len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams['tiles_per_tiling'] s = mc.d_zero() w = np.zeros((1, len(actions) * num_tilings)) for x in range(epoch): s = mc.d_zero() # e ← 0 e = np.zeros(w.shape) # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); first_q = estimation.epsilon_greedy(fa.qw(w, s, actions, base, baseparams), actions, eps(x)) # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps) a = np.random.choice(actions, 1, p=first_q)[0] count = 0 while s[0] < mc.right_bound and count < 1e3: # Take action a and observe r and s′; new_s, r = mc.P_and_R(s, a) # Choose a′ from s′ using a policy derived from q; pi_temp = estimation.epsilon_greedy(fa.qw(w, new_s, actions, base, baseparams), actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] if new_s == [0.5, 0]: new_q = 0 else: new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0] q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams) # e←γλe+∂qw(s,a)/∂w; e = l * 1 * e + dqdw # δ←r+γqw(s′,a′)−qw(s,a); delta = r + 1 * new_q - q # w←w+αδe; w += lr * delta * e # print(w) s = new_s a = new_a count += 1 # print('update end') epi = MountainCarEpisode(mc) estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) return estimated_rewards
def sarsa_mountaincar(lr, baseparams, eps, epoch=100, base='fourier'): mc = MountainCar() estimated_rewards = np.zeros(epoch) actions = mc.actions w = None if base == 'fourier': order = baseparams['order'] s = mc.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = mc.d_zero() w = np.zeros((1, len(actions) * num_tilings)) for x in range(epoch): s = mc.d_zero() # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); first_q = estimation.epsilon_greedy( fa.qw(w, s, actions, base, baseparams), actions, eps(x)) # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps) a = np.random.choice(actions, 1, p=first_q)[0] count = 0 while s[0] < mc.right_bound: # Take action a and observe r and s′; new_s, r = mc.P_and_R(s, a) # Choose a′ from s′ using a policy derived from q; pi_temp = estimation.epsilon_greedy( fa.qw(w, new_s, actions, base, baseparams), actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0] q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s a = new_a count += 1 epi = MountainCarEpisode(mc) estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) return estimated_rewards
def __init__(self): self.mc = MountainCar() # Params for env self.state_dim = 2 self.action_dim = 1 self.max_action = self.mc.F # Params for training self.batch_size = 32 self.gamma = 0.99 self.tau = 0.001 self.actor_lr = 0.0001 self.critic_lr = 0.0004 self.noise = OU_Noise(self.action_dim, 0) self.explore = 100000 self.epsilon = 0.3 self.counter = 0 # Init self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(device) self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.actor_lr) self.critic = Critic(self.state_dim, self.action_dim).to(device) self.critic_target = Critic(self.state_dim, self.action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.critic_lr) self.replay_buffer = ReplayBuffer() self.num_training = 0 self.state_curr = None self.state_prev = None
if __name__ == "__main__": # alpha = 0.1 / 8 epsilon = 0.05 gamma = 1 num_episodes = 500 dim1 = np.array((-1.2, 0.5)) dim2 = np.array((-0.07, 0.07)) dims = np.array((dim1, dim2)) num_actions = 3 num_tilings = 8 tiles_per_dim = 8 idcs_per_action = num_tilings * tiles_per_dim**len(dims) displ_vecs = np.array([(1, 3), (3, 1), (-1, 3), (3, -1), (1, -3), (-3, 1), (-1, -3), (-3, -1)], dtype="float") #%% fvecs = feature_vecs(dims, num_tilings, tiles_per_dim, displ_vecs) weights = init_weights(num_tilings, tiles_per_dim, len(dims), num_actions) env = MountainCar() #%% episodic_sg_SARSA(env, fvecs, weights, alpha, epsilon, gamma, num_actions, num_episodes)
self.s_t = s_tp1 self.a_t = a_tp1 return self.actions[a_tp1] if a_tp1 != None else None def reset(self): self.s_t = None self.a_t = None def proposeAction(self, state): return self.actions[self.policy(state)] # domain constructor: # you can replace the mountain car domain by the inverted pendulum domain = MountainCar(random_start= False, max_episode=10000) # domain = SwingPendulum(random_start= False, max_episode_length=10000) phi = TileCoding(input_indicies = [np.array([0,1])], ntiles = [10], ntilings=[10], state_range = domain.state_range, bias_term = True) valuefn = LinearTD(len(domain.discrete_actions), phi, alpha = 0.01, gamma= 0.995) # this is a sub-optimal policy for mountain car that naively always # increases the energy of the system. You can swap the e-greedy policy
def __init__(self, g=10.0, d=100.0, H=10., m=10.0, F=3.0): """Instanciate a new environement in its initial state. """ self.mc = MountainCar(g=g, d=d, H=H, m=m, F=F, R=50.0, T=0.0)
from mountaincar import MountainCar from gp_gym_info import info import gym # GP Parameters info["env_name"] = "MountainCar-v0" info["pop_size"] = 100 info["max_gens"] = 10 info["max_depth"] = 1 info["num_eps"] = 100 agent = MountainCar(info) best_program = agent.train() print(best_program) f = agent.fit(best_program, 100, 200, render=False) print(f)
def reinforce_mc(alpha, beta, l, baseparams, eps, epoch=100, base='fourier'): mc = MountainCar() estimated_rewards = np.zeros(epoch) actions = mc.actions theta = None order = 0 if base == 'fourier': order = baseparams['order'] s = mc.d_zero() theta = np.zeros((1, len(actions) * (order + 1)**len(s))) w = np.zeros((1, (order + 1)**len(s))) # theta = np.zeros((len(s), 3)) for x in range(epoch): s = mc.d_zero() e = np.zeros(w.shape) hist_s = [] hist_a = [] hist_r = [] hist_pi = [] count = 0 dj = np.zeros(theta.shape) # for each time step, until s is the terminal absorbing state do while s[0] < mc.right_bound and count < 1000: pi_temp = estimation.softmax( fa.qw(theta, s, actions, base, baseparams), eps(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] new_s, r = mc.P_and_R(s, a) hist_a.append(a) hist_s.append(s) hist_r.append(r) hist_pi.append(pi_temp) s = new_s count += 1 for i in range(len(hist_a)): g = 0 for j in range(i, len(hist_s)): g += hist_r[j] v, dv = fa.vw(w, hist_s[i], base, baseparams) dj += (g - v) * dsoftmax(hist_s[i], hist_a[i], order, actions, hist_pi[i]) e = l * e + dv if i == len(hist_s) - 1: delta = hist_r[i] + 0 - \ fa.vw(w, hist_s[i], base, baseparams)[0] else: delta = hist_r[i] + fa.vw(w, hist_s[i + 1], base, baseparams)[0] - \ fa.vw(w, hist_s[i], base, baseparams)[0] w += alpha * delta * e theta += beta * dj epi = MountainCarEpisode(mc) # print(theta) estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) return estimated_rewards
def __init__(self): """Init a new agent. """ mc = MountainCar()
from deap import gp import random random.seed(5) # env = gym.make("MountainCarContinuous-v0") # print(env.action_space.low) info["env_name"] = "MountainCarContinuous-v0" info["pop_size"] = 100 info["max_gens"] = 10 info["max_depth"] = 1 info["tournament_size"] = 5 info["num_eps"] = 10 agent = MountainCar(info) solutions = {} force_values = np.arange(0.0, 1.0, 0.1) fitness_scores = [] counter = 1 for force in force_values: solution = "IFLTE(0.0, velocity, {}, {})".format(force, -force) f = agent.fit(solution, 100, 200, render=False)[0] fitness_scores.append(f) # Timing print(counter) counter += 1 if f >= 90:
def actor_critic_mc(lr, l, baseparams, eps, epoch=100, base='fourier'): mc = MountainCar() estimated_rewards = np.zeros(epoch) actions = mc.actions w = None theta = None order = 0 if base == 'fourier': order = baseparams['order'] s = mc.d_zero() w = np.zeros((1, (order + 1)**len(s))) theta = np.zeros((1, len(actions) * (order + 1)**len(s))) # theta = np.zeros((len(s), 3)) for x in range(epoch): s = mc.d_zero() # ev ← 0 e = np.zeros(w.shape) # et ← 0 # et = np.zeros(theta.shape) count = 0 # for each time step, until s is the terminal absorbing state do while s[0] < mc.right_bound and count < 1000: pi_temp = estimation.softmax( fa.qw(theta, s, actions, base, baseparams), eps(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # print(a) # print(pi_temp) # dydtheta_list = [] # for na in actions: # dydtheta_list.append(fa.qw_ele(theta, s, na, actions, base, baseparams)[1]) # # dtheta = estimation.dsoftmax(fa.qw(theta, s, actions, base, baseparams), dydtheta_list, actions.index( # a), eps(x)) dtheta = np.zeros((1, len(actions) * (order + 1)**len(s))) for idx in range(len(actions)): phi = fa.fourier_phi_mc(s, order).T if actions[idx] == a: # print('target') dtheta[:, idx * phi.shape[1]:(idx + 1) * phi.shape[1]] = (1 - pi_temp[idx]) * phi else: dtheta[:, idx * phi.shape[1]:(idx + 1) * phi.shape[1]] = -pi_temp[idx] * phi # Take action a and observe r and s′; new_s, r = mc.P_and_R(s, a) # Critic update using TD(λ) # ev←γλev+∂vw(s); v, dv = fa.vw(w, s, base, baseparams) if new_s[0] > mc.right_bound: new_v = 0 else: new_v = fa.vw(w, new_s, base, baseparams)[0] e = l * mc.gamma * e e += dv # δ ← r + γvw(s′,a′) − vw(s,a); delta = r + mc.gamma * new_v - v # w←w+αδev; w += lr * delta * e # Actor update # θ + αγ^tδ ∂ ln(π(s,a,θ)) theta += lr * delta * dtheta s = new_s count += 1 epi = MountainCarEpisode(mc) # print(theta) estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) return estimated_rewards