def seed(self, seed=None): self.np_random, seed1 = seeding.np_random(seed) # Derive a random seed. This gets passed as a uint, but gets # checked as an int elsewhere, so we need to keep it below # 2**31. seed2 = seeding.hash_seed(seed1 + 1) % 2**31 # Empirically, we need to seed before loading the ROM. self.ale.setInt(b'random_seed', seed2) self.ale.loadROM(self.game_path) if self.game_mode is not None: modes = self.ale.getAvailableModes() assert self.game_mode in modes, ( "Invalid game mode \"{}\" for game {}.\nAvailable modes are: {}" ).format(self.game_mode, self.game, modes) self.ale.setMode(self.game_mode) if self.game_difficulty is not None: difficulties = self.ale.getAvailableDifficulties() assert self.game_difficulty in difficulties, ( "Invalid game difficulty \"{}\" for game {}.\nAvailable difficulties are: {}" ).format(self.game_difficulty, self.game, difficulties) self.ale.setDifficulty(self.game_difficulty) return [seed1, seed2]
def main(): np_random,_ = seeding.np_random(None) np.random.seed() eta = 0.0 reward_noise = 1.0 P_init = 10. theta_noise = None env = models.Inv_Pendulum() test_env = models.Inv_Pendulum() x = KTD_Q(phi=env.phi[0], gamma=0.95, P_init=P_init, theta0 = np.zeros(30,),theta_noise=theta_noise, eta=eta, reward_noise=reward_noise, anum = 3, kappa = 10.0) performance = [] episode = 0 step = 0 state = env.reset(np_random) while(episode < 1000): action = np.random.choice(3,) reward, n_state, done = env.observe(state, action) x.update_V(state, action, n_state, reward) state = n_state step +=1 if done or (step > 3000): if episode%50 == 0: performance.append(test(x, test_env)) print("After %d steps, Episode %d: %d"%(step, episode, performance[-1])) episode +=1 step = 0 state = env.reset(np_random) plt.plot(performance) plt.show()
def __init__(self, env_name, discount, TH, memory_size=None): """A base class. Parameters ---------- env_name : experimental domain name in models.py discount : the discount factor in MDP TH : finite-time horizon (maximum learning steps) memory_size : Experience Replay memory size """ self.env = envs.make(env_name, type='classic_mdp') self.discount = discount self.states = [] self.actions = [] self.rewards = [] self.np_random, _ = seeding.np_random(None) self.test_counts = [] self.test_rewards = [] self.Q_err = [] self.Q_target = np.array(self.env.optQ(self.discount)).astype( np.float16) self.visits = np.zeros((self.env.snum, self.env.anum)) self.memory_size = memory_size self.replayMem = {(i, j): [] for i in range(self.env.snum) for j in range(self.env.anum)} if not (TH == None): self.env.set_time(TH)
def test(x, env): np_random,_ = seeding.np_random(None) state = env.reset(np_random) step = 0 done = False while((not done) and (step < 3000)) : action = np.argmax([np.dot(x.theta, x.phi(state,a)) for a in range(x.anum)]) reward, n_state, done = env.observe(state, action) step+=1 state = n_state return step
def __init__(self,scene,discount,init_policy, TH, useGym, memory_size): self.scene = scene self.obj = gym.make(scene) if useGym else mdl.model_assign(scene) self.discount = discount self.states = [] self.actions = [] self.rewards = [] self.init_policy = init_policy self.np_random,_ = seeding.np_random(None) self.test_counts = [] self.test_rewards = [] self.Q_err = [] self.visits = np.zeros((self.obj.snum,self.obj.anum)) self.useGym = useGym self.replayMem = [] self.memory_size = memory_size if not(TH==None): self.obj.set_time(TH)
def __init__(self, scene, discount, initQ, TH, memory_size): """Tabular RL Parameters ---------- scene : A name of a task you want to test. (See models.py) alpha : learning rate of Q-learning discount : discount factor in MDP initQ : initial Q value (initialize all Q values with the same number) TH : finite-time horizon (maximum learning steps) memory_size : Experience Replay memory size """ self.env = envs.make(scene) self.discount = discount self.states, self.actions, self.rewards = [], [], [] self.visits = np.zeros((self.env.snum, self.env.anum), dtype=np.int) self.np_random, _ = seeding.np_random(None) self.test_counts = [] self.test_rewards = [] self.dim = (self.env.snum, self.env.anum) if initQ is None: self.init_params() else: self.Q = initQ * np.ones(self.dim, dtype=float) if hasattr(self.env, 'terminal_states'): for ts in self.env.terminal_states: self.Q[ts, :] = 0.0 self.Q_err = [] self.Q_target = np.array(self.env.optQ(self.discount)).astype( np.float16) self.memory_size = memory_size self.replayMem = {(i, j): [] for i in range(self.env.snum) for j in range(self.env.anum)} if TH is not None: self.env.set_time(TH)
def seed(self, seed=None): self.np_random, seed = np_random(seed) return [seed]
def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) self.game.np_random = self.np_random return seed
def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) self.curr_seed = seed return [seed]
def policy_iter(env, discount, threshold, T=5000): V = np.zeros(env.snum) policy = np.random.choice(env.anum, env.snum) np_random, _ = seeding.np_random(None) p_stable = False trans_dict = {} rew_dict = {} slip_prob = env.slip if env.stochastic_reward: slip_prob_r = env.slip_r for state in env.eff_states: for action in range(env.anum): transM = np.zeros(env.snum) rewM = np.zeros(env.snum) if env.stochastic: env.slip = 0.0 if env.stochastic_reward: env.slip_r = 0.0 r0, s_n0, _ = env.observe(state, action, np_random) transM[s_n0] = 1.0 - slip_prob rewM[s_n0] = (1.0 - slip_prob_r) * r0 env.slip_r = 1.0 r1, s_n1, _ = env.observe(state, action, np_random) rewM[s_n1] += slip_prob_r * r1 assert (s_n0 == s_n1) else: r0, s_n0, _ = env.observe(state, action, np_random) transM[s_n0] = 1.0 - slip_prob rewM[s_n0] = r0 env.slip = 1.0 if env.stochastic_reward: env.slip_r = 0.0 r0, s_n0, _ = env.observe(state, action, np_random) transM[s_n0] = 1.0 - slip_prob rewM[s_n0] = (1.0 - slip_prob_r) * r0 env.slip_r = 1.0 r1, s_n1, _ = env.observe(state, action, np_random) rewM[s_n1] += slip_prob_r * r1 else: r1, s_n1, _ = env.observe(state, action, np_random) transM[s_n1] = slip_prob rewM[s_n1] = r1 else: if env.stochastic_reward: env.slip_r = 0.0 r0, s_n0, _ = env.observe(state, action, np_random) transM[s_n0] = 1.0 rewM[s_n0] = (1.0 - slip_prob_r) * r0 env.slip_r = 1.0 r1, s_n1, _ = env.observe(state, action, np_random) if s_n1 != s_n0: print("Transition is stochastic!") rewM[s_n1] += slip_prob_r * r1 else: r0, s_n0, _ = env.observe(state, action, np_random) transM[s_n0] = 1.0 rewM[s_n0] = r0 trans_dict[(state, action)] = transM rew_dict[(state, action)] = rewM it = 0 env.slip = slip_prob if env.stochastic_reward: env.slip_r = slip_prob_r while (not p_stable): delta = 1.0 t = 0 while (delta > threshold and t < T): delta = 0 for s in env.eff_states: v_prev = V[s] V[s] = sum([ trans_dict[(s,policy[s])][s_next] * (rew_dict[(s,policy[s])][s_next] \ + int((s_next<env.goal[0]) or (s_next>=env.goal[1]))*discount*V[s_next]) \ for s_next in range(env.snum)]) delta = max(delta, abs(v_prev - V[s])) t += 1 p_stable = True for s in env.eff_states: u_old = policy[s] q_val = [sum([ trans_dict[(s,u)][s_next] * (rew_dict[(s,u)][s_next] \ + int((s_next<env.goal[0]) or (s_next>=env.goal[1]))*discount*V[s_next]) \ for s_next in range(env.snum)]) for u in range(env.anum)] if max(q_val) - min(q_val) < 0.001: policy[s] = 0 else: policy[s] = np.argmax(q_val) if not (u_old == policy[s]): p_stable = False it += 1 print("after %d iterations" % it) Q = np.zeros((env.snum, env.anum)) for s in env.eff_states: for a in range(env.anum): Q[s][a] = sum([ trans_dict[(s,a)][s_next] * (rew_dict[(s,a)][s_next] \ + int((s_next<env.goal[0]) or (s_next>=env.goal[1]))*discount*V[s_next]) \ for s_next in range(env.snum)]) return V, Q, policy
def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed)
def seed(self, seed=None): """Seed the PRNG of this space. """ self._np_random, seed = seeding.np_random(seed) return [seed]