예제 #1
0
파일: atari_env.py 프로젝트: bluddy/lean_rl
    def seed(self, seed=None):
        self.np_random, seed1 = seeding.np_random(seed)
        # Derive a random seed. This gets passed as a uint, but gets
        # checked as an int elsewhere, so we need to keep it below
        # 2**31.
        seed2 = seeding.hash_seed(seed1 + 1) % 2**31
        # Empirically, we need to seed before loading the ROM.
        self.ale.setInt(b'random_seed', seed2)
        self.ale.loadROM(self.game_path)

        if self.game_mode is not None:
            modes = self.ale.getAvailableModes()

            assert self.game_mode in modes, (
                "Invalid game mode \"{}\" for game {}.\nAvailable modes are: {}"
            ).format(self.game_mode, self.game, modes)
            self.ale.setMode(self.game_mode)

        if self.game_difficulty is not None:
            difficulties = self.ale.getAvailableDifficulties()

            assert self.game_difficulty in difficulties, (
                "Invalid game difficulty \"{}\" for game {}.\nAvailable difficulties are: {}"
            ).format(self.game_difficulty, self.game, difficulties)
            self.ale.setDifficulty(self.game_difficulty)

        return [seed1, seed2]
예제 #2
0
def main():
	np_random,_  = seeding.np_random(None)
	np.random.seed()
	eta = 0.0
	reward_noise = 1.0
	P_init = 10.
	theta_noise = None
	env = models.Inv_Pendulum()
	test_env = models.Inv_Pendulum()
	x = KTD_Q(phi=env.phi[0], gamma=0.95, P_init=P_init, theta0 = np.zeros(30,),theta_noise=theta_noise, eta=eta,
	             reward_noise=reward_noise, anum = 3, kappa = 10.0)
	performance = []
	episode = 0
	step = 0
	state = env.reset(np_random)
	while(episode < 1000):
		action = np.random.choice(3,)
		reward, n_state, done = env.observe(state, action)
		x.update_V(state, action, n_state, reward)
		state = n_state
		step +=1
		if done or (step > 3000):
			if episode%50 == 0:
				performance.append(test(x, test_env))
				print("After %d steps, Episode %d: %d"%(step, episode, performance[-1]))
			episode +=1
			step = 0
			state = env.reset(np_random)

	plt.plot(performance)
	plt.show()
예제 #3
0
파일: adfq.py 프로젝트: coco66/ADFQ
 def __init__(self, env_name, discount, TH, memory_size=None):
     """A base class.
 	Parameters
 	----------
 	env_name : experimental domain name in models.py
 	discount : the discount factor in MDP
 	TH : finite-time horizon (maximum learning steps)
 	memory_size : Experience Replay memory size
     """
     self.env = envs.make(env_name, type='classic_mdp')
     self.discount = discount
     self.states = []
     self.actions = []
     self.rewards = []
     self.np_random, _ = seeding.np_random(None)
     self.test_counts = []
     self.test_rewards = []
     self.Q_err = []
     self.Q_target = np.array(self.env.optQ(self.discount)).astype(
         np.float16)
     self.visits = np.zeros((self.env.snum, self.env.anum))
     self.memory_size = memory_size
     self.replayMem = {(i, j): []
                       for i in range(self.env.snum)
                       for j in range(self.env.anum)}
     if not (TH == None):
         self.env.set_time(TH)
예제 #4
0
def test(x, env):
	np_random,_  = seeding.np_random(None)
	state = env.reset(np_random)
	step = 0
	done = False
	while((not done) and (step < 3000)) :
		action =  np.argmax([np.dot(x.theta, x.phi(state,a)) for a in range(x.anum)])
		reward, n_state, done = env.observe(state, action)
		step+=1
		state = n_state
	return step
예제 #5
0
	def __init__(self,scene,discount,init_policy, TH, useGym, memory_size):
		self.scene = scene
		self.obj = gym.make(scene) if useGym else mdl.model_assign(scene)
		self.discount = discount
		self.states = []
		self.actions = []
		self.rewards = []
		self.init_policy = init_policy
		self.np_random,_  = seeding.np_random(None)
		self.test_counts = []
		self.test_rewards = []
		self.Q_err = []
		self.visits = np.zeros((self.obj.snum,self.obj.anum))
		self.useGym = useGym
		self.replayMem = []
		self.memory_size = memory_size
		if not(TH==None):
			self.obj.set_time(TH)
예제 #6
0
    def __init__(self, scene, discount, initQ, TH, memory_size):
        """Tabular RL
		Parameters
		----------
		scene : A name of a task you want to test. (See models.py)
		alpha : learning rate of Q-learning
		discount : discount factor in MDP
		initQ : initial Q value (initialize all Q values with the same number)
		TH : finite-time horizon (maximum learning steps)
		memory_size : Experience Replay memory size
		"""
        self.env = envs.make(scene)
        self.discount = discount
        self.states, self.actions, self.rewards = [], [], []
        self.visits = np.zeros((self.env.snum, self.env.anum), dtype=np.int)
        self.np_random, _ = seeding.np_random(None)
        self.test_counts = []
        self.test_rewards = []
        self.dim = (self.env.snum, self.env.anum)
        if initQ is None:
            self.init_params()
        else:
            self.Q = initQ * np.ones(self.dim, dtype=float)
        if hasattr(self.env, 'terminal_states'):
            for ts in self.env.terminal_states:
                self.Q[ts, :] = 0.0
        self.Q_err = []
        self.Q_target = np.array(self.env.optQ(self.discount)).astype(
            np.float16)
        self.memory_size = memory_size
        self.replayMem = {(i, j): []
                          for i in range(self.env.snum)
                          for j in range(self.env.anum)}

        if TH is not None:
            self.env.set_time(TH)
예제 #7
0
 def seed(self, seed=None):
     self.np_random, seed = np_random(seed)
     return [seed]
예제 #8
0
 def _seed(self, seed=None):
     self.np_random, seed = seeding.np_random(seed)
     self.game.np_random = self.np_random
     return seed
예제 #9
0
 def seed(self, seed=None):
     self.np_random, seed = seeding.np_random(seed)
     self.curr_seed = seed
     return [seed]
예제 #10
0
def policy_iter(env, discount, threshold, T=5000):

    V = np.zeros(env.snum)
    policy = np.random.choice(env.anum, env.snum)
    np_random, _ = seeding.np_random(None)
    p_stable = False
    trans_dict = {}
    rew_dict = {}
    slip_prob = env.slip
    if env.stochastic_reward:
        slip_prob_r = env.slip_r
    for state in env.eff_states:
        for action in range(env.anum):
            transM = np.zeros(env.snum)
            rewM = np.zeros(env.snum)

            if env.stochastic:
                env.slip = 0.0
                if env.stochastic_reward:
                    env.slip_r = 0.0
                    r0, s_n0, _ = env.observe(state, action, np_random)
                    transM[s_n0] = 1.0 - slip_prob
                    rewM[s_n0] = (1.0 - slip_prob_r) * r0

                    env.slip_r = 1.0
                    r1, s_n1, _ = env.observe(state, action, np_random)
                    rewM[s_n1] += slip_prob_r * r1
                    assert (s_n0 == s_n1)
                else:
                    r0, s_n0, _ = env.observe(state, action, np_random)
                    transM[s_n0] = 1.0 - slip_prob
                    rewM[s_n0] = r0

                env.slip = 1.0
                if env.stochastic_reward:
                    env.slip_r = 0.0
                    r0, s_n0, _ = env.observe(state, action, np_random)
                    transM[s_n0] = 1.0 - slip_prob
                    rewM[s_n0] = (1.0 - slip_prob_r) * r0

                    env.slip_r = 1.0
                    r1, s_n1, _ = env.observe(state, action, np_random)
                    rewM[s_n1] += slip_prob_r * r1
                else:
                    r1, s_n1, _ = env.observe(state, action, np_random)
                    transM[s_n1] = slip_prob
                    rewM[s_n1] = r1
            else:
                if env.stochastic_reward:
                    env.slip_r = 0.0
                    r0, s_n0, _ = env.observe(state, action, np_random)
                    transM[s_n0] = 1.0
                    rewM[s_n0] = (1.0 - slip_prob_r) * r0

                    env.slip_r = 1.0
                    r1, s_n1, _ = env.observe(state, action, np_random)
                    if s_n1 != s_n0:
                        print("Transition is stochastic!")
                    rewM[s_n1] += slip_prob_r * r1
                else:
                    r0, s_n0, _ = env.observe(state, action, np_random)
                    transM[s_n0] = 1.0
                    rewM[s_n0] = r0

            trans_dict[(state, action)] = transM
            rew_dict[(state, action)] = rewM
    it = 0
    env.slip = slip_prob
    if env.stochastic_reward:
        env.slip_r = slip_prob_r
    while (not p_stable):
        delta = 1.0
        t = 0
        while (delta > threshold and t < T):
            delta = 0
            for s in env.eff_states:
                v_prev = V[s]
                V[s] = sum([ trans_dict[(s,policy[s])][s_next] * (rew_dict[(s,policy[s])][s_next] \
                  + int((s_next<env.goal[0]) or (s_next>=env.goal[1]))*discount*V[s_next]) \
                  for s_next in range(env.snum)])
                delta = max(delta, abs(v_prev - V[s]))
            t += 1
        p_stable = True
        for s in env.eff_states:
            u_old = policy[s]
            q_val = [sum([ trans_dict[(s,u)][s_next] * (rew_dict[(s,u)][s_next] \
              + int((s_next<env.goal[0]) or (s_next>=env.goal[1]))*discount*V[s_next]) \
              for s_next in range(env.snum)]) for u in range(env.anum)]

            if max(q_val) - min(q_val) < 0.001:
                policy[s] = 0
            else:
                policy[s] = np.argmax(q_val)
                if not (u_old == policy[s]):
                    p_stable = False
        it += 1
    print("after %d iterations" % it)
    Q = np.zeros((env.snum, env.anum))
    for s in env.eff_states:
        for a in range(env.anum):
            Q[s][a] = sum([ trans_dict[(s,a)][s_next] * (rew_dict[(s,a)][s_next] \
               + int((s_next<env.goal[0]) or (s_next>=env.goal[1]))*discount*V[s_next]) \
               for s_next in range(env.snum)])

    return V, Q, policy
예제 #11
0
 def seed(self, seed=None):
     self.np_random, seed = seeding.np_random(seed)
예제 #12
0
 def seed(self, seed=None):
     """Seed the PRNG of this space. """
     self._np_random, seed = seeding.np_random(seed)
     return [seed]