示例#1
0
def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt):
    dS, dA = visitation_probs.shape
    samples = np.random.choice(np.arange(dS * dA),
                               size=n_sample,
                               p=visitation_probs.reshape(dS * dA))
    policy = get_policy(q_fn, ent_wt=ent_wt)
    observations = samples // dA
    actions = samples % dA
    a_logprobs = np.log(policy[observations, actions])

    observations_next = []
    for i in range(n_sample):
        t_distr = env.tabular_trans_distr(observations[i], actions[i])
        next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)),
                                                      p=t_distr),
                                     ndim=dS)
        observations_next.append(next_state)
    observations_next = np.array(observations_next)

    return {
        'observations': flat_to_one_hot(observations, ndim=dS),
        'actions': flat_to_one_hot(actions, ndim=dA),
        'a_logprobs': a_logprobs,
        'observations_next': observations_next
    }
示例#2
0
def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt):
    dS, dA = visitation_probs.shape
    samples = np.random.choice(np.arange(dS*dA), size=n_sample, p=visitation_probs.reshape(dS*dA))
    policy = get_policy(q_fn, ent_wt=ent_wt)
    observations = samples // dA
    actions = samples % dA
    a_logprobs = np.log(policy[observations, actions])

    observations_next = []
    for i in range(n_sample):
        t_distr = env.tabular_trans_distr(observations[i], actions[i])
        next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)), p=t_distr), ndim=dS)
        observations_next.append(next_state)
    observations_next = np.array(observations_next)

    return {'observations': flat_to_one_hot(observations, ndim=dS),
            'actions': flat_to_one_hot(actions, ndim=dA),
            'a_logprobs': a_logprobs,
            'observations_next': observations_next}
示例#3
0
    def step(self, a):
        transition_probs = self.transitions[self.cur_state, a]
        next_state = np.random.choice(np.arange(self.nstates), p=transition_probs)
        r = self.reward[self.cur_state, a, next_state]
        self.cur_state = next_state
        obs = flat_to_one_hot(self.cur_state, ndim=self.nstates)

        done = False
        if self.terminate_on_reward and r>0:
            done = True
        return obs, r, done, {}
示例#4
0
    def step(self, a):
        transition_probs = self.transitions[self.cur_state, a]
        next_state = np.random.choice(np.arange(self.nstates), p=transition_probs)
        r = self.reward[self.cur_state, a, next_state]
        self.cur_state = next_state
        obs = flat_to_one_hot(self.cur_state, ndim=self.nstates)

        done = False
        if self.terminate_on_reward and r>0:
            done = True
        return obs, r, done, {}
示例#5
0
 def initial_state_distribution(self):
     return flat_to_one_hot(self.init_state, ndim=self.nstates)
示例#6
0
 def reset(self):
     self.cur_state = self.init_state
     obs = flat_to_one_hot(self.cur_state, ndim=self.nstates)
     return obs
示例#7
0
 def initial_state_distribution(self):
     return flat_to_one_hot(self.init_state, ndim=self.nstates)
示例#8
0
 def reset(self):
     self.cur_state = self.init_state
     obs = flat_to_one_hot(self.cur_state, ndim=self.nstates)
     return obs