예제 #1
0
파일: OC.py 프로젝트: konichuvak/hrl
 def choose_option(self, state, epsilon=0.01):
     """ Picks the optimal option in an epsilon-greedy way """
     if random.random() > epsilon:
         option = self.options[randargmax(self.Q[state])]
     else:
         option = random.choice(self.options)
     return option
예제 #2
0
파일: policies.py 프로젝트: konichuvak/hrl
 def pmf(self, state, action_values):
     if isinstance(action_values, torch.Tensor):
         action_values = action_values.detach().numpy().squeeze()
     probs = np.array([o.initiation(state) for o in self.options],
                      dtype=np.float)
     action_values[np.array(np.abs(probs - 1), dtype=bool)] = np.inf
     probs[probs == 1] = self.ε / (probs.sum() - 1)
     probs[randargmax(action_values)] = 1 - self.ε
     assert np.sum(probs) == 1.
     return probs
예제 #3
0
    def _q_learning(self,
                    q_values: np.array = None,
                    epsilon: float = 0.1,
                    alpha: float = 0.1,
                    gamma: float = 0.9):
        """ learns the optimal policy to get to the hallway from anywhere within a room """

        # State is number of cells in the grid plus the direction of agent
        # Actions are primitive {left, right, forward}
        if q_values is None:
            q_values = np.zeros((3, 4, 10, 10))

        state = (
            self.env.agent_dir,
            *self.env.agent_pos,
        )
        done = False

        while not done:
            # self.env.render()
            # time.sleep(0.0005)

            a = randargmax(q_values[:, state[0], state[1], state[2]])
            a = self._epsilon_greedy(a, epsilon)
            obs, reward, done, info = self.env.step(a)

            # Note: we could infer the state of the agent from obs, but get it directly instead
            state_next = (self.env.agent_dir, *self.env.agent_pos)
            a_next = randargmax(q_values[:, state_next[0], state_next[1],
                                         state_next[2]])

            q_index = a, state[0], state[1], state[2]
            q_index_next = a_next, state_next[0], state_next[1], state_next[2]
            q_values[q_index] += alpha * (reward + gamma *
                                          (q_values[q_index_next]) -
                                          q_values[q_index])

            state = state_next

        return q_values
예제 #4
0
 def sample(self, phi):
     if self.rng.uniform() < self.epsilon:
         return int(self.rng.randint(self.weights.shape[1]))
     return randargmax(self.value(phi))
예제 #5
0
파일: SMDP.py 프로젝트: konichuvak/hrl
    def q_learning(self,
                   n_episodes: int,
                   γ: float = 0.9,
                   Q: np.ndarray = None,
                   N: np.ndarray = None,
                   α: float = None,
                   render: bool = False):

        env = self.env.unwrapped
        n_options = len(self.options)
        state_space_dim = (4, env.width, env.height)
        dim = (n_options, *state_space_dim)

        if Q is None:
            N = np.zeros(dim)
            Q = np.zeros(dim)

        for episode in range(n_episodes):

            self.env.reset()
            state = (env.agent_dir, *reversed(env.agent_pos))
            executing_option = self.policy(Q, state)
            done = False

            while not done:

                # Step through environment
                a = executing_option.policy(state)
                obs, reward, done, info = self.env.step(a)
                # TODO: infer the state of the agent from obs, i.e. make it POMDP
                s_next = (env.agent_dir, *reversed(env.agent_pos))

                if render:
                    action_name = list(env.actions)[a].name
                    self.logger.debug(f"State: {state}, "
                                      f"Option: {executing_option}, "
                                      f"Action: {action_name}, "
                                      f"Next State: {s_next}")
                    self.env.render()
                    time.sleep(0.05)

                # Update option
                executing_option.k += 1
                executing_option.cumulant += γ**executing_option.k * reward

                # Check for termination condition and update action-values
                if executing_option.termination_function(s_next) == 1 or done:

                    start_state = (self.option_idx_dict[executing_option.name],
                                   *executing_option.starting_state)

                    # Determine the step-size
                    if α is None:
                        N[start_state] += 1
                        alpha = 1 / N[start_state]
                    else:
                        alpha = α

                    # Update Q in the direction of the optimal action
                    r = executing_option.cumulant
                    k = executing_option.k
                    o = randargmax(Q[(slice(None), *s_next)])
                    target = r + γ**k * Q[(o, *s_next)]
                    Q[start_state] += alpha * (target - Q[start_state])

                    # Choose the next option
                    executing_option = self.policy(Q, s_next)

                # Reset the state
                state = s_next
            yield Q, self.env.step_count

        return Q, self.env.step_count