예제 #1
0
    def fit(self, dataset):
        phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset(
            dataset, self.phi)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)

        norm = np.inf
        while norm > self._epsilon:
            q = self.approximator.predict(phi_next_state)
            if np.any(absorbing):
                q *= 1 - absorbing.reshape(-1, 1)

            next_action = np.argmax(q, axis=1).reshape(-1, 1)
            phi_next_state_next_action = get_action_features(
                phi_next_state, next_action, self.mdp_info.action_space.n)

            tmp = phi_state_action - self.mdp_info.gamma *\
                phi_next_state_next_action
            self._A += phi_state_action.T.dot(tmp)
            self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1)

            old_w = self.approximator.get_weights()
            if np.linalg.matrix_rank(self._A) == self._A.shape[1]:
                w = np.linalg.solve(self._A, self._b).ravel()
            else:
                w = np.linalg.pinv(self._A).dot(self._b).ravel()
            self.approximator.set_weights(w)

            norm = np.linalg.norm(w - old_w)
예제 #2
0
    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)
        q_current = self.Q.predict(phi_state, action)

        if self._q_old is None:
            self._q_old = q_current

        alpha = self.alpha(state, action)

        e_phi = self.e.dot(phi_state_action)
        self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * (
            1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action

        self.next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self.next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - self._q_old

        theta = self.Q.get_weights()
        theta += delta * self.e + alpha * (self._q_old -
                                           q_current) * phi_state_action
        self.Q.set_weights(theta)

        self._q_old = q_next