def calculate_expected_phi_ns_na(self, s, a, ns_samples):
     # calculate the expected next feature vector (phi(ns,pi(ns)) given s
     # and a. Eqns 2.20 and 2.25 in [Geramifard et. al. 2012 FTML Paper]
     if hasFunction(self.domain, 'expectedStep'):
         p, r, ns, t = self.domain.expectedStep(s, a)
         phi_ns_na = np.zeros(
             self.representation.features_num *
             self.domain.actions_num)
         for j in xrange(len(p)):
             na = self.policy.pi(ns[j])
             phi_ns_na += p[j] * self.representation.phi_sa(ns[j], na)
     else:
         next_states, rewards = self.domain.sampleStep(s, a, ns_samples)
         phi_ns_na = np.mean(
             [self.representation.phisa(next_states[i],
                                        self.policy.pi(next_states[i])) for i in xrange(ns_samples)])
     return phi_ns_na
 def calculate_expected_phi_ns_na(self, s, a, ns_samples):
     # calculate the expected next feature vector (phi(ns,pi(ns)) given s
     # and a. Eqns 2.20 and 2.25 in [Geramifard et. al. 2012 FTML Paper]
     if hasFunction(self.domain, 'expectedStep'):
         p, r, ns, t, pa = self.domain.expectedStep(s, a)
         phi_ns_na = np.zeros(self.representation.features_num *
                              self.domain.actions_num)
         for j in xrange(len(p)):
             na = self.policy.pi(ns[j], t[j], pa[j])
             phi_ns_na += p[j] * self.representation.phi_sa(ns[j], t[j], na)
     else:
         next_states, rewards = self.domain.sampleStep(s, a, ns_samples)
         phi_ns_na = np.mean([
             self.representation.phisa(next_states[i],
                                       self.policy.pi(next_states[i]))
             for i in xrange(ns_samples)
         ])
     return phi_ns_na
Пример #3
0
    def Q_oneStepLookAhead(self, s, a, ns_samples, policy=None):
        """
        Returns the state action value, Q(s,a), by performing one step
        look-ahead on the domain.

        .. note::
            For an example of how this function works, see
            `Line 8 of Figure 4.3 <http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node43.html>`_
            in Sutton and Barto 1998.

        If the domain does not define ``expectedStep()``, this function uses
        ``ns_samples`` samples to estimate the one_step look-ahead.
        If a policy is passed (used in the policy evaluation), it is used to
        generate the action for the next state.
        Otherwise the best action is selected.

        .. note::
            This function should not be called in any RL algorithms unless
            the underlying domain is an approximation of the true model.

        :param s: The given state
        :param a: The given action
        :param ns_samples: The number of samples used to estimate the one_step look-ahead.
        :param policy: (optional) Used to select the action in the next state
            (*after* taking action a) when estimating the one_step look-aghead.
            If ``policy == None``, the best action will be selected.

        :return: The one-step lookahead state-action value, Q(s,a).
        """
        # Hash new state for the incremental tabular case
        self.continuous_state_starting_samples = 10
        if hasFunction(self, 'addState'):
            self.addState(s)

        discount_factor = self.domain.discount_factor
        if hasFunction(self.domain, 'expectedStep'):
            p, r, ns, t, p_actions = self.domain.expectedStep(s, a)
            Q = 0
            for j in range(len(p)):
                if policy is None:
                    Q += p[j, 0] * (r[j, 0] + discount_factor *
                                    self.V(ns[j, :], t[j, :], p_actions[j]))
                else:
                    # For some domains such as blocks world, you may want to apply bellman backup to impossible states which may not have any possible actions.
                    # This if statement makes sure that there exist at least
                    # one action in the next state so the bellman backup with
                    # the fixed policy is valid
                    if len(self.domain.possibleActions(ns[j, :])):
                        na = policy.pi(ns[j, :], t[j, :],
                                       self.domain.possibleActions(ns[j, :]))
                        Q += p[j, 0] * (r[j, 0] + discount_factor *
                                        self.Q(ns[j, :], t[j, :], na))
        else:
            # See if they are in cache:
            key = tuple(np.hstack((s, [a])))
            cacheHit = self.expectedStepCached.get(key)
            if cacheHit is None:
                # Not found in cache => Calculate and store in cache
                # If continuous domain, sample <continuous_state_starting_samples> points within each discritized grid and sample <ns_samples>/<continuous_state_starting_samples> for each starting state.
                # Otherwise take <ns_samples> for the state.

                # First put s in the middle of the grid:
                # shout(self,s)
                s = self.stateInTheMiddleOfGrid(s)
                # print "After:", shout(self,s)
                if len(self.domain.continuous_dims):
                    next_states = np.empty(
                        (ns_samples, self.domain.state_space_dims))
                    rewards = np.empty(ns_samples)
                    # next states per samples initial state
                    ns_samples_ = old_div(ns_samples, \
                        self.continuous_state_starting_samples)
                    for i in range(self.continuous_state_starting_samples):
                        # sample a random state within the grid corresponding
                        # to input s
                        new_s = s.copy()
                        for d in range(self.domain.state_space_dims):
                            w = self.binWidth_per_dim[d]
                            # Sample each dimension of the new_s within the
                            # cell
                            new_s[d] = (self.random_state.rand() -
                                        .5) * w + s[d]
                            # If the dimension is discrete make make the
                            # sampled value to be int
                            if not d in self.domain.continuous_dims:
                                new_s[d] = int(new_s[d])
                        # print new_s
                        ns, r = self.domain.sampleStep(new_s, a, ns_samples_)
                        next_states[i * ns_samples_:(i + 1) *
                                    ns_samples_, :] = ns
                        rewards[i * ns_samples_:(i + 1) * ns_samples_] = r
                else:
                    next_states, rewards = self.domain.sampleStep(
                        s, a, ns_samples)
                self.expectedStepCached[key] = [next_states, rewards]
            else:
                # print "USED CACHED"
                next_states, rewards = cacheHit
            if policy is None:
                Q = np.mean([
                    rewards[i] + discount_factor * self.V(next_states[i, :])
                    for i in range(ns_samples)
                ])
            else:
                Q = np.mean([
                    rewards[i] + discount_factor *
                    self.Q(next_states[i, :], policy.pi(next_states[i, :]))
                    for i in range(ns_samples)
                ])
        return Q
Пример #4
0
    def Q_oneStepLookAhead(self, s, a, ns_samples, policy=None):
        """
        Returns the state action value, Q(s,a), by performing one step
        look-ahead on the domain.

        .. note::
            For an example of how this function works, see
            `Line 8 of Figure 4.3 <http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node43.html>`_
            in Sutton and Barto 1998.

        If the domain does not define ``expectedStep()``, this function uses
        ``ns_samples`` samples to estimate the one_step look-ahead.
        If a policy is passed (used in the policy evaluation), it is used to
        generate the action for the next state.
        Otherwise the best action is selected.

        .. note::
            This function should not be called in any RL algorithms unless
            the underlying domain is an approximation of the true model.

        :param s: The given state
        :param a: The given action
        :param ns_samples: The number of samples used to estimate the one_step look-ahead.
        :param policy: (optional) Used to select the action in the next state
            (*after* taking action a) when estimating the one_step look-aghead.
            If ``policy == None``, the best action will be selected.

        :return: The one-step lookahead state-action value, Q(s,a).
        """
        # Hash new state for the incremental tabular case
        self.continuous_state_starting_samples = 10
        if hasFunction(self, 'addState'):
            self.addState(s)

        discount_factor = self.domain.discount_factor
        if hasFunction(self.domain, 'expectedStep'):
            p, r, ns, t, p_actions = self.domain.expectedStep(s, a)
            Q = 0
            for j in xrange(len(p)):
                if policy is None:
                    Q += p[j, 0] * (r[j, 0] + discount_factor * self.V(ns[j,:], t[j,:], p_actions[j]))
                else:
                    # For some domains such as blocks world, you may want to apply bellman backup to impossible states which may not have any possible actions.
                    # This if statement makes sure that there exist at least
                    # one action in the next state so the bellman backup with
                    # the fixed policy is valid
                    if len(self.domain.possibleActions(ns[j,:])):
                        na = policy.pi(ns[j,:], t[j,:], self.domain.possibleActions(ns[j,:]))
                        Q += p[j, 0] * (r[j, 0] + discount_factor * self.Q(ns[j,:], t[j,:], na))
        else:
            # See if they are in cache:
            key = tuple(np.hstack((s, [a])))
            cacheHit = self.expectedStepCached.get(key)
            if cacheHit is None:
                # Not found in cache => Calculate and store in cache
                # If continuous domain, sample <continuous_state_starting_samples> points within each discritized grid and sample <ns_samples>/<continuous_state_starting_samples> for each starting state.
                # Otherwise take <ns_samples> for the state.

                # First put s in the middle of the grid:
                # shout(self,s)
                s = self.stateInTheMiddleOfGrid(s)
                # print "After:", shout(self,s)
                if len(self.domain.continuous_dims):
                    next_states = np.empty(
                        (ns_samples, self.domain.state_space_dims))
                    rewards = np.empty(ns_samples)
                    # next states per samples initial state
                    ns_samples_ = ns_samples / \
                        self.continuous_state_starting_samples
                    for i in xrange(self.continuous_state_starting_samples):
                        # sample a random state within the grid corresponding
                        # to input s
                        new_s = s.copy()
                        for d in xrange(self.domain.state_space_dims):
                            w = self.binWidth_per_dim[d]
                            # Sample each dimension of the new_s within the
                            # cell
                            new_s[d] = (self.random_state.rand() - .5) * w + s[d]
                            # If the dimension is discrete make make the
                            # sampled value to be int
                            if not d in self.domain.continuous_dims:
                                new_s[d] = int(new_s[d])
                        # print new_s
                        ns, r = self.domain.sampleStep(new_s, a, ns_samples_)
                        next_states[i * ns_samples_:(i + 1) * ns_samples_,:] = ns
                        rewards[i * ns_samples_:(i + 1) * ns_samples_] = r
                else:
                    next_states, rewards = self.domain.sampleStep(
                        s, a, ns_samples)
                self.expectedStepCached[key] = [next_states, rewards]
            else:
                # print "USED CACHED"
                next_states, rewards = cacheHit
            if policy is None:
                Q = np.mean([rewards[i] + discount_factor * self.V(next_states[i,:]) for i in xrange(ns_samples)])
            else:
                Q = np.mean([rewards[i] + discount_factor * self.Q(next_states[i,:], policy.pi(next_states[i,:])) for i in xrange(ns_samples)])
        return Q