def calculate_expected_phi_ns_na(self, s, a, ns_samples): # calculate the expected next feature vector (phi(ns,pi(ns)) given s # and a. Eqns 2.20 and 2.25 in [Geramifard et. al. 2012 FTML Paper] if hasFunction(self.domain, 'expectedStep'): p, r, ns, t = self.domain.expectedStep(s, a) phi_ns_na = np.zeros( self.representation.features_num * self.domain.actions_num) for j in xrange(len(p)): na = self.policy.pi(ns[j]) phi_ns_na += p[j] * self.representation.phi_sa(ns[j], na) else: next_states, rewards = self.domain.sampleStep(s, a, ns_samples) phi_ns_na = np.mean( [self.representation.phisa(next_states[i], self.policy.pi(next_states[i])) for i in xrange(ns_samples)]) return phi_ns_na
def calculate_expected_phi_ns_na(self, s, a, ns_samples): # calculate the expected next feature vector (phi(ns,pi(ns)) given s # and a. Eqns 2.20 and 2.25 in [Geramifard et. al. 2012 FTML Paper] if hasFunction(self.domain, 'expectedStep'): p, r, ns, t, pa = self.domain.expectedStep(s, a) phi_ns_na = np.zeros(self.representation.features_num * self.domain.actions_num) for j in xrange(len(p)): na = self.policy.pi(ns[j], t[j], pa[j]) phi_ns_na += p[j] * self.representation.phi_sa(ns[j], t[j], na) else: next_states, rewards = self.domain.sampleStep(s, a, ns_samples) phi_ns_na = np.mean([ self.representation.phisa(next_states[i], self.policy.pi(next_states[i])) for i in xrange(ns_samples) ]) return phi_ns_na
def Q_oneStepLookAhead(self, s, a, ns_samples, policy=None): """ Returns the state action value, Q(s,a), by performing one step look-ahead on the domain. .. note:: For an example of how this function works, see `Line 8 of Figure 4.3 <http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node43.html>`_ in Sutton and Barto 1998. If the domain does not define ``expectedStep()``, this function uses ``ns_samples`` samples to estimate the one_step look-ahead. If a policy is passed (used in the policy evaluation), it is used to generate the action for the next state. Otherwise the best action is selected. .. note:: This function should not be called in any RL algorithms unless the underlying domain is an approximation of the true model. :param s: The given state :param a: The given action :param ns_samples: The number of samples used to estimate the one_step look-ahead. :param policy: (optional) Used to select the action in the next state (*after* taking action a) when estimating the one_step look-aghead. If ``policy == None``, the best action will be selected. :return: The one-step lookahead state-action value, Q(s,a). """ # Hash new state for the incremental tabular case self.continuous_state_starting_samples = 10 if hasFunction(self, 'addState'): self.addState(s) discount_factor = self.domain.discount_factor if hasFunction(self.domain, 'expectedStep'): p, r, ns, t, p_actions = self.domain.expectedStep(s, a) Q = 0 for j in range(len(p)): if policy is None: Q += p[j, 0] * (r[j, 0] + discount_factor * self.V(ns[j, :], t[j, :], p_actions[j])) else: # For some domains such as blocks world, you may want to apply bellman backup to impossible states which may not have any possible actions. # This if statement makes sure that there exist at least # one action in the next state so the bellman backup with # the fixed policy is valid if len(self.domain.possibleActions(ns[j, :])): na = policy.pi(ns[j, :], t[j, :], self.domain.possibleActions(ns[j, :])) Q += p[j, 0] * (r[j, 0] + discount_factor * self.Q(ns[j, :], t[j, :], na)) else: # See if they are in cache: key = tuple(np.hstack((s, [a]))) cacheHit = self.expectedStepCached.get(key) if cacheHit is None: # Not found in cache => Calculate and store in cache # If continuous domain, sample <continuous_state_starting_samples> points within each discritized grid and sample <ns_samples>/<continuous_state_starting_samples> for each starting state. # Otherwise take <ns_samples> for the state. # First put s in the middle of the grid: # shout(self,s) s = self.stateInTheMiddleOfGrid(s) # print "After:", shout(self,s) if len(self.domain.continuous_dims): next_states = np.empty( (ns_samples, self.domain.state_space_dims)) rewards = np.empty(ns_samples) # next states per samples initial state ns_samples_ = old_div(ns_samples, \ self.continuous_state_starting_samples) for i in range(self.continuous_state_starting_samples): # sample a random state within the grid corresponding # to input s new_s = s.copy() for d in range(self.domain.state_space_dims): w = self.binWidth_per_dim[d] # Sample each dimension of the new_s within the # cell new_s[d] = (self.random_state.rand() - .5) * w + s[d] # If the dimension is discrete make make the # sampled value to be int if not d in self.domain.continuous_dims: new_s[d] = int(new_s[d]) # print new_s ns, r = self.domain.sampleStep(new_s, a, ns_samples_) next_states[i * ns_samples_:(i + 1) * ns_samples_, :] = ns rewards[i * ns_samples_:(i + 1) * ns_samples_] = r else: next_states, rewards = self.domain.sampleStep( s, a, ns_samples) self.expectedStepCached[key] = [next_states, rewards] else: # print "USED CACHED" next_states, rewards = cacheHit if policy is None: Q = np.mean([ rewards[i] + discount_factor * self.V(next_states[i, :]) for i in range(ns_samples) ]) else: Q = np.mean([ rewards[i] + discount_factor * self.Q(next_states[i, :], policy.pi(next_states[i, :])) for i in range(ns_samples) ]) return Q
def Q_oneStepLookAhead(self, s, a, ns_samples, policy=None): """ Returns the state action value, Q(s,a), by performing one step look-ahead on the domain. .. note:: For an example of how this function works, see `Line 8 of Figure 4.3 <http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node43.html>`_ in Sutton and Barto 1998. If the domain does not define ``expectedStep()``, this function uses ``ns_samples`` samples to estimate the one_step look-ahead. If a policy is passed (used in the policy evaluation), it is used to generate the action for the next state. Otherwise the best action is selected. .. note:: This function should not be called in any RL algorithms unless the underlying domain is an approximation of the true model. :param s: The given state :param a: The given action :param ns_samples: The number of samples used to estimate the one_step look-ahead. :param policy: (optional) Used to select the action in the next state (*after* taking action a) when estimating the one_step look-aghead. If ``policy == None``, the best action will be selected. :return: The one-step lookahead state-action value, Q(s,a). """ # Hash new state for the incremental tabular case self.continuous_state_starting_samples = 10 if hasFunction(self, 'addState'): self.addState(s) discount_factor = self.domain.discount_factor if hasFunction(self.domain, 'expectedStep'): p, r, ns, t, p_actions = self.domain.expectedStep(s, a) Q = 0 for j in xrange(len(p)): if policy is None: Q += p[j, 0] * (r[j, 0] + discount_factor * self.V(ns[j,:], t[j,:], p_actions[j])) else: # For some domains such as blocks world, you may want to apply bellman backup to impossible states which may not have any possible actions. # This if statement makes sure that there exist at least # one action in the next state so the bellman backup with # the fixed policy is valid if len(self.domain.possibleActions(ns[j,:])): na = policy.pi(ns[j,:], t[j,:], self.domain.possibleActions(ns[j,:])) Q += p[j, 0] * (r[j, 0] + discount_factor * self.Q(ns[j,:], t[j,:], na)) else: # See if they are in cache: key = tuple(np.hstack((s, [a]))) cacheHit = self.expectedStepCached.get(key) if cacheHit is None: # Not found in cache => Calculate and store in cache # If continuous domain, sample <continuous_state_starting_samples> points within each discritized grid and sample <ns_samples>/<continuous_state_starting_samples> for each starting state. # Otherwise take <ns_samples> for the state. # First put s in the middle of the grid: # shout(self,s) s = self.stateInTheMiddleOfGrid(s) # print "After:", shout(self,s) if len(self.domain.continuous_dims): next_states = np.empty( (ns_samples, self.domain.state_space_dims)) rewards = np.empty(ns_samples) # next states per samples initial state ns_samples_ = ns_samples / \ self.continuous_state_starting_samples for i in xrange(self.continuous_state_starting_samples): # sample a random state within the grid corresponding # to input s new_s = s.copy() for d in xrange(self.domain.state_space_dims): w = self.binWidth_per_dim[d] # Sample each dimension of the new_s within the # cell new_s[d] = (self.random_state.rand() - .5) * w + s[d] # If the dimension is discrete make make the # sampled value to be int if not d in self.domain.continuous_dims: new_s[d] = int(new_s[d]) # print new_s ns, r = self.domain.sampleStep(new_s, a, ns_samples_) next_states[i * ns_samples_:(i + 1) * ns_samples_,:] = ns rewards[i * ns_samples_:(i + 1) * ns_samples_] = r else: next_states, rewards = self.domain.sampleStep( s, a, ns_samples) self.expectedStepCached[key] = [next_states, rewards] else: # print "USED CACHED" next_states, rewards = cacheHit if policy is None: Q = np.mean([rewards[i] + discount_factor * self.V(next_states[i,:]) for i in xrange(ns_samples)]) else: Q = np.mean([rewards[i] + discount_factor * self.Q(next_states[i,:], policy.pi(next_states[i,:])) for i in xrange(ns_samples)]) return Q