def expectedStep(self, s, a): # Returns k possible outcomes # p: k-by-1 probability of each transition # r: k-by-1 rewards # ns: k-by-|s| next state # t: k-by-1 terminal values # pa: k-by-?? possible actions for each next state actions = self.possibleActions(s) k = len(actions) # Make Probabilities intended_action_index = findElemArray1D(a, actions) p = np.ones((k, 1)) * self.NOISE / (k * 1.) p[intended_action_index, 0] += 1 - self.NOISE # Make next states ns = np.tile(s, (k, 1)).astype(int) actions = self.ACTIONS[actions] ns += actions # Make next possible actions pa = np.array([self.possibleActions(sn) for sn in ns]) # Make rewards r = np.ones((k, 1)) * self.STEP_REWARD goal = self.map[ns[:, 0], ns[:, 1]] == self.GOAL pit = self.map[ns[:, 0], ns[:, 1]] == self.PIT r[goal] = self.GOAL_REWARD r[pit] = self.PIT_REWARD # Make terminals t = np.zeros((k, 1), bool) t[goal] = True t[pit] = True return p, r, ns, t, pa
def top(self, A, s): # returns the block on top of block A. Return [] if nothing is on top # of A on_A = findElemArray1D(A, s) on_A = np.setdiff1d(on_A, [A]) # S[i] = i is the key for i is on table. return on_A
def bestActions(self, s, terminal, p_actions, phi_s=None): """ Returns a list of the best actions at a given state. If *phi_s* [the feature vector at state *s*] is given, it is used to speed up code by preventing re-computation within this function. See :py:meth:`~rlpy.Representations.Representation.Representation.bestAction` :param s: The given state :param terminal: Whether or not the state *s* is a terminal one. :param phi_s: (optional) the feature vector at state (s). :return: A list of the best actions at the given state. """ Qs = self.Qs(s, terminal, phi_s) Qs = Qs[p_actions] # Find the index of best actions ind = findElemArray1D(Qs, Qs.max()) return np.array(p_actions)[ind]