def true_values_for_sample(self, states, actions, assume_optimal_policy: bool): string_actions = [] for action in actions: string_actions.append(self.features_to_action(action)) return GridworldBase.true_values_for_sample( self, states, string_actions, assume_optimal_policy )
def transition_probabilities(self, state, action) -> np.ndarray: if action == "C": next_state = self._cheat_step(state) probabilities = np.zeros((self.width * self.height,)) probabilities[next_state] = 1 return probabilities else: return GridworldBase.transition_probabilities(self, state, action)
def true_values_for_sample(self, states, actions, assume_optimal_policy: bool): string_actions = [] for action in actions: string_actions.append(self.ACTIONS[int(list(action.keys())[0]) - self.num_states]) return GridworldBase.true_values_for_sample(self, states, string_actions, assume_optimal_policy)
def possible_next_actions(self, state, ignore_terminal=False) -> List[str]: if ignore_terminal is False and self.is_terminal(state): return [] possible_actions = GridworldBase.possible_next_actions( self, state, ignore_terminal) if ignore_terminal is False: # Also ignore cheat actions when ignoring terminal possible_actions.append('C') return possible_actions
def step(self, action: str, with_possible=True) -> Tuple[int, float, bool, List[str]]: if action == 'C': self._state: int = self._cheat_step(self._state) reward = self.reward(self._state) possible_next_action = self.possible_next_actions(self._state) return self._state, reward, self.is_terminal( self._state), possible_next_action else: return GridworldBase.step(self, action)
def true_rewards_for_sample(self, states, actions): string_actions = [] for action in actions: string_actions.append(self.features_to_action(action)) return GridworldBase.true_rewards_for_sample(self, states, string_actions)