def __call__(self, action=None): if action == None: self.curr_pos = self.start_pos self.episode_steps = 0 self.start_episode() return self.state() else: self.episode_steps += 1 assert action in self.actions r, c = self.curr_pos p = self.correct_action_probability N = len(self.actions) distr = array([(1 - p) / (N - 1)] * N) distr[self.actions.index(action)] = p a = utils.weighted_sample(distr) dr, dc = self.action_map[self.actions[a]] if self.move_okay(r + dr, c + dc): r, c = self.curr_pos = (r + dr, c + dc) if (r, c) == self.goal_pos: self.verbose("!!! GOAL !!!") return rl.TERMINAL_STATE, self.goal_reward elif self.timeout and self.episode_steps > self.timeout: return rl.TERMINAL_STATE, self.step_reward else: return self.state(), self.step_reward
def policy(self,sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation,actions))] else: # In the terminal state, the action is irrelevant return 0
def policy(self, sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation, actions))] else: # In the terminal state, the action is irrelevant return 0