class ExpectedSarsaAgent(BaseAgent): def __init__(self, environment, exploration_strategy, discount, step_size): super().__init__(environment, exploration_strategy, discount) # TODO: make sure environment has a discrete action and observation space self.action_values = ActionValuesTable(self.environment.action_space, step_size=step_size) def train_episode(self): prev_observation = self.environment.reset() done = False while not done: action = self.act(prev_observation) observation, reward, done, _ = self.environment.step(action) sample_return = reward + self.discount * self.expected_value( observation) self.action_values.record(prev_observation, action, sample_return) prev_observation = observation def expected_value(self, state): greedy_action = self.action_values.greedy_action(state) expectation = 0 for action in spaces.enumerate(self.environment.action_space): likelihood = self.exploration_strategy.action_probability( action, greedy_action) expectation += likelihood * self.action_values[state, action] return expectation def act_greedily(self, observation): return self.action_values.greedy_action(observation)
class OnPolicyMonteCarloAgent(BaseAgent): def __init__(self, environment, exploration_strategy, discount): super().__init__(environment, exploration_strategy, discount) # TODO: make sure environment has a discrete action and observation space self.action_values = ActionValuesTable(self.environment.action_space) def train_episode(self): states, actions, rewards = [], [], [] observation = self.environment.reset() done = False while not done: states.append(observation) action = self.act(observation) actions.append(action) observation, reward, done, _ = self.environment.step(action) rewards.append(reward) episode_return = 0 for state, action, reward in zip(reversed(states), reversed(actions), reversed(rewards)): episode_return = reward + self.discount * episode_return self.action_values.record(state, action, episode_return) def act_greedily(self, observation): return self.action_values.greedy_action(observation)
def __init__(self, environment, exploration_strategy, discount, step_size): super().__init__(environment, exploration_strategy, discount) # TODO: make sure environment has a discrete action and observation space self.action_values = [ ActionValuesTable(self.environment.action_space, step_size=step_size), ActionValuesTable(self.environment.action_space, step_size=step_size) ]
class SarsaAgent(BaseAgent): def __init__(self, environment, exploration_strategy, discount, step_size): super().__init__(environment, exploration_strategy, discount) # TODO: make sure environment has a discrete action and observation space self.action_values = ActionValuesTable(self.environment.action_space, step_size=step_size) def train_episode(self): prev_observation = self.environment.reset() prev_action = self.act(prev_observation) done = False while not done: observation, reward, done, _ = self.environment.step(prev_action) action = self.act(observation) sample_return = reward + self.discount * self.action_values[observation, action] self.action_values.record(prev_observation, prev_action, sample_return) prev_observation, prev_action = observation, action def act_greedily(self, observation): return self.action_values.greedy_action(observation)
class OffPolicyMonteCarloAgent(BaseAgent): def __init__(self, environment, exploration_strategy, discount): super().__init__(environment, exploration_strategy, discount) # TODO: make sure environment has a discrete action and observation space self.action_values = ActionValuesTable(self.environment.action_space) def train_episode(self): states, actions, rewards = [], [], [] observation = self.environment.reset() done = False while not done: states.append(observation) action = self.act(observation) actions.append(action) observation, reward, done, _ = self.environment.step(action) rewards.append(reward) episode_return = 0 importance_sampling_ratio = 1 for state, action, reward in zip(reversed(states), reversed(actions), reversed(rewards)): episode_return = reward + self.discount * episode_return self.action_values.record(state, action, episode_return, weight=importance_sampling_ratio) greedy_action = self.action_values.greedy_action(state) if action != greedy_action: # Since the target policy is greedy, the importance sampling ratio # will be 0 if the greedy action was not chosen by the behavior policy, # resulting in the returns from earlier in the episode having 0 weight break importance_sampling_ratio = importance_sampling_ratio * ( 1 / self.exploration_strategy.action_probability( action, greedy_action)) def act_greedily(self, observation): return self.action_values.greedy_action(observation)