Exemplo n.º 1
0
class ExpectedSarsaAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount, step_size):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space,
                                               step_size=step_size)

    def train_episode(self):
        prev_observation = self.environment.reset()
        done = False
        while not done:
            action = self.act(prev_observation)
            observation, reward, done, _ = self.environment.step(action)
            sample_return = reward + self.discount * self.expected_value(
                observation)
            self.action_values.record(prev_observation, action, sample_return)
            prev_observation = observation

    def expected_value(self, state):
        greedy_action = self.action_values.greedy_action(state)
        expectation = 0
        for action in spaces.enumerate(self.environment.action_space):
            likelihood = self.exploration_strategy.action_probability(
                action, greedy_action)
            expectation += likelihood * self.action_values[state, action]
        return expectation

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)
Exemplo n.º 2
0
class OnPolicyMonteCarloAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space)

    def train_episode(self):
        states, actions, rewards = [], [], []
        observation = self.environment.reset()
        done = False
        while not done:
            states.append(observation)
            action = self.act(observation)
            actions.append(action)
            observation, reward, done, _ = self.environment.step(action)
            rewards.append(reward)

        episode_return = 0
        for state, action, reward in zip(reversed(states), reversed(actions),
                                         reversed(rewards)):
            episode_return = reward + self.discount * episode_return
            self.action_values.record(state, action, episode_return)

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)
Exemplo n.º 3
0
    def __init__(self, environment, exploration_strategy, discount, step_size):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = [
            ActionValuesTable(self.environment.action_space,
                              step_size=step_size),
            ActionValuesTable(self.environment.action_space,
                              step_size=step_size)
        ]
Exemplo n.º 4
0
class SarsaAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount, step_size):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space, step_size=step_size)

    def train_episode(self):
        prev_observation = self.environment.reset()
        prev_action = self.act(prev_observation)
        done = False
        while not done:
            observation, reward, done, _ = self.environment.step(prev_action)
            action = self.act(observation)
            sample_return = reward + self.discount * self.action_values[observation, action]
            self.action_values.record(prev_observation, prev_action, sample_return)
            prev_observation, prev_action = observation, action

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)
Exemplo n.º 5
0
class OffPolicyMonteCarloAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space)

    def train_episode(self):
        states, actions, rewards = [], [], []
        observation = self.environment.reset()
        done = False
        while not done:
            states.append(observation)
            action = self.act(observation)
            actions.append(action)
            observation, reward, done, _ = self.environment.step(action)
            rewards.append(reward)

        episode_return = 0
        importance_sampling_ratio = 1
        for state, action, reward in zip(reversed(states), reversed(actions),
                                         reversed(rewards)):
            episode_return = reward + self.discount * episode_return
            self.action_values.record(state,
                                      action,
                                      episode_return,
                                      weight=importance_sampling_ratio)

            greedy_action = self.action_values.greedy_action(state)
            if action != greedy_action:
                # Since the target policy is greedy, the importance sampling ratio
                # will be 0 if the greedy action was not chosen by the behavior policy,
                # resulting in the returns from earlier in the episode having 0 weight
                break
                importance_sampling_ratio = importance_sampling_ratio * (
                    1 / self.exploration_strategy.action_probability(
                        action, greedy_action))

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)