Python ActionValuesTable примеры использования

Язык программирования: Python

Пространство имен/Пакет: rl.agents.tabular

Класс/Тип: ActionValuesTable

Примеров на hotexamples.com: 5

Python ActionValuesTable - 5 примеров найдено. Это лучшие примеры Python кода для rl.agents.tabular.ActionValuesTable, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

record(4)

greedy_action(4)

ActionValuesTable(1)

Пример #1

Показать файл

class ExpectedSarsaAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount, step_size):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space,
                                               step_size=step_size)

    def train_episode(self):
        prev_observation = self.environment.reset()
        done = False
        while not done:
            action = self.act(prev_observation)
            observation, reward, done, _ = self.environment.step(action)
            sample_return = reward + self.discount * self.expected_value(
                observation)
            self.action_values.record(prev_observation, action, sample_return)
            prev_observation = observation

    def expected_value(self, state):
        greedy_action = self.action_values.greedy_action(state)
        expectation = 0
        for action in spaces.enumerate(self.environment.action_space):
            likelihood = self.exploration_strategy.action_probability(
                action, greedy_action)
            expectation += likelihood * self.action_values[state, action]
        return expectation

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)

Пример #2

Показать файл

Файл: on_policy_monte_carlo.py Проект: taylor1355/rl

class OnPolicyMonteCarloAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space)

    def train_episode(self):
        states, actions, rewards = [], [], []
        observation = self.environment.reset()
        done = False
        while not done:
            states.append(observation)
            action = self.act(observation)
            actions.append(action)
            observation, reward, done, _ = self.environment.step(action)
            rewards.append(reward)

        episode_return = 0
        for state, action, reward in zip(reversed(states), reversed(actions),
                                         reversed(rewards)):
            episode_return = reward + self.discount * episode_return
            self.action_values.record(state, action, episode_return)

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)

Пример #3

Показать файл

Файл: double_q_learning.py Проект: taylor1355/rl

    def __init__(self, environment, exploration_strategy, discount, step_size):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = [
            ActionValuesTable(self.environment.action_space,
                              step_size=step_size),
            ActionValuesTable(self.environment.action_space,
                              step_size=step_size)
        ]

Пример #4

Показать файл

class SarsaAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount, step_size):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space, step_size=step_size)

    def train_episode(self):
        prev_observation = self.environment.reset()
        prev_action = self.act(prev_observation)
        done = False
        while not done:
            observation, reward, done, _ = self.environment.step(prev_action)
            action = self.act(observation)
            sample_return = reward + self.discount * self.action_values[observation, action]
            self.action_values.record(prev_observation, prev_action, sample_return)
            prev_observation, prev_action = observation, action

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)

Пример #5

Показать файл

class OffPolicyMonteCarloAgent(BaseAgent):
    def __init__(self, environment, exploration_strategy, discount):
        super().__init__(environment, exploration_strategy, discount)
        # TODO: make sure environment has a discrete action and observation space

        self.action_values = ActionValuesTable(self.environment.action_space)

    def train_episode(self):
        states, actions, rewards = [], [], []
        observation = self.environment.reset()
        done = False
        while not done:
            states.append(observation)
            action = self.act(observation)
            actions.append(action)
            observation, reward, done, _ = self.environment.step(action)
            rewards.append(reward)

        episode_return = 0
        importance_sampling_ratio = 1
        for state, action, reward in zip(reversed(states), reversed(actions),
                                         reversed(rewards)):
            episode_return = reward + self.discount * episode_return
            self.action_values.record(state,
                                      action,
                                      episode_return,
                                      weight=importance_sampling_ratio)

            greedy_action = self.action_values.greedy_action(state)
            if action != greedy_action:
                # Since the target policy is greedy, the importance sampling ratio
                # will be 0 if the greedy action was not chosen by the behavior policy,
                # resulting in the returns from earlier in the episode having 0 weight
                break
                importance_sampling_ratio = importance_sampling_ratio * (
                    1 / self.exploration_strategy.action_probability(
                        action, greedy_action))

    def act_greedily(self, observation):
        return self.action_values.greedy_action(observation)