Python ReplayBuffer примеры использования

Язык программирования: Python

Пространство имен/Пакет: rlib.shared.replay_buffer

Класс/Тип: ReplayBuffer

Примеров на hotexamples.com: 8

Python ReplayBuffer - 8 примеров найдено. Это лучшие примеры Python кода для rlib.shared.replay_buffer.ReplayBuffer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(4)

add(4)

sample(4)

Основные методы

ReplayBuffer (4)

add (4)

sample (4)

Пример #1

Показать файл

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 agents=None,
                 new_hyperparameters=None,
                 seed=0,
                 device="cpu",
                 model_output_dir=None,
                 enable_logger=False,
                 logger_path=None,
                 logger_comment=None,
                 opt_soft_update=False):
        """Initialize a MADDPGAgent wrapper.
       
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): the number of agents in the environment
        """
        raise NotImplementedError()

        super(DDPG, self).__init__(
            new_hyperparameters=new_hyperparameters,
            enable_logger=enable_logger,
            logger_path=logger_path,
            logger_comment=logger_comment
        )

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if agents:
            self.agents = agents
        else:
            self.agents = [DDPGAgent(state_size, action_size, agent_id=i+1, handler=self) for i in range(num_agents)]

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir

Пример #2

Показать файл

class ReplayBufferTest(unittest.TestCase):
    def setUp(self):
        self.batch_size = 2
        self.replay_buffer = ReplayBuffer(10, self.batch_size, "cpu")
        self.populate_replay_buffer()

    def populate_replay_buffer(self, n=5):
        for _ in range(n):
            self.replay_buffer.add(0.0, 0.0, 0.0, 0.0, 0.0)

    def test_add(self):
        l1 = len(self.replay_buffer)
        self.replay_buffer.add(0.0, 0.0, 0.0, 0.0, 0.0)
        l2 = len(self.replay_buffer)
        self.assertNotEqual(l1, l2)

    def test_sample(self):
        s, a, r, ns, d = self.replay_buffer.sample()
        self.assertEqual(s.shape[0], self.batch_size)
        self.assertEqual(a.shape[0], self.batch_size)
        self.assertEqual(r.shape[0], self.batch_size)
        self.assertEqual(ns.shape[0], self.batch_size)
        self.assertEqual(d.shape[0], self.batch_size)

Пример #3

Показать файл

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 qnetwork_local=None,
                 qnetwork_target=None,
                 optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 opt_soft_update: bool = False,
                 opt_ddqn: bool = False):
        """Initialize an DQNAgent object.

        Args:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            qnetwork_local (torch.nn.Module): Local Q-Network model.
            qnetwork_target (torch.nn.Module): Target Q-Network model.
            optimizer (torch.optim): Local Q-Network optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.
            opt_ddqn (bool): Use Double DQN for `expected_Q`.
        
        Returns:
            An instance of DQNAgent.
        """
        super(DQNAgent, self).__init__(new_hyperparameters=new_hyperparameters)

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if qnetwork_local:
            self.qnetwork_local = qnetwork_local
        else:
            self.qnetwork_local = QNetwork(state_size,
                                           action_size).to(self.device)

        if qnetwork_target:
            self.qnetwork_target = qnetwork_target
        else:
            self.qnetwork_target = QNetwork(state_size,
                                            action_size).to(self.device)

        if optimizer:
            self.optimizer = optimizer
        else:
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=self.LEARNING_RATE)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update
        self.opt_ddqn = opt_ddqn

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.qnetwork_local, "qnetwork_local_params"),
            (self.optimizer, "optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.qnetwork_local, self.qnetwork_target)

Пример #4

Показать файл

class DQNAgent(Agent):
    """DQN Agent implementation."""

    # TODO: Consider how to extend this to accept multiple agents?
    # TODO: Add noise to DQN?

    # TODO: Ensure that this cannot be changed in other ways
    # TODO: Look up original value for these params
    REQUIRED_HYPERPARAMETERS = {
        "buffer_size": int(1e7),
        "batch_size": 32,
        "gamma": 0.99,
        "learning_rate": 2.5e-4,
        "tau": 1e-3,
        "learn_every": 4,
        "hard_update_every": 10000
    }

    ALGORITHM = "DQN"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 qnetwork_local=None,
                 qnetwork_target=None,
                 optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 opt_soft_update: bool = False,
                 opt_ddqn: bool = False):
        """Initialize an DQNAgent object.

        Args:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            qnetwork_local (torch.nn.Module): Local Q-Network model.
            qnetwork_target (torch.nn.Module): Target Q-Network model.
            optimizer (torch.optim): Local Q-Network optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.
            opt_ddqn (bool): Use Double DQN for `expected_Q`.
        
        Returns:
            An instance of DQNAgent.
        """
        super(DQNAgent, self).__init__(new_hyperparameters=new_hyperparameters)

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if qnetwork_local:
            self.qnetwork_local = qnetwork_local
        else:
            self.qnetwork_local = QNetwork(state_size,
                                           action_size).to(self.device)

        if qnetwork_target:
            self.qnetwork_target = qnetwork_target
        else:
            self.qnetwork_target = QNetwork(state_size,
                                            action_size).to(self.device)

        if optimizer:
            self.optimizer = optimizer
        else:
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=self.LEARNING_RATE)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update
        self.opt_ddqn = opt_ddqn

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.qnetwork_local, "qnetwork_local_params"),
            (self.optimizer, "optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.qnetwork_local, self.qnetwork_target)

    def __str__(self) -> str:
        """Helper to output network architecture for the agent.
        
        Returns:
            A string representation of this algorithm.
        """
        return ("{}\n{}\n{}\n{}".format("Q-Network (Local):",
                                        self.qnetwork_local,
                                        "Q-Network (Target):",
                                        self.qnetwork_target))

    def origin(self) -> str:
        """Helper to get the original paper for this algorithm.

        Returns: 
            The original paper for this algorithm.
        """
        return 'https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf'

    def description(self) -> str:
        """Helper to get a brief description of this algorithm.

        Returns:
            A brief description of this algorithm.
        """
        description = (
            'DQN is an algorithm created by DeepMind that brings together the power '
            'of the Q-Learning algorithm with the advantages of generalization through '
            'function approximation. It uses a deep neural network to estimate a Q-value '
            'function. As such, the input to the network is the current state of the '
            'environment, and the output is the Q-value for each possible action.'
        )
        return description

    def step(self,
             state,
             action,
             reward,
             next_state,
             done,
             logger=None) -> None:
        """Saves experience to replay memory and updates model weights.

        Args:
            state: Environment states.
            action: Environment actions.
            reward: Rewards for the actions above.
            next_state: Next environment states.
            done (bool): Boolean indicating if the environment has terminated. 
            logger (Logger): An instance of Logger.
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn every `learn_every` time steps
        self.time_step += 1
        if self.time_step % self.LEARN_EVERY == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, logger=logger)

    def act(self, state, eps=0.0, add_noise=False, logger=None):
        """Returns actions for given state as per current policy.

        Args:
            state: The current state of the environment.
            eps (float): Epsilon, for Epsilon-greedy action selection.
            add_noise (bool): Controls addition of noise.
            logger (Logger): An instance of Logger.

        Returns: 
            Actions for given state as per current policy.
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, logger=None) -> None:
        """Updates value parameters using given batch of experience tuples.

        Args:
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples.
            logger (Logger): An instance of Logger.
        """
        states, actions, rewards, next_states, dones = experiences

        if self.opt_ddqn:
            # Double DQN
            non_final_next_states = next_states * (1 - dones)
            # Get the actions themselves, not their output value
            _, next_state_actions = self.qnetwork_local(
                non_final_next_states).max(1, keepdim=True)
            next_Q_targets = self.qnetwork_target(
                non_final_next_states).gather(1, next_state_actions)
            target_Q = rewards + (self.GAMMA * next_Q_targets * (1 - dones))
        else:
            # Vanilla DQN
            next_max_a = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
            target_Q = rewards + (self.GAMMA * next_max_a * (1 - dones))

        expected_Q = self.qnetwork_local(states)
        if len(actions.shape) == 1:
            actions = actions.unsqueeze(1)
        expected_Q = torch.gather(expected_Q, 1, actions.long())

        # Compute and minimize the loss
        loss = F.mse_loss(expected_Q, target_Q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        if self.opt_soft_update:
            soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU)
        elif self.time_step % self.HARD_UPDATE_EVERY == 0:
            hard_update(self.qnetwork_local, self.qnetwork_target)

        if logger:
            loss = loss.cpu().detach().item()
            logger.add_scalar('loss', loss, self.time_step)

Пример #5

Показать файл

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 num_agents: int,
                 actor_local=None,
                 actor_target=None,
                 actor_optimizer=None,
                 critic_local=None,
                 critic_target=None,
                 critic_optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 enable_logger: bool = False,
                 logger_path: str = None,
                 logger_comment: str = None,
                 opt_soft_update: bool = False):
        """Initialize an DDPGAgent object.

        Args:
            state_size (int): dimension of each state.
            action_size (int): dimension of each action.
            num_agents (int): number of agents in the environment.
            actor_local (torch.nn.Module): Local Actor model.
            actor_target (torch.nn.Module): Target Actor model.
            actor_optimizer (torch.optim): Actor optimizer.
            critic_local (torch.nn.Module): Local Critic model.
            critic_target (torch.nn.Module): Target Critic model.
            critic_optimizer (torch.optim): Critic optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.

        Returns:
            An instance of DDPGAgent.
        """
        super(DDPGAgent,
              self).__init__(new_hyperparameters=new_hyperparameters,
                             enable_logger=enable_logger,
                             logger_path=logger_path,
                             logger_comment=logger_comment)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        # Actor Network (w/ Target Network)
        self.actor_local = actor_local if actor_local else Actor(
            state_size, action_size, seed).to(device)
        self.actor_target = actor_target if actor_target else Actor(
            state_size, action_size, seed).to(device)
        self.actor_optimizer = actor_optimizer if actor_optimizer else optim.Adam(
            self.actor_local.parameters(), lr=self.LEARNING_RATE_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = critic_local if critic_local else Critic(
            state_size, action_size, seed).to(device)
        self.critic_target = critic_target if critic_target else Critic(
            state_size, action_size, seed).to(device)
        self.critic_optimizer = critic_optimizer if critic_optimizer else optim.Adam(
            self.critic_local.parameters(),
            lr=self.LEARNING_RATE_CRITIC,
            weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.actor_local, "actor_local_params"),
            (self.actor_optimizer, "actor_optimizer_params"),
            (self.critic_local, "critic_local_params"),
            (self.critic_optimizer, "critic_optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.actor_local, self.actor_target)
        hard_update(self.critic_local, self.critic_target)

Пример #6

Показать файл

class DDPGAgent(Agent):
    """DDPG Agent implementation."""

    REQUIRED_HYPERPARAMETERS = {
        "buffer_size": int(1e6),
        "batch_size": 64,
        "gamma": 0.99,
        "tau": 1e-3,
        "learning_rate_actor": 1e-4,
        "learning_rate_critic": 1e-3,
        "weight_decay": 1e-2,
        "learn_every": 4,
        "hard_update_every": 4
    }

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 num_agents: int,
                 actor_local=None,
                 actor_target=None,
                 actor_optimizer=None,
                 critic_local=None,
                 critic_target=None,
                 critic_optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 enable_logger: bool = False,
                 logger_path: str = None,
                 logger_comment: str = None,
                 opt_soft_update: bool = False):
        """Initialize an DDPGAgent object.

        Args:
            state_size (int): dimension of each state.
            action_size (int): dimension of each action.
            num_agents (int): number of agents in the environment.
            actor_local (torch.nn.Module): Local Actor model.
            actor_target (torch.nn.Module): Target Actor model.
            actor_optimizer (torch.optim): Actor optimizer.
            critic_local (torch.nn.Module): Local Critic model.
            critic_target (torch.nn.Module): Target Critic model.
            critic_optimizer (torch.optim): Critic optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.

        Returns:
            An instance of DDPGAgent.
        """
        super(DDPGAgent,
              self).__init__(new_hyperparameters=new_hyperparameters,
                             enable_logger=enable_logger,
                             logger_path=logger_path,
                             logger_comment=logger_comment)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        # Actor Network (w/ Target Network)
        self.actor_local = actor_local if actor_local else Actor(
            state_size, action_size, seed).to(device)
        self.actor_target = actor_target if actor_target else Actor(
            state_size, action_size, seed).to(device)
        self.actor_optimizer = actor_optimizer if actor_optimizer else optim.Adam(
            self.actor_local.parameters(), lr=self.LEARNING_RATE_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = critic_local if critic_local else Critic(
            state_size, action_size, seed).to(device)
        self.critic_target = critic_target if critic_target else Critic(
            state_size, action_size, seed).to(device)
        self.critic_optimizer = critic_optimizer if critic_optimizer else optim.Adam(
            self.critic_local.parameters(),
            lr=self.LEARNING_RATE_CRITIC,
            weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.actor_local, "actor_local_params"),
            (self.actor_optimizer, "actor_optimizer_params"),
            (self.critic_local, "critic_local_params"),
            (self.critic_optimizer, "critic_optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.actor_local, self.actor_target)
        hard_update(self.critic_local, self.critic_target)

    def __str__(self) -> str:
        """Helper to output network architecture for the agent.
        
        Returns:
            A string representation of this algorithm.
        """
        return ("{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}".format(
            "Actor (Local):", self.actor_local, "Actor (Target):",
            self.actor_target, "Critic (Local):", self.critic_local,
            "Critic (Target):", self.critic_target))

    def origin(self) -> str:
        """Helper to get the original paper for this algorithm.

        Returns: 
            The original paper for this algorithm.
        """
        return 'https://arxiv.org/pdf/1509.02971.pdf'

    def description(self) -> str:
        """Helper to get a brief description of this algorithm.

        Returns:
            A brief description of this algorithm.
        """
        description = (
            'DDPG was introduced as an actor-critic method that performs well '
            'in environments with a continuous action space, which is a known '
            'limitation of the popular DQN algorithm. It improves on the '
            'deterministic policy gradient (DPG) algorithm by using a neural '
            'network to take advantage of generalization and function approximation.'
        )
        return description

    def step(self,
             states,
             actions,
             rewards,
             next_states,
             dones,
             logger=None) -> None:
        """Save experience in replay memory, and use random sample from buffer to learn.

        Args:
            states: Environment states.
            actions: Environment actions.
            rewards: Rewards for the actions above.
            next_states: Next environment states.
            dones (bool): Boolean indicating if the environment has terminated. 
            logger (Logger): An instance of Logger.
        """
        if self.num_agents == 1:
            self.memory.add(states, actions, rewards, next_states, dones)
        else:
            # TODO: Refactor this to not assume that the objects come in correct shape
            for i in range(self.num_agents):
                self.memory.add(states[i], actions[i], rewards[i],
                                next_states[i], dones[i])

        # Learn every `learn_every` time steps
        self.time_step += 1
        if self.time_step % self.LEARN_EVERY == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, logger=logger)

    def act(self, state, add_noise: bool = True, logger=None):
        """Chooses an action for the current state based on the current policy.

        Args:
            state: The current state of the environment.
            add_noise (bool): Controls addition of noise.
            logger (Logger): An instance of Logger.

        Returns: 
            Actions for given state as per current policy.
        """
        state = torch.from_numpy(state).float().to(self.device)

        if self.num_agents == 1:
            self.actor_local.eval()
            with torch.no_grad():
                action = self.actor_local(state).cpu().data.numpy()
            self.actor_local.train()

            if add_noise:
                action += self.noise.sample()

            # TODO: Have parameter that controls this?
            # return np.clip(action, -1, 1)
            return action
        else:
            actions = np.zeros((self.num_agents, self.action_size))
            self.actor_local.eval()
            with torch.no_grad():
                for i, s in enumerate(state):
                    # Populate list of actions one state at a time
                    actions[i, :] = self.actor_local(s).cpu().data.numpy()
            self.actor_local.train()

            if add_noise:
                actions += self.noise.sample()

            # TODO: Have parameter that controls this?
            # return np.clip(action, -1, 1)
            return actions

    def learn(self, experiences, logger=None) -> None:
        """Update policy and value parameters using given batch of experience tuples.

        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples.
            logger (Logger): An instance of Logger.
        """
        states, actions, rewards, next_states, dones = experiences

        ### Update critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.GAMMA * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # adds gradient clipping to stabilize learning
        self.critic_optimizer.step()

        ### Update actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        if self.opt_soft_update:
            soft_update(self.actor_local, self.actor_target, self.TAU)
            soft_update(self.critic_local, self.critic_target, self.TAU)
        elif self.time_step % self.HARD_UPDATE_EVERY == 0:
            hard_update(self.actor_local, self.actor_target)
            hard_update(self.critic_local, self.critic_target)

        if logger:
            actor_loss = actor_loss.cpu().detach().item()
            critic_loss = critic_loss.cpu().detach().item()
            logger.add_scalars('loss', {
                "actor loss": actor_loss,
                "critic loss": critic_loss,
            }, self.time_step)

Пример #7

Показать файл

class MADDPGAgent(Agent):
    """MADDPG implementation."""

    REQUIRED_HYPERPARAMETERS = {
        "buffer_size": int(1e6),
        "batch_size": 64,
        "gamma": 0.99,
        "tau": 1e-3,
        "learning_rate_actor": 1e-4,
        "learning_rate_critic": 1e-3,
        "weight_decay": 1e-2,
        "learn_every": 4,
        "hard_update_every": 5
    }

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 agents=None,
                 new_hyperparameters=None,
                 seed=0,
                 device="cpu",
                 model_output_dir=None,
                 enable_logger=False,
                 logger_path=None,
                 logger_comment=None,
                 opt_soft_update=False):
        """Initialize a MADDPGAgent wrapper.
       
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): the number of agents in the environment
        """
        raise NotImplementedError()

        super(DDPG, self).__init__(
            new_hyperparameters=new_hyperparameters,
            enable_logger=enable_logger,
            logger_path=logger_path,
            logger_comment=logger_comment
        )

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if agents:
            self.agents = agents
        else:
            self.agents = [DDPGAgent(state_size, action_size, agent_id=i+1, handler=self) for i in range(num_agents)]

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir

    def reset(self):
        """Resets OU Noise for each agent."""
        for agent in self.agents:
            agent.reset()

    def act(self, observations, add_noise=False, logger=None):
        """Picks an action for each agent given their individual observations
        and the current policy."""
        actions = []
        for agent, observation in zip(self.agents, observations):
            action = agent.act(observation, add_noise=add_noise)
            actions.append(action)
        return np.array(actions)

    def step(self, observations, actions, rewards, next_observations, dones, logger=None):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        observations = observations.reshape(1, -1)
        actions = actions.reshape(1, -1)
        next_observations = next_observations.reshape(1, -1)

        self.memory.add(observations, actions, rewards, next_observations, dones)

        # Learn every `learn_every` time steps
        self.time_step += 1
        if self.time_step % self.LEARN_EVERY == 0:
            if len(self.memory) > self.BATCH_SIZE:
                for a_i, agent in enumerate(self.agents):
                    experiences = self.memory.sample()
                    self.learn(experiences, a_i, logger=logger)

    def learn(self, experiences, agent_number, logger=None):
        """Helper to pick actions from each agent for the `experiences` tuple that
        will be used to update the weights to agent with ID = `agent_number`.
        Each observation in the `experiences` tuple contains observations from each
        agent, so before using the tuple of update the weights of an agent, we need
        all agents to contribute in generating `next_actions` and `actions_pred`.
        This happens because the critic will take as its input the combined
        observations and actions from all agents."""
        next_actions = []
        actions_pred = []
        states, _, _, next_states, _ = experiences

        next_states = next_states.reshape(-1, self.num_agents, self.state_size)
        states = states.reshape(-1, self.num_agents, self.state_size)

        for a_i, agent in enumerate(self.agents):
            agent_id_tensor = self._get_agent_number(a_i)

            state = states.index_select(1, agent_id_tensor).squeeze(1)
            next_state = next_states.index_select(1, agent_id_tensor).squeeze(1)

            next_actions.append(agent.actor_target(next_state))
            actions_pred.append(agent.actor_local(state))

        next_actions = torch.cat(next_actions, dim=1).to(device)
        actions_pred = torch.cat(actions_pred, dim=1).to(device)

        agent = self.agents[agent_number]
        agent.learn(experiences, next_actions, actions_pred, logger=logger)

    def _get_agent_number(self, i):
        """Helper to get an agent's number as a Torch tensor."""
        return torch.tensor([i]).to(device)

Пример #8

Показать файл

 def setUp(self):
     self.batch_size = 2
     self.replay_buffer = ReplayBuffer(10, self.batch_size, "cpu")
     self.populate_replay_buffer()