Exemplo n.º 1
0
class DDPGAgent():
    """
    Deep deterministic policy gradient agent as described in
    https://arxiv.org/abs/1509.02971.

    This agent is meant to operate on low dimensional inputs, not raw pixels.

    To use the agent, you can get action predictions using act(), and to teach
    the agent, feed the results to learn.
    """
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.local_actor_network = ActorNetwork(state_size, action_size)
        self.target_actor_network = ActorNetwork(state_size, action_size)
        self.actor_optimizer = optim.Adam(
            self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

        # Critic
        self.local_critic_network = CriticNetwork(state_size, action_size)
        self.target_critic_network = CriticNetwork(state_size, action_size)
        self.critic_optimizer = optim.Adam(
            self.local_critic_network.parameters(),
            lr=CRITIC_LEARNING_RATE,
            weight_decay=CRITIC_WEIGHT_DECAY)

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess(
            (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)

    def act(self, states, noise=True):
        """
        Returns an action vector based on the current game state.

        Params
        ======
        states (array_like): A matrix of game states (each row represents the
            state of an agent)
        noise (boolean): Add random noise to the predicted action.  Aids
            exploration of the environment during training.
        """

        self.local_actor_network.eval()
        with torch.no_grad():
            actions = self.local_actor_network(
                torch.tensor(states, dtype=torch.float32)).detach().numpy()
        self.local_actor_network.train()
        if noise:
            actions = actions + self.random_process.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def vectorize_experiences(self, experiences):
        """Vectorizes experience objects for use by pytorch

        Params
        ======
            experiences (array_like of Experience objects): Experiences to
                vectorize
        """
        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(self.device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(self.device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences if e is not None
                       ]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones)

    def normalize(self, to_normalize):
        """
        Normalize the each row of the input along the 0 dimension using the
        formula (value - mean)/std

        Params
        ======
        to_normalize (array_like): Values to normalize
        """

        std = to_normalize.std(0)
        mean = to_normalize.mean(0)
        return (to_normalize - mean) / (std + 1e-5)

    def soft_update(self, target_parameters, local_parameters):
        """
        Updates the given target network parameters with the local parameters
        using a soft update strategy: tau * local + (1-tau) * target
        """

        for target, local in zip(target_parameters, local_parameters):
            target.data.copy_(TAU * local.data + (1.0 - TAU) * target.data)

    def train(self, experiences):
        """
        Trains the actor and critic networks using a minibatch of experiences

        Params
        ======
        experiences (array_like of Experience): Minibatch of experiences
        """
        states, actions, rewards, next_states, dones = self.vectorize_experiences(
            experiences)
        #states = self.normalize(states)
        #next_states = self.normalize(next_states)
        rewards = self.normalize(rewards)

        # Use the target critic network to calculate a target q value
        next_actions = self.target_actor_network(next_states)
        q_target = rewards + GAMMA * self.target_critic_network(
            next_states, next_actions) * (1 - dones)

        # Calculate the predicted q value
        q_predicted = self.local_critic_network(states, actions)

        # Update critic network
        critic_loss = F.mse_loss(q_predicted, q_target)
        #print(critic_loss)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.local_critic_network.parameters(),
                                       1)
        self.critic_optimizer.step()

        # Update predicted action using policy gradient
        actions_predicted = self.local_actor_network(states)
        #print(self.local_critic_network(states, actions_predicted).mean())
        policy_loss = -self.local_critic_network(states,
                                                 actions_predicted).mean()
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        #print(policy_loss)
        self.actor_optimizer.step()

        self.soft_update(self.target_actor_network.parameters(),
                         self.local_actor_network.parameters())
        self.soft_update(self.target_critic_network.parameters(),
                         self.local_critic_network.parameters())

    def learn(self, experience):
        """
        Tells the agent to learn from an experience.  This may not immediately
        result in training since this agent uses a replay buffer.

        Params
        ======
        experience (Experience): An experience used to teach the agent.
        """
        self.replay_buffer.add(experience)
        self.steps += 1
        if self.steps % STEPS_BETWEEN_TRAINING == 0 and len(
                self.replay_buffer) >= BATCH_SIZE:
            for i in range(ITERATIONS_PER_TRAINING):
                self.train(self.replay_buffer.sample(BATCH_SIZE))

    def save(self, filename):
        """Saves learned params of underlying networks to a checkpoint file.

        Params
        ======
            filename (string): Target file.  agent- and critic- are prepended
                for the agent and critic network, respectively
        """
        torch.save(self.local_actor_network.state_dict(), "actor-" + filename)
        torch.save(self.local_critic_network.state_dict(),
                   "critic-" + filename)

    def load(self, filename):
        """Loads learned params generated by save() into underlying networks.

            filename (string): Path to file.  There should be an agent- and
            critic- version of this file.
        """
        self.local_actor_network.load_state_dict(
            torch.load("actor-" + filename))
        self.target_actor_network.load_state_dict(
            torch.load("actor-" + filename))

        self.local_critic_network.load_state_dict(
            torch.load("critic-" + filename))
        self.target_critic_network.load_state_dict(
            torch.load("critic-" + filename))

    def end_episode(self):
        """
        Tell the agent that an episode is complete.
        """
        self.random_process.reset()
        self.steps = 0
Exemplo n.º 2
0
class MADDPGAgent():
    """
    Multi Aagent Deep deterministic policy gradient agent as described in
    https://arxiv.org/pdf/1706.02275.pdf.

    This agent is meant to operate on low dimensional inputs, not raw pixels.

    To use the agent, you can get action predictions using act(), and to teach
    the agent, feed the results to learn.
    """
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.actor = ActorNetwork(state_size, action_size)
        self.actor_target = ActorNetwork(state_size, action_size)
        self.soft_update(self.actor_target.parameters(),
                         self.actor.parameters(), 1)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=ACTOR_LEARNING_RATE)

        # Create one critic per agent
        self.critics = []
        self.critic_targets = []
        self.critic_optimizers = []
        for i in range(num_agents):

            # Critic
            # Note: we use action_size * num_agents since we'll pass in the actions of all agents concatenated
            critic = CriticNetwork(state_size * num_agents,
                                   action_size * num_agents)
            self.critics.append(critic)
            self.critic_targets.append(
                CriticNetwork(state_size * num_agents,
                              action_size * num_agents))
            self.soft_update(self.critic_targets[-1].parameters(),
                             critic.parameters(), 1)
            self.critic_optimizers.append(
                optim.Adam(critic.parameters(),
                           lr=CRITIC_LEARNING_RATE,
                           weight_decay=CRITIC_WEIGHT_DECAY))

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess((1, action_size),
                                                       sigma=RANDOM_SIGMA,
                                                       theta=RANDOM_THETA)

    def act(self, all_states, noise=True):
        """
        Returns an action vector based on the current game state.

        Params
        ======
        all_states (array_like): A matrix of game states (each row represents the
            state of an agent)
        noise (boolean): Add random noise to the predicted action.  Aids
            exploration of the environment during training.
        """
        #print("states")
        #print(states)
        all_actions = []

        # Generate actions for each 'agent'
        for state in all_states:
            actor = self.actor
            actor.eval()
            with torch.no_grad():
                actions = actor(
                    torch.tensor(
                        state,
                        dtype=torch.float32).unsqueeze(0)).detach().numpy()
            actor.train()
            if noise:
                actions = actions + self.random_process.sample()
            actions = np.clip(actions, -1, 1)
            all_actions.append(actions)
        return np.vstack(all_actions)

    def predict_and_vectorize_actions(self, experiences, agent_index):
        """
        Return a vectorized form of predicted actinos.  As dictated by the
        MADDPG algorithm, the agent corresponding to agent_index predicts the
        action using its actor.  The rest of the agents
        use the action already contained in the experience.

        Params
        ======
        experiences (array_like of Experience): Minibatch of experiences
        agent_index (integer): Offset into the agent array.  This is the
            agent making a prediction
        """
        actions = []
        for i in range(self.num_agents):
            if i == agent_index:
                actor = self.actor
                states = torch.from_numpy(
                    np.vstack([
                        e.states[i] for e in experiences if e is not None
                    ])).float().to(self.device)
                actions.append(actor(states))
            else:
                actions.append(
                    torch.from_numpy(
                        np.vstack([
                            e.actions[i] for e in experiences if e is not None
                        ])).float().to(self.device))
        return torch.cat(actions, dim=1)

    def predict_and_vectorize_next_actions(self, experiences):
        """
        Predicts next_actions based on next_states from the experience minibatch
        using the agents' actor targets.  The resulting actions are returned
        as torch tensors.

        Params
        ======
        experiences (array_like of Experience): Minibatch of experiences
        """
        next_actions = []
        for i in range(self.num_agents):
            next_states = torch.from_numpy(
                np.vstack([
                    e.next_states[i] for e in experiences if e is not None
                ])).float().to(self.device)
            next_actions.append(self.actor_target(next_states).detach())
        return torch.cat(next_actions, dim=1)

    def vectorize_actions_and_states(self, experiences):
        actions = torch.from_numpy(
            np.vstack([
                np.concatenate(e.actions) for e in experiences if e is not None
            ])).float().to(self.device)
        full_states = torch.from_numpy(
            np.vstack([
                np.concatenate(e.states) for e in experiences if e is not None
            ])).float().to(self.device)
        full_next_states = torch.from_numpy(
            np.vstack([
                np.concatenate(e.next_states) for e in experiences
                if e is not None
            ])).float().to(self.device)
        return (actions, full_states, full_next_states)

    def vectorize_per_agent_data(self, experiences, agent_index):
        states = torch.from_numpy(
            np.vstack([
                e.states[agent_index] for e in experiences if e is not None
            ])).float().to(self.device)
        rewards = torch.from_numpy(
            np.vstack([
                e.rewards[agent_index] for e in experiences if e is not None
            ])).float().to(self.device)
        next_states = torch.from_numpy(
            np.vstack([
                e.next_states[agent_index] for e in experiences
                if e is not None
            ])).float().to(self.device)
        dones = torch.from_numpy(
            np.vstack([
                e.dones[agent_index] for e in experiences if e is not None
            ]).astype(np.uint8)).float().to(self.device)

        return (states, rewards, next_states, dones)

    def normalize(self, to_normalize):
        """
        Normalize the each row of the input along the 0 dimension using the
        formula (value - mean)/std

        Params
        ======
        to_normalize (array_like): Values to normalize
        """

        std = to_normalize.std(0)
        mean = to_normalize.mean(0)
        return (to_normalize - mean) / (std + 1e-5)

    def soft_update(self, target_parameters, local_parameters, tau=TAU):
        """
        Updates the given target network parameters with the local parameters
        using a soft update strategy: tau * local + (1-tau) * target
        """

        for target, local in zip(target_parameters, local_parameters):
            target.data.copy_(tau * local.data + (1.0 - tau) * target.data)

    def train(self, experiences):
        """
        Trains the actor and critic networks using a minibatch of experiences

        Params
        ======
        experiences (array_like of Experience): Minibatch of experiences
        """

        # Transform agent indendent data into vectorized tensors
        next_actions = self.predict_and_vectorize_next_actions(experiences)
        actions, full_states, full_next_states = self.vectorize_actions_and_states(
            experiences)

        # Iterate through each agent
        for i in range(self.num_agents):

            # Transform agent dependent data into vectorized tensors
            states, rewards, next_states, dones = self.vectorize_per_agent_data(
                experiences, i)
            rewards = self.normalize(rewards)

            # Grab networks for this agent offset
            critic = self.critics[i]
            critic_target = self.critic_targets[i]
            critic_optimizer = self.critic_optimizers[i]
            actor = self.actor
            actor_target = self.actor_target
            actor_optimizer = self.actor_optimizer

            # Use the target critic network to calculate a target q value\
            q_target = rewards + GAMMA * critic_target(
                full_next_states, next_actions) * (1 - dones)

            # Calculate the predicted q value
            q_predicted = critic(full_states, actions)

            # Update critic network
            critic_loss = F.mse_loss(q_predicted, q_target)
            critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(critic.parameters(), 1)
            critic_optimizer.step()

            # Update predicted action using policy gradient
            actions_predicted = self.predict_and_vectorize_actions(
                experiences, i)
            policy_loss = -critic(full_states, actions_predicted).mean()
            actor_optimizer.zero_grad()
            policy_loss.backward()
            actor_optimizer.step()

            # Soft update target networks
            self.soft_update(actor_target.parameters(), actor.parameters())
            self.soft_update(critic_target.parameters(), critic.parameters())

    def learn(self, experience):
        """
        Tells the agent to learn from an experience.  This may not immediately
        result in training since this agent uses a replay buffer.

        Params
        ======
        experience (Experience): An experience used to teach the agent.
        """
        self.replay_buffer.add(experience)
        self.steps += 1
        if self.steps % STEPS_BETWEEN_TRAINING == 0 and len(
                self.replay_buffer) >= BATCH_SIZE:
            self.train(self.replay_buffer.sample(BATCH_SIZE))

    def save(self, filename):
        """Saves learned params of underlying networks to a checkpoint file.

        Params
        ======
            filename (string): Target file.  agent- and critic- are prepended
                for the agent and critic network, respectively.  Each agent
                has it's own critic, so the critic networks are saved as
                critic-i- where i represents the agent offset (0 based).
        """
        torch.save(self.actor.state_dict(), "actor-" + filename)
        for i in range(self.num_agents):
            torch.save(self.critics[i].state_dict(),
                       "critic-" + str(i) + "-" + filename)

    def load(self, filename):
        """Loads learned params generated by save() into underlying networks.

            filename (string): Path to file.  There should be an agent- and
            critic- version of this file.
        """
        self.actor.load_state_dict(torch.load("actor-" + filename))
        self.actor_target.load_state_dict(torch.load("actor-" + filename))
        for i in range(self.num_agents):
            self.critics[i].load_state_dict(
                torch.load("critic-" + str(i) + "-" + filename))
            self.critic_targets[i].load_state_dict(
                torch.load("critic-" + str(i) + "-" + filename))

    def end_episode(self):
        """
        Tell the agent that an episode is complete.
        """
        self.random_process.reset()