Exemplo n.º 1
0
class SingleDDPGAgent:
    """
        Single agent DDPG.
        Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            cfg (config object): main configuration with other passed settings
            num_agents (int): optional (default: 1). If >1 will multiply state and action
                            space sizes for critic. Used for usage with MADDPG.
            agent_id (int): optional (default: 0). Set agent id for MADDPG.
        """
        print("Initializing single DDPG agent!")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(cfg.random_seed)
        self.n_agents = num_agents
        self.agent_id = agent_id

        self.cfg = cfg

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, cfg.random_seed,
                                 cfg.dense_layers_actor).to(device)
        self.actor_target = Actor(state_size, action_size, cfg.random_seed,
                                  cfg.dense_layers_actor).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents, cfg.random_seed,
                                   cfg.dense_layers_critic).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents, cfg.random_seed,
                                    cfg.dense_layers_critic).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=cfg.lr_critic,
                                           weight_decay=cfg.weight_decay)

        self.hard_copy_weights(self.critic_local, self.critic_target)
        self.hard_copy_weights(self.actor_local, self.actor_target)

        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # Replay memory
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        max_prio = self.memory.get_max_priority()
        self.memory.add(state, action, reward, next_state, max_prio, done)

        # Learn, if enough samples are available in memory
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.cfg.update_every
        if self.t_step == 0:
            if len(self.memory) > self.cfg.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.cfg.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state.view(
                1, -1)).squeeze().cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def target_act(self, state):
        """ Let target network return action."""
        self.actor_target.eval()
        with torch.no_grad():
            action_target = self.actor_target(state)

        return np.clip(action_target, -1, 1)

    def reset(self):
        self.t_step = 0
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', prio, done, indices) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, priorities, dones, indices = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        if self.cfg.prioritized_replay:
            weights = 1. / (
                (self.cfg.batch_size * priorities)**self.cfg.priority_beta)
            weights /= max(weights)
            # calculating new transition priorities based on residuals
            # between target and local network predictions
            diffs = Q_targets - Q_expected  # TD-error
            diffs = np.abs(np.squeeze(diffs.tolist()))
            self.memory.update_prios(indices, diffs)
            # bias-annealing weights
            Q_expected *= weights
            Q_targets *= weights

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.cfg.tau)
        self.soft_update(self.actor_local, self.actor_target, self.cfg.tau)

    @staticmethod
    def hard_copy_weights(local_model, target_model):
        """Update model parameters.

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_weights(self, model_save_path, suffix=""):
        """
        Simple method to save network weights.
        """
        # actors
        torch.save(
            self.actor_local.state_dict(),
            os.path.join(model_save_path,
                         "weights_actor_local{:s}.pth".format(suffix)))
        torch.save(
            self.actor_target.state_dict(),
            os.path.join(model_save_path,
                         "weights_actor_target{:s}.pth".format(suffix)))
        # critics
        torch.save(
            self.critic_local.state_dict(),
            os.path.join(model_save_path,
                         "weights_critic_local{:s}.pth".format(suffix)))
        torch.save(
            self.critic_target.state_dict(),
            os.path.join(model_save_path,
                         "weights_critic_target{:s}.pth".format(suffix)))

    def load_weights(self, model_save_path, suffix=""):
        """
        Method to load network weights from saved files.
        """
        self.actor_local.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_actor_local{:s}.pth".format(suffix))))
        self.actor_target.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_actor_target{:s}.pth".format(suffix))))

        self.critic_local.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_critic_local{:s}.pth".format(suffix))))
        self.critic_target.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_critic_target{:s}.pth".format(suffix))))
Exemplo n.º 2
0
class MultiDDPGAgent:
    """ Multi-agent DDPG implementation."""
    def __init__(self, state_size, action_size, num_agents, cfg):
        """Initialize a MADDPG Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): Number of agents in environment
            cfg (config object): main configuration with other settings
        """
        print("Initializing MADDPG agent with {:d} agents!".format(num_agents))

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(cfg.random_seed)

        self.cfg = cfg

        # initializing list of single agents (2 for tennis)
        self.agents = []
        for aid in range(num_agents):
            agent = SingleDDPGAgent(state_size,
                                    action_size,
                                    cfg,
                                    num_agents=num_agents,
                                    agent_id=aid)
            self.agents.append(agent)

        self.t_step = 0

        # Noise process
        self.noise_scale = self.cfg.noise_scale
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # as long as active, will fill replay buffer with random memories, no learning
        self.prefetching = True

        # Replay memory for shared experiences (all agents)
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)

    def add_noise(self):
        if self.cfg.use_ou:
            return self.noise_scale * self.noise.sample()
        else:
            # Gaussian noise
            return self.noise_scale * np.random.normal(0, 1.0,
                                                       self.action_size)

    def reset(self):
        self.t_step = 0
        self.noise.reset()

    def act(self, state_all, add_noise=True):
        """
        Let all agents act.
        Receives full state tensor of all agents
        and outputs all actions (num_agents x action_size).
        """
        actions = []
        for aid in range(self.num_agents):
            # only add noise after pre-loading memories
            noise = 0
            if not self.prefetching and add_noise:
                noise = self.add_noise()
            actions.append(
                self.agents[aid].act(state_all[aid], add_noise=False) + noise)

        return actions

    def _target_act(self, states_all):
        """
        Internal function used by learn function.
        Gets target network actions for all agents.
        """
        target_actions = []
        for aid in range(self.num_agents):
            # states_all format (batch size, num_agents, state size)
            target_actions.append(self.agents[aid].target_act(
                states_all[:, aid, :]))

        return target_actions

    def step(self, states, actions, rewards, next_states, dones):
        """ Save experiences in global memory.
            If memory large enough, use it to learn each agent.
        """
        max_prio = self.memory.get_max_priority()
        self.memory.add(states, actions, rewards, next_states, max_prio, dones)

        # start training if memory size large enough.
        if len(self.memory) >= max(self.cfg.batch_size, self.cfg.init_replay):
            if self.prefetching:
                self.prefetching = False
                print("Pre-loading of memories complete, starting training!")
        else:
            return

        self.t_step = (self.t_step + 1) % self.cfg.learn_every
        if self.t_step == 0:
            for _ in range(self.cfg.learn_steps):
                self.learn_all()

        self.noise_scale = max(self.noise_scale * self.cfg.noise_decay,
                               self.cfg.noise_scale_min)
        self.t_step += 1

    def learn_all(self):
        """Generates full batch input and performs individual learning steps."""
        samples = self.memory.sample()
        for aid in range(self.num_agents):
            self.learn(samples, aid)
            self.soft_update_all()

    def learn(self, samples, agent_number):
        """
            Update critic and actor networks of given agent using provided
            samples from replay memory.
        """
        # from memory
        states, actions, rewards, next_states, priorities, dones, indices = samples

        # creating full states and next_states with shape (batch_size, -1)
        batch_size = self.cfg.batch_size
        full_states = states.view(batch_size, -1)
        full_next_states = next_states.view(batch_size, -1)

        # selecting the correct agent
        agent = self.agents[agent_number]

        # 1. Update of critic
        agent.critic_optimizer.zero_grad()

        # critic loss = TD-error, so batch mean of (y- Q*(s,a))^2
        # y = current reward + discount * Q*(st+1,at+1) from target network Q*

        # shape (batch_size, num_agents, -1)
        target_actions = torch.cat(self._target_act(
            next_states.view(batch_size, self.num_agents, -1)),
                                   dim=1)
        # returns list, so change to shape (batch_size, action_size, num_agent)

        # get next q values from target critic
        q_next = agent.critic_target(full_next_states,
                                     target_actions.to(device))

        y = rewards[:, agent_number].view(-1, 1) + \
            self.cfg.gamma * q_next * (1 - dones[:, agent_number].view(-1, 1))

        q = agent.critic_local(full_states, actions.view(batch_size, -1))

        critic_loss = None
        if self.cfg.loss_l == 1:
            huber_loss = torch.nn.SmoothL1Loss()
            critic_loss = huber_loss(q, y.detach())
        elif self.cfg.loss_l == 2:
            critic_loss = F.mse_loss(q, y.detach())
        else:
            AssertionError("L{:d} loss is not supported!".format(
                self.cfg.loss_l))

        # optimization of critic (local) loss
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # 2. Update of actor network using policy gradient
        agent.actor_optimizer.zero_grad()

        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [
            self.agents[i].actor_local(
                states.view(batch_size, self.num_agents, -1)[:, i, :])
            if i == agent_number else self.agents[i].actor_local(
                states.view(batch_size, self.num_agents, -1)[:,
                                                             i, :]).detach()
            for i in range(self.num_agents)
        ]

        q_input = torch.cat(q_input, dim=1)

        # combine all the actions and observations for input to critic
        # many of the obs are redundant, and obs[1] contains all useful information already

        # get the actual policy gradient here
        actor_loss = -agent.critic_local(full_states, q_input).mean()

        # optimize
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.actor_local.parameters(), 1)
        agent.actor_optimizer.step()

        # soft update the models
        agent.soft_update(agent.critic_local, agent.critic_target,
                          self.cfg.tau)
        agent.soft_update(agent.actor_local, agent.actor_target, self.cfg.tau)

    def soft_update_all(self):
        """soft update targets"""
        for agent in self.agents:
            agent.soft_update(agent.critic_local, agent.critic_target,
                              self.cfg.tau)
            agent.soft_update(agent.actor_local, agent.actor_target,
                              self.cfg.tau)

    def save_weights(self, model_save_path):
        """
        Simple method to save network weights.
        """
        for aid, agent in enumerate(self.agents):
            agent.save_weights(model_save_path, suffix="_{:d}".format(aid))

    def load_weights(self, model_save_path):
        """
        Method to load network weights from saved files.
        """
        for aid, agent in enumerate(self.agents):
            agent.load_weights(model_save_path, suffix="_{:d}".format(aid))