示例#1
0
class TD3Agent(Agent):
    """
        The TD3Agent class implements a trainable TD3 agent.

        Parameters
        ----------

        logger: Logger
            The variable specifies a logger for model management, plotting and printing.
        obs_dim: int
            The variable specifies the dimension of observation space vector.
         action_space: ndarray
            The variable specifies the action space of environment.
        userconfig:
            The variable specifies the config settings.
        """
    def __init__(self, logger, obs_dim, action_space, userconfig):
        super().__init__(logger=logger,
                         obs_dim=obs_dim,
                         action_dim=action_space.shape[0],
                         userconfig=userconfig)

        self._observation_dim = obs_dim
        self._action_space = action_space
        self._action_n = action_space.shape[0]
        self._config = {
            "eps": 0.05,
            "discount": 0.95,
            "buffer_size": int(1e5),
            "batch_size": 128,
            "learning_rate_actor": 0.0002,
            "learning_rate_critic": 0.0002,
            "hidden_sizes": [256, 256],
            'tau': 0.0001,
            'noise': 0.2,
            'noise_clip': 0.5
        }

        self._config.update(userconfig)
        self._eps = self._config['eps']
        self._tau = self._config['tau']
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.eval_mode = False

        if self._config['lr_milestones'] is None:
            raise ValueError(
                'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300'
            )

        lr_milestones = [
            int(x) for x in (self._config['lr_milestones'][0]).split(' ')
        ]

        # Critics
        self.critics = TwinCritic(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_critic'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

        self.critics_target = TwinCritic(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_critic'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

        # Actor
        self.actor = Actor(self._observation_dim,
                           self._action_n,
                           hidden_sizes=self._config['hidden_sizes'],
                           learning_rate=self._config['learning_rate_actor'],
                           lr_milestones=lr_milestones,
                           lr_factor=self._config['lr_factor'],
                           device=self._config['device'])
        self.actor_target = Actor(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_actor'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

    def eval(self):
        self.eval_mode = True

    def train_mode(self):
        self.eval_mode = False

    def act(self, observation, noise=0, evaluation=False):
        state = torch.from_numpy(observation).float().to(self.device)
        action = self.actor.forward(state)
        action = action.detach().cpu().numpy()[0]

        if noise != 0 and not evaluation:
            action = (action +
                      np.random.normal(0, noise, size=action.shape[0]))
        return action.clip(-1, 1)

    def schedulers_step(self):
        self.critics.lr_scheduler.step()
        self.critics_target.lr_scheduler.step()
        self.actor.lr_scheduler.step()
        self.actor_target.lr_scheduler.step()

    def store_transition(self, transition):
        self.buffer.add_transition(transition)

    @staticmethod
    def load_model(fpath):
        with open(Path(fpath), 'rb') as inp:
            return pickle.load(inp)

    def train(self, total_step_counter, iter_fit=32):
        losses = []

        for i in range(iter_fit):
            data = self.buffer.sample(batch_size=self._config['batch_size'])
            s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device)

            s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device)
            a = torch.FloatTensor(np.stack(
                data[:, 1])[:, None]).squeeze(dim=1).to(self.device)

            rew = torch.FloatTensor(np.stack(
                data[:, 2])[:, None]).squeeze(dim=1).to(self.device)

            done = torch.FloatTensor(np.stack(
                data[:, 4])[:,
                            None]).squeeze(dim=1).to(self.device)  # done flag

            noise = torch.FloatTensor(a.cpu()).data.normal_(
                0, self._config['noise']).to(self.device)
            noise = noise.clamp(-self._config['noise_clip'],
                                self._config['noise_clip'])
            a_next = (self.actor_target(s_next).to(self.device) + noise).clamp(
                -1, 1)

            Q1_target, Q2_target = self.critics_target(s_next, a_next)
            target_Q = torch.min(Q1_target,
                                 Q2_target).squeeze(dim=1).to(self.device)

            # target

            targets = rew + self._config['discount'] * target_Q * (1.0 - done)

            # optimize critic
            targets = targets.to(self.device)
            Q1_current, Q2_current = self.critics(s, a)
            Q1_current = Q1_current.squeeze(dim=1).to(self.device)
            Q2_current = Q2_current.squeeze(dim=1).to(self.device)
            critic_loss = F.mse_loss(Q1_current, targets) + F.mse_loss(
                Q2_current, targets)

            losses.append(critic_loss)
            self.critics.optimizer.zero_grad()
            critic_loss.backward()
            self.critics.optimizer.step()

            if ((total_step_counter - 1) * iter_fit + i +
                    1) % self._config['update_target_every'] == 0:
                # optimize actor
                actions = self.actor.forward(s)
                actor_loss = -self.critics.Q1(s, actions).mean()
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()
                # update

                soft_update(self.critics_target, self.critics, self._tau)
                soft_update(self.actor_target, self.actor, self._tau)

        return losses
示例#2
0
class DDPGAgent:
    def __init__(self,
                 env,
                 gamma,
                 tau,
                 buffer_maxlen,
                 critic_learning_rate,
                 actor_learning_rate,
                 max_action=1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.noise = OUNoise(env.action_space)
        self.iter = 0.0
        self.noisy = False
        self.max_action = max_action

        print(self.action_dim)
        print(self.obs_dim)

        # RL hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # Initialize critic and actorr networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim,
                                    self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim,
                           self.max_action).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy target network paramters for critic
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        # Set Optimization algorithms
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.replay_buffer = ExperienceReplayLog(buffer_maxlen)

    def get_action(self, obs):
        #print('obs;',obs)

        if self.noisy == True:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()
            action = self.noise.get_action(action, self.iter)
            self.iter = self.iter + 1

        else:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):

        #Batch updates
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(
            batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        # Q info updates
        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch,
                                            next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # Update Critic network
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()

        self.critic_optimizer.step()

        # Update Actor network
        policy_loss = -self.critic.forward(
            state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # Update Actor and Critic target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))