Пример #1
0
class TD3Agent:
    def __init__(self,
                 env,
                 n_episodes=3000,
                 time_steps=500,
                 gamma=0.99,
                 batch_size=64,
                 memory_capacity=100000,
                 tau=1e-2,
                 lr=0.00001,
                 pi_update_steps=2,
                 render=False):
        self.env = env
        self.gamma = gamma
        self.time_steps = time_steps
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.tau = tau
        self.lr = lr
        self.pi_update_steps = pi_update_steps
        self.render = render

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Create actor and critic network
        self.actor = Actor(state_dim=self.state_dim,
                           action_dim=self.action_dim).to(self.device)
        self.actor_target = Actor(state_dim=self.state_dim,
                                  action_dim=self.action_dim).to(self.device)

        self.critic = Critic(state_dim=self.state_dim,
                             action_dim=self.action_dim).to(self.device)
        self.critic_target = Critic(state_dim=self.state_dim,
                                    action_dim=self.action_dim).to(self.device)

        # Same weights for target network as for original network
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.critic_loss_fct = torch.nn.MSELoss()

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.lr * 10)

        self.n_episodes = n_episodes
        self.replay_memory = ReplayMemory(capacity=self.memory_capacity,
                                          batch_size=batch_size)

        self.res = pd.DataFrame({
            'episodes': [],
            'states': [],
            'rewards': [],
            'steps': [],
            'actor_losses': [],
            'critic_losses': [],
        })

    def train(self):
        for i in range(self.n_episodes):
            state = self.env.reset()

            for step in range(self.time_steps):
                if self.render:
                    self.env.render()

                state = tt(state)
                action = self.actor(state).cpu().detach().numpy()

                noise = np.random.normal(0,
                                         0.1,
                                         size=self.env.action_space.shape[0])
                action = np.clip(action + noise, self.env.action_space.low[0],
                                 self.env.action_space.high[0])
                next_state, reward, done, _ = self.env.step(action)

                # Save step in memory
                self.replay_memory.append(state=state,
                                          action=action,
                                          reward=reward,
                                          next_state=next_state,
                                          done=done)

                res = {
                    'episodes': i + 1,
                    'states': state.tolist(),
                    'rewards': reward,
                    'steps': step + 1
                }

                # Start training, if batch size reached
                if len(self.replay_memory) < self.batch_size:
                    self.res = self.res.append([res])
                    continue

                # Sample batch from memory
                states, actions, rewards, next_states, dones = self.replay_memory.sample_batch(
                )

                # Critic loss
                q1, q2 = self.critic(states, actions)
                next_actions = self.actor_target(next_states)

                noise = tt(torch.Tensor(actions.cpu()).data.normal_(0, 0.2))
                noise = noise.clamp(-0.5, 0.5)
                next_actions = (next_actions + noise).clamp(
                    self.env.action_space.low[0],
                    self.env.action_space.high[0])
                # Get next state q values by Clipped Double Q-Learning
                q1_ns, q2_ns = self.critic_target(next_states,
                                                  next_actions.detach())
                q_ns = torch.min(q1_ns, q2_ns)
                td_target = rewards + self.gamma * q_ns

                loss_critic = self.critic_loss_fct(
                    q1, td_target) + self.critic_loss_fct(q2, td_target)
                res['critic_losses'] = float(loss_critic)

                # Optimize critic
                self.critic_optim.zero_grad()
                loss_critic.backward()
                self.critic_optim.step()

                # Delayed Policy Updates
                if step % self.pi_update_steps == 0:
                    q1, _ = self.critic(states, self.actor(states))
                    # Actor loss
                    loss_actor = -q1.mean()
                    res['actor_losses'] = float(loss_actor)

                    # Optimize actor
                    self.actor_optim.zero_grad()
                    loss_actor.backward()
                    self.actor_optim.step()

                    # update target networks
                    for param, target_param in zip(
                            self.critic.parameters(),
                            self.critic_target.parameters()):
                        target_param.data.copy_(self.tau * param.data +
                                                (1 - self.tau) *
                                                target_param.data)

                    for param, target_param in zip(
                            self.actor.parameters(),
                            self.actor_target.parameters()):
                        target_param.data.copy_(self.tau * param.data +
                                                (1 - self.tau) *
                                                target_param.data)

                self.res = self.res.append([res])
                state = next_state
                if done:
                    break

            logging.info(f'Episode {i + 1}:')
            logging.info(
                f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}'
            )
            logging.info(
                f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}'
            )

        self.env.close()
        return self.res
Пример #2
0
class DDPGAgent:
    def __init__(self,
                 env,
                 n_episodes=3000,
                 time_steps=500,
                 gamma=0.99,
                 batch_size=32,
                 memory_capacity=100000,
                 tau=1e-2,
                 eps=0.1,
                 lr=0.00001,
                 render=False):
        self.env = env
        self.gamma = gamma
        self.time_steps = time_steps
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.tau = tau
        self.eps = eps
        self.lr = lr
        self.render = render

        # Same weights for target network as for original network
        self.actor = Actor(state_dim=self.state_dim,
                           action_dim=self.action_dim)
        self.actor_target = Actor(state_dim=self.state_dim,
                                  action_dim=self.action_dim)

        self.critic = Critic(state_dim=self.state_dim,
                             action_dim=self.action_dim)
        self.critic_target = Critic(state_dim=self.state_dim,
                                    action_dim=self.action_dim)

        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.critic_loss_fct = torch.nn.MSELoss()

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.lr * 10)

        self.n_episodes = n_episodes

        self.replay_memory = ReplayMemory(capacity=self.memory_capacity,
                                          batch_size=batch_size)

        self.res = pd.DataFrame({
            'episodes': [],
            'states': [],
            'rewards': [],
            'steps': []
        })

    def train(self):
        for i in range(self.n_episodes):
            steps = 0
            state = self.env.reset()

            for step in range(self.time_steps):
                if self.render:
                    self.env.render()

                state = tt(state)
                action = self.actor(state).detach().numpy()

                # Exploration
                p = np.random.random()
                if p < self.eps:
                    action = np.random.uniform(low=-1, high=1, size=(1, ))
                # Do one step in env
                next_state, reward, done, _ = self.env.step(action)

                res = {
                    'episodes': i + 1,
                    'states': state.tolist(),
                    'rewards': reward,
                    'steps': step + 1
                }

                # Save step in memory
                self.replay_memory.append(state=state,
                                          action=action,
                                          reward=reward,
                                          next_state=next_state,
                                          done=done)

                # Start training, if batch size reached
                if len(self.replay_memory) < self.batch_size:
                    continue

                # Sample batch from memory
                states, actions, rewards, next_states, dones = self.replay_memory.sample_batch(
                )

                # Critic loss
                q_values = self.critic(states, actions)
                next_actions = self.actor_target(next_states)
                q_values_ns = self.critic_target(next_states,
                                                 next_actions.detach())
                td_target = rewards + self.gamma * q_values_ns
                loss_critic = self.critic_loss_fct(q_values, td_target)

                # Actor loss
                loss_actor = -(self.critic(states, self.actor(states)).mean())

                # Optimize actor
                self.actor_optim.zero_grad()
                loss_actor.backward()
                self.actor_optim.step()

                # Optimize critic
                self.critic_optim.zero_grad()
                loss_critic.backward()
                self.critic_optim.step()

                # update target networks
                for target_param, param in zip(self.actor_target.parameters(),
                                               self.actor.parameters()):
                    target_param.data.copy_(param.data * self.tau +
                                            target_param.data *
                                            (1.0 - self.tau))

                for target_param, param in zip(self.critic_target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_(param.data * self.tau +
                                            target_param.data *
                                            (1.0 - self.tau))

                self.res = self.res.append([res])

                state = next_state
                steps += 1

                if done:
                    break

            logging.info(f'Episode {i + 1}:')
            logging.info(
                f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}'
            )
            logging.info(
                f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}'
            )

        self.env.close()
        return self.res