示例#1
0
class SacAgent(object):
    actor_store_dir = 'actor'
    q_net_1_store_dir = 'q_1'
    q_net_2_store_dir = 'q_2'

    def __init__(self, env, batch_size):
        self.batch_size = batch_size
        self.tau = 1e-2
        memory_size = 1000000
        self.gamma = 0.99
        self.q_lr = 3e-4
        self.actor_lr = 3e-4
        self.alpha_lr = 3e-3

        self.update_step = 0
        self.delay_step = 2

        self.action_range = [env.action_space.low, env.action_space.high]

        self.memory = Memory(memory_size)

        # entropy temperature
        self.alpha = 0.2
        self.target_entropy = -torch.prod(torch.Tensor(
            env.action_space.shape)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=self.alpha_lr)

        self.actor = SacActor(env.observation_space.shape[0],
                              env.action_space.shape[0])
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)

        self.q_net_1 = Critic(env.observation_space.shape[0],
                              env.action_space.shape[0])
        self.q_net_1_target = Critic(env.observation_space.shape[0],
                                     env.action_space.shape[0])
        self.copy_networks(self.q_net_1, self.q_net_1_target)
        self.q_net_1_optimizer = optim.Adam(self.q_net_1.parameters(),
                                            lr=self.q_lr)

        self.q_net_2 = Critic(env.observation_space.shape[0],
                              env.action_space.shape[0])
        self.q_net_2_target = Critic(env.observation_space.shape[0],
                                     env.action_space.shape[0])
        self.copy_networks(self.q_net_2, self.q_net_2_target)
        self.q_net_2_optimizer = optim.Adam(self.q_net_2.parameters(),
                                            lr=self.q_lr)

    def copy_networks(self, org_net, dest_net):
        for dest_param, param in zip(dest_net.parameters(),
                                     org_net.parameters()):
            dest_param.data.copy_(param.data)

    def get_test_action(self, state):
        # 100% deterministic. It is not always the best option to do it this way
        state = torch.FloatTensor(state).unsqueeze(0)
        mean, log_std = self.actor.forward(state)
        action = torch.tanh(mean)
        action = action.detach().squeeze(0).numpy()
        return self.rescale_action(action)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action, log_pi = self.actor.sample(state)
        action = action.detach().squeeze(0).numpy()
        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def save(self, state, action, reward, new_state, cost, fail):
        self.memory.push(state, action, reward, new_state, cost, fail)

    def save_model(self, data_dir):
        actor_dir = os.path.join(data_dir, self.actor_store_dir)
        torch.save(self.actor, actor_dir)
        q_net_1_dir = os.path.join(data_dir, self.q_net_1_store_dir)
        torch.save(self.q_net_1, q_net_1_dir)
        q_net_2_dir = os.path.join(data_dir, self.q_net_2_store_dir)
        torch.save(self.q_net_2, q_net_2_dir)

    def load_model(self, data_dir):
        actor_dir = os.path.join(data_dir, self.actor_store_dir)
        self.actor = torch.load(actor_dir)

        q_net_1_dir = os.path.join(data_dir, self.q_net_1_store_dir)
        self.q_net_1 = torch.load(q_net_1_dir)
        self.copy_networks(self.q_net_1, self.q_net_1_target)

        q_net_2_dir = os.path.join(data_dir, self.q_net_2_store_dir)
        self.q_net_2 = torch.load(q_net_2_dir)
        self.copy_networks(self.q_net_2, self.q_net_2_target)

    def update(self, num=1):
        for _ in range(num):
            self.__one_update()

    def __one_update(self):
        if (len(self.memory) < self.batch_size):
            return
        states, actions, rewards, next_states, costs, fails = self.memory.get_batch(
            self.batch_size)
        not_fails = (fails == 0)

        next_actions, next_log_pi = self.actor.sample(next_states)

        next_q_1 = self.q_net_1_target(next_states, next_actions)
        next_q_2 = self.q_net_2_target(next_states, next_actions)
        next_q_target = torch.min(next_q_1,
                                  next_q_2) - self.alpha * next_log_pi
        expected_q = rewards - costs + not_fails * self.gamma * next_q_target

        curr_q_1 = self.q_net_1.forward(states, actions)
        curr_q_2 = self.q_net_2.forward(states, actions)
        q1_loss = F.mse_loss(curr_q_1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q_2, expected_q.detach())

        self.q_net_1_optimizer.zero_grad()
        q1_loss.backward()
        self.q_net_1_optimizer.step()

        self.q_net_2_optimizer.zero_grad()
        q2_loss.backward()
        self.q_net_2_optimizer.step()

        # delayed update for policy network and target q networks
        new_actions, log_pi = self.actor.sample(states)
        if self.update_step % self.delay_step == 0:
            min_q = torch.min(self.q_net_1.forward(states, new_actions),
                              self.q_net_2.forward(states, new_actions))
            actor_loss = (self.alpha * log_pi - min_q).mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # target networks
            for target_param, param in zip(self.q_net_1_target.parameters(),
                                           self.q_net_1.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.q_net_2_target.parameters(),
                                           self.q_net_2.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        # update temperature
        alpha_loss = (self.log_alpha *
                      (-log_pi - self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()
        self.alpha = self.log_alpha.exp()

        self.update_step += 1
示例#2
0
class DdpgAgent(object):
    actor_store_dir = 'actor'
    critic_store_dir = 'critic'

    def __init__(self, env, batch_size):
        self.batch_size = batch_size
        self.tau = 1e-2
        memory_size = 1000000
        self.gamma = 0.99
        actor_learning_rate = 1e-4
        critic_learning_rate = 1e-3
        self.critic_loss_fn = nn.MSELoss()

        self.actor = DdpgActor(env.observation_space.shape[0],
                               env.action_space.shape[0],
                               env.action_space.high, env.action_space.low)
        self.actor_target = DdpgActor(env.observation_space.shape[0],
                                      env.action_space.shape[0],
                                      env.action_space.high,
                                      env.action_space.low)
        self.copy_networks(self.actor, self.actor_target)

        self.critic = Critic(env.observation_space.shape[0],
                             env.action_space.shape[0])
        self.critic_target = Critic(env.observation_space.shape[0],
                                    env.action_space.shape[0])
        self.copy_networks(self.critic, self.critic_target)

        self.memory = Memory(memory_size)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)

    def copy_networks(self, org_net, dest_net):
        for dest_param, param in zip(dest_net.parameters(),
                                     org_net.parameters()):
            dest_param.data.copy_(param.data)

    def get_action(self, state):
        tensor_state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        tensor_action = self.actor.noisy_forward(tensor_state)
        #tensor_action = self.actor.forward(tensor_state)
        return tensor_action.detach().numpy()[0]

    def get_test_action(self, state):
        tensor_state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        tensor_action = self.actor.forward(tensor_state)
        return tensor_action.detach().numpy()[0]

    def save(self, state, action, reward, new_state, cost, fail):
        self.memory.push(state, action, reward, new_state, cost, fail)

    def save_model(self, data_dir):
        actor_dir = os.path.join(data_dir, self.actor_store_dir)
        torch.save(self.actor, actor_dir)
        critic_dir = os.path.join(data_dir, self.critic_store_dir)
        torch.save(self.critic, critic_dir)

    def load_model(self, data_dir):
        actor_dir = os.path.join(data_dir, self.actor_store_dir)
        self.actor = torch.load(actor_dir)
        self.copy_networks(self.actor, self.actor_target)

        critic_dir = os.path.join(data_dir, self.critic_store_dir)
        self.critic = torch.load(critic_dir)
        self.copy_networks(self.critic, self.critic_target)

    def update(self, num=1):
        for _ in range(num):
            self.__one_update()
        self.actor.reset_noise()

    def __one_update(self):
        if (len(self.memory) < self.batch_size):
            return
        states, actions, rewards, next_states, costs, fails = self.memory.get_batch(
            self.batch_size)

        states_q_values = self.critic.forward(states, actions)
        next_actions = self.actor_target.forward(next_states)
        next_states_q_value = self.critic_target.forward(
            next_states, next_actions.detach())
        not_fails = (fails == 0)
        next_states_q_value = next_states_q_value * not_fails
        new_q_value = rewards - costs + (self.gamma * next_states_q_value)
        critic_loss = self.critic_loss_fn(states_q_values, new_q_value)

        actor_loss = -self.critic.forward(states,
                                          self.actor.forward(states)).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))