예제 #1
0
    def init(self, net_dim, state_dim, action_dim, if_per):
        self.action_dim = action_dim
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.act = ActorMPO(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)

        self.criterion = torch.nn.SmoothL1Loss()
        self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate)
        self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate)

        self.eta = np.random.rand()
        self.eta_kl_mu = 0.
        self.eta_kl_sigma = 0.
        self.eta_kl_mu = 0.
예제 #2
0
    def init(self, net_dim, state_dim, action_dim, if_per=False):
        self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=self.ou_explore_noise)
        # I don't recommend to use OU-Noise
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)
        self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate)

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate)

        self.criterion = torch.nn.SmoothL1Loss(reduction='none' if if_per else 'mean')
        if if_per:
            self.get_obj_critic = self.get_obj_critic_per
        else:
            self.get_obj_critic = self.get_obj_critic_raw
예제 #3
0
class AgentMPO(AgentBase):
    def __init__(self):
        super().__init__()
        self.epsilon_dual = 0.1
        self.epsilon_kl_mu = 0.01
        self.epsilon_kl_sigma = 0.01
        self.epsilon_kl = 0.01
        self.alpha = 10
        self.sample_a_num = 64
        self.lagrange_iteration_num = 5

    def init(self, net_dim, state_dim, action_dim, if_per):
        self.action_dim = action_dim
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.act = ActorMPO(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)

        self.criterion = torch.nn.SmoothL1Loss()
        self.act_optimizer = torch.optim.Adam(self.act.parameters(),
                                              lr=self.learning_rate)
        self.cri_optimizer = torch.optim.Adam(self.cri.parameters(),
                                              lr=self.learning_rate)

        self.eta = np.random.rand()
        self.eta_kl_mu = 0.
        self.eta_kl_sigma = 0.
        self.eta_kl_mu = 0.

    def select_action(self, state) -> np.ndarray:
        states = torch.as_tensor((state, ),
                                 dtype=torch.float32,
                                 device=self.device).detach_()
        action = self.act.get_action(states)[0]
        return action.cpu().numpy()

    def update_net(self, buffer, target_step, batch_size,
                   repeat_times) -> (float, float):
        buffer.update_now_len_before_sample()
        t_a = torch.empty([256, 64, 3],
                          dtype=torch.float32,
                          device=self.device)
        t_s = torch.empty([256, 64, 15],
                          dtype=torch.float32,
                          device=self.device)
        obj_critic = None
        for _ in range(int(target_step * repeat_times)):
            # Policy Evaluation
            with torch.no_grad():
                reward, mask, a, state, next_s = buffer.sample_batch(
                    batch_size)
                pi_next_s = self.act_target.get_distribution(next_s)
                sampled_next_a = pi_next_s.sample(
                    (self.sample_a_num, )).transpose(0, 1)
                expanded_next_s = next_s[:, None, :].expand(
                    -1, self.sample_a_num, -1)
                expected_next_q = self.cri_target(
                    expanded_next_s.reshape(-1, state.shape[1]),
                    sampled_next_a.reshape(-1, a.shape[1]))
                expected_next_q = expected_next_q.reshape(
                    batch_size, self.sample_a_num)
                expected_next_q = expected_next_q.mean(dim=1)
                expected_next_q = expected_next_q.unsqueeze(dim=1)
                q_label = reward + mask * expected_next_q
            q = self.cri(state, a)
            obj_critic = self.criterion(q, q_label)
            self.cri_optimizer.zero_grad()
            obj_critic.backward()
            self.cri_optimizer.step()
            self.soft_update(self.cri_target, self.cri, self.soft_update_tau)

            # Policy Improvation
            # Sample M additional action for each state
            with torch.no_grad():
                pi_b = self.act_target.get_distribution(state)
                sampled_a = pi_b.sample((self.sample_a_num, ))
                expanded_s = state[None, ...].expand(self.sample_a_num, -1, -1)
                target_q = self.cri_target.forward(
                    expanded_s.reshape(-1, state.shape[1]),  # (M * B, ds)
                    sampled_a.reshape(-1, a.shape[1])  # (M * B, da)
                ).reshape(self.sample_a_num, batch_size)
                target_q_np = target_q.cpu().numpy()

            # E step
            def dual(eta):
                max_q = np.max(target_q_np, 0)
                return eta * self.epsilon_dual + np.mean(max_q) \
                       + eta * np.mean(np.log(np.mean(np.exp((target_q_np - max_q) / eta), axis=0)))

            bounds = [(1e-6, None)]
            res = minimize(dual,
                           np.array([self.eta]),
                           method='SLSQP',
                           bounds=bounds)
            self.eta = res.x[0]

            qij = torch.softmax(target_q / self.eta,
                                dim=0)  # (M, B) or (da, B)

            # M step
            pi = self.act.get_distribution(state)
            loss_p = torch.mean(qij * (
                pi.expand((self.sample_a_num, batch_size)).log_prob(
                    sampled_a)  # (M, B)
                + pi_b.expand((self.sample_a_num, batch_size)).log_prob(
                    sampled_a)  # (M, B)
            ))
            # mean_loss_p.append((-loss_p).item())
            kl_mu, kl_sigma = gaussian_kl(mu_i=pi_b.loc,
                                          mu=pi.loc,
                                          A_i=pi_b.scale_tril,
                                          A=pi.scale_tril)
            if np.isnan(kl_mu.item()):
                print('kl_μ is nan')
                embed()
            if np.isnan(kl_sigma.item()):
                print('kl_Σ is nan')
                embed()

            self.eta_kl_mu -= self.alpha * (self.epsilon_kl_mu -
                                            kl_mu).detach().item()
            self.eta_kl_sigma -= self.alpha * (self.epsilon_kl_sigma -
                                               kl_sigma).detach().item()
            self.eta_kl_mu = 0.0 if self.eta_kl_mu < 0.0 else self.eta_kl_mu
            self.eta_kl_sigma = 0.0 if self.eta_kl_sigma < 0.0 else self.eta_kl_sigma
            self.act_optimizer.zero_grad()
            obj_actor = -(loss_p + self.eta_kl_mu *
                          (self.epsilon_kl_mu - kl_mu) + self.eta_kl_sigma *
                          (self.epsilon_kl_sigma - kl_sigma))
            self.act_optimizer.zero_grad()
            obj_actor.backward()
            self.act_optimizer.step()
            self.soft_update(self.act_target, self.act, self.soft_update_tau)

        self.update_record(obj_a=obj_actor.item(),
                           obj_c=obj_critic.item(),
                           loss_pi=loss_p.item(),
                           est_q=q_label.mean().item(),
                           max_kl_mu=kl_mu.item(),
                           max_kl_sigma=kl_sigma.item(),
                           eta=self.eta)
        return self.train_record
예제 #4
0
class AgentDDPG(AgentBase):
    def __init__(self):
        super().__init__()
        self.ou_explore_noise = 0.3  # explore noise of action
        self.ou_noise = None

    def init(self, net_dim, state_dim, action_dim, if_per=False):
        self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim,
                                               sigma=self.ou_explore_noise)
        # I don't recommend to use OU-Noise
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)
        self.cri_optimizer = torch.optim.Adam(self.cri.parameters(),
                                              lr=self.learning_rate)

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.act_optimizer = torch.optim.Adam(self.act.parameters(),
                                              lr=self.learning_rate)

        self.criterion = torch.nn.SmoothL1Loss(
            reduction='none' if if_per else 'mean')
        if if_per:
            self.get_obj_critic = self.get_obj_critic_per
        else:
            self.get_obj_critic = self.get_obj_critic_raw

    def select_action(self, state) -> np.ndarray:
        states = torch.as_tensor((state, ),
                                 dtype=torch.float32,
                                 device=self.device).detach_()
        action = self.act(states)[0].cpu().numpy()
        return (action + self.ou_noise()).ratio_clip(-1, 1)

    def update_net(self, buffer, target_step, batch_size,
                   repeat_times) -> (float, float):
        buffer.update_now_len_before_sample()

        obj_critic = obj_actor = None  # just for print return
        for _ in range(int(target_step * repeat_times)):
            obj_critic, state = self.get_obj_critic(buffer, batch_size)
            self.cri_optimizer.zero_grad()
            obj_critic.backward()
            self.cri_optimizer.step()
            self.soft_update(self.cri_target, self.cri, self.soft_update_tau)

            q_value_pg = self.act(state)  # policy gradient
            obj_actor = -self.cri_target(state, q_value_pg).mean()  # obj_actor
            self.act_optimizer.zero_grad()
            obj_actor.backward()
            self.act_optimizer.step()
            self.soft_update(self.act_target, self.act, self.soft_update_tau)
        self.update_record(obj_a=obj_actor.item(), obj_c=obj_critic.item())
        return self.train_record

    def get_obj_critic_raw(self, buffer, batch_size):
        with torch.no_grad():
            reward, mask, action, state, next_s = buffer.sample_batch(
                batch_size)
            next_q = self.cri_target(next_s, self.act_target(next_s))
            q_label = reward + mask * next_q
        q_value = self.cri(state, action)
        obj_critic = self.criterion(q_value, q_label)
        return obj_critic, state

    def get_obj_critic_per(self, buffer, batch_size):
        with torch.no_grad():
            reward, mask, action, state, next_s, is_weights = buffer.sample_batch(
                batch_size)
            next_q = self.cri_target(next_s, self.act_target(next_s))
            q_label = reward + mask * next_q
        q_value = self.cri(state, action)
        obj_critic = (self.criterion(q_value, q_label) * is_weights).mean()

        td_error = (q_label - q_value.detach()).abs()
        buffer.td_error_update(td_error)
        return obj_critic, state