示例#1
0
class AgentDDPG(AgentBase):
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.explore_noise = 0.05  # explore noise of action

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)

        self.criterion = torch.nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([{
            'params': self.act.parameters(),
            'lr': learning_rate
        }, {
            'params': self.cri.parameters(),
            'lr': learning_rate
        }])

    def select_actions(self, states):  # states = (state, ...)
        states = torch.as_tensor(states,
                                 dtype=torch.float32,
                                 device=self.device)
        actions = self.act(states)
        actions = (actions +
                   torch.randn_like(actions) * self.explore_noise).clamp(
                       -1, 1)
        return actions.detach().cpu().numpy()

    def update_policy(self, buffer, max_step, batch_size, repeat_times):
        buffer.update__now_len__before_sample()
        obj_critic = obj_actor = None  # just for print return
        for _ in range(int(max_step * repeat_times)):
            with torch.no_grad():
                reward, mask, action, state, next_s = buffer.random_sample(
                    batch_size)
                next_q = self.cri_target(next_s, self.act_target(next_s))
                q_label = reward + mask * next_q
            q_value = self.cri(state, action)
            obj_critic = self.criterion(q_value, q_label)

            q_value_pg = self.act(state)  # policy gradient
            obj_actor = -self.cri_target(state, q_value_pg).mean()

            obj_united = obj_actor + obj_critic  # objective
            self.optimizer.zero_grad()
            obj_united.backward()
            self.optimizer.step()

            soft_target_update(self.cri_target, self.cri)
            soft_target_update(self.act_target, self.act)
        return obj_actor.item(), obj_critic.item()
示例#2
0
class AgentDDPG(AgentBase):
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.explore_noise = 0.05  # explore noise of action
        # self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=0.3)  # I don't recommend OU-Noise

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)

        self.criterion = torch.nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate},
                                           {'params': self.cri.parameters(), 'lr': learning_rate}])

    def select_actions(self, states):  # states = (state, ...)
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        actions = self.act(states)
        # actions = actions.detach().cpu().numpy()
        # return (actions + self.ou_noise()).clip(-1, 1)
        actions = (actions + torch.randn_like(actions) * self.explore_noise).clamp(-1, 1)
        return actions.detach().cpu().numpy()

    def update_policy(self, buffer, max_step, batch_size, repeat_times):
        """ Contribution of DDPG (Deep Deterministic Policy Gradient)
        1. Policy Gradient with Deep network: DQN + DPG -> DDPG
           Q_value = reward + gamma * next_Q_value
           Q-learning -> DQN (Deep Q-learning): (discrete state space Q-table -> continuous state space Q-net)
           DQN + DPG -> DDPG: (discrete action space Q-net -> continuous action space Policy Gradient)
        2. experiment replay buffer for stabilizing training
        3. soft target update for stabilizing training
        """
        buffer.update__now_len__before_sample()

        obj_critic = obj_actor = None  # just for print return
        for _ in range(int(max_step * repeat_times)):
            """critic (train Critic network using Supervised Deep learning)
            the optimization objective of critic is minimizing loss function 'criterion(q_value, q_label)'
            minimize criterion(q_eval, label) to train a critic
            We input state-action to a critic (policy function), critic will output a q_value estimation.
            A better action will get higher q_value from critic.  
            """
            with torch.no_grad():
                reward, mask, action, state, next_s = buffer.random_sample(batch_size)
                next_q = self.cri_target(next_s, self.act_target(next_s))
                q_label = reward + mask * next_q
            q_value = self.cri(state, action)
            obj_critic = self.criterion(q_value, q_label)

            """actor (Policy Gradient)
            the optimization objective of actor is maximizing value function 'critic(state, actor(state))'
            maximize cri(state, action) is equal to minimize -cri(state, action)
            Accurately, it is more appropriate to call 'actor_obj' as 'actor_objective'.

            We train critic output q_value close to q_label
                by minimizing the error provided by loss function of critic.
            We train actor output action which gets higher q_value from critic
                by maximizing the q_value provided by policy function.
            We call it Policy Gradient (PG). The gradient for actor is provided by a policy function.
                By the way, Generative Adversarial Networks (GANs) is a kind of Policy Gradient.
                The gradient for Generator (Actor) is provided by a Discriminator (Critic).
            """
            q_value_pg = self.act(state)  # policy gradient
            obj_actor = -self.cri_target(state, q_value_pg).mean()

            """united objective
            I can write in this way:
            
            self.optimizer_of_actor.zero_grad()
            obj_actor.backward()
            self.optimizer_of_actor.step()
            
            self.optimizer_of_critic.zero_grad()
            obj_critic.backward()
            self.optimizer_of_critic.step()
            
            I use one single optimizer for both networks in order to speed up training
            """
            obj_united = obj_actor + obj_critic  # objective
            self.optimizer.zero_grad()
            obj_united.backward()
            self.optimizer.step()

            soft_target_update(self.cri_target, self.cri)
            soft_target_update(self.act_target, self.act)
        return obj_actor.item(), obj_critic.item()