def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.explore_noise = 0.05 # explore noise of action self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate}, {'params': self.cri.parameters(), 'lr': learning_rate}])
def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.explore_noise = 0.05 # explore noise of action # self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=0.3) # I don't recommend OU-Noise self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate}, {'params': self.cri.parameters(), 'lr': learning_rate}])
class AgentDDPG(AgentBase): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.explore_noise = 0.05 # explore noise of action self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{ 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }]) def select_actions(self, states): # states = (state, ...) states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act(states) actions = (actions + torch.randn_like(actions) * self.explore_noise).clamp( -1, 1) return actions.detach().cpu().numpy() def update_policy(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() obj_critic = obj_actor = None # just for print return for _ in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample( batch_size) next_q = self.cri_target(next_s, self.act_target(next_s)) q_label = reward + mask * next_q q_value = self.cri(state, action) obj_critic = self.criterion(q_value, q_label) q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() obj_united = obj_actor + obj_critic # objective self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) return obj_actor.item(), obj_critic.item()
class AgentDDPG(AgentBase): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.explore_noise = 0.05 # explore noise of action # self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=0.3) # I don't recommend OU-Noise self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate}, {'params': self.cri.parameters(), 'lr': learning_rate}]) def select_actions(self, states): # states = (state, ...) states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act(states) # actions = actions.detach().cpu().numpy() # return (actions + self.ou_noise()).clip(-1, 1) actions = (actions + torch.randn_like(actions) * self.explore_noise).clamp(-1, 1) return actions.detach().cpu().numpy() def update_policy(self, buffer, max_step, batch_size, repeat_times): """ Contribution of DDPG (Deep Deterministic Policy Gradient) 1. Policy Gradient with Deep network: DQN + DPG -> DDPG Q_value = reward + gamma * next_Q_value Q-learning -> DQN (Deep Q-learning): (discrete state space Q-table -> continuous state space Q-net) DQN + DPG -> DDPG: (discrete action space Q-net -> continuous action space Policy Gradient) 2. experiment replay buffer for stabilizing training 3. soft target update for stabilizing training """ buffer.update__now_len__before_sample() obj_critic = obj_actor = None # just for print return for _ in range(int(max_step * repeat_times)): """critic (train Critic network using Supervised Deep learning) the optimization objective of critic is minimizing loss function 'criterion(q_value, q_label)' minimize criterion(q_eval, label) to train a critic We input state-action to a critic (policy function), critic will output a q_value estimation. A better action will get higher q_value from critic. """ with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample(batch_size) next_q = self.cri_target(next_s, self.act_target(next_s)) q_label = reward + mask * next_q q_value = self.cri(state, action) obj_critic = self.criterion(q_value, q_label) """actor (Policy Gradient) the optimization objective of actor is maximizing value function 'critic(state, actor(state))' maximize cri(state, action) is equal to minimize -cri(state, action) Accurately, it is more appropriate to call 'actor_obj' as 'actor_objective'. We train critic output q_value close to q_label by minimizing the error provided by loss function of critic. We train actor output action which gets higher q_value from critic by maximizing the q_value provided by policy function. We call it Policy Gradient (PG). The gradient for actor is provided by a policy function. By the way, Generative Adversarial Networks (GANs) is a kind of Policy Gradient. The gradient for Generator (Actor) is provided by a Discriminator (Critic). """ q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() """united objective I can write in this way: self.optimizer_of_actor.zero_grad() obj_actor.backward() self.optimizer_of_actor.step() self.optimizer_of_critic.zero_grad() obj_critic.backward() self.optimizer_of_critic.step() I use one single optimizer for both networks in order to speed up training """ obj_united = obj_actor + obj_critic # objective self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) return obj_actor.item(), obj_critic.item()