class A2C(object): def __init__(self, args): self.args = args self.actor = Actor(args) self.actor_target = Actor(args) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(args) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_parameters(self, batch): state_batch = torch.cat(batch.state) next_state_batch = torch.cat(batch.next_state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) done_batch = torch.cat(batch.done) state_batch.volatile = False next_state_batch.volatile = True action_batch.volatile = False # Critic Update vals = self.critic.forward(state_batch) new_vals = self.critic.forward(next_state_batch) * (1 - done_batch) targets = reward_batch + self.gamma * new_vals self.critic_optim.zero_grad() dt = self.loss(vals, targets) dt.backward() self.critic_optim.step() # Actor Update self.actor_optim.zero_grad() state_batch = utils.to_tensor(utils.to_numpy(state_batch)) targets = utils.to_tensor(utils.to_numpy(targets)) vals = utils.to_tensor(utils.to_numpy(vals)) action_logs = self.actor.forward(state_batch) entropy_loss = torch.mean(entropy(torch.exp(action_logs))) action_logs = F.log_softmax(action_logs) dt = targets - vals alogs = [] for i, action in enumerate(action_batch): action_i = int(action.cpu().data.numpy()) alogs.append(action_logs[i, action_i]) alogs = torch.cat(alogs).unsqueeze(0) policy_loss = -torch.mean(dt * alogs.t()) actor_loss = policy_loss - entropy_loss actor_loss.backward() self.actor_optim.step()
class Actor_Critic(object): def __init__(self, state_dim, action_dim, gamma, tau, buffer_size, is_mem_cuda, out_act): self.actor = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_target = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = gamma self.tau = tau self.loss = nn.MSELoss() self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda) self.exploration_noise = OUNoise(action_dim) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def act(self, state, is_noise): state = utils.to_tensor(state).unsqueeze(0) action = self.actor.forward(state) action = action.detach().numpy().flatten() if is_noise: action += self.exploration_noise.noise() return action def train_from_batch(self, batch): env_state_batch = torch.cat(batch.state) goal_batch = torch.cat(batch.goal) uvfa_states = torch.cat((env_state_batch, goal_batch), dim=1).detach() next_env_state_batch = torch.cat(batch.next_state) next_uvfa_states = torch.cat((next_env_state_batch, goal_batch), dim=1).detach() action_batch = torch.cat(batch.action).detach() reward_batch = torch.cat(batch.reward).detach() #if self.args.use_done_mask: done_batch = torch.cat(batch.done) #Load everything to GPU if not already # if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() uvfa_states = uvfa_states.cuda() next_uvfa_states = next_uvfa_states.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # if self.args.use_done_mask: done_batch = done_batch.cuda() #Critic Update with torch.no_grad(): next_action_batch = self.actor_target.forward(next_uvfa_states) next_q = self.critic_target.forward(next_uvfa_states, next_action_batch) #if self.args.use_done_mask: next_q = next_q * ( 1 - done_batch.float()) #Done mask target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q = self.critic.forward((uvfa_states.detach()), (action_batch.detach())) dt = self.loss(current_q, target_q) dt.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.critic_optim.step() #Actor Update self.actor_optim.zero_grad() policy_loss = -self.critic.forward( (uvfa_states), self.actor.forward((uvfa_states))) policy_loss = policy_loss.mean() policy_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.actor_optim.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) #Nets back to CPU if using memory_cuda self.actor.cpu() self.actor_target.cpu() self.critic_target.cpu() self.critic.cpu()
class DDPG(object): def __init__(self, args): self.args = args self.actor = Actor(args, init=True) self.actor_target = Actor(args, init=True) self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4) self.critic = Critic(args) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_parameters(self, batch): state_batch = torch.cat(batch.state) next_state_batch = torch.cat(batch.next_state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) if self.args.use_done_mask: done_batch = torch.cat(batch.done) #Load everything to GPU if not already if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() state_batch = state_batch.cuda() next_state_batch = next_state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() if self.args.use_done_mask: done_batch = done_batch.cuda() #Critic Update next_action_batch = self.actor_target.forward(next_state_batch) with torch.no_grad(): next_q = self.critic_target.forward(next_state_batch, next_action_batch) if self.args.use_done_mask: next_q = next_q * (1 - done_batch.float()) #Done mask target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q = self.critic.forward((state_batch), (action_batch)) dt = self.loss(current_q, target_q) dt.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.critic_optim.step() #Actor Update self.actor_optim.zero_grad() policy_loss = -self.critic.forward( (state_batch), self.actor.forward((state_batch))) policy_loss = policy_loss.mean() policy_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.actor_optim.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) #Nets back to CPU if using memory_cuda if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cpu() self.actor_target.cpu() self.critic_target.cpu() self.critic.cpu()
class Off_Policy_Algo(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w=True): self.algo_name = algo_name self.gamma = gamma self.tau = tau self.HLoss = HLoss() #Initialize actors self.actor = Actor(state_dim, action_dim, wwid, self.algo_name) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid, self.algo_name) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() if torch.cuda.is_available(): self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def save_net(self, path): torch.save(self.actor.state_dict(), path) def act(self, state): return self.actor(state) def share_memory(self): self.actor.share_memory() self.actor_target.share_memory() self.critic.share_memory() self.critic_target.share_memory() def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch) next_state_batch = torch.cat(next_state_batch) action_batch = torch.cat(action_batch) reward_batch = torch.cat(reward_batch).done_batch = torch.cat( done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal( 0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth #next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.Gumbel_softmax_sample_distribution(next_state_batch, use_cuda=True))\ # if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda() #this should use one-hot from logits next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.forward(next_state_batch)) \ if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda() # this should use one-hot from logits if random.random() < 0.0001: print('off_policy line 114, changed next action batch') next_action_batch = torch.clamp(next_action_batch, 0, 1) #Compute Q-val and value of next state masking by done q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min' or self.algo_name == 'dis': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward( (state_batch), (action_batch )) #here the action batch should be the soft version self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max' or self.algo_name == 'dis': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) #print(dt.item(), "off_policy_algo line 136") dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.actor.Gumbel_softmax_sample_distribution(state_batch, use_cuda=True)\ if self.algo_name == 'dis' else self.actor.forward(state_batch) #actor_actions = self.actor.forward(state_batch) #if random.random() < 0.001: print('actor action changed') Q1, Q2, val = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 + 0.1 * self.HLoss( actor_actions ) # HLoss is a single scalar, directly regularized logits? if random.random() < 0.0005: print('added entropy regularization, off_policy_algo 161') self.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() #print(policy_loss, 'off_policy line 157') self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) self.actor_optim.step() #if random.random() <= 0.001: # self.test_actor_gradient_descent(state_batch) if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau) def test_actor_gradient_descent(self, state_batch): #this method test if running gradient descent on the actor actually decrease the loss print("test_actor_gradient_descent, off_policy_algo line 179") for i in range(10): actor_actions = self.actor.forward(state_batch) print("logits_", self.actor.w_out(self.actor.logits(state_batch))[0]) print("action_batch", actor_actions[0]) Q1, Q2, val = self.critic.forward(state_batch, actor_actions) policy_loss = -Q1 policy_loss = policy_loss.mean() print("policy_loss at i = ", i, " is ", policy_loss) self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) print("gradient_", self.actor.f1.bias.grad[0]) self.actor_optim.step() print("bias_", self.actor.f1.bias[0])
class Off_Policy_Algo(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau #Initialize actors self.actor = Actor(state_dim, action_dim, wwid) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.critic_loss = {'mean':[]} self.q = {'min':[], 'max': [], 'mean':[], 'std':[]} self.val = {'min':[], 'max': [], 'mean':[], 'std':[]} def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth next_action_batch = self.actor_target.forward(next_state_batch) + policy_noise.cuda() next_action_batch = torch.clamp(next_action_batch, 0,1) #Compute Q-val and value of next state masking by done q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward((state_batch), (action_batch)) self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.actor.forward(state_batch) Q1, Q2, val = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 self.compute_stats(policy_loss,self.policy_loss) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) self.actor_optim.step() if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau)