def test_polyak(): param1, param2 = th.nn.Parameter(th.ones((5, 5))), th.nn.Parameter(th.zeros((5, 5))) target1, target2 = th.nn.Parameter(th.ones((5, 5))), th.nn.Parameter(th.zeros((5, 5))) tau = 0.1 polyak_update([param1], [param2], tau) with th.no_grad(): for param, target_param in zip([target1], [target2]): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) assert th.allclose(param1, target1) assert th.allclose(param2, target2)
def _on_step(self) -> None: """ Update the exploration rate and target network if needed. This method is called in ``collect_rollouts()`` after each step in the environment. """ if self.num_timesteps % self.target_update_interval == 0: polyak_update(self.q_net.parameters(), self.q_net_target.parameters(), self.tau) self.exploration_rate = self.exploration_schedule( self._current_progress_remaining) logger.record("rollout/exploration rate", self.exploration_rate)
def train(self, gradient_steps: int, batch_size: int = 100) -> None: # Update learning rate according to lr schedule self._update_learning_rate([self.actor.optimizer, self.critic.optimizer]) actor_losses, critic_losses = [], [] for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) with th.no_grad(): # Select action according to policy and add clipped noise noise = replay_data.actions.clone().data.normal_(0, self.target_policy_noise) noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip) next_actions = (self.actor_target(replay_data.next_observations) + noise).clamp(-1, 1) # Compute the target Q value: min over all critics targets targets = th.cat(self.critic_target(replay_data.next_observations, next_actions), dim=1) target_q, _ = th.min(targets, dim=1, keepdim=True) target_q = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q # Get current Q estimates for each critic network current_q_estimates = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = sum([F.mse_loss(current_q, target_q) for current_q in current_q_estimates]) critic_losses.append(critic_loss.item()) # Optimize the critics self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Delayed policy updates if gradient_step % self.policy_delay == 0: # Compute actor loss actor_loss = -self.critic.q1_forward(replay_data.observations, self.actor(replay_data.observations)).mean() actor_losses.append(actor_loss.item()) # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) polyak_update(self.actor.parameters(), self.actor_target.parameters(), self.tau) self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses))
def learn_step(self, idxs, transition_batch, weights): Otm1, old_action, env_rew, done, Ot = transition_batch batch_size = len(Ot) observations = (torch.tensor(Otm1, device=self.device)) actions = torch.tensor(old_action, device=self.device) rewards = torch.tensor(env_rew, device=self.device) done = torch.tensor(done, device=self.device).float().to(self.device) next_observations = (torch.tensor(Ot, device=self.device)) # weights = torch.tensor(weights, device=self.device) with torch.no_grad(): actions_pred, log_probs_pred = self.policy(observations) ent_coef = torch.exp(self.log_ent_coef.detach()) ent_coef_loss = -(self.log_ent_coef * (log_probs_pred + self.target_entropy).detach()).mean() self.ent_coef_optimizer.zero_grad() ent_coef_loss.backward() self.ent_coef_optimizer.step() with th.no_grad(): next_actions, next_log_prob = self.policy(next_observations) targets = self.policy.critic_target(next_observations, next_actions) targets = torch.stack(targets, dim=1) target_q, _ = torch.min(targets,dim=1) target_q = target_q - ent_coef * next_log_prob # td error + entropy term q_backup = rewards + (1 - done) * self.gamma * target_q current_q_estimates = self.policy.critic(observations, actions) critic_loss = mean([F.mse_loss(current_q, q_backup) for current_q in current_q_estimates]) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() q_values_pi = th.stack(self.policy.critic.forward(observations, actions_pred), dim=1) min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True) actor_loss = (ent_coef * log_probs_pred - min_qf_pi).mean() self.policy_optimizer.zero_grad() actor_loss.backward() self.policy_optimizer.step() polyak_update(self.policy.critic.parameters(), self.policy.critic_target.parameters(), self.tau) logger = self.logger logger.record_mean("ent_coef_loss",ent_coef_loss.item()) logger.record_mean("critic_loss",critic_loss.item()) logger.record_mean("actor_loss",actor_loss.item()) logger.record_mean("q_backup",q_backup.mean().item()) logger.record("policy_lr",self.policy_optimizer.get_last_lr())
def _on_update(self) -> None: if self._n_updates % self.target_update_interval == 0: polyak_update(self.q_net.parameters(), self.q_net_target.parameters(), self.tau) if not self.share: polyak_update(self.v_mlp_extractor.parameters(), self.v_mlp_extractor_target.parameters(), self.tau) if self.KL: if self._n_updates % 2 == 0: # if self.vloss_tracker.full and self.vloss_tracker.mean() < 5: self.train_mode='policy' ''' careful that train() will call _on_update() so must clear before train ''' self.vloss_tracker.clear() self.train(gradient_steps=self.gradient_steps, batch_size=self.batch_size) # self.train(gradient_steps=self.replay_buffer.size()*2//self.batch_size, batch_size=self.batch_size) self.train_mode='value' # print("policy updated") if self._n_updates % self.behav_update_interval == 0: polyak_update(self.action_net.parameters(), self.behav_net.parameters(), tau=self.behav_tau) if not self.share: polyak_update(self.a_mlp_extractor.parameters(), self.a_mlp_extractor_target.parameters(), tau=self.behav_tau) self.trajectories = [Trajectory(self.device) for i in range(self.n_envs)] self.trajectory_buffer.reset() self.exploration_rate = self.exploration_schedule(self._current_progress_remaining)
def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Switch to train mode (this affects batch norm / dropout) self.policy.set_training_mode(True) # Update optimizers learning rate optimizers = [self.actor.optimizer, self.critic.optimizer] if self.ent_coef_optimizer is not None: optimizers += [self.ent_coef_optimizer] # Update learning rate according to lr schedule self._update_learning_rate(optimizers) ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses = [], [] for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: self.actor.reset_noise() # Action by the current actor for the sampled state actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations) log_prob = log_prob.reshape(-1, 1) ent_coef_loss = None if self.ent_coef_optimizer is not None: # Important: detach the variable from the graph # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 ent_coef = th.exp(self.log_ent_coef.detach()) ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() ent_coef_losses.append(ent_coef_loss.item()) else: ent_coef = self.ent_coef_tensor ent_coefs.append(ent_coef.item()) # Optimize entropy coefficient, also called # entropy temperature or alpha in the paper if ent_coef_loss is not None: self.ent_coef_optimizer.zero_grad() ent_coef_loss.backward() self.ent_coef_optimizer.step() with th.no_grad(): # Select action according to policy next_actions, next_log_prob = self.actor.action_log_prob(replay_data.next_observations) # Compute the next Q values: min over all critics targets next_q_values = th.cat(self.critic_target(replay_data.next_observations, next_actions), dim=1) next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True) # add entropy term next_q_values = next_q_values - ent_coef * next_log_prob.reshape(-1, 1) # td error + entropy term target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values # Get current Q-values estimates for each critic network # using action from the replay buffer current_q_values = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = 0.5 * sum([F.mse_loss(current_q, target_q_values) for current_q in current_q_values]) critic_losses.append(critic_loss.item()) # Optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Compute actor loss # Alternative: actor_loss = th.mean(log_prob - qf1_pi) # Mean over all critic networks q_values_pi = th.cat(self.critic.forward(replay_data.observations, actions_pi), dim=1) min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True) actor_loss = (ent_coef * log_prob - min_qf_pi).mean() actor_losses.append(actor_loss.item()) # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # Update target networks if gradient_step % self.target_update_interval == 0: polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) self._n_updates += gradient_steps self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard") self.logger.record("train/ent_coef", np.mean(ent_coefs)) self.logger.record("train/actor_loss", np.mean(actor_losses)) self.logger.record("train/critic_loss", np.mean(critic_losses)) if len(ent_coef_losses) > 0: self.logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
def train(self, gradient_steps: int, batch_size: int) -> None: # Set mppisac coeficient (either constant or using schedule) if isinstance(self.mppisac_coef, float): mppisac_coef = self.mppisac_coef assert (mppisac_coef <= 1.0) and (mppisac_coef >= 0.0), "MPPISAC_coef should be between 0.0 to 1.0" else: mppisac_coef = self.mppisac_coef_schedule.value( step=self.num_timesteps - self.learning_starts, total_steps=self._total_timesteps) # Update optimizers learning rate optimizers = [self.actor.optimizer, self.critic.optimizer] if self.ent_coef_optimizer is not None: optimizers += [self.ent_coef_optimizer] # Update learning rate according to lr schedule self._update_learning_rate(optimizers) ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses, mppisac_losses = [], [], [] mb_train_losses, mb_valid_losses = [], [] for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) # train model based MPPICTRL dynamic model if mppisac_coef > 0.0: mb_valid_loss = self.mbctrl.validate( states=replay_data.observations, next_states=replay_data.next_observations, actions=replay_data.actions, ) mb_valid_losses.append(mb_valid_loss) mb_train_loss = self.mbctrl.train( states=replay_data.observations, next_states=replay_data.next_observations, actions=replay_data.actions, ) mb_train_losses.append(np.mean(mb_train_loss)) # MPPICTRL's suggested actions for observations # t1 = time.time() if self.mppisac_nprocesses == 1: actions_mb = self.mbctrl.act(replay_data.observations) else: with Pool(self.mppisac_nprocesses) as pool: actions_mb_mp = pool.map(self.mbctrl.act, replay_data.observations) actions_mb = torch.stack(actions_mb_mp) # t2 = time.time() # to_log = f"Processing {replay_data.observations.shape[0]} observations in {t2-t1:.2f} seconds [{(t2-t1)/replay_data.observations.shape[0]:.3f}sec/action] using {self.mppisac_nprocesses} cpus" # logger.log(to_log, level=20) actions_mb = self.policy.scale_action(actions_mb) # Deterministic actions from current SAC actor to be used for behavioral cloning loss actions_pi_deterministic = self.actor( obs=replay_data.observations, deterministic=True) # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: self.actor.reset_noise() # Action by the current actor for the sampled state actions_pi, log_prob = self.actor.action_log_prob( replay_data.observations) log_prob = log_prob.reshape(-1, 1) ent_coef_loss = None if self.ent_coef_optimizer is not None: # Important: detach the variable from the graph # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 ent_coef = torch.exp(self.log_ent_coef.detach()) ent_coef_loss = -( self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() ent_coef_losses.append(ent_coef_loss.item()) else: ent_coef = self.ent_coef_tensor ent_coefs.append(ent_coef.item()) # Optimize entropy coefficient, also called # entropy temperature or alpha in the paper if ent_coef_loss is not None: self.ent_coef_optimizer.zero_grad() ent_coef_loss.backward() self.ent_coef_optimizer.step() with torch.no_grad(): # Select action according to policy next_actions, next_log_prob = self.actor.action_log_prob( replay_data.next_observations) # Compute the target Q value: min over all critics targets targets = torch.cat(self.critic_target( replay_data.next_observations, next_actions), dim=1) target_q, _ = torch.min(targets, dim=1, keepdim=True) # add entropy term target_q = target_q - ent_coef * next_log_prob.reshape(-1, 1) # td error + entropy term q_backup = replay_data.rewards + ( 1 - replay_data.dones) * self.gamma * target_q # q(s, MPPI_action) FOR USING DIFF Qs if mppisac_coef > 0.0 and self.mppisac_use_qdiff: actions_mb_qs = torch.cat(self.critic_target( replay_data.observations, actions_mb), dim=1) actions_mb_q, _ = torch.min(actions_mb_qs, dim=1, keepdim=True) # Get current Q estimates for each critic network # using action from the replay buffer current_q_estimates = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = 0.5 * sum([ F.mse_loss(current_q, q_backup) for current_q in current_q_estimates ]) critic_losses.append(critic_loss.item()) # Optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Compute actor loss # Alternative: actor_loss = th.mean(log_prob - qf1_pi) # Mean over all critic networks q_values_pi = torch.cat(self.critic.forward( replay_data.observations, actions_pi), dim=1) min_qf_pi, _ = torch.min(q_values_pi, dim=1, keepdim=True) sac_actor_loss = (ent_coef * log_prob - min_qf_pi).mean() # --- Behavioral Cloning (MPPISAC) loss --- if mppisac_coef > 0.0 and self.mppisac_use_qdiff: mppisac_loss = F.mse_loss(min_qf_pi, actions_mb_q, reduction="none").mean() # using qs elif mppisac_coef > 0.0: mppisac_loss = F.mse_loss( actions_pi_deterministic, actions_mb, reduction="none").mean() # using actions else: mppisac_loss = torch.tensor(0.0) mppisac_losses.append(mppisac_loss.item()) actor_loss = (1.0 - mppisac_coef ) * sac_actor_loss + mppisac_coef * mppisac_loss actor_losses.append(actor_loss.item()) # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # Update target networks if gradient_step % self.target_update_interval == 0: polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/ent_coef", np.mean(ent_coefs)) logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses)) logger.record("train/mppi-sac_loss", np.mean(mppisac_losses)) logger.record("train/mppi-sac_coef", mppisac_coef) logger.record("mbctrl/train_loss", np.mean(mb_train_losses)) logger.record("mbctrl/valid_loss", np.mean(mb_valid_losses)) if len(ent_coef_losses) > 0: logger.record("train/ent_coef_loss", np.mean(ent_coef_losses)) self._dump_logs()
def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Update optimizers learning rate optimizers = [self.actor.optimizer, self.critic.optimizer] # Update learning rate according to lr schedule self._update_learning_rate(optimizers) actor_losses, critic_losses = [], [] for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) batch_size = replay_data.observations.size(0) # Critic update with th.no_grad(): target_next_actions = self.actor_target.forward( replay_data.next_observations) target_next_actions_q, _ = self.critic_target.forward( replay_data.next_observations, target_next_actions, self.action_dist_samples) target_next_actions_q = target_next_actions_q.transpose(1, 2) target_expected_Q = replay_data.rewards.unsqueeze(-1) + \ (1 - replay_data.dones.unsqueeze(-1)) * self.gamma * target_next_actions_q expected_Q, taus = self.critic.forward(replay_data.observations, replay_data.actions, self.action_dist_samples) # Quantile Huber loss td_error = target_expected_Q - expected_Q huber_1 = calculate_huber_loss(td_error, 1.0) quantil_1 = abs(taus - (td_error.detach() < 0).float()) * huber_1 / 1.0 critic_loss = (quantil_1.sum(dim=1).mean(dim=1, keepdim=True) * replay_data.weights).mean() critic_losses.append(critic_loss.item()) # Optimize critic self.critic.optimizer.zero_grad() critic_loss.backward() clip_grad_norm_(self.critic.parameters(), 1) self.critic.optimizer.step() # Actor update actions = self.actor.forward(replay_data.observations) actions_q = self.critic.get_qvalues(replay_data.observations, actions) actor_loss = -actions_q.mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() actor_losses.append(actor_loss.item()) self.replay_buffer.update_priorities( replay_data.indices, np.clip( abs( td_error.sum(dim=1).mean( dim=1, keepdim=True).data.cpu().numpy()), -1, 1)) if gradient_step % self.target_update_interval == 0: polyak_update(self.actor.parameters(), self.actor_target.parameters(), self.tau) polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses))
def train(self, gradient_steps: int, batch_size: int = 100) -> None: # Update learning rate according to lr schedule self._update_learning_rate( [self.actor.optimizer, self.critic.optimizer]) actor_losses, critic_losses = [], [] for i in range(gradient_steps): use_bc_loss = False self._n_updates += 1 updatas = 0 # Sample replay buffer #if self.use_expert_demonstration =True and updatas< (gradient_steps/2): #self.use_expert_demonstration=0 if self._n_updates % 2 == 0 and self.use_expert_demonstration == 1: replay_data = self.expert_buffer.sample( batch_size, env=self._vec_normalize_env) use_bc_loss = True #print('buffer_sample') #print(replay_data.actions) else: replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) #print('replay_sample') #print(replay_data.actions) #replay_expert_data=self.expert_buffer.sample(batch_size, env=self._vec_normalize_env) #replay_data=self.replay_buffer.sample(batch_size,env=self._vec_normalize_env) #print(test_data.next_observations[0]) #replay_data = self.expert_buffer.sample(batch_size, env=self._vec_normalize_env) #print(type(replay_data.next_observations)) # replay_data = self.expert_buffer.sample(batch_size, env=self._vec_normalize_env) #use_bc_loss=True with th.no_grad(): # Select action according to policy and add clipped noise noise = replay_data.actions.clone().data.normal_( 0, self.target_policy_noise) noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip) next_actions = ( self.actor_target(replay_data.next_observations) + noise).clamp(-1, 1) # Compute the next Q-values: min over all critics targets next_q_values = th.cat(self.critic_target( replay_data.next_observations, next_actions), dim=1) next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True) target_q_values = replay_data.rewards + ( 1 - replay_data.dones) * self.gamma * next_q_values # Get current Q-values estimates for each critic network current_q_values = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = sum([ F.mse_loss(current_q, target_q_values) for current_q in current_q_values ]) critic_losses.append(critic_loss.item()) # Optimize the critics self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Delayed policy updates if self._n_updates % self.policy_delay == 0: if use_bc_loss == True: # Compute actor loss# current_actions = self.actor(replay_data.observations) bc_loss = sum([ F.mse_loss(current_action, replay_data.actions) for current_action in current_actions ]) actor_loss = -self.critic.q1_forward( replay_data.observations, self.actor(replay_data.observations)).mean() + bc_loss actor_losses.append(actor_loss.item()) else: actor_loss = -self.critic.q1_forward( replay_data.observations, self.actor(replay_data.observations)).mean() actor_losses.append(actor_loss.item()) # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) polyak_update(self.actor.parameters(), self.actor_target.parameters(), self.tau) logger.record("train/n_updates", self._n_updates, exclude="tensorboard") if len(actor_losses) > 0: logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses))
def run(params: argparse.Namespace): Task.init() sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda) writer = helper.get_summary_writer(__file__, params) env = helper.make_env(params, 'env') q = network.get_model_class(params)(env).to(device) q_hat = network.get_model_class(params)(env).to(device) q_hat.load_state_dict(q.state_dict()) replay_buffer = ReplayBuffer(params.replay_size) # todo check optimizer opt = optim.Adam(q.parameters(), lr=params.learning_rate) all_rewards = [] state = env.reset() episode_reward = [0] episode_no = 0 for t in range(1, params.max_ts + 1): # order of terms important so that the call to 'next(eps)' # does not decrease epsilon epsilon = get_epsilon(params.epsilon_start, params.epsilon_end, params.epsilon_decay, t) if random.random() < epsilon: a = random.randrange(env.action_space.n) else: val = q(np.expand_dims(state, axis=0)) a = torch.argmax(val).item() # equivalent to q(...).max(1)[1].data[0] # (selects max tensor with .max(1) and its index with ...[1]) s_tp1, r, done, infos = env.step(a) episode_reward = list(map(add, episode_reward, [r])) replay_buffer.add(state, a, r, s_tp1, done) state = s_tp1 if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = [0] episode_no += 1 # replay buffer reached minimum capacity if len(replay_buffer) > params.start_train_ts: obses_t, actions, rewards, obses_tp1, dones = \ replay_buffer.sample(params.batch_size) rewards = torch.tensor(rewards, dtype=torch.float32) \ .unsqueeze(1).to(device) actions = torch.tensor(actions).unsqueeze(1).to(device) dones = torch.tensor(dones).unsqueeze(1).to(device) if True: with torch.no_grad(): # Compute the target Q values target_q = q_hat(obses_tp1) # Follow greedy policy: use the one with the highest value target_q, _ = target_q.max(dim=1) # Avoid potential broadcast issue target_q = target_q.reshape(-1, 1) # 1-step TD target target_q = rewards + ~dones * params.gamma * target_q # Get current Q estimates current_q = q(obses_t) # Retrieve the q-values for the actions from the replay buffer current_q = torch.gather(current_q, dim=1, index=actions.long()) # Compute Huber loss (less sensitive to outliers) loss = F.smooth_l1_loss(current_q, target_q) else: val_tp1 = q(obses_tp1) val_t = q(obses_t) val_hat_tp1 = q_hat(obses_tp1) # .T to iterate over columns of the array: https://stackoverflow.com/a/10148855/256002 r = torch.from_numpy(rewards).to(device) #if params.summed_q: # head = heads[idx] #else: # head = heads[mirrored_envs.use_for_decisions_idx] a = torch.argmax(val_tp1, dim=1) td_errors = r + ~dones * params.gamma * val_hat_tp1.gather( 1, a.unsqueeze(1)).squeeze() q_vals = val_t.gather(1, actions).squeeze() #loss = (td_errors.detach() - q_vals).pow(2).mean() loss = F.smooth_l1_loss(q_vals, td_errors.detach()) if done: writer.add_scalar("loss_idx", loss.data, episode_no) writer.add_scalar("total_loss", loss.data, episode_no) # Optimize the policy opt.zero_grad() loss.backward() # Clip gradient norm torch.nn.utils.clip_grad_norm_(q.parameters(), params.max_grad_norm) opt.step() if t % params.target_network_update_f == 0: print('weights copied') sb3_utils.polyak_update(q.parameters(), q_hat.parameters(), 1.0) if done: for idx, ep_reward in enumerate(all_rewards[-1]): helper.add_scalar(writer, "episode_reward_idx{}".format(idx), ep_reward, episode_no, params) helper.add_scalar(writer, "steps_count", infos['steps_count'], episode_no, params) if episode_no % params.log_interval == 0: #print('replaybuffer size:', len(replay_buffer)) out_str = "Timestep {}".format(t) if len(all_rewards) > 0: out_str += ",Reward:{}".format(all_rewards[-1]) out_str += ", done: {}".format(done) out_str += ', steps_count {}'.format(infos['steps_count']) out_str += ', epsilon {}'.format(epsilon) print(out_str) helper.close_summary_writer(writer)
def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Update optimizers learning rate optimizers = [self.actor.optimizer, self.critic.optimizer] if self.ent_coef_optimizer is not None: optimizers += [self.ent_coef_optimizer] # Update learning rate according to lr schedule self._update_learning_rate(optimizers) ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses = [], [] for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: self.actor.reset_noise() # Action by the current actor for the sampled state actions_pi, log_prob = self.actor.action_log_prob( replay_data.observations) log_prob = log_prob.reshape(-1, 1) ent_coef_loss = None if self.ent_coef_optimizer is not None: # Important: detach the variable from the graph # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 ent_coef = th.exp(self.log_ent_coef.detach()) ent_coef_loss = -( self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() ent_coef_losses.append(ent_coef_loss.item()) else: ent_coef = self.ent_coef_tensor ent_coefs.append(ent_coef.item()) self.replay_buffer.ent_coef = ent_coef.item() # Optimize entropy coefficient, also called # entropy temperature or alpha in the paper if ent_coef_loss is not None: self.ent_coef_optimizer.zero_grad() ent_coef_loss.backward() self.ent_coef_optimizer.step() with th.no_grad(): # Select action according to policy next_actions, next_log_prob = self.actor.action_log_prob( replay_data.next_observations) # Compute and cut quantiles at the next state # batch x nets x quantiles next_quantiles = self.critic_target( replay_data.next_observations, next_actions) # Sort and drop top k quantiles to control overestimation. n_target_quantiles = self.critic.quantiles_total - self.top_quantiles_to_drop_per_net * self.critic.n_critics next_quantiles, _ = th.sort( next_quantiles.reshape(batch_size, -1)) next_quantiles = next_quantiles[:, :n_target_quantiles] # td error + entropy term target_quantiles = next_quantiles - ent_coef * next_log_prob.reshape( -1, 1) target_quantiles = replay_data.rewards + ( 1 - replay_data.dones) * self.gamma * target_quantiles # Make target_quantiles broadcastable to (batch_size, n_critics, n_target_quantiles). target_quantiles.unsqueeze_(dim=1) # Get current Quantile estimates using action from the replay buffer current_quantiles = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss, not summing over the quantile dimension as in the paper. critic_loss = quantile_huber_loss(current_quantiles, target_quantiles, sum_over_quantiles=False) critic_losses.append(critic_loss.item()) # Optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Compute actor loss qf_pi = self.critic(replay_data.observations, actions_pi).mean(dim=2).mean(dim=1, keepdim=True) actor_loss = (ent_coef * log_prob - qf_pi).mean() actor_losses.append(actor_loss.item()) # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # Update target networks if gradient_step % self.target_update_interval == 0: polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/ent_coef", np.mean(ent_coefs)) logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses)) if len(ent_coef_losses) > 0: logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
def train_mer(self, gradient_steps: int, batch_size: int = 64) -> None: optimizers = [self.actor.optimizer, self.critic.optimizer] if self.ent_coef_optimizer is not None: optimizers += [self.ent_coef_optimizer] optimizers_to_reset = optimizers # Reset optimizers: for i_optimizer, optimizer in enumerate(optimizers_to_reset): optimizer.__init__(optimizer.param_groups[0]['params']) optimizers[i_optimizer] = optimizer # Update optimizers learning rate base_lr = self.lr_schedule(self._current_progress_remaining) ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses = [], [] # Save initial weights of model - for actor and critic but not for critic_target models_to_update = [self.actor, self.critic] initial_state_dicts = [ deepcopy(model.state_dict()) for model in models_to_update ] initial_log_ent_coef = self.log_ent_coef.detach().clone() current_example_ind = randint(0, gradient_steps - 1) for gradient_step in range(gradient_steps): # Sample replay buffer or current example, and update learning rate accordingly if gradient_step == current_example_ind: replay_data = self.current_experience_buffer.sample( 1, env=self._vec_normalize_env) for optimizer in optimizers: update_learning_rate(optimizer, base_lr * self.mer_s) else: replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) for optimizer in optimizers: update_learning_rate(optimizer, base_lr) # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: self.actor.reset_noise() # Action by the current actor for the sampled state actions_pi, log_prob = self.actor.action_log_prob( replay_data.observations) log_prob = log_prob.reshape(-1, 1) ent_coef_loss = None if self.ent_coef_optimizer is not None: # Important: detach the variable from the graph # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 ent_coef = th.exp(self.log_ent_coef.detach()) ent_coef_loss = -( self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() ent_coef_losses.append(ent_coef_loss.item()) else: ent_coef = self.ent_coef_tensor ent_coefs.append(ent_coef.item()) # Optimize entropy coefficient, also called # entropy temperature or alpha in the paper if ent_coef_loss is not None: self.ent_coef_optimizer.zero_grad() ent_coef_loss.backward() self.ent_coef_optimizer.step() with th.no_grad(): # Select action according to policy next_actions, next_log_prob = self.actor.action_log_prob( replay_data.next_observations) # Compute the target Q value: min over all critics targets targets = th.cat(self.critic_target( replay_data.next_observations, next_actions), dim=1) target_q, _ = th.min(targets, dim=1, keepdim=True) # add entropy term target_q = target_q - ent_coef * next_log_prob.reshape(-1, 1) # td error + entropy term q_backup = replay_data.rewards + ( 1 - replay_data.dones) * self.gamma * target_q # Get current Q estimates for each critic network # using action from the replay buffer current_q_estimates = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = 0.5 * sum([ F.mse_loss(current_q, q_backup) for current_q in current_q_estimates ]) critic_losses.append(critic_loss.item()) # Optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Compute actor loss # Alternative: actor_loss = th.mean(log_prob - qf1_pi) # Mean over all critic networks q_values_pi = th.cat(self.critic.forward(replay_data.observations, actions_pi), dim=1) min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True) actor_loss = (ent_coef * log_prob - min_qf_pi).mean() actor_losses.append(actor_loss.item()) # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # Perform Reptile step for i_model, model in enumerate(models_to_update): self.reptile_step_state_dict(model, initial_state_dicts[i_model]) self.log_ent_coef.data = self.reptile_step_tensor( self.log_ent_coef.data, initial_log_ent_coef.data) # Update target networks polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/ent_coef", np.mean(ent_coefs)) logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses)) if len(ent_coef_losses) > 0: logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Update optimizers learning rate optimizers = [self.actor.optimizer, self.critic.optimizer] # Update learning rate according to lr schedule self._update_learning_rate(optimizers) mean_loss_q, mean_loss_p, mean_loss_l, max_kl_μ, max_kl_Σ, max_kl = [], [], [], [], [], [] actor_losses, critic_losses = [], [] for gradient_step in range(gradient_steps): # Sample replay buffer replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) batch_size = replay_data.observations.size(0) with th.no_grad(): # Sample "action_samples" num additional actions target_next_action_mean, target_next_action_cholesky, _ = self.actor_target.get_action_dist_params( replay_data.next_observations) target_next_action_dist = MultivariateNormal( target_next_action_mean, scale_tril=target_next_action_cholesky) target_sampled_next_actions = target_next_action_dist.sample( (self.action_samples, )).transpose(0, 1) # Compute mean of q values for the samples # Expand next_observation to match self.action_samples expanded_next_observations = replay_data.next_observations[:, None, :].expand( -1, self.action_samples, -1) target_sampled_next_actions_expected_q = get_min_critic_tensor( self.critic_target.forward( expanded_next_observations.reshape( -1, self.features_dim), target_sampled_next_actions.reshape( -1, self.action_dim))).reshape( batch_size, self.action_samples).mean(dim=1) # Compute total expected return target_sampled_expected_return = replay_data.rewards.squeeze() + (1 - replay_data.dones.squeeze()) * self.gamma * \ target_sampled_next_actions_expected_q # Optimize the critic critic_qs = self.critic.forward(replay_data.observations, replay_data.actions) critic_loss = 0.5 * sum([ self.critic_loss(current_q.squeeze(), target_sampled_expected_return) for current_q in critic_qs ]) critic_losses.append(critic_loss.item()) self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Sample additional actions for E-Step with th.no_grad(): target_action_mean, target_action_cholesky, _ = self.actor_target.get_action_dist_params( replay_data.observations) target_action_dist = MultivariateNormal( target_action_mean, scale_tril=target_action_cholesky) sampled_actions = target_action_dist.sample( (self.action_samples, )) # Compute q values for the samples # Expand next_observation to match self.action_samples expanded_observations = replay_data.observations[ None, ...].expand(self.action_samples, -1, -1) target_sampled_actions_expected_q = get_min_critic_tensor( self.critic_target.forward( expanded_observations.reshape(-1, self.features_dim), sampled_actions.reshape(-1, self.action_dim))).reshape( self.action_samples, batch_size) target_sampled_actions_expected_q_np = target_sampled_actions_expected_q.cpu( ).numpy() # Define dual function def dual(η): max_q = np.max(target_sampled_actions_expected_q_np, 0) return η * self.ε_dual + np.mean(max_q) \ + η * np.mean(np.log(np.mean(np.exp((target_sampled_actions_expected_q_np - max_q) / η), axis=0))) bounds = [(1e-6, None)] self.η = np.max([self.η, 1e-6]) res = minimize(dual, np.array([self.η]), method='SLSQP', bounds=bounds) self.η = res.x[0] qij = th.softmax(target_sampled_actions_expected_q / self.η, dim=0) # M-Step for _ in range(self.lagrange_iterations): action_mean, action_cholesky, _ = self.actor.get_action_dist_params( replay_data.observations) π1 = MultivariateNormal(action_mean, scale_tril=target_action_cholesky) π2 = MultivariateNormal(target_action_mean, scale_tril=action_cholesky) loss_p = th.mean(qij * (π1.expand( (self.action_samples, batch_size)).log_prob(sampled_actions) + π2.expand( (self.action_samples, batch_size)).log_prob(sampled_actions))) mean_loss_p.append((-loss_p).item()) kl_μ, kl_Σ = gaussian_kl(μ_target=target_action_mean, μ=action_mean, A_target=target_action_cholesky, A=action_cholesky) max_kl_μ.append(kl_μ.item()) max_kl_Σ.append(kl_Σ.item()) self.η_kl_μ -= self.α * (self.ε_kl_μ - kl_μ).detach().item() self.η_kl_Σ -= self.α * (self.ε_kl_Σ - kl_Σ).detach().item() if self.η_kl_μ < 0.0: self.η_kl_μ = 0.0 if self.η_kl_Σ < 0.0: self.η_kl_Σ = 0.0 self.actor.optimizer.zero_grad() actor_loss = -(loss_p + self.η_kl_μ * (self.ε_kl_μ - kl_μ) + self.η_kl_Σ * (self.ε_kl_Σ - kl_Σ)) actor_losses.append(actor_loss.item()) # Optimize actor actor_loss.backward() clip_grad_norm_(self.actor.parameters(), 0.1) self.actor.optimizer.step() if gradient_step % self.target_update_interval == 0: polyak_update(self.actor.parameters(), self.actor_target.parameters(), self.tau) polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses)) logger.record("train/actor_policy_loss", np.mean(mean_loss_p)) logger.record("train/max_kl_mean", np.max(max_kl_μ)) logger.record("train/mean_kl_mean", np.mean(max_kl_μ)) logger.record("train/max_kl_std", np.max(max_kl_Σ)) logger.record("train/mean_kl_std", np.mean(max_kl_Σ))