def train_shared(self): ''' Trains the network when the actor and critic share parameters ''' clock = self.body.env.clock if self.to_train == 1: # update old net torch.cuda.empty_cache() net_util.copy(self.net, self.old_net) batch = self.sample() total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss # retain for entropies etc. self.net.training_step(loss=loss, lr_clock=clock, retain_graph=True) total_loss += loss loss = total_loss / self.training_epoch # reset self.to_train = 0 self.body.flush() logger.debug( f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def train_shared(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: # update old net net_util.copy(self.net, self.old_net) batch = self.sample() total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss # retain for entropies etc. self.net.training_step(loss=loss, retain_graph=True, global_net=self.global_nets.get('net')) total_loss += loss loss = total_loss / self.training_epoch # reset self.to_train = 0 self.body.entropies = [] self.body.log_probs = [] logger.debug(f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}') return loss.item() else: return np.nan
def train_separate(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: batch = self.sample() total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): loss = self.calc_loss(batch) # to reuse loss for critic loss.backward = partial(loss.backward, retain_graph=True) self.net.training_step(loss=loss) # critic.optim.step using the same loss loss.backward = partial(loss.backward, retain_graph=False) self.critic.training_step(loss=loss) total_loss += loss loss = total_loss.mean() net_util.copy(self.net, self.old_net) net_util.copy(self.critic, self.old_critic) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def train_shared(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: batch = self.sample() total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss # retain for entropies etc. self.net.training_step(loss=loss, retain_graph=True) total_loss += loss.cpu() loss = total_loss / self.training_epoch net_util.copy(self.net, self.old_net) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def train_step(self, loss, optim, lr_scheduler=None, clock=None, global_net=None): if lr_scheduler is not None: lr_scheduler.step(epoch=ps.get(clock, 'frame')) optim.zero_grad() loss.backward() if self.previous_grads is not None: total_norm = 0.0 for p in self.parameters(): param_norm = p.grad.data.norm(2) total_norm += param_norm.item() ** 2 total_norm = total_norm ** (1. / 2) self.previous_grads.append(total_norm) average_grad = sum(self.previous_grads) / len(self.previous_grads) if self.clip_grad_val is not None: nn.utils.clip_grad_norm_(self.parameters(), min(average_grad * self.grad_scaler, self.clip_grad_val)) else: nn.utils.clip_grad_norm_(self.parameters(), average_grad * self.grad_scaler) elif self.clip_grad_val is not None: nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val) if global_net is not None: net_util.push_global_grads(self, global_net) optim.step() if global_net is not None: net_util.copy(global_net, self) if clock is not None: clock.tick('opt_step') return loss
def update_nets(self): '''Update target networks''' if util.frame_mod(self.body.env.clock.frame, self.q1_net.update_frequency, self.body.env.num_envs): if self.q1_net.update_type == 'replace': net_util.copy(self.q1_net, self.target_q1_net) net_util.copy(self.q2_net, self.target_q2_net) elif self.q1_net.update_type == 'polyak': net_util.polyak_update(self.q1_net, self.target_q1_net, self.q1_net.polyak_coef) net_util.polyak_update(self.q2_net, self.target_q2_net, self.q2_net.polyak_coef) else: raise ValueError('Unknown q1_net.update_type. Should be "replace" or "polyak". Exiting.')
def train_step(self, loss, optim, lr_scheduler, clock, global_net=None): lr_scheduler.step(epoch=ps.get(clock, 'frame')) optim.zero_grad() loss.backward() if self.clip_grad_val is not None: nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val) if global_net is not None: net_util.push_global_grads(self, global_net) optim.step() if global_net is not None: net_util.copy(global_net, self) clock.tick('opt_step') return loss
def update_nets(self): total_t = self.body.env.clock.total_t if total_t % self.net.update_frequency == 0: if self.net.update_type == 'replace': logger.debug('Updating target_net by replacing') net_util.copy(self.net, self.target_net) elif self.net.update_type == 'polyak': logger.debug('Updating net by averaging') net_util.polyak_update(self.net, self.target_net, self.net.polyak_coef) else: raise ValueError( 'Unknown net.update_type. Should be "replace" or "polyak". Exiting.' )
def train(self): if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: net_util.copy(self.net, self.old_net) # update old net batch = self.sample() clock.set_batch_size(len(batch)) with torch.no_grad(): states = batch['states'] if self.body.env.is_venv: states = math_util.venv_unpack(states) # NOTE states is massive with batch_size = time_horizon * num_envs. Chunk up so forward pass can fit into device esp. GPU num_chunks = int(len(states) / self.minibatch_size) v_preds_chunks = [self.calc_v(states_chunk, use_cache=False) for states_chunk in torch.chunk(states, num_chunks)] v_preds = torch.cat(v_preds_chunks) advs, v_targets = self.calc_advs_v_targets(batch, v_preds) # piggy back on batch, but remember to not pack or unpack batch['advs'], batch['v_targets'] = advs, v_targets if self.body.env.is_venv: # unpack if venv for minibatch sampling for k, v in batch.items(): if k not in ('advs', 'v_targets'): batch[k] = math_util.venv_unpack(v) total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): minibatches = util.split_minibatch(batch, self.minibatch_size) for minibatch in minibatches: if self.body.env.is_venv: # re-pack to restore proper shape for k, v in minibatch.items(): if k not in ('advs', 'v_targets'): minibatch[k] = math_util.venv_pack(v, self.body.env.num_envs) advs, v_targets = minibatch['advs'], minibatch['v_targets'] pdparams, v_preds = self.calc_pdparam_v(minibatch) policy_loss = self.calc_policy_loss(minibatch, pdparams, advs) # from actor val_loss = self.calc_val_loss(v_preds, v_targets) # from critic if self.shared: # shared network loss = policy_loss + val_loss self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) else: self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = policy_loss + val_loss total_loss += loss loss = total_loss / self.training_epoch / len(minibatches) # reset self.to_train = 0 logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def init_nets(self, global_nets=None): ''' Networks: net(actor/policy), q1_net, target_q1_net, q2_net, target_q2_net All networks are separate, and have the same hidden layer architectures and optim specs, so tuning is minimal ''' self.shared = False # SAC does not share networks NetClass = getattr(net, self.net_spec['type']) # main actor network self.net = NetClass(self.net_spec, self.body.state_dim, net_util.get_out_dim(self.body)) self.net_names = ['net'] # two critic Q-networks to mitigate positive bias in q_loss and speed up training, uses q_net.py with prefix Q QNetClass = getattr(net, 'Q' + self.net_spec['type']) q_in_dim = [self.body.state_dim, self.body.action_dim] self.q1_net = QNetClass(self.net_spec, q_in_dim, 1) self.target_q1_net = QNetClass(self.net_spec, q_in_dim, 1) self.q2_net = QNetClass(self.net_spec, q_in_dim, 1) self.target_q2_net = QNetClass(self.net_spec, q_in_dim, 1) self.net_names += [ 'q1_net', 'target_q1_net', 'q2_net', 'target_q2_net' ] net_util.copy(self.q1_net, self.target_q1_net) net_util.copy(self.q2_net, self.target_q2_net) # temperature variable to be learned, and its target entropy self.log_alpha = torch.zeros(1, requires_grad=True, device=self.net.device) self.alpha = self.log_alpha.detach().exp() if self.body.is_discrete: self.target_entropy = -self.body.action_space.n else: self.target_entropy = -np.product(self.body.action_space.shape) # init net optimizer and its lr scheduler self.optim = net_util.get_optim(self.net, self.net.optim_spec) self.lr_scheduler = net_util.get_lr_scheduler( self.optim, self.net.lr_scheduler_spec) self.q1_optim = net_util.get_optim(self.q1_net, self.q1_net.optim_spec) self.q1_lr_scheduler = net_util.get_lr_scheduler( self.q1_optim, self.q1_net.lr_scheduler_spec) self.q2_optim = net_util.get_optim(self.q2_net, self.q2_net.optim_spec) self.q2_lr_scheduler = net_util.get_lr_scheduler( self.q2_optim, self.q2_net.lr_scheduler_spec) self.alpha_optim = net_util.get_optim(self.log_alpha, self.net.optim_spec) self.alpha_lr_scheduler = net_util.get_lr_scheduler( self.alpha_optim, self.net.lr_scheduler_spec) net_util.set_global_nets(self, global_nets) self.post_init_nets()
def train_separate(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: net_util.copy(self.net, self.old_net) batch = self.sample() policy_loss = self.train_actor(batch) val_loss = self.train_critic(batch) loss = val_loss + abs(policy_loss) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.4f}') self.last_loss = loss.item() return self.last_loss
def train_separate(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: batch = self.sample() policy_loss = self.train_actor(batch) val_loss = self.train_critic(batch) loss = val_loss + abs(policy_loss) net_util.copy(self.net, self.old_net) net_util.copy(self.critic, self.old_critic) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def train_separate(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: net_util.copy(self.net, self.old_net) batch = self.sample() policy_loss = self.train_actor(batch) val_loss = self.train_critic(batch) loss = val_loss + abs(policy_loss) # reset self.to_train = 0 self.body.entropies = [] self.body.log_probs = [] logger.debug(f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}') return loss.item() else: return np.nan
def train_shared(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: batch = self.sample() total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): loss = self.calc_loss(batch) self.net.training_step(loss=loss) total_loss += loss loss = total_loss.mean() net_util.copy(self.net, self.old_net) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def train_separate(self): ''' Trains the network when the actor and critic share parameters ''' clock = self.body.env.clock if self.to_train == 1: torch.cuda.empty_cache() net_util.copy(self.net, self.old_net) batch = self.sample() policy_loss = self.train_actor(batch) val_loss = self.train_critic(batch) loss = val_loss + policy_loss # reset self.to_train = 0 self.body.flush() logger.debug( f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan