def training_step(self, x=None, y=None, loss=None, retain_graph=False, global_net=None): ''' Takes a single training step: one forward and one backwards pass More most RL usage, we have custom, often complicated, loss functions. Compute its value and put it in a pytorch tensor then pass it in as loss ''' self.train() self.zero_grad() self.optim.zero_grad() if loss is None: out = self(x) loss = self.loss_fn(out, y) assert not torch.isnan(loss).any(), loss if net_util.to_assert_trained(): # to accommodate split model in inherited classes model = getattr(self, 'model', None) or getattr(self, 'model_body') assert_trained = net_util.gen_assert_trained(model) loss.backward(retain_graph=retain_graph) if self.clip_grad: logger.debug(f'Clipping gradient: {self.clip_grad_val}') torch.nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val) if global_net is None: self.optim.step() else: # distributed training with global net net_util.push_global_grad(self, global_net) self.optim.step() net_util.pull_global_param(self, global_net) if net_util.to_assert_trained(): model = getattr(self, 'model', None) or getattr(self, 'model_body') assert_trained(model, loss) logger.debug(f'Net training_step loss: {loss}') return loss
def train_shared(self): ''' Trains the network when the actor and critic share parameters loss = self.policy_loss_coef * policy_loss + self.val_loss_coef * val_loss ''' if self.to_train == 1: batch = self.sample() with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss self.net.training_step(loss=loss, global_net=self.global_nets.get('net')) # reset self.to_train = 0 self.body.entropies = [] self.body.log_probs = [] logger.debug( f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}' ) return loss.item() else: return np.nan
def test_logger(test_str): logger.critical(test_str) logger.debug(test_str) logger.error(test_str) logger.exception(test_str) logger.info(test_str) logger.warning(test_str)
def calc_log_probs(algorithm, net, body, batch): ''' Method to calculate log_probs fresh from batch data Body already stores log_prob from self.net. This is used for PPO where log_probs needs to be recalculated. ''' states, actions = batch['states'], batch['actions'] action_dim = body.action_dim is_multi_action = ps.is_iterable(action_dim) # construct log_probs for each state-action pdparams = algorithm.calc_pdparam(states, net=net) pdparams = guard_multi_pdparams(pdparams, body) assert len(pdparams) == len(states), f'batch_size of pdparams: {len(pdparams)} vs states: {len(states)}' pdtypes = ACTION_PDS[body.action_type] ActionPD = getattr(distributions, body.action_pdtype) log_probs = [] for idx, pdparam in enumerate(pdparams): if not is_multi_action: # already cloned for multi_action above pdparam = pdparam.clone() # clone for grad safety _action, action_pd = sample_action_pd(ActionPD, pdparam, body) log_probs.append(action_pd.log_prob(actions[idx].float()).sum(dim=0)) log_probs = torch.stack(log_probs) assert not torch.isnan(log_probs).any(), f'log_probs: {log_probs}, \npdparams: {pdparams} \nactions: {actions}' logger.debug(f'log_probs: {log_probs}') return log_probs
def get_session_data(session): ''' Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate. @returns {dict, dict} session_mdp_data, session_data ''' data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES mdp_data_names = ['t', 'epi'] + data_names agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys()) data_h_v_dict = {data_name: session.aeb_space.get_history_v(data_name) for data_name in data_names} session_mdp_data, session_data = {}, {} for aeb in session.aeb_space.aeb_list: data_h_dict = {data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items()} # trim back to remove any incomplete sessions due to multienv termination complete_done_h = np.trim_zeros(data_h_dict['done'], 'b') # offset properly to bin separate episodes reset_bin = np.concatenate([[0.], complete_done_h[:-1]]) data_len = len(reset_bin) reset_idx = reset_bin.astype('bool') nonreset_idx = ~reset_idx data_h_dict['t'] = np.ones(reset_idx.shape) data_h_dict['epi'] = reset_idx.astype(int).cumsum() mdp_df = pd.DataFrame({ data_name: data_h_dict[data_name][:data_len] for data_name in mdp_data_names}) mdp_df = mdp_df.reindex(mdp_data_names, axis=1) aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS) aeb_df.reset_index(drop=False, inplace=True) session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df logger.debug(f'{session_data}') data_size_in_bytes = util.memory_size(session_mdp_data) logger.debug(f'Size of session data: {data_size_in_bytes} MB') if data_size_in_bytes > 25: logger.warn(f'Session data > 25 MB') return session_mdp_data, session_data
def update_lr(self): assert 'lr' in self.optim_param old_lr = self.optim_param['lr'] self.optim_param['lr'] = old_lr * 0.9 logger.debug( f'Learning rate decayed from {old_lr} to {self.optim_param["lr"]}') self.optim = net_util.get_optim_multinet(self.params, self.optim_param)
def training_step(self, xs=None, ys=None, loss=None, retain_graph=False, lr_clock=None): ''' Takes a single training step: one forward and one backwards pass. Both x and y are lists of the same length, one x and y per environment ''' self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t')) self.train() self.optim.zero_grad() if loss is None: outs = self(xs) total_loss = torch.tensor(0.0, device=self.device) for out, y in zip(outs, ys): loss = self.loss_fn(out, y) total_loss += loss loss = total_loss assert not torch.isnan(loss).any(), loss if net_util.to_assert_trained(): assert_trained = net_util.gen_assert_trained(self) loss.backward(retain_graph=retain_graph) if self.clip_grad_val is not None: nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val) self.optim.step() if net_util.to_assert_trained(): assert_trained(self, loss) self.store_grad_norms() logger.debug(f'Net training_step loss: {loss}') return loss
def calc_gae_advs_v_targets(self, batch, v_preds): ''' Calculate GAE, and advs = GAE, v_targets = advs + v_preds See GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf ''' next_states = batch['next_states'][-1] if not self.body.env.is_venv: next_states = next_states.unsqueeze(dim=0) with torch.no_grad(): next_v_pred = self.calc_v(next_states, use_cache=False) v_preds = v_preds.detach() # adv does not accumulate grad if self.body.env.is_venv: v_preds = math_util.venv_pack(v_preds, self.body.env.num_envs) next_v_pred = next_v_pred.unsqueeze(dim=0) v_preds_all = torch.cat((v_preds, next_v_pred), dim=0) advs = math_util.calc_gaes(batch['rewards'], batch['dones'], v_preds_all, self.gamma, self.lam) v_targets = advs + v_preds advs = math_util.standardize( advs) # standardize only for advs, not v_targets if self.body.env.is_venv: advs = math_util.venv_unpack(advs) v_targets = math_util.venv_unpack(v_targets) logger.debug(f'advs: {advs}\nv_targets: {v_targets}') return advs, v_targets
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_iter): batch = self.sample() clock.set_batch_size(len(batch)) for _ in range(self.training_batch_iter): loss = self.calc_q_loss(batch) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) total_loss += loss loss = total_loss / (self.training_iter * self.training_batch_iter) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def train_shared(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: # update old net net_util.copy(self.net, self.old_net) batch = self.sample() total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss # retain for entropies etc. self.net.training_step(loss=loss, retain_graph=True, global_net=self.global_nets.get('net')) total_loss += loss loss = total_loss / self.training_epoch # reset self.to_train = 0 self.body.entropies = [] self.body.log_probs = [] logger.debug(f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}') return loss.item() else: return np.nan
def calc_q_loss(self, batch): '''Compute the Q value loss using predicted and target Q values from the appropriate networks''' states = batch['states'] next_states = batch['next_states'] q_preds = self.net(states) with torch.no_grad(): # Use online_net to select actions in next state online_next_q_preds = self.online_net(next_states) # Use eval_net to calculate next_q_preds for actions chosen by online_net next_q_preds = self.eval_net(next_states) act_q_preds = q_preds.gather( -1, batch['actions'].long().unsqueeze(-1)).squeeze(-1) online_actions = online_next_q_preds.argmax(dim=-1, keepdim=True) max_next_q_preds = next_q_preds.gather(-1, online_actions).squeeze(-1) max_q_targets = batch['rewards'] + self.gamma * ( 1 - batch['dones']) * max_next_q_preds logger.debug( f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}') q_loss = self.net.loss_fn(act_q_preds, max_q_targets) # TODO use the same loss_fn but do not reduce yet if 'Prioritized' in util.get_class_name(self.body.memory): # PER errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy() self.body.memory.update_priorities(errors) return q_loss
def space_train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): self.body.flush() return np.nan clock = self.body.env.clock # main clock tick = util.s_get(self, 'aeb_space.clock').get(clock.max_tick_unit) self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0) if self.to_train == 1: total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): batch = self.space_sample() for _ in range(self.training_batch_epoch): loss = self.calc_q_loss(batch) self.net.training_step(loss=loss, lr_clock=clock) total_loss += loss loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 for body in self.agent.nanflat_body_a: body.flush() logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def init_nets(self): '''Initialize nets with multi-task dimensions, and set net params''' # NOTE: Separate init from MultitaskDQN despite similarities so that this implementation can support arbitrary sized state and action heads (e.g. multiple layers) net_spec = self.agent.spec['net'] if len(net_spec['hid_layers']) > 0: state_head_out_d = int(net_spec['hid_layers'][0] / 4) else: state_head_out_d = 16 self.state_dims = [ [body.state_dim, state_head_out_d] for body in self.agent.nanflat_body_a] self.action_dims = [ [body.action_dim] for body in self.agent.nanflat_body_a] self.total_state_dim = sum([s[0] for s in self.state_dims]) self.total_action_dim = sum([a[0] for a in self.action_dims]) logger.debug( f'State dims: {self.state_dims}, total: {self.total_state_dim}') logger.debug( f'Action dims: {self.action_dims}, total: {self.total_action_dim}') net_kwargs = util.compact_dict(dict( hid_layers_activation=_.get(net_spec, 'hid_layers_activation'), optim_param=_.get(net_spec, 'optim'), loss_param=_.get(net_spec, 'loss'), clamp_grad=_.get(net_spec, 'clamp_grad'), clamp_grad_val=_.get(net_spec, 'clamp_grad_val'), )) self.net = getattr(net, net_spec['type'])( self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs) self.target_net = getattr(net, net_spec['type'])( self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs) self.online_net = self.target_net self.eval_net = self.target_net util.set_attr(self, _.pick(net_spec, [ 'batch_size', 'update_type', 'update_frequency', 'polyak_weight', ]))
def test_logger(test_multiline_str): logger.critical(test_multiline_str) logger.debug(test_multiline_str) logger.error(test_multiline_str) logger.exception(test_multiline_str) logger.info(test_multiline_str) logger.warn(test_multiline_str)
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' total_t = util.s_get(self, 'aeb_space.clock').get('total_t') self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0) if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): batch = self.sample() for _ in range(self.training_batch_epoch): with torch.no_grad(): q_targets = self.calc_q_targets(batch) loss = self.net.training_step(batch['states'], q_targets) total_loss += loss loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss}') self.last_loss = loss.item() return self.last_loss
def get_session_data(session): '''Gather data from session: MDP, Agent, Env data, and form session_data.''' aeb_space = session.aeb_space data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys()) data_h_v_dict = { data_name: aeb_space.get_history_v(data_name) for data_name in data_names } session_df_data = {} session_data = {} for aeb in aeb_space.aeb_list: data_h_dict = { data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items() } reset_idx = np.isnan(data_h_dict['done']) nonreset_idx = ~reset_idx epi_h = reset_idx.astype(int).cumsum() t_h = np.ones(reset_idx.shape) data_h_dict['epi'] = epi_h data_h_dict['t'] = t_h df = pd.DataFrame({ data_name: data_h_dict[data_name][nonreset_idx] for data_name in ['epi', 't'] + data_names }) aeb_df = df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS) aeb_df.reset_index(drop=False, inplace=True) # TODO save full data to db session_df_data[aeb] = df session_data[aeb] = aeb_df logger.debug(f'{session_data}') return session_data
def normalize_states_and_next_states(body, batch, episodic_flag=None): ''' Convenience function for normalizing the states and next states in a batch of data ''' logger.debug(f'states: {batch["states"]}') logger.debug(f'next states: {batch["next_states"]}') episodic = episodic_flag if episodic_flag is not None else body.memory.is_episodic logger.debug( f'Episodic: {episodic}, episodic_flag: {episodic_flag}, body.memory: {body.memory.is_episodic}' ) if episodic: normalized = [] for epi in batch['states']: normalized.append(normalize_state(body, epi)) batch['states'] = normalized normalized = [] for epi in batch['next_states']: normalized.append(normalize_state(body, epi)) batch['next_states'] = normalized else: batch['states'] = normalize_state(body, batch['states']) batch['next_states'] = normalize_state(body, batch['next_states']) logger.debug(f'normalized states: {batch["states"]}') logger.debug(f'normalized next states: {batch["next_states"]}') return batch
def calc_gae_advs_v_targets(self, batch): ''' Calculate the GAE advantages and value targets for training actor and critic respectively adv_targets = GAE (see math_util method) v_targets = adv_targets + v_preds before output, adv_targets is standardized (so v_targets used the unstandardized version) Used for training with GAE ''' v_preds = self.calc_v(batch['states']) # calc next_state boundary value and concat with above for efficiency next_v_pred_tail = self.calc_v(batch['next_states'][-1:]) next_v_preds = torch.cat([v_preds[1:], next_v_pred_tail], dim=0) # ensure val for next_state is 0 at done next_v_preds = next_v_preds * (1 - batch['dones']) # v_targets = gae_targets + v_preds adv_targets = math_util.calc_gaes(batch['rewards'], v_preds, next_v_preds, self.gamma, self.lam) v_targets = adv_targets + v_preds if torch.cuda.is_available() and self.net.gpu: adv_targets = adv_targets.cuda() v_targets = v_targets.cuda() # standardization trick # guard nan std by setting to 0 and add small const adv_std = adv_targets.std() adv_std[adv_std != adv_std] = 0 adv_std += 1e-08 adv_targets = (adv_targets - adv_targets.mean()) / adv_std logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}') return adv_targets, v_targets
def train_separate(self): ''' Trains the network when the actor and critic are separate networks ''' clock = self.body.env.clock if self.to_train == 1: # onpolicy update super_loss = super(SIL, self).train_separate() # offpolicy sil update with random minibatch total_sil_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): batch = self.replay_sample() for _ in range(self.training_batch_epoch): sil_policy_loss, sil_val_loss = self.calc_sil_policy_val_loss( batch) self.net.training_step(loss=sil_policy_loss, lr_clock=clock, retain_graph=True) self.critic.training_step(loss=sil_val_loss, lr_clock=clock) total_sil_loss += sil_policy_loss + sil_val_loss sil_loss = total_sil_loss / self.training_epoch loss = super_loss + sil_loss logger.debug( f'Trained {self.name} at epi: {clock.get("epi")}, total_t: {clock.get("total_t")}, t: {clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}' ) return loss.item() else: return np.nan
def training_step(self, x=None, y=None, loss=None, retain_graph=False, lr_clock=None): ''' Takes a single training step: one forward and one backwards pass More most RL usage, we have custom, often complicated, loss functions. Compute its value and put it in a pytorch tensor then pass it in as loss ''' if hasattr(self, 'model_tails') and x is not None: raise ValueError( 'Loss computation from x,y not supported for multitails') self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t')) self.train() self.optim.zero_grad() if loss is None: out = self(x) loss = self.loss_fn(out, y) assert not torch.isnan(loss).any(), loss if net_util.to_assert_trained(): assert_trained = net_util.gen_assert_trained(self) loss.backward(retain_graph=retain_graph) if self.clip_grad_val is not None: nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val) self.optim.step() if net_util.to_assert_trained(): assert_trained(self, loss) self.store_grad_norms() logger.debug(f'Net training_step loss: {loss}') return loss
def train(self): ''' Completes one training step for the agent if it is time to train. Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() clock.set_batch_size(len(batch)) loss = self.calc_q_loss(batch) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def train(self): '''Train actor critic by computing the loss in batch efficiently''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() clock.set_batch_size(len(batch)) pdparams, v_preds = self.calc_pdparam_v(batch) advs, v_targets = self.calc_advs_v_targets(batch, v_preds) policy_loss = self.calc_policy_loss(batch, pdparams, advs) # from actor val_loss = self.calc_val_loss(v_preds, v_targets) # from critic if self.shared: # shared network loss = policy_loss + val_loss self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) else: self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = policy_loss + val_loss # reset self.to_train = 0 logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def calc_q_targets(self, batch): '''Compute the target Q values for multitask network by iterating through the slices corresponding to bodies, and computing the singleton function''' q_preds = self.net.wrap_eval(batch['states']) # Use online_net to select actions in next state online_next_q_preds = self.online_net.wrap_eval(batch['next_states']) next_q_preds = self.eval_net.wrap_eval(batch['next_states']) start_idx = 0 multi_q_targets = [] # iterate over body, use slice with proper idx offset for b, body_batch in enumerate(batch['body_batches']): body = self.agent.nanflat_body_a[b] end_idx = start_idx + body.action_dim _, action_idxs = torch.max(online_next_q_preds[:, start_idx:end_idx], dim=1) # Offset action index properly action_idxs += start_idx batch_size = len(body_batch['dones']) max_next_q_preds = next_q_preds[range(batch_size), action_idxs] max_q_targets = body_batch['rewards'] + self.gamma * ( 1 - body_batch['dones']) * max_next_q_preds max_q_targets.unsqueeze_(1) q_targets = (max_q_targets * body_batch['actions']) + ( q_preds[:, start_idx:end_idx] * (1 - body_batch['actions'])) multi_q_targets.append(q_targets) start_idx = end_idx q_targets = torch.cat(multi_q_targets, dim=1) logger.debug(f'q_targets: {q_targets}') return q_targets
def training_step(self, xs=None, ys=None, loss=None, retain_graph=False): ''' Takes a single training step: one forward and one backwards pass. Both x and y are lists of the same length, one x and y per environment ''' self.train() self.zero_grad() self.optim.zero_grad() if loss is None: outs = self(xs) total_loss = torch.tensor(0.0) for out, y in zip(outs, ys): loss = self.loss_fn(out, y) total_loss += loss.cpu() assert not torch.isnan(total_loss).any() if net_util.to_assert_trained(): assert_trained = net_util.gen_assert_trained(self.model_body) total_loss.backward(retain_graph=retain_graph) if self.clip_grad: logger.debug(f'Clipping gradient') torch.nn.utils.clip_grad_norm(self.parameters(), self.clip_grad_val) self.optim.step() if net_util.to_assert_trained(): assert_trained(self.model_body) return total_loss
def train_separate(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: batch = self.sample() total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): loss = self.calc_loss(batch) # to reuse loss for critic loss.backward = partial(loss.backward, retain_graph=True) self.net.training_step(loss=loss) # critic.optim.step using the same loss loss.backward = partial(loss.backward, retain_graph=False) self.critic.training_step(loss=loss) total_loss += loss loss = total_loss.mean() net_util.copy(self.net, self.old_net) net_util.copy(self.critic, self.old_critic) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def training_step(self, x=None, y=None, loss=None, retain_graph=False, global_net=None): '''Takes a single training step: one forward and one backwards pass''' self.train() self.zero_grad() self.optim.zero_grad() if loss is None: out = self(x) loss = self.loss_fn(out, y) assert not torch.isnan(loss).any(), loss if net_util.to_assert_trained(): assert_trained = net_util.gen_assert_trained(self.rnn_model) loss.backward(retain_graph=retain_graph) if self.clip_grad: logger.debug(f'Clipping gradient: {self.clip_grad_val}') torch.nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val) if global_net is None: self.optim.step() else: # distributed training with global net net_util.push_global_grad(self, global_net) self.optim.step() net_util.pull_global_param(self, global_net) if net_util.to_assert_trained(): assert_trained(self.rnn_model, loss) logger.debug(f'Net training_step loss: {loss}') return loss
def training_step(self, xs=None, ys=None, loss=None, retain_graph=False, global_net=None): ''' Takes a single training step: one forward and one backwards pass. Both x and y are lists of the same length, one x and y per environment ''' self.train() self.zero_grad() self.optim.zero_grad() if loss is None: outs = self(xs) total_loss = torch.tensor(0.0, device=self.device) for out, y in zip(outs, ys): loss = self.loss_fn(out, y) total_loss += loss loss = total_loss assert not torch.isnan(loss).any(), loss if net_util.to_assert_trained(): assert_trained = net_util.gen_assert_trained(self.model_body) loss.backward(retain_graph=retain_graph) if self.clip_grad: logger.debug(f'Clipping gradient: {self.clip_grad_val}') torch.nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val) if global_net is None: self.optim.step() else: # distributed training with global net net_util.push_global_grad(self, global_net) self.optim.step() net_util.pull_global_param(self, global_net) if net_util.to_assert_trained(): assert_trained(self.model_body, loss) logger.debug(f'Net training_step loss: {loss}') return loss
def train_shared(self): ''' Trains the network when the actor and critic share parameters ''' clock = self.body.env.clock if self.to_train == 1: # update old net torch.cuda.empty_cache() net_util.copy(self.net, self.old_net) batch = self.sample() total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss # retain for entropies etc. self.net.training_step(loss=loss, lr_clock=clock, retain_graph=True) total_loss += loss loss = total_loss / self.training_epoch # reset self.to_train = 0 self.body.flush() logger.debug( f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def compute_q_target_values(self, batch): '''Computes the target Q values for a batch of experiences. Note that the net references may differe based on algorithm.''' q_sts = self.net.wrap_eval(batch['states']) # Use act_select network to select actions in next state q_next_st_acts = self.online_net.wrap_eval(batch['next_states']) _val, q_next_acts = torch.max(q_next_st_acts, dim=1) logger.debug(f'Q next action: {q_next_acts.size()}') # Select q_next_st_maxs based on action selected in q_next_acts # Evaluate the action selection using the eval net q_next_sts = self.eval_net.wrap_eval(batch['next_states']) logger.debug(f'Q next_states: {q_next_sts.size()}') idx = torch.from_numpy(np.array(list(range(self.batch_size)))) q_next_st_maxs = q_next_sts[idx, q_next_acts] q_next_st_maxs.unsqueeze_(1) logger.debug(f'Q next_states max {q_next_st_maxs.size()}') # Compute final q_target using reward and estimated best Q value from the next state if there is one # Make future reward 0 if the current state is done q_targets_max = batch['rewards'].data + self.gamma * \ torch.mul((1 - batch['dones'].data), q_next_st_maxs) logger.debug(f'Q targets max: {q_targets_max.size()}') # We only want to train the network for the action selected # For all other actions we set the q_target = q_sts # So that the loss for these actions is 0 q_targets = torch.mul(q_targets_max, batch['actions'].data) + \ torch.mul(q_sts, (1 - batch['actions'].data)) logger.debug(f'Q targets: {q_targets.size()}') return q_targets
def train_shared(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: batch = self.sample() total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss # retain for entropies etc. self.net.training_step(loss=loss, retain_graph=True) total_loss += loss.cpu() loss = total_loss / self.training_epoch net_util.copy(self.net, self.old_net) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def get_session_data(session): ''' Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate. @returns {dict, dict} session_mdp_data, session_data ''' data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES mdp_data_names = ['t', 'epi'] + data_names agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys()) data_h_v_dict = { data_name: session.aeb_space.get_history_v(data_name) for data_name in data_names } session_mdp_data, session_data = {}, {} for aeb in session.aeb_space.aeb_list: data_h_dict = { data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items() } # remove any incomplete session timesteps from tail (due to multienv termination) complete_done_h = np.trim_zeros(data_h_dict['done'], 'b') data_len = len(complete_done_h) reset_idx = np.isnan(complete_done_h) nonreset_idx = ~reset_idx data_h_dict['t'] = np.ones(reset_idx.shape) data_h_dict['epi'] = reset_idx.astype(int).cumsum() mdp_df = pd.DataFrame({ data_name: data_h_dict[data_name][:data_len][nonreset_idx] for data_name in mdp_data_names }) mdp_df = mdp_df.reindex(mdp_data_names, axis=1) aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS) aeb_df.reset_index(drop=False, inplace=True) session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df logger.debug(f'{session_data}') return session_mdp_data, session_data
def update_online_stats(body, state): ''' Method to calculate the running mean and standard deviation of the state space. See https://www.johndcook.com/blog/standard_deviation/ for more details for n >= 1 M_n = M_n-1 + (state - M_n-1) / n S_n = S_n-1 + (state - M_n-1) * (state - M_n) variance = S_n / (n - 1) std_dev = sqrt(variance) ''' logger.debug(f'mean: {body.state_mean}, std: {body.state_std_dev}, num examples: {body.state_n}') # Assumes only one state is given if ("Atari" in body.memory.__class__.__name__): assert state.ndim == 3 elif getattr(body.memory, 'raw_state_dim', False): assert state.size == body.memory.raw_state_dim else: assert state.size == body.state_dim or state.shape == body.state_dim mean = body.state_mean body.state_n += 1 if np.isnan(mean).any(): assert np.isnan(body.state_std_dev_int) assert np.isnan(body.state_std_dev) body.state_mean = state body.state_std_dev_int = 0 body.state_std_dev = 0 else: assert body.state_n > 1 body.state_mean = mean + (state - mean) / body.state_n body.state_std_dev_int = body.state_std_dev_int + (state - mean) * (state - body.state_mean) body.state_std_dev = np.sqrt(body.state_std_dev_int / (body.state_n - 1)) # Guard against very small std devs if (body.state_std_dev < 1e-8).any(): body.state_std_dev[np.where(body.state_std_dev < 1e-8)] += 1e-8 logger.debug(f'new mean: {body.state_mean}, new std: {body.state_std_dev}, num examples: {body.state_n}')
def reset(self, state_space): logger.debug('AgentSpace.reset') _action_v, _loss_v, _explore_var_v = self.aeb_space.init_data_v(AGENT_DATA_NAMES) for agent in self.agents: state_a = state_space.get(a=agent.a) agent.reset(state_a) _action_space, _loss_space, _explore_var_space = self.aeb_space.add(AGENT_DATA_NAMES, [_action_v, _loss_v, _explore_var_v]) return _action_space
def reset(self): logger.debug('EnvSpace.reset') _reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES) for env in self.envs: _reward_e, state_e, done_e = env.reset() state_v[env.e, 0:len(state_e)] = state_e done_v[env.e, 0:len(done_e)] = done_e _reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, [_reward_v, state_v, done_v]) logger.debug(f'\nstate_space: {state_space}') return _reward_space, state_space, done_space
def calc_val_loss(self, batch, v_targets): '''Calculate the critic's value loss''' v_targets = v_targets.unsqueeze(dim=-1) v_preds = self.calc_v(batch['states'], evaluate=False).unsqueeze_(dim=-1) assert v_preds.shape == v_targets.shape val_loss = self.val_loss_coef * self.net.loss_fn(v_preds, v_targets) if torch.cuda.is_available() and self.net.gpu: val_loss = val_loss.cuda() logger.debug(f'Critic value loss: {val_loss:.2f}') return val_loss
def act(self, state_space): data_names = ['action'] action_v, = self.aeb_space.init_data_v(data_names) for agent in self.agents: a = agent.a state_a = state_space.get(a=a) action_a = agent.act(state_a) action_v[a, 0:len(action_a)] = action_a action_space, = self.aeb_space.add(data_names, [action_v]) logger.debug(f'\naction_space: {action_space}') return action_space
def step(self, action_space): reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES) for env in self.envs: e = env.e action_e = action_space.get(e=e) reward_e, state_e, done_e = env.step(action_e) reward_v[e, 0:len(reward_e)] = reward_e state_v[e, 0:len(state_e)] = state_e done_v[e, 0:len(done_e)] = done_e reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, [reward_v, state_v, done_v]) logger.debug(f'\nreward_space: {reward_space}\nstate_space: {state_space}\ndone_space: {done_space}') return reward_space, state_space, done_space
def calc_policy_loss(self, batch, advs): '''Calculate the actor's policy loss''' assert len(self.body.log_probs) == len(advs), f'{len(self.body.log_probs)} vs {len(advs)}' log_probs = torch.stack(self.body.log_probs) policy_loss = - self.policy_loss_coef * log_probs * advs if self.add_entropy: entropies = torch.stack(self.body.entropies) policy_loss += (-self.entropy_coef * entropies) policy_loss = torch.mean(policy_loss) if torch.cuda.is_available() and self.net.gpu: policy_loss = policy_loss.cuda() logger.debug(f'Actor policy loss: {policy_loss:.2f}') return policy_loss
def train(self): if util.get_lab_mode() == 'enjoy': return np.nan if self.to_train == 1: batch = self.sample() loss = self.calc_policy_loss(batch) self.net.training_step(loss=loss) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Policy loss: {loss}') self.last_loss = loss.item() return self.last_loss
def update_nets(self): total_t = util.s_get(self, 'aeb_space.clock').get('total_t') if self.net.update_type == 'replace': if total_t % self.net.update_frequency == 0: logger.debug('Updating target_net by replacing') self.target_net.load_state_dict(self.net.state_dict()) self.online_net = self.target_net self.eval_net = self.target_net elif self.net.update_type == 'polyak': logger.debug('Updating net by averaging') net_util.polyak_update(self.net, self.target_net, self.net.polyak_coef) self.online_net = self.target_net self.eval_net = self.target_net else: raise ValueError('Unknown net.update_type. Should be "replace" or "polyak". Exiting.')
def update(self, action_space, reward_space, state_space, done_space): data_names = ['loss', 'explore_var'] loss_v, explore_var_v = self.aeb_space.init_data_v(data_names) for agent in self.agents: a = agent.a action_a = action_space.get(a=a) reward_a = reward_space.get(a=a) state_a = state_space.get(a=a) done_a = done_space.get(a=a) loss_a, explore_var_a = agent.update(action_a, reward_a, state_a, done_a) loss_v[a, 0:len(loss_a)] = loss_a explore_var_v[a, 0:len(explore_var_a)] = explore_var_a loss_space, explore_var_space = self.aeb_space.add(data_names, [loss_v, explore_var_v]) logger.debug(f'\nloss_space: {loss_space}\nexplore_var_space: {explore_var_space}') return loss_space, explore_var_space
def fn_decay_lr(net, fn): ''' Decay learning rate for net module, only returns the new lr for user to set to appropriate nets In the future, might add more flexible lr adjustment, like boosting and decaying on need. ''' space_clock = util.s_get(net.algorithm, 'aeb_space.clock') total_t = space_clock.get('total_t') start_val, end_val = net.optim_spec['lr'], 1e-6 anneal_total_t = net.lr_anneal_timestep or max(10e6, 60 * net.lr_decay_frequency) if total_t >= net.lr_decay_min_timestep and total_t % net.lr_decay_frequency == 0: logger.debug(f'anneal_total_t: {anneal_total_t}, total_t: {total_t}') new_lr = fn(start_val, end_val, anneal_total_t, total_t) return new_lr else: return no_decay(net)
def train_separate(self): ''' Trains the network when the actor and critic are separate networks loss = val_loss + abs(policy_loss) ''' if self.to_train == 1: batch = self.sample() policy_loss = self.train_actor(batch) val_loss = self.train_critic(batch) loss = val_loss + abs(policy_loss) # reset self.to_train = 0 self.body.entropies = [] self.body.log_probs = [] logger.debug(f'Total loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def train_separate(self): ''' Trains the network when the actor and critic share parameters ''' if self.to_train == 1: batch = self.sample() policy_loss = self.train_actor(batch) val_loss = self.train_critic(batch) loss = val_loss + abs(policy_loss) net_util.copy(self.net, self.old_net) net_util.copy(self.critic, self.old_critic) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def train(self): ''' Completes one training step for the agent if it is time to train. Otherwise this function does nothing. ''' if util.get_lab_mode() == 'enjoy': return np.nan if self.to_train == 1: batch = self.sample() with torch.no_grad(): q_targets = self.calc_q_targets(batch) loss = self.net.training_step(batch['states'], q_targets) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss}') self.last_loss = loss.item() return self.last_loss
def training_step(self, x=None, y=None, loss=None, retain_graph=False): '''Takes a single training step: one forward and one backwards pass''' self.train() self.zero_grad() self.optim.zero_grad() if loss is None: out = self(x) loss = self.loss_fn(out, y) assert not torch.isnan(loss).any() if net_util.to_assert_trained(): assert_trained = net_util.gen_assert_trained(self.conv_model) loss.backward(retain_graph=retain_graph) if self.clip_grad: logger.debug(f'Clipping gradient') torch.nn.utils.clip_grad_norm(self.parameters(), self.clip_grad_val) self.optim.step() if net_util.to_assert_trained(): assert_trained(self.conv_model) return loss
def train_shared(self): ''' Trains the network when the actor and critic share parameters loss = self.policy_loss_coef * policy_loss + self.val_loss_coef * val_loss ''' if self.to_train == 1: batch = self.sample() with torch.no_grad(): advs, v_targets = self.calc_advs_v_targets(batch) policy_loss = self.calc_policy_loss(batch, advs) # from actor val_loss = self.calc_val_loss(batch, v_targets) # from critic loss = policy_loss + val_loss self.net.training_step(loss=loss) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Total loss: {loss:.2f}') self.last_loss = loss.item() return self.last_loss
def calc_sil_policy_val_loss(self, batch): ''' Calculate the SIL policy losses for actor and critic sil_policy_loss = -log_prob * max(R - v_pred, 0) sil_val_loss = (max(R - v_pred, 0)^2) / 2 This is called on a randomly-sample batch from experience replay ''' returns = math_util.calc_returns(batch, self.gamma) v_preds = self.calc_v(batch['states'], evaluate=False) clipped_advs = torch.clamp(returns - v_preds, min=0.0) log_probs = self.calc_log_probs(batch) sil_policy_loss = self.sil_policy_loss_coef * torch.mean(- log_probs * clipped_advs) sil_val_loss = self.sil_val_loss_coef * torch.pow(clipped_advs, 2) / 2 sil_val_loss = torch.mean(sil_val_loss) if torch.cuda.is_available() and self.net.gpu: sil_policy_loss = sil_policy_loss.cuda() sil_val_loss = sil_val_loss.cuda() logger.debug(f'SIL actor policy loss: {sil_policy_loss:.2f}') logger.debug(f'SIL critic value loss: {sil_val_loss:.2f}') return sil_policy_loss, sil_val_loss
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.get_lab_mode() == 'enjoy': return np.nan total_t = util.s_get(self, 'aeb_space.clock').get('total_t') self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0) is_per = util.get_class_name(self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay' if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): batch = self.sample() for _ in range(self.training_batch_epoch): with torch.no_grad(): q_targets = self.calc_q_targets(batch) if is_per: q_preds = self.net.wrap_eval(batch['states']) errors = torch.abs(q_targets - q_preds) errors = errors.sum(dim=1).unsqueeze_(dim=1) for body in self.agent.nanflat_body_a: body.memory.update_priorities(errors) loss = self.net.training_step(batch['states'], q_targets) total_loss += loss.cpu() loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss}') self.last_loss = loss.item() return self.last_loss
def time_fn(*args, **kwargs): start = time.time() output = fn(*args, **kwargs) end = time.time() logger.debug(f'Timed: {fn.__name__} {round((end - start) * 1000, 4)}ms') return output
def check_api(*args, **kwargs): # TODO name-based data check for api methods output = fn(*args, **kwargs) logger.debug(f'API method: {fn.__name__}, output: {output}') return output
def __init__(self): logger.debug('Monitor initialized.')