def train(self): '''Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' t = util.s_get(self, 'aeb_space.clock').get('total_t') if (t > self.training_min_timestep and t % self.training_frequency == 0): logger.debug(f'Training at t: {t}') nanflat_loss_a = np.zeros(self.agent.body_num) for _b in range(self.training_epoch): batch_losses = np.zeros(self.agent.body_num) batch = self.sample() for _i in range(self.training_iters_per_batch): q_targets = self.compute_q_target_values(batch) y = [Variable(q) for q in q_targets] losses = self.net.training_step(batch['states'], y) logger.debug(f'losses {losses}') batch_losses += losses batch_losses /= self.training_iters_per_batch nanflat_loss_a += batch_losses nanflat_loss_a /= self.training_epoch loss_a = self.nanflat_to_data_a('loss', nanflat_loss_a) return loss_a else: logger.debug('NOT training') return np.nan
def space_train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): self.body.flush() return np.nan clock = self.body.env.clock # main clock tick = util.s_get(self, 'aeb_space.clock').get(clock.max_tick_unit) self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0) if self.to_train == 1: total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): batch = self.space_sample() for _ in range(self.training_batch_epoch): loss = self.calc_q_loss(batch) self.net.training_step(loss=loss, lr_clock=clock) total_loss += loss loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 for body in self.agent.nanflat_body_a: body.flush() logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' total_t = util.s_get(self, 'aeb_space.clock').get('total_t') self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0) if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): batch = self.sample() for _ in range(self.training_batch_epoch): with torch.no_grad(): q_targets = self.calc_q_targets(batch) loss = self.net.training_step(batch['states'], q_targets) total_loss += loss loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss}') self.last_loss = loss.item() return self.last_loss
def update_explore_var(self): '''Updates the explore variables''' space_clock = util.s_get(self, 'aeb_space.clock') nanflat_explore_var_a = self.action_policy_update(self, space_clock) explore_var_a = self.nanflat_to_data_a( 'explore_var', nanflat_explore_var_a) return explore_var_a
def train(self): '''Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' t = util.s_get(self, 'aeb_space.clock').get('total_t') if (t > self.training_min_timestep and t % self.training_frequency == 0): logger.debug(f'Training at t: {t}') total_loss = 0.0 for _b in range(self.training_epoch): batch = self.sample() batch_loss = 0.0 for _i in range(self.training_iters_per_batch): q_targets = self.compute_q_target_values(batch) y = Variable(q_targets) loss = self.net.training_step(batch['states'], y) batch_loss += loss.data[0] batch_loss /= self.training_iters_per_batch total_loss += batch_loss total_loss /= self.training_epoch logger.debug(f'total_loss {total_loss}') return total_loss else: logger.debug('NOT training') return np.nan
def update(self): '''Update the agent after training''' space_clock = util.s_get(self, 'aeb_space.clock') for net in [self.net]: net.update_lr(space_clock) explore_vars = [self.action_policy_update(self, body) for body in self.agent.nanflat_body_a] explore_var_a = self.nanflat_to_data_a('explore_var', explore_vars) return explore_var_a
def train(self): '''Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' t = util.s_get(self, 'aeb_space.clock').get('total_t') if (t > self.training_min_timestep and t % self.training_frequency == 0): logger.debug(f'Training at t: {t}') total_loss = 0.0 total_losses = None for _b in range(self.training_epoch): batch = self.sample() batch_loss = 0.0 batch_losses = None for _i in range(self.training_iters_per_batch): q_targets = self.compute_q_target_values(batch) y = [] for q in q_targets: y.append(Variable(q)) loss, losses = self.net.training_step(batch['states'], y) logger.debug(f'loss {loss}') logger.debug(f'losses {losses}') batch_loss += loss if batch_losses is None: batch_losses = losses else: batch_losses = [ sum(x) for x in zip(batch_losses, losses) ] batch_loss /= self.training_iters_per_batch batch_losses = [ float(x) / self.training_iters_per_batch for x in batch_losses ] total_loss += batch_loss if total_losses is None: total_losses = batch_losses else: total_losses = [ sum(x) for x in zip(total_losses, batch_losses) ] total_loss /= self.training_epoch total_losses = [ float(x) / self.training_epoch for x in total_losses ] if t % 25 == 0: logger.info(f'total_loss {total_loss}') logger.info(f'total losses {total_losses}') # TODO: Return other losses as well. return total_loss else: logger.debug('NOT training') return np.nan
def update_nets(self): res = super(DoubleDQN, self).update_nets() total_t = util.s_get(self, 'aeb_space.clock').get('total_t') if self.net.update_type == 'replace': if total_t % self.net.update_frequency == 0: self.online_net = self.net self.eval_net = self.target_net elif self.net.update_type == 'polyak': self.online_net = self.net self.eval_net = self.target_net
def update_nets(self): res = super(DoubleDQN, self).update_nets() total_t = util.s_get(self, 'aeb_space.clock').get('total_t') if self.net.update_type == 'replace': if total_t % self.net.update_frequency == 0: self.online_net = self.net self.eval_net = self.target_net elif self.net.update_type == 'polyak': self.online_net = self.net self.eval_net = self.target_net
def decay_learning_rate(algo, nets): ''' Decay learning rate for each net by the decay method update_lr() defined in them. In the future, might add more flexible lr adjustment, like boosting and decaying on need. ''' space_clock = util.s_get(algo, 'aeb_space.clock') t = space_clock.get('total_t') if algo.decay_lr and t > algo.decay_lr_min_timestep: if t % algo.decay_lr_frequency == 0: for net in nets: net.update_lr()
def update(self): super(DoubleDQN, self).update() space_clock = util.s_get(self, 'aeb_space.clock') t = space_clock.get('t') if self.update_type == 'replace': if t % self.update_frequency == 0: self.online_net = self.net self.eval_net = self.target_net elif self.update_type == 'polyak': self.online_net = self.net self.eval_net = self.target_net return self.explore_var
def update(self): space_clock = util.s_get(self, 'aeb_space.clock') nets = [self.net ] if self.share_architecture else [self.net, self.critic] for net in nets: net.update_lr(space_clock) explore_vars = [ self.action_policy_update(self, body) for body in self.agent.nanflat_body_a ] explore_var_a = self.nanflat_to_data_a('explore_var', explore_vars) return explore_var_a
def update_nets(self): total_t = util.s_get(self, 'aeb_space.clock').get('total_t') if self.net.update_type == 'replace': if total_t % self.net.update_frequency == 0: logger.debug('Updating target_net by replacing') self.target_net.load_state_dict(self.net.state_dict()) self.online_net = self.target_net self.eval_net = self.target_net elif self.net.update_type == 'polyak': logger.debug('Updating net by averaging') net_util.polyak_update(self.net, self.target_net, self.net.polyak_coef) self.online_net = self.target_net self.eval_net = self.target_net else: raise ValueError('Unknown net.update_type. Should be "replace" or "polyak". Exiting.')
def fn_decay_lr(net, fn): ''' Decay learning rate for net module, only returns the new lr for user to set to appropriate nets In the future, might add more flexible lr adjustment, like boosting and decaying on need. ''' space_clock = util.s_get(net.algorithm, 'aeb_space.clock') total_t = space_clock.get('total_t') start_val, end_val = net.optim_spec['lr'], 1e-6 anneal_total_t = net.lr_anneal_timestep or max(10e6, 60 * net.lr_decay_frequency) if total_t >= net.lr_decay_min_timestep and total_t % net.lr_decay_frequency == 0: logger.debug(f'anneal_total_t: {anneal_total_t}, total_t: {total_t}') new_lr = fn(start_val, end_val, anneal_total_t, total_t) return new_lr else: return no_decay(net)
def space_train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.get_lab_mode() == 'enjoy': return np.nan total_t = util.s_get(self, 'aeb_space.clock').get('total_t') self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0) is_per = util.get_class_name( self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay' if self.to_train == 1: total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): batch = self.space_sample() for _ in range(self.training_batch_epoch): with torch.no_grad(): q_targets = self.calc_q_targets(batch) if is_per: q_preds = self.net.wrap_eval(batch['states']) errors = torch.abs(q_targets - q_preds) errors = errors.sum(dim=1).unsqueeze_(dim=1) for body in self.agent.nanflat_body_a: body.memory.update_priorities(errors) loss = self.net.training_step( batch['states'], q_targets, global_net=self.global_nets.get('net')) total_loss += loss loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 for body in self.agent.nanflat_body_a: body.entropies = [] body.log_probs = [] logger.debug( f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}' ) return loss.item() else: return np.nan
def fn_decay_lr(net, fn): ''' Decay learning rate for net module, only returns the new lr for user to set to appropriate nets In the future, might add more flexible lr adjustment, like boosting and decaying on need. ''' space_clock = util.s_get(net.algorithm, 'aeb_space.clock') total_t = space_clock.get('total_t') start_val, end_val = net.optim_spec['lr'], 1e-6 anneal_total_t = net.lr_anneal_timestep or max(10e6, 60 * net.lr_decay_frequency) if total_t >= net.lr_decay_min_timestep and total_t % net.lr_decay_frequency == 0: logger.debug(f'anneal_total_t: {anneal_total_t}, total_t: {total_t}') new_lr = fn(start_val, end_val, anneal_total_t, total_t) return new_lr else: return no_decay(net)
def update_nets(self): # NOTE: Once polyak updating for multi-headed networks is supported via updates to flatten_params and load_params then this can be removed space_clock = util.s_get(self, 'aeb_space.clock') t = space_clock.get('t') if self.update_type == 'replace': if t % self.update_frequency == 0: logger.debug('Updating target_net by replacing') self.target_net = deepcopy(self.net) self.online_net = self.target_net self.eval_net = self.target_net elif self.update_type == 'polyak': logger.error( '"polyak" updating not supported yet for MultiHeadDQN, please use "replace" instead. Exiting.') sys.exit() else: logger.error( 'Unknown net.update_type. Should be "replace" or "polyak". Exiting.') sys.exit()
def update_nets(self): space_clock = util.s_get(self, 'aeb_space.clock') t = space_clock.get('t') if self.update_type == 'replace': if t % self.update_frequency == 0: logger.debug('Updating target_net by replacing') self.target_net = deepcopy(self.net) self.online_net = self.target_net self.eval_net = self.target_net elif self.update_type == 'polyak': logger.debug('Updating net by averaging') avg_params = self.polyak_weight * net_util.flatten_params(self.target_net) + \ (1 - self.polyak_weight) * net_util.flatten_params(self.net) self.target_net = net_util.load_params(self.target_net, avg_params) self.online_net = self.target_net self.eval_net = self.target_net else: logger.error( 'Unknown net.update_type. Should be "replace" or "polyak". Exiting.') sys.exit()
def train(self): '''Completes one training step for the agent if it is time to train. Otherwise this function does nothing. ''' t = util.s_get(self, 'aeb_space.clock').get('total_t') if self.to_train == 1: logger.debug3(f'Training at t: {t}') batch = self.sample() if batch['states'].size(0) < 2: logger.info(f'Batch too small to train with, skipping...') self.to_train = 0 return np.nan q_targets = self.compute_q_target_values(batch) if torch.cuda.is_available() and self.gpu: q_targets = q_targets.cuda() y = Variable(q_targets) loss = self.net.training_step(batch['states'], y) logger.debug(f'loss {loss.data[0]}') self.to_train = 0 return loss.data[0] else: logger.debug3('NOT training') return np.nan
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.get_lab_mode() == 'enjoy': return np.nan total_t = util.s_get(self, 'aeb_space.clock').get('total_t') self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0) is_per = util.get_class_name(self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay' if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): batch = self.sample() for _ in range(self.training_batch_epoch): with torch.no_grad(): q_targets = self.calc_q_targets(batch) if is_per: q_preds = self.net.wrap_eval(batch['states']) errors = torch.abs(q_targets - q_preds) errors = errors.sum(dim=1).unsqueeze_(dim=1) for body in self.agent.nanflat_body_a: body.memory.update_priorities(errors) loss = self.net.training_step(batch['states'], q_targets) total_loss += loss.cpu() loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss}') self.last_loss = loss.item() return self.last_loss
def test_s_get(test_agent): spec = util.s_get(test_agent, 'aeb_space.spec') assert _.is_dict(spec) spec = util.s_get(test_agent, 'aeb_space').spec assert _.is_dict(spec)
def test_s_get(test_agent): spec = util.s_get(test_agent, 'aeb_space.spec') assert ps.is_dict(spec) spec = util.s_get(test_agent, 'aeb_space').spec assert ps.is_dict(spec)
def update(self): '''Updates the explore variables''' space_clock = util.s_get(self, 'aeb_space.clock') self.action_policy_update(self, space_clock) return self.explore_var
def update_explore_var(self): space_clock = util.s_get(self, 'aeb_space.clock') self.action_policy_update(self, space_clock)