def normalize_state(body, state): ''' Normalizes one or more states using a running mean and standard deviation Details of the normalization from Deep RL Bootcamp, L6 https://www.youtube.com/watch?v=8EcdaCk9KaQ&feature=youtu.be ''' same_shape = False if type( state) == list else state.shape == body.state_mean.shape has_preprocess = getattr(body.memory, 'preprocess_state', False) if ('Atari' in util.get_class_name(body.memory)): # never normalize atari, it has its own normalization step logger.debug( 'skipping normalizing for Atari, already handled by preprocess') return state elif ('Replay' in util.get_class_name(body.memory)) and has_preprocess: # normalization handled by preprocess_state function in the memory logger.debug('skipping normalizing, already handled by preprocess') return state elif same_shape: # if not atari, always normalize the state the first time we see it during act # if the shape is not transformed in some way if np.sum(body.state_std_dev) == 0: return np.clip(state - body.state_mean, -10, 10) else: return np.clip((state - body.state_mean) / body.state_std_dev, -10, 10) else: # broadcastable sample from an un-normalized memory so we should normalize logger.debug('normalizing sample from memory') if np.sum(body.state_std_dev) == 0: return np.clip(state - body.state_mean, -10, 10) else: return np.clip((state - body.state_mean) / body.state_std_dev, -10, 10)
def calc_q_loss(self, batch): '''Compute the Q value loss using predicted and target Q values from the appropriate networks''' states = batch['states'] next_states = batch['next_states'] q_preds = self.net(states) with torch.no_grad(): # Use online_net to select actions in next state online_next_q_preds = self.online_net(next_states) # Use eval_net to calculate next_q_preds for actions chosen by online_net next_q_preds = self.eval_net(next_states) act_q_preds = q_preds.gather( -1, batch['actions'].long().unsqueeze(-1)).squeeze(-1) online_actions = online_next_q_preds.argmax(dim=-1, keepdim=True) max_next_q_preds = next_q_preds.gather(-1, online_actions).squeeze(-1) max_q_targets = batch['rewards'] + self.gamma * ( 1 - batch['dones']) * max_next_q_preds logger.debug( f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}') q_loss = self.net.loss_fn(act_q_preds, max_q_targets) # TODO use the same loss_fn but do not reduce yet if 'Prioritized' in util.get_class_name(self.body.memory): # PER errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy() self.body.memory.update_priorities(errors) return q_loss
def init_global_nets(algorithm): ''' Initialize global_nets for Hogwild using an identical instance of an algorithm from an isolated Session in spec.meta.distributed, specify either: - 'shared': global network parameter is shared all the time. In this mode, algorithm local network will be replaced directly by global_net via overriding by identify attribute name - 'synced': global network parameter is periodically synced to local network after each gradient push. In this mode, algorithm will keep a separate reference to `global_{net}` for each of its network ''' dist_mode = algorithm.agent.spec['meta']['distributed'] assert dist_mode in ('shared', 'synced'), f'Unrecognized distributed mode' global_nets = {} for net_name in algorithm.net_names: optim_name = net_name.replace('net', 'optim') if not hasattr( algorithm, optim_name): # only for trainable network, i.e. has an optim continue g_net = getattr(algorithm, net_name) g_net.share_memory() # make net global if dist_mode == 'shared': # use the same name to override the local net global_nets[net_name] = g_net else: # keep a separate reference for syncing global_nets[f'global_{net_name}'] = g_net # if optim is Global, set to override the local optim and its scheduler optim = getattr(algorithm, optim_name) if 'Global' in util.get_class_name(optim): optim.share_memory() # make optim global global_nets[optim_name] = optim lr_scheduler_name = net_name.replace('net', 'lr_scheduler') lr_scheduler = getattr(algorithm, lr_scheduler_name) global_nets[lr_scheduler_name] = lr_scheduler logger.info( f'Initialized global_nets attr {list(global_nets.keys())} for Hogwild') return global_nets
def index_lab_comp(self, lab_comp): ''' Update info space coor when initializing lab component, and return its coor and index. Does not apply to AEB entities. @returns {tuple, int} data_coor, index @example class Session: def __init__(self, spec): self.coor, self.index = info_space.index_lab_comp(self) ''' axis = util.get_class_name(lab_comp, lower=True) self.advance_coor(axis) coor = self.coor.copy() index = coor[axis] return coor, index
def calc_q_loss(self, batch): '''Compute the Q value loss using predicted and target Q values from the appropriate networks''' q_preds = self.net.wrap_eval(batch['states']) act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1) next_q_preds = self.net.wrap_eval(batch['next_states']) # Bellman equation: compute max_q_targets using reward and max estimated Q values (0 if no next_state) max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=True) max_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds max_q_targets = max_q_targets.detach() q_loss = self.net.loss_fn(act_q_preds, max_q_targets) # TODO use the same loss_fn but do not reduce yet if 'Prioritized' in util.get_class_name(self.body.memory): # PER errors = torch.abs(max_q_targets - act_q_preds.detach()) self.body.memory.update_priorities(errors) return q_loss
def space_train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.get_lab_mode() == 'enjoy': return np.nan total_t = util.s_get(self, 'aeb_space.clock').get('total_t') self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0) is_per = util.get_class_name( self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay' if self.to_train == 1: total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): batch = self.space_sample() for _ in range(self.training_batch_epoch): with torch.no_grad(): q_targets = self.calc_q_targets(batch) if is_per: q_preds = self.net.wrap_eval(batch['states']) errors = torch.abs(q_targets - q_preds) errors = errors.sum(dim=1).unsqueeze_(dim=1) for body in self.agent.nanflat_body_a: body.memory.update_priorities(errors) loss = self.net.training_step( batch['states'], q_targets, global_net=self.global_nets.get('net')) total_loss += loss loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 for body in self.agent.nanflat_body_a: body.entropies = [] body.log_probs = [] logger.debug( f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}' ) return loss.item() else: return np.nan
def init_params(module, init_fn): '''Initialize module's weights using init_fn, and biases to 0.0''' bias_init = 0.0 classname = util.get_class_name(module) if 'Net' in classname: # skip if it's a net, not pytorch layer pass elif any(k in classname for k in ('BatchNorm', 'Conv', 'Linear')): init_fn(module.weight) nn.init.constant_(module.bias, bias_init) elif 'GRU' in classname: for name, param in module.named_parameters(): if 'weight' in name: init_fn(param) elif 'bias' in name: nn.init.constant_(param, bias_init) else: pass
def calc_q_loss(self, batch): '''Compute the Q value loss for Hydra network by apply the singleton logic on generalized aggregate.''' q_preds = torch.stack(self.net.wrap_eval(batch['states'])) act_q_preds = q_preds.gather(-1, torch.stack(batch['actions']).long().unsqueeze(-1)).squeeze(-1) # Use online_net to select actions in next state online_next_q_preds = torch.stack(self.online_net.wrap_eval(batch['next_states'])) # Use eval_net to calculate next_q_preds for actions chosen by online_net next_q_preds = torch.stack(self.eval_net.wrap_eval(batch['next_states'])) max_next_q_preds = online_next_q_preds.gather(-1, next_q_preds.argmax(dim=-1, keepdim=True)).squeeze(-1) max_q_targets = torch.stack(batch['rewards']) + self.gamma * (1 - torch.stack(batch['dones'])) * max_next_q_preds q_loss = self.net.loss_fn(act_q_preds, max_q_targets) # TODO use the same loss_fn but do not reduce yet for body in self.agent.nanflat_body_a: if 'Prioritized' in util.get_class_name(body.memory): # PER errors = torch.abs(max_q_targets - act_q_preds) body.memory.update_priorities(errors) return q_loss
def calc_q_loss(self, batch): '''Compute the Q value loss using predicted and target Q values from the appropriate networks''' q_preds = self.net.wrap_eval(batch['states']) act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1) # Use online_net to select actions in next state online_next_q_preds = self.online_net.wrap_eval(batch['next_states']) # Use eval_net to calculate next_q_preds for actions chosen by online_net next_q_preds = self.eval_net.wrap_eval(batch['next_states']) max_next_q_preds = next_q_preds.gather(-1, online_next_q_preds.argmax(dim=-1, keepdim=True)).squeeze(-1) max_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds max_q_targets = max_q_targets.detach() q_loss = self.net.loss_fn(act_q_preds, max_q_targets) # TODO use the same loss_fn but do not reduce yet if 'Prioritized' in util.get_class_name(self.body.memory): # PER errors = torch.abs(max_q_targets - act_q_preds.detach()) self.body.memory.update_priorities(errors) return q_loss
def get_coor_idx(self, lab_comp): ''' Get info space coor when initializing lab component, and return its coor and index. Does not apply to AEB entities. @returns {tuple, int} data_coor, index @example class Session: def __init__(self, spec): self.coor, self.index = info_space.get_coor_idx(self) info_space.tick('session') session = Session(spec, info_space) ''' axis = util.get_class_name(lab_comp, lower=True) coor = self.coor.copy() index = coor[axis] return coor, index
def update_online_stats(body, state): ''' Method to calculate the running mean and standard deviation of the state space. See https://www.johndcook.com/blog/standard_deviation/ for more details for n >= 1 M_n = M_n-1 + (state - M_n-1) / n S_n = S_n-1 + (state - M_n-1) * (state - M_n) variance = S_n / (n - 1) std_dev = sqrt(variance) ''' logger.debug( f'mean: {body.state_mean}, std: {body.state_std_dev}, num examples: {body.state_n}' ) # Assumes only one state is given if ('Atari' in util.get_class_name(body.memory)): assert state.ndim == 3 elif getattr(body.memory, 'raw_state_dim', False): assert state.size == body.memory.raw_state_dim else: assert state.size == body.state_dim or state.shape == body.state_dim mean = body.state_mean body.state_n += 1 if np.isnan(mean).any(): assert np.isnan(body.state_std_dev_int) assert np.isnan(body.state_std_dev) body.state_mean = state body.state_std_dev_int = 0 body.state_std_dev = 0 else: assert body.state_n > 1 body.state_mean = mean + (state - mean) / body.state_n body.state_std_dev_int = body.state_std_dev_int + (state - mean) * ( state - body.state_mean) body.state_std_dev = np.sqrt(body.state_std_dev_int / (body.state_n - 1)) # Guard against very small std devs if (body.state_std_dev < 1e-8).any(): body.state_std_dev[np.where(body.state_std_dev < 1e-8)] += 1e-8 logger.debug( f'new mean: {body.state_mean}, new std: {body.state_std_dev}, num examples: {body.state_n}' )
def init_parameters(module, init_fn): ''' Initializes module's weights using init_fn, which is the name of function from from nn.init Initializes module's biases to either 0.01 or 0.0, depending on module The only exception is BatchNorm layers, for which we use uniform initialization ''' bias_init = 0.0 classname = util.get_class_name(module) if 'BatchNorm' in classname: init_fn(module.weight) nn.init.constant_(module.bias, bias_init) elif 'GRU' in classname: for name, param in module.named_parameters(): if 'weight' in name: init_fn(param) elif 'bias' in name: nn.init.constant_(param, 0.0) elif 'Linear' in classname or ('Conv' in classname and 'Net' not in classname): init_fn(module.weight) nn.init.constant_(module.bias, bias_init)
def calc_q_loss(self, batch): '''Compute the Q value loss using predicted and target Q values from the appropriate networks''' states = batch['states'] next_states = batch['next_states'] q_preds = self.net(states) with torch.no_grad(): next_q_preds = self.net(next_states) act_q_preds = q_preds.gather( -1, batch['actions'].long().unsqueeze(-1)).squeeze(-1) # Bellman equation: compute max_q_targets using reward and max estimated Q values (0 if no next_state) max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=False) max_q_targets = batch['rewards'] + self.gamma * ( 1 - batch['dones']) * max_next_q_preds logger.debug( f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}') q_loss = self.net.loss_fn(act_q_preds, max_q_targets) # TODO use the same loss_fn but do not reduce yet if 'Prioritized' in util.get_class_name(self.body.memory): # PER errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy() self.body.memory.update_priorities(errors) return q_loss
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.get_lab_mode() == 'enjoy': return np.nan total_t = util.s_get(self, 'aeb_space.clock').get('total_t') self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0) is_per = util.get_class_name(self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay' if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): batch = self.sample() for _ in range(self.training_batch_epoch): with torch.no_grad(): q_targets = self.calc_q_targets(batch) if is_per: q_preds = self.net.wrap_eval(batch['states']) errors = torch.abs(q_targets - q_preds) errors = errors.sum(dim=1).unsqueeze_(dim=1) for body in self.agent.nanflat_body_a: body.memory.update_priorities(errors) loss = self.net.training_step(batch['states'], q_targets) total_loss += loss.cpu() loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 self.body.log_probs = [] self.body.entropies = [] logger.debug(f'Loss: {loss}') self.last_loss = loss.item() return self.last_loss
def is_discrete(self, a): '''Check if an agent (brain) is subject to discrete actions''' assert a == 0, 'OpenAI Gym supports only single body, use a=0' return util.get_class_name(self.action_spaces[a]) != 'Box' # continuous
def try_update_per(self, q_preds, q_targets): if 'Prioritized' in util.get_class_name(self.body.memory): # PER with torch.no_grad(): errors = (q_preds - q_targets).abs().cpu().numpy() self.body.memory.update_priorities(errors)
def _is_discrete(self, action_space): '''Check if an action space is discrete''' return util.get_class_name(action_space) != 'Box'