def __init__( self, observation_shape, action_size, hidden_sizes=None, # None for default (see below). lstm_size=256, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6, ): super().__init__() self._obs_n_dim = len(observation_shape) self._action_size = action_size hidden_sizes = hidden_sizes or [256, 256] mlp_input_size = int(np.prod(observation_shape)) self.mlp = MlpModel( input_size=mlp_input_size, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=nonlinearity, ) mlp_output_size = hidden_sizes[-1] if hidden_sizes else mlp_input_size self.lstm = torch.nn.LSTM(mlp_output_size + action_size + 1, lstm_size) self.head = torch.nn.Linear(lstm_size, action_size * 2 + 1) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation
def __init__( self, observation_shape, action_size, hidden_sizes=None, lstm_size=None, lstm_skip=True, constraint=True, hidden_nonlinearity="tanh", # or "relu" mu_nonlinearity="tanh", init_log_std=0., normalize_observation=True, var_clip=1e-6, ): super().__init__() if hidden_nonlinearity == "tanh": # So these can be strings in config file. hidden_nonlinearity = torch.nn.Tanh elif hidden_nonlinearity == "relu": hidden_nonlinearity = torch.nn.ReLU else: raise ValueError(f"Unrecognized hidden_nonlinearity string: {hidden_nonlinearity}") if mu_nonlinearity == "tanh": # So these can be strings in config file. mu_nonlinearity = torch.nn.Tanh elif mu_nonlinearity == "relu": mu_nonlinearity = torch.nn.ReLU else: raise ValueError(f"Unrecognized mu_nonlinearity string: {mu_nonlinearity}") self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) self.body = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes or [256, 256], nonlinearity=hidden_nonlinearity, ) last_size = self.body.output_size if lstm_size: lstm_input_size = last_size + action_size + 1 self.lstm = torch.nn.LSTM(lstm_input_size, lstm_size) last_size = lstm_size else: self.lstm = None mu_linear = torch.nn.Linear(last_size, action_size) if mu_nonlinearity is not None: self.mu = torch.nn.Sequential(mu_linear, mu_nonlinearity()) else: self.mu = mu_linear self.value = torch.nn.Linear(last_size, 1) if constraint: self.constraint = torch.nn.Linear(last_size, 1) else: self.constraint = None self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size)) self._lstm_skip = lstm_skip if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.var_clip = var_clip self.normalize_observation = normalize_observation
def __init__( self, observation_shape, action_size, option_size, hidden_sizes=None, # None for default (see below). hidden_nonlinearity=torch.nn.Tanh, # Module form. mu_nonlinearity=torch.nn.Tanh, # Module form. init_log_std=0., normalize_observation=True, norm_obs_clip=10, norm_obs_var_clip=1e-6, baselines_init=True, # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value use_interest=False, # IOC sigmoid interest functions use_diversity=False, # TDEOC q entropy output use_attention=False, ): """Instantiate neural net modules according to inputs.""" super().__init__() from functools import partial self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) hidden_sizes = hidden_sizes or [64, 64] inits_mu = inits_v = None if baselines_init: inits_mu = (np.sqrt(2), 0.01) inits_v = (np.sqrt(2), 1.) body_mlp_class = partial(MlpModel, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits_v) self.model = OptionCriticHead_IndependentPreprocessor( input_size=input_size, input_module_class=body_mlp_class, output_size=action_size, option_size=option_size, intra_option_policy='continuous', intra_option_kwargs={'init_log_std': init_log_std, 'mu_nonlinearity': mu_nonlinearity}, input_module_kwargs={}, use_interest=use_interest, use_diversity=use_diversity, use_attention=use_attention, baselines_init=baselines_init, orthogonal_init_base=inits_v[1], orthogonal_init_pol=inits_mu[1] ) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation self.use_interest = use_interest self.use_diversity = use_diversity self.use_attention = use_attention
def __init__( self, observation_shape, output_size, hidden_sizes=None, # None for default (see below). lstm_size=256, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6, ): """Instantiate neural net module according to inputs.""" super().__init__() self._obs_n_dim = len(observation_shape) hidden_sizes = hidden_sizes or [256, 256] mlp_input_size = int(np.prod(observation_shape)) self.mlp = MlpModel( input_size=mlp_input_size, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=nonlinearity, ) mlp_output_size = hidden_sizes[-1] if hidden_sizes else mlp_input_size self.lstm = torch.nn.LSTM(mlp_output_size + output_size + 1, lstm_size) self.pi = torch.nn.Linear(lstm_size, output_size) self.value = torch.nn.Linear(lstm_size, 1) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation
def __init__( self, observation_shape, action_size, policy_hidden_sizes=None, policy_hidden_nonlinearity=torch.nn.Tanh, value_hidden_sizes=None, value_hidden_nonlinearity=torch.nn.Tanh, init_log_std=0., min_std=0., normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6, policy_inputs_indices=None, ): super().__init__() self.min_std = min_std self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) self.policy_inputs_indices = policy_inputs_indices if policy_inputs_indices is not None else list( range(input_size)) policy_hidden_sizes = [ 400, 300 ] if policy_hidden_sizes is None else policy_hidden_sizes value_hidden_sizes = [ 400, 300 ] if value_hidden_sizes is None else value_hidden_sizes self.mu = MlpModel(input_size=len(self.policy_inputs_indices), hidden_sizes=policy_hidden_sizes, output_size=action_size, nonlinearity=policy_hidden_nonlinearity) self.v = MlpModel( input_size=input_size, hidden_sizes=value_hidden_sizes, output_size=1, nonlinearity=value_hidden_nonlinearity, ) self._log_std = torch.nn.Parameter( (np.log(np.exp(init_log_std) - self.min_std)) * torch.ones(action_size)) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation
def __init__( self, observation_shape, action_size, hidden_sizes=None, # None for default (see below). hidden_nonlinearity=torch.nn.Tanh, # Module form. mu_nonlinearity=torch.nn.Tanh, # Module form. init_log_std=0., normalize_observation=True, norm_obs_clip=10, norm_obs_var_clip=1e-6, baselines_init=True, # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value ): """Instantiate neural net modules according to inputs.""" super().__init__() self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) hidden_sizes = hidden_sizes or [64, 64] inits_mu = inits_v = None if baselines_init: inits_mu = (np.sqrt(2), 0.01) inits_v = (np.sqrt(2), 1.) mu_mlp = torch.jit.script( MlpModel(input_size=input_size, hidden_sizes=hidden_sizes, output_size=action_size, nonlinearity=hidden_nonlinearity, inits=inits_mu)) if mu_nonlinearity is not None: self.mu = torch.nn.Sequential(mu_mlp, mu_nonlinearity()) else: self.mu = mu_mlp self.v = torch.jit.script( MlpModel(input_size=input_size, hidden_sizes=hidden_sizes, output_size=1, nonlinearity=hidden_nonlinearity, inits=inits_v)) self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size)) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation
def __init__( self, RndCls, # type: BaseFeatureExtractor rnd_model_kwargs): """ Constructs target and distillation model. Assumes identical architectures. Also constructs normalization models for observation and intrinsic rewards. """ super().__init__() self.target_model = RndCls(**rnd_model_kwargs) self.distill_model = RndCls(**rnd_model_kwargs) rnd_param_init_(self.target_model) rnd_param_init_(self.distill_model) self.obs_rms = RunningMeanStdModel( wrap(rnd_model_kwargs["input_shape"]) ) # Requires RndCls takes input_shape self.int_rff = None # Intrinsic reward forward filter (this stores a discounted sum of non-episodic rewards) self.int_rff_rms = RunningMeanStdModel(torch.Size( [1])) # Intrinsic reward forward filter RMS model self.update_norm = True # Default to updating obs and int_rew normalization models
def __init__( self, observation_shape, action_size, hidden_sizes=None, # None for default (see below). hidden_nonlinearity=torch.nn.Tanh, # Module form. mu_nonlinearity=torch.nn.Tanh, # Module form. init_log_std=0., normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6, ): """Instantiate neural net modules according to inputs.""" super().__init__() self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) hidden_sizes = hidden_sizes or [64, 64] mu_mlp = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=action_size, nonlinearity=hidden_nonlinearity, ) if mu_nonlinearity is not None: self.mu = torch.nn.Sequential(mu_mlp, mu_nonlinearity()) else: self.mu = mu_mlp self.v = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=1, nonlinearity=hidden_nonlinearity, ) self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size)) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation
def __init__( self, image_shape, action_size, hidden_sizes=512, stop_conv_grad=False, channels=None, # Defaults below. kernel_sizes=None, strides=None, paddings=None, kiaming_init=True, normalize_conv_out=False, ): super().__init__() c, h, w = image_shape self.conv = Conv2dModel( in_channels=c, channels=channels or [32, 64, 64], kernel_sizes=kernel_sizes or [8, 4, 3], strides=strides or [4, 2, 1], paddings=paddings, ) self._conv_out_size = self.conv.conv_out_size(h=h, w=w) self.pi_v_mlp = MlpModel( input_size=self._conv_out_size, hidden_sizes=hidden_sizes, output_size=action_size + 1, ) if kiaming_init: self.apply(weight_init) self.stop_conv_grad = stop_conv_grad logger.log("Model stopping gradient at CONV." if stop_conv_grad else "Modeul using gradients on all parameters.") if normalize_conv_out: # Havent' seen this make a difference yet. logger.log("Model normalizing conv output across all pixels.") self.conv_rms = RunningMeanStdModel((1, )) self.var_clip = 1e-6 self.normalize_conv_out = normalize_conv_out
class AtariPgModel(torch.nn.Module): """Can feed in conv and/or fc1 layer from pre-trained model, or have it initialize new ones (if initializing new, must provide image_shape).""" def __init__( self, image_shape, action_size, hidden_sizes=512, stop_conv_grad=False, channels=None, # Defaults below. kernel_sizes=None, strides=None, paddings=None, kiaming_init=True, normalize_conv_out=False, ): super().__init__() c, h, w = image_shape self.conv = Conv2dModel( in_channels=c, channels=channels or [32, 64, 64], kernel_sizes=kernel_sizes or [8, 4, 3], strides=strides or [4, 2, 1], paddings=paddings, ) self._conv_out_size = self.conv.conv_out_size(h=h, w=w) self.pi_v_mlp = MlpModel( input_size=self._conv_out_size, hidden_sizes=hidden_sizes, output_size=action_size + 1, ) if kiaming_init: self.apply(weight_init) self.stop_conv_grad = stop_conv_grad logger.log("Model stopping gradient at CONV." if stop_conv_grad else "Modeul using gradients on all parameters.") if normalize_conv_out: # Havent' seen this make a difference yet. logger.log("Model normalizing conv output across all pixels.") self.conv_rms = RunningMeanStdModel((1, )) self.var_clip = 1e-6 self.normalize_conv_out = normalize_conv_out def forward(self, observation, prev_action, prev_reward): if observation.dtype == torch.uint8: img = observation.type(torch.float) img = img.mul_(1.0 / 255) else: img = observation lead_dim, T, B, img_shape = infer_leading_dims(img, 3) conv = self.conv(img.view(T * B, *img_shape)) if self.stop_conv_grad: conv = conv.detach() if self.normalize_conv_out: conv_var = self.conv_rms.var conv_var = torch.clamp(conv_var, min=self.var_clip) # stddev of uniform [a,b] = (b-a)/sqrt(12), 1/sqrt(12)~0.29 # then allow [0, 10]? conv = torch.clamp(0.29 * conv / conv_var.sqrt(), 0, 10) pi_v = self.pi_v_mlp(conv.view(T * B, -1)) pi = F.softmax(pi_v[:, :-1], dim=-1) v = pi_v[:, -1] pi, v, conv = restore_leading_dims((pi, v, conv), lead_dim, T, B) return pi, v, conv def update_conv_rms(self, observation): if self.normalize_conv_out: with torch.no_grad(): if observation.dtype == torch.uint8: img = observation.type(torch.float) img = img.mul_(1.0 / 255) else: img = observation lead_dim, T, B, img_shape = infer_leading_dims(img, 3) conv = self.conv(img.view(T * B, *img_shape)) self.conv_rms.update(conv.view(-1, 1)) def parameters(self): if not self.stop_conv_grad: yield from self.conv.parameters() yield from self.pi_v_mlp.parameters() def named_parameters(self): if not self.stop_conv_grad: yield from self.conv.named_parameters() yield from self.pi_v_mlp.named_parameters() @property def conv_out_size(self): return self._conv_out_size
class RndBonusModule(SelfSupervisedModule): """ Random Network Distillation Module. Produces intrinsic rewards as the prediction error between the feature embeddings from a target and distilled model, both randomly initialized. """ def __init__( self, RndCls, # type: BaseFeatureExtractor rnd_model_kwargs): """ Constructs target and distillation model. Assumes identical architectures. Also constructs normalization models for observation and intrinsic rewards. """ super().__init__() self.target_model = RndCls(**rnd_model_kwargs) self.distill_model = RndCls(**rnd_model_kwargs) rnd_param_init_(self.target_model) rnd_param_init_(self.distill_model) self.obs_rms = RunningMeanStdModel( wrap(rnd_model_kwargs["input_shape"]) ) # Requires RndCls takes input_shape self.int_rff = None # Intrinsic reward forward filter (this stores a discounted sum of non-episodic rewards) self.int_rff_rms = RunningMeanStdModel(torch.Size( [1])) # Intrinsic reward forward filter RMS model self.update_norm = True # Default to updating obs and int_rew normalization models def normalize_obs(self, obs): """ Normalizes observations according to specifications in https://arxiv.org/abs/1810.12894. This is necessary since the target network is fixed and cannot adjust to varying environments. This model should be initialized in the sampler by running a small number of observations through it. WARNING: If observations are already normalized using a different model / formulation, this will cause issues if this model is initialized on raw obs in the sampler. """ obs = obs.to( dtype=torch.float32) # Obs may be byte tensor (e.g. 8-bit pixels) if self.update_norm: self.obs_rms.update(obs) obs = (obs - self.obs_rms.mean) / torch.sqrt(self.obs_rms.var + 1e-5) obs = torch.clamp(obs, min=-5, max=5) return obs def normalize_int_rew(self, int_rew, gamma=0.99): """ Normalizes intrinsic rewards according to specifications in https://arxiv.org/abs/1810.12894. This is done to remove the need to search for optimal intrinsic reward scaling factors in between different environments. This model is *not* expected to be initialized, if following the authors' implementation. """ # Update rewards forward filter and gather batch of results rff_batch = torch.empty_like(int_rew) int_rff_prior = self.int_rff for i, rews in enumerate(int_rew): if self.int_rff is None: self.int_rff = rews else: self.int_rff = self.int_rff * gamma + rews rff_batch[i, :] = self.int_rff # Update intrinsic rff rms for int rew normalization if updating norm models if self.update_norm: batch_size = rff_batch.numel() self.int_rff_rms.update(rff_batch.view((batch_size, 1))) else: # Reset rff prior state if not updating norm models self.int_rff = int_rff_prior # Normalize by dividing out running std of rff values return int_rew / torch.sqrt(self.int_rff_rms.var) def forward(self, next_obs): """ Runs forward pass for distillation and target models, producing intrinsic bonuses and distillation model loss. Note the self-supervised losses of the models are unused (and are presumably placeholders with a value of zero). """ next_obs = self.normalize_obs(next_obs) distill_feat, _ = self.distill_model(next_obs) target_feat, _ = self.target_model(next_obs) pred_errors = torch.mean((distill_feat - target_feat.detach())**2, dim=-1) # Maintains batch dimension distill_loss = torch.mean(pred_errors) # Reduces batch dimension int_rew = pred_errors.detach() return int_rew, distill_loss
class CppoModel(torch.nn.Module): def __init__( self, observation_shape, action_size, hidden_sizes=None, lstm_size=None, lstm_skip=True, constraint=True, hidden_nonlinearity="tanh", # or "relu" mu_nonlinearity="tanh", init_log_std=0., normalize_observation=True, var_clip=1e-6, ): super().__init__() if hidden_nonlinearity == "tanh": # So these can be strings in config file. hidden_nonlinearity = torch.nn.Tanh elif hidden_nonlinearity == "relu": hidden_nonlinearity = torch.nn.ReLU else: raise ValueError(f"Unrecognized hidden_nonlinearity string: {hidden_nonlinearity}") if mu_nonlinearity == "tanh": # So these can be strings in config file. mu_nonlinearity = torch.nn.Tanh elif mu_nonlinearity == "relu": mu_nonlinearity = torch.nn.ReLU else: raise ValueError(f"Unrecognized mu_nonlinearity string: {mu_nonlinearity}") self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) self.body = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes or [256, 256], nonlinearity=hidden_nonlinearity, ) last_size = self.body.output_size if lstm_size: lstm_input_size = last_size + action_size + 1 self.lstm = torch.nn.LSTM(lstm_input_size, lstm_size) last_size = lstm_size else: self.lstm = None mu_linear = torch.nn.Linear(last_size, action_size) if mu_nonlinearity is not None: self.mu = torch.nn.Sequential(mu_linear, mu_nonlinearity()) else: self.mu = mu_linear self.value = torch.nn.Linear(last_size, 1) if constraint: self.constraint = torch.nn.Linear(last_size, 1) else: self.constraint = None self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size)) self._lstm_skip = lstm_skip if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.var_clip = var_clip self.normalize_observation = normalize_observation def forward(self, observation, prev_action, prev_reward, init_rnn_state=None): lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim) if self.normalize_observation: obs_var = self.obs_rms.var if self.var_clip is not None: obs_var = torch.clamp(obs_var, min=self.var_clip) observation = torch.clamp((observation - self.obs_rms.mean) / obs_var.sqrt(), -10, 10) fc_x = self.body(observation.view(T * B, -1)) if self.lstm is not None: lstm_inputs = [fc_x, prev_action, prev_reward] lstm_input = torch.cat([x.view(T, B, -1) for x in lstm_inputs], dim=2) # lstm_input = torch.cat([ # fc_x.view(T, B, -1), # prev_action.view(T, B, -1), # prev_reward.view(T, B, -1), # ], dim=2) init_rnn_state = None if init_rnn_state is None else tuple(init_rnn_state) lstm_out, (hn, cn) = self.lstm(lstm_input, init_rnn_state) lstm_out = lstm_out.view(T * B, -1) if self._lstm_skip: fc_x = fc_x + lstm_out else: fc_x = lstm_out mu = self.mu(fc_x) log_std = self.log_std.repeat(T * B, 1) v = self.value(fc_x).squeeze(-1) mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B) if self.constraint is None: value = ValueInfo(value=v, c_value=None) else: c = self.constraint(fc_x).squeeze(-1) c = restore_leading_dims(c, lead_dim, T, B) value = ValueInfo(value=v, c_value=c) outputs = (mu, log_std, value) if self.lstm is not None: outputs += (RnnState(h=hn, c=cn),) return outputs def update_obs_rms(self, observation): if not self.normalize_observation: return self.obs_rms.update(observation)
def __init__( self, observation_shape, action_size, option_size, hidden_sizes=None, # None for default (see below). hidden_nonlinearity=torch.nn.Tanh, # Module form. mu_nonlinearity=torch.nn.Tanh, # Module form. init_log_std=0., normalize_observation=True, norm_obs_clip=10, norm_obs_var_clip=1e-6, baselines_init=True, # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value use_interest=False, # IOC sigmoid interest functions ): """Instantiate neural net modules according to inputs.""" super().__init__() self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) hidden_sizes = hidden_sizes or [64, 64] inits_mu = inits_v = None if baselines_init: inits_mu = (np.sqrt(2), 0.01) inits_v = (np.sqrt(2), 1.) # Body for intra-option policy mean mu_mlp = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits_mu ) # Intra-option policy. Outputs tanh mu if exists, else unactivateed linear. Also logstd self.mu = torch.nn.Sequential(mu_mlp, ContinuousIntraOptionPolicy(input_size=mu_mlp.output_size, num_options=option_size, num_actions=action_size, ortho_init=baselines_init, ortho_init_value=inits_mu[-1], init_log_std=init_log_std, mu_nonlinearity=mu_nonlinearity)) # Option value. Pure linear self.q = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ) # Option termination. MLP with sigmoid at end self.beta = torch.nn.Sequential(MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ), torch.nn.Sigmoid()) # self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size)) # Softmax policy over options self.pi_omega = torch.nn.Sequential(MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ), torch.nn.Softmax(-1)) # Per-option sigmoid interest functions self.pi_omega_I = torch.nn.Sequential(MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ), torch.nn.Sigmoid()) if use_interest else Dummy(option_size) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation self.use_interest = use_interest
class MujocoFfModel(torch.nn.Module): """ Model commonly used in Mujoco locomotion agents: an MLP which outputs distribution means, separate parameter for learned log_std, and separate MLP for state-value estimate. """ def __init__( self, observation_shape, action_size, hidden_sizes=None, # None for default (see below). hidden_nonlinearity=torch.nn.Tanh, # Module form. mu_nonlinearity=torch.nn.Tanh, # Module form. init_log_std=0., normalize_observation=True, norm_obs_clip=10, norm_obs_var_clip=1e-6, baselines_init=True, # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value ): """Instantiate neural net modules according to inputs.""" super().__init__() self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) hidden_sizes = hidden_sizes or [64, 64] inits_mu = inits_v = None if baselines_init: inits_mu = (np.sqrt(2), 0.01) inits_v = (np.sqrt(2), 1.) mu_mlp = torch.jit.script( MlpModel(input_size=input_size, hidden_sizes=hidden_sizes, output_size=action_size, nonlinearity=hidden_nonlinearity, inits=inits_mu)) if mu_nonlinearity is not None: self.mu = torch.nn.Sequential(mu_mlp, mu_nonlinearity()) else: self.mu = mu_mlp self.v = torch.jit.script( MlpModel(input_size=input_size, hidden_sizes=hidden_sizes, output_size=1, nonlinearity=hidden_nonlinearity, inits=inits_v)) self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size)) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation def forward(self, observation, prev_action, prev_reward): """ Compute mean, log_std, and value estimate from input state. Infers leading dimensions of input: can be [T,B], [B], or []; provides returns with same leading dims. Intermediate feedforward layers process as [T*B,H], with T=1,B=1 when not given. Used both in sampler and in algorithm (both via the agent). """ # Infer (presence of) leading dimensions: [T,B], [B], or []. lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim) if self.normalize_observation: obs_var = self.obs_rms.var if self.norm_obs_var_clip is not None: obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip) observation = torch.clamp( (observation - self.obs_rms.mean) / obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip) obs_flat = observation.view(T * B, -1) mu = self.mu(obs_flat) v = self.v(obs_flat).squeeze(-1) log_std = self.log_std.repeat(T * B, 1) # Restore leading dimensions: [T,B], [B], or [], as input. mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B) return mu, log_std, v def update_obs_rms(self, observation): if self.normalize_observation: self.obs_rms.update(observation)
class RefactoredMujocoOcFfModel(torch.nn.Module): """ Model commonly used in Mujoco locomotion agents: an MLP which outputs distribution means, separate parameter for learned log_std, and separate MLPs for state-option-value estimate, termination probabilities. Policy over options """ def __init__( self, observation_shape, action_size, option_size, hidden_sizes=None, # None for default (see below). hidden_nonlinearity=torch.nn.Tanh, # Module form. mu_nonlinearity=torch.nn.Tanh, # Module form. init_log_std=0., normalize_observation=True, norm_obs_clip=10, norm_obs_var_clip=1e-6, baselines_init=True, # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value use_interest=False, # IOC sigmoid interest functions use_diversity=False, # TDEOC q entropy output use_attention=False, ): """Instantiate neural net modules according to inputs.""" super().__init__() from functools import partial self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) hidden_sizes = hidden_sizes or [64, 64] inits_mu = inits_v = None if baselines_init: inits_mu = (np.sqrt(2), 0.01) inits_v = (np.sqrt(2), 1.) body_mlp_class = partial(MlpModel, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits_v) self.model = OptionCriticHead_IndependentPreprocessor( input_size=input_size, input_module_class=body_mlp_class, output_size=action_size, option_size=option_size, intra_option_policy='continuous', intra_option_kwargs={'init_log_std': init_log_std, 'mu_nonlinearity': mu_nonlinearity}, input_module_kwargs={}, use_interest=use_interest, use_diversity=use_diversity, use_attention=use_attention, baselines_init=baselines_init, orthogonal_init_base=inits_v[1], orthogonal_init_pol=inits_mu[1] ) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation self.use_interest = use_interest self.use_diversity = use_diversity self.use_attention = use_attention def forward(self, observation, prev_action, prev_reward): """ Compute mean, log_std, q-value, and termination estimates from input state. Infers leading dimensions of input: can be [T,B], [B], or []; provides returns with same leading dims. Intermediate feedforward layers process as [T*B,H], with T=1,B=1 when not given. Used both in sampler and in algorithm (both via the agent). """ # Infer (presence of) leading dimensions: [T,B], [B], or []. lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim) if self.normalize_observation: obs_var = self.obs_rms.var if self.norm_obs_var_clip is not None: obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip) observation = torch.clamp((observation - self.obs_rms.mean) / obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip) obs_flat = observation.view(T * B, -1) (mu, logstd), beta, q, pi_I, q_ent = self.model(obs_flat) log_std = logstd.repeat(T * B, 1, 1) # Restore leading dimensions: [T,B], [B], or [], as input. mu, log_std, q, beta, pi, q_ent = restore_leading_dims((mu, log_std, q, beta, pi_I, q_ent), lead_dim, T, B) return mu, log_std, beta, q, pi def update_obs_rms(self, observation): if self.normalize_observation: self.obs_rms.update(observation)
class MujocoOCFfModel(torch.nn.Module): """ Model commonly used in Mujoco locomotion agents: an MLP which outputs distribution means, separate parameter for learned log_std, and separate MLPs for state-option-value estimate, termination probabilities. Policy over options """ def __init__( self, observation_shape, action_size, option_size, hidden_sizes=None, # None for default (see below). hidden_nonlinearity=torch.nn.Tanh, # Module form. mu_nonlinearity=torch.nn.Tanh, # Module form. init_log_std=0., normalize_observation=True, norm_obs_clip=10, norm_obs_var_clip=1e-6, baselines_init=True, # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value use_interest=False, # IOC sigmoid interest functions ): """Instantiate neural net modules according to inputs.""" super().__init__() self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) hidden_sizes = hidden_sizes or [64, 64] inits_mu = inits_v = None if baselines_init: inits_mu = (np.sqrt(2), 0.01) inits_v = (np.sqrt(2), 1.) # Body for intra-option policy mean mu_mlp = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits_mu ) # Intra-option policy. Outputs tanh mu if exists, else unactivateed linear. Also logstd self.mu = torch.nn.Sequential(mu_mlp, ContinuousIntraOptionPolicy(input_size=mu_mlp.output_size, num_options=option_size, num_actions=action_size, ortho_init=baselines_init, ortho_init_value=inits_mu[-1], init_log_std=init_log_std, mu_nonlinearity=mu_nonlinearity)) # Option value. Pure linear self.q = MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ) # Option termination. MLP with sigmoid at end self.beta = torch.nn.Sequential(MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ), torch.nn.Sigmoid()) # self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size)) # Softmax policy over options self.pi_omega = torch.nn.Sequential(MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ), torch.nn.Softmax(-1)) # Per-option sigmoid interest functions self.pi_omega_I = torch.nn.Sequential(MlpModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=option_size, nonlinearity=hidden_nonlinearity, inits=inits_v ), torch.nn.Sigmoid()) if use_interest else Dummy(option_size) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation self.use_interest = use_interest def forward(self, observation, prev_action, prev_reward): """ Compute mean, log_std, q-value, and termination estimates from input state. Infers leading dimensions of input: can be [T,B], [B], or []; provides returns with same leading dims. Intermediate feedforward layers process as [T*B,H], with T=1,B=1 when not given. Used both in sampler and in algorithm (both via the agent). """ # Infer (presence of) leading dimensions: [T,B], [B], or []. lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim) if self.normalize_observation: obs_var = self.obs_rms.var if self.norm_obs_var_clip is not None: obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip) observation = torch.clamp((observation - self.obs_rms.mean) / obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip) obs_flat = observation.view(T * B, -1) mu, logstd = self.mu(obs_flat) q = self.q(obs_flat) log_std = logstd.repeat(T * B, 1, 1) beta = self.beta(obs_flat) pi = self.pi_omega(obs_flat) I = self.pi_omega_I(obs_flat) # Restore leading dimensions: [T,B], [B], or [], as input. mu, log_std, q, beta, pi, I = restore_leading_dims((mu, log_std, q, beta, pi, I), lead_dim, T, B) pi = pi * I # Torch multinomial will normalize return mu, log_std, beta, q, pi def update_obs_rms(self, observation): if self.normalize_observation: self.obs_rms.update(observation)
class ModelPgNNContinuousSelective(torch.nn.Module): def __init__( self, observation_shape, action_size, policy_hidden_sizes=None, policy_hidden_nonlinearity=torch.nn.Tanh, value_hidden_sizes=None, value_hidden_nonlinearity=torch.nn.Tanh, init_log_std=0., min_std=0., normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6, policy_inputs_indices=None, ): super().__init__() self.min_std = min_std self._obs_ndim = len(observation_shape) input_size = int(np.prod(observation_shape)) self.policy_inputs_indices = policy_inputs_indices if policy_inputs_indices is not None else list( range(input_size)) policy_hidden_sizes = [ 400, 300 ] if policy_hidden_sizes is None else policy_hidden_sizes value_hidden_sizes = [ 400, 300 ] if value_hidden_sizes is None else value_hidden_sizes self.mu = MlpModel(input_size=len(self.policy_inputs_indices), hidden_sizes=policy_hidden_sizes, output_size=action_size, nonlinearity=policy_hidden_nonlinearity) self.v = MlpModel( input_size=input_size, hidden_sizes=value_hidden_sizes, output_size=1, nonlinearity=value_hidden_nonlinearity, ) self._log_std = torch.nn.Parameter( (np.log(np.exp(init_log_std) - self.min_std)) * torch.ones(action_size)) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation @property def log_std(self): return (self._log_std.exp() + self.min_std).log() def forward(self, observation, prev_action, prev_reward): lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim) if self.normalize_observation: obs_var = self.obs_rms.var if self.norm_obs_var_clip is not None: obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip) observation = torch.clamp( (observation - self.obs_rms.mean) / obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip) obs_flat = observation.view(T * B, -1) mu = self.mu(obs_flat[:, self.policy_inputs_indices]) v = self.v(obs_flat).squeeze(-1) log_std = self.log_std.repeat(T * B, 1) # Restore leading dimensions: [T,B], [B], or [], as input. mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B) return mu, log_std, v def update_obs_rms(self, observation): if self.normalize_observation: self.obs_rms.update(observation)
class MujocoLstmModel(torch.nn.Module): """ Recurrent model for Mujoco locomotion agents: an MLP into an LSTM which outputs distribution means, log_std, and state-value estimate. """ def __init__( self, observation_shape, action_size, hidden_sizes=None, # None for default (see below). lstm_size=256, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6, ): super().__init__() self._obs_n_dim = len(observation_shape) self._action_size = action_size hidden_sizes = hidden_sizes or [256, 256] mlp_input_size = int(np.prod(observation_shape)) self.mlp = MlpModel( input_size=mlp_input_size, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=nonlinearity, ) mlp_output_size = hidden_sizes[-1] if hidden_sizes else mlp_input_size self.lstm = torch.nn.LSTM(mlp_output_size + action_size + 1, lstm_size) self.head = torch.nn.Linear(lstm_size, action_size * 2 + 1) if normalize_observation: self.obs_rms = RunningMeanStdModel(observation_shape) self.norm_obs_clip = norm_obs_clip self.norm_obs_var_clip = norm_obs_var_clip self.normalize_observation = normalize_observation def forward(self, observation, prev_action, prev_reward, init_rnn_state): """ Compute mean, log_std, and value estimate from input state. Infer leading dimensions of input: can be [T,B], [B], or []; provides returns with same leading dims. Intermediate feedforward layers process as [T*B,H], and recurrent layers as [T,B,H], with T=1,B=1 when not given. Used both in sampler and in algorithm (both via the agent). Also returns the next RNN state. """ # Infer (presence of) leading dimensions: [T,B], [B], or []. lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_n_dim) if self.normalize_observation: obs_var = self.obs_rms.var if self.norm_obs_var_clip is not None: obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip) observation = torch.clamp( (observation - self.obs_rms.mean) / obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip, ) mlp_out = self.mlp(observation.view(T * B, -1)) lstm_input = torch.cat( [ mlp_out.view(T, B, -1), prev_action.view(T, B, -1), prev_reward.view(T, B, 1), ], dim=2, ) init_rnn_state = None if init_rnn_state is None else tuple( init_rnn_state) lstm_out, (hn, cn) = self.lstm(lstm_input, init_rnn_state) outputs = self.head(lstm_out.view(T * B, -1)) mu = outputs[:, :self._action_size] log_std = outputs[:, self._action_size:-1] v = outputs[:, -1] # Restore leading dimensions: [T,B], [B], or [], as input. mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B) # Model should always leave B-dimension in rnn state: [N,B,H] next_rnn_state = RnnState(h=hn, c=cn) return mu, log_std, v, next_rnn_state def update_obs_rms(self, observation): if self.normalize_observation: self.obs_rms.update(observation)