示例#1
0
 def __init__(
     self,
     observation_shape,
     action_size,
     hidden_sizes=None,  # None for default (see below).
     lstm_size=256,
     nonlinearity=torch.nn.ReLU,
     normalize_observation=False,
     norm_obs_clip=10,
     norm_obs_var_clip=1e-6,
 ):
     super().__init__()
     self._obs_n_dim = len(observation_shape)
     self._action_size = action_size
     hidden_sizes = hidden_sizes or [256, 256]
     mlp_input_size = int(np.prod(observation_shape))
     self.mlp = MlpModel(
         input_size=mlp_input_size,
         hidden_sizes=hidden_sizes,
         output_size=None,
         nonlinearity=nonlinearity,
     )
     mlp_output_size = hidden_sizes[-1] if hidden_sizes else mlp_input_size
     self.lstm = torch.nn.LSTM(mlp_output_size + action_size + 1, lstm_size)
     self.head = torch.nn.Linear(lstm_size, action_size * 2 + 1)
     if normalize_observation:
         self.obs_rms = RunningMeanStdModel(observation_shape)
         self.norm_obs_clip = norm_obs_clip
         self.norm_obs_var_clip = norm_obs_var_clip
     self.normalize_observation = normalize_observation
示例#2
0
 def __init__(
         self,
         observation_shape,
         action_size,
         hidden_sizes=None,
         lstm_size=None,
         lstm_skip=True,
         constraint=True,
         hidden_nonlinearity="tanh",  # or "relu"
         mu_nonlinearity="tanh",
         init_log_std=0.,
         normalize_observation=True,
         var_clip=1e-6,
         ):
     super().__init__()
     if hidden_nonlinearity == "tanh":  # So these can be strings in config file.
         hidden_nonlinearity = torch.nn.Tanh
     elif hidden_nonlinearity == "relu":
         hidden_nonlinearity = torch.nn.ReLU
     else:
         raise ValueError(f"Unrecognized hidden_nonlinearity string: {hidden_nonlinearity}")
     if mu_nonlinearity == "tanh":  # So these can be strings in config file.
         mu_nonlinearity = torch.nn.Tanh
     elif mu_nonlinearity == "relu":
         mu_nonlinearity = torch.nn.ReLU
     else:
         raise ValueError(f"Unrecognized mu_nonlinearity string: {mu_nonlinearity}")
     self._obs_ndim = len(observation_shape)
     input_size = int(np.prod(observation_shape))
     self.body = MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes or [256, 256],
         nonlinearity=hidden_nonlinearity,
     )
     last_size = self.body.output_size
     if lstm_size:
         lstm_input_size = last_size + action_size + 1
         self.lstm = torch.nn.LSTM(lstm_input_size, lstm_size)
         last_size = lstm_size
     else:
         self.lstm = None
     mu_linear = torch.nn.Linear(last_size, action_size)
     if mu_nonlinearity is not None:
         self.mu = torch.nn.Sequential(mu_linear, mu_nonlinearity())
     else:
         self.mu = mu_linear
     self.value = torch.nn.Linear(last_size, 1)
     if constraint:
         self.constraint = torch.nn.Linear(last_size, 1)
     else:
         self.constraint = None
     self.log_std = torch.nn.Parameter(init_log_std *
         torch.ones(action_size))
     self._lstm_skip = lstm_skip
     if normalize_observation:
         self.obs_rms = RunningMeanStdModel(observation_shape)
         self.var_clip = var_clip
     self.normalize_observation = normalize_observation
示例#3
0
 def __init__(
         self,
         observation_shape,
         action_size,
         option_size,
         hidden_sizes=None,  # None for default (see below).
         hidden_nonlinearity=torch.nn.Tanh,  # Module form.
         mu_nonlinearity=torch.nn.Tanh,  # Module form.
         init_log_std=0.,
         normalize_observation=True,
         norm_obs_clip=10,
         norm_obs_var_clip=1e-6,
         baselines_init=True,  # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value
         use_interest=False,  # IOC sigmoid interest functions
         use_diversity=False,  # TDEOC q entropy output
         use_attention=False,
         ):
     """Instantiate neural net modules according to inputs."""
     super().__init__()
     from functools import partial
     self._obs_ndim = len(observation_shape)
     input_size = int(np.prod(observation_shape))
     hidden_sizes = hidden_sizes or [64, 64]
     inits_mu = inits_v = None
     if baselines_init:
         inits_mu = (np.sqrt(2), 0.01)
         inits_v = (np.sqrt(2), 1.)
     body_mlp_class = partial(MlpModel, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits_v)
     self.model = OptionCriticHead_IndependentPreprocessor(
         input_size=input_size,
         input_module_class=body_mlp_class,
         output_size=action_size,
         option_size=option_size,
         intra_option_policy='continuous',
         intra_option_kwargs={'init_log_std': init_log_std, 'mu_nonlinearity': mu_nonlinearity},
         input_module_kwargs={},
         use_interest=use_interest,
         use_diversity=use_diversity,
         use_attention=use_attention,
         baselines_init=baselines_init,
         orthogonal_init_base=inits_v[1],
         orthogonal_init_pol=inits_mu[1]
     )
     if normalize_observation:
         self.obs_rms = RunningMeanStdModel(observation_shape)
         self.norm_obs_clip = norm_obs_clip
         self.norm_obs_var_clip = norm_obs_var_clip
     self.normalize_observation = normalize_observation
     self.use_interest = use_interest
     self.use_diversity = use_diversity
     self.use_attention = use_attention
示例#4
0
    def __init__(
        self,
        observation_shape,
        output_size,
        hidden_sizes=None,  # None for default (see below).
        lstm_size=256,
        nonlinearity=torch.nn.ReLU,
        normalize_observation=False,
        norm_obs_clip=10,
        norm_obs_var_clip=1e-6,
    ):
        """Instantiate neural net module according to inputs."""
        super().__init__()
        self._obs_n_dim = len(observation_shape)
        hidden_sizes = hidden_sizes or [256, 256]
        mlp_input_size = int(np.prod(observation_shape))
        self.mlp = MlpModel(
            input_size=mlp_input_size,
            hidden_sizes=hidden_sizes,
            output_size=None,
            nonlinearity=nonlinearity,
        )

        mlp_output_size = hidden_sizes[-1] if hidden_sizes else mlp_input_size
        self.lstm = torch.nn.LSTM(mlp_output_size + output_size + 1, lstm_size)
        self.pi = torch.nn.Linear(lstm_size, output_size)
        self.value = torch.nn.Linear(lstm_size, 1)
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.norm_obs_clip = norm_obs_clip
            self.norm_obs_var_clip = norm_obs_var_clip
        self.normalize_observation = normalize_observation
    def __init__(
        self,
        observation_shape,
        action_size,
        policy_hidden_sizes=None,
        policy_hidden_nonlinearity=torch.nn.Tanh,
        value_hidden_sizes=None,
        value_hidden_nonlinearity=torch.nn.Tanh,
        init_log_std=0.,
        min_std=0.,
        normalize_observation=False,
        norm_obs_clip=10,
        norm_obs_var_clip=1e-6,
        policy_inputs_indices=None,
    ):
        super().__init__()
        self.min_std = min_std
        self._obs_ndim = len(observation_shape)
        input_size = int(np.prod(observation_shape))
        self.policy_inputs_indices = policy_inputs_indices if policy_inputs_indices is not None else list(
            range(input_size))

        policy_hidden_sizes = [
            400, 300
        ] if policy_hidden_sizes is None else policy_hidden_sizes
        value_hidden_sizes = [
            400, 300
        ] if value_hidden_sizes is None else value_hidden_sizes
        self.mu = MlpModel(input_size=len(self.policy_inputs_indices),
                           hidden_sizes=policy_hidden_sizes,
                           output_size=action_size,
                           nonlinearity=policy_hidden_nonlinearity)
        self.v = MlpModel(
            input_size=input_size,
            hidden_sizes=value_hidden_sizes,
            output_size=1,
            nonlinearity=value_hidden_nonlinearity,
        )
        self._log_std = torch.nn.Parameter(
            (np.log(np.exp(init_log_std) - self.min_std)) *
            torch.ones(action_size))
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.norm_obs_clip = norm_obs_clip
            self.norm_obs_var_clip = norm_obs_var_clip
        self.normalize_observation = normalize_observation
示例#6
0
 def __init__(
     self,
     observation_shape,
     action_size,
     hidden_sizes=None,  # None for default (see below).
     hidden_nonlinearity=torch.nn.Tanh,  # Module form.
     mu_nonlinearity=torch.nn.Tanh,  # Module form.
     init_log_std=0.,
     normalize_observation=True,
     norm_obs_clip=10,
     norm_obs_var_clip=1e-6,
     baselines_init=True,  # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value
 ):
     """Instantiate neural net modules according to inputs."""
     super().__init__()
     self._obs_ndim = len(observation_shape)
     input_size = int(np.prod(observation_shape))
     hidden_sizes = hidden_sizes or [64, 64]
     inits_mu = inits_v = None
     if baselines_init:
         inits_mu = (np.sqrt(2), 0.01)
         inits_v = (np.sqrt(2), 1.)
     mu_mlp = torch.jit.script(
         MlpModel(input_size=input_size,
                  hidden_sizes=hidden_sizes,
                  output_size=action_size,
                  nonlinearity=hidden_nonlinearity,
                  inits=inits_mu))
     if mu_nonlinearity is not None:
         self.mu = torch.nn.Sequential(mu_mlp, mu_nonlinearity())
     else:
         self.mu = mu_mlp
     self.v = torch.jit.script(
         MlpModel(input_size=input_size,
                  hidden_sizes=hidden_sizes,
                  output_size=1,
                  nonlinearity=hidden_nonlinearity,
                  inits=inits_v))
     self.log_std = torch.nn.Parameter(init_log_std *
                                       torch.ones(action_size))
     if normalize_observation:
         self.obs_rms = RunningMeanStdModel(observation_shape)
         self.norm_obs_clip = norm_obs_clip
         self.norm_obs_var_clip = norm_obs_var_clip
     self.normalize_observation = normalize_observation
示例#7
0
 def __init__(
         self,
         RndCls,  # type: BaseFeatureExtractor
         rnd_model_kwargs):
     """
     Constructs target and distillation model. Assumes identical architectures.
     Also constructs normalization models for observation and intrinsic rewards.
     """
     super().__init__()
     self.target_model = RndCls(**rnd_model_kwargs)
     self.distill_model = RndCls(**rnd_model_kwargs)
     rnd_param_init_(self.target_model)
     rnd_param_init_(self.distill_model)
     self.obs_rms = RunningMeanStdModel(
         wrap(rnd_model_kwargs["input_shape"])
     )  # Requires RndCls takes input_shape
     self.int_rff = None  # Intrinsic reward forward filter (this stores a discounted sum of non-episodic rewards)
     self.int_rff_rms = RunningMeanStdModel(torch.Size(
         [1]))  # Intrinsic reward forward filter RMS model
     self.update_norm = True  # Default to updating obs and int_rew normalization models
示例#8
0
 def __init__(
     self,
     observation_shape,
     action_size,
     hidden_sizes=None,  # None for default (see below).
     hidden_nonlinearity=torch.nn.Tanh,  # Module form.
     mu_nonlinearity=torch.nn.Tanh,  # Module form.
     init_log_std=0.,
     normalize_observation=False,
     norm_obs_clip=10,
     norm_obs_var_clip=1e-6,
 ):
     """Instantiate neural net modules according to inputs."""
     super().__init__()
     self._obs_ndim = len(observation_shape)
     input_size = int(np.prod(observation_shape))
     hidden_sizes = hidden_sizes or [64, 64]
     mu_mlp = MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes,
         output_size=action_size,
         nonlinearity=hidden_nonlinearity,
     )
     if mu_nonlinearity is not None:
         self.mu = torch.nn.Sequential(mu_mlp, mu_nonlinearity())
     else:
         self.mu = mu_mlp
     self.v = MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes,
         output_size=1,
         nonlinearity=hidden_nonlinearity,
     )
     self.log_std = torch.nn.Parameter(init_log_std *
                                       torch.ones(action_size))
     if normalize_observation:
         self.obs_rms = RunningMeanStdModel(observation_shape)
         self.norm_obs_clip = norm_obs_clip
         self.norm_obs_var_clip = norm_obs_var_clip
     self.normalize_observation = normalize_observation
示例#9
0
    def __init__(
        self,
        image_shape,
        action_size,
        hidden_sizes=512,
        stop_conv_grad=False,
        channels=None,  # Defaults below.
        kernel_sizes=None,
        strides=None,
        paddings=None,
        kiaming_init=True,
        normalize_conv_out=False,
    ):
        super().__init__()
        c, h, w = image_shape
        self.conv = Conv2dModel(
            in_channels=c,
            channels=channels or [32, 64, 64],
            kernel_sizes=kernel_sizes or [8, 4, 3],
            strides=strides or [4, 2, 1],
            paddings=paddings,
        )
        self._conv_out_size = self.conv.conv_out_size(h=h, w=w)
        self.pi_v_mlp = MlpModel(
            input_size=self._conv_out_size,
            hidden_sizes=hidden_sizes,
            output_size=action_size + 1,
        )
        if kiaming_init:
            self.apply(weight_init)

        self.stop_conv_grad = stop_conv_grad
        logger.log("Model stopping gradient at CONV." if stop_conv_grad else
                   "Modeul using gradients on all parameters.")
        if normalize_conv_out:
            # Havent' seen this make a difference yet.
            logger.log("Model normalizing conv output across all pixels.")
            self.conv_rms = RunningMeanStdModel((1, ))
            self.var_clip = 1e-6
        self.normalize_conv_out = normalize_conv_out
示例#10
0
class AtariPgModel(torch.nn.Module):
    """Can feed in conv and/or fc1 layer from pre-trained model, or have it
    initialize new ones (if initializing new, must provide image_shape)."""
    def __init__(
        self,
        image_shape,
        action_size,
        hidden_sizes=512,
        stop_conv_grad=False,
        channels=None,  # Defaults below.
        kernel_sizes=None,
        strides=None,
        paddings=None,
        kiaming_init=True,
        normalize_conv_out=False,
    ):
        super().__init__()
        c, h, w = image_shape
        self.conv = Conv2dModel(
            in_channels=c,
            channels=channels or [32, 64, 64],
            kernel_sizes=kernel_sizes or [8, 4, 3],
            strides=strides or [4, 2, 1],
            paddings=paddings,
        )
        self._conv_out_size = self.conv.conv_out_size(h=h, w=w)
        self.pi_v_mlp = MlpModel(
            input_size=self._conv_out_size,
            hidden_sizes=hidden_sizes,
            output_size=action_size + 1,
        )
        if kiaming_init:
            self.apply(weight_init)

        self.stop_conv_grad = stop_conv_grad
        logger.log("Model stopping gradient at CONV." if stop_conv_grad else
                   "Modeul using gradients on all parameters.")
        if normalize_conv_out:
            # Havent' seen this make a difference yet.
            logger.log("Model normalizing conv output across all pixels.")
            self.conv_rms = RunningMeanStdModel((1, ))
            self.var_clip = 1e-6
        self.normalize_conv_out = normalize_conv_out

    def forward(self, observation, prev_action, prev_reward):
        if observation.dtype == torch.uint8:
            img = observation.type(torch.float)
            img = img.mul_(1.0 / 255)
        else:
            img = observation

        lead_dim, T, B, img_shape = infer_leading_dims(img, 3)
        conv = self.conv(img.view(T * B, *img_shape))

        if self.stop_conv_grad:
            conv = conv.detach()
        if self.normalize_conv_out:
            conv_var = self.conv_rms.var
            conv_var = torch.clamp(conv_var, min=self.var_clip)
            # stddev of uniform [a,b] = (b-a)/sqrt(12), 1/sqrt(12)~0.29
            # then allow [0, 10]?
            conv = torch.clamp(0.29 * conv / conv_var.sqrt(), 0, 10)

        pi_v = self.pi_v_mlp(conv.view(T * B, -1))
        pi = F.softmax(pi_v[:, :-1], dim=-1)
        v = pi_v[:, -1]

        pi, v, conv = restore_leading_dims((pi, v, conv), lead_dim, T, B)
        return pi, v, conv

    def update_conv_rms(self, observation):
        if self.normalize_conv_out:
            with torch.no_grad():
                if observation.dtype == torch.uint8:
                    img = observation.type(torch.float)
                    img = img.mul_(1.0 / 255)
                else:
                    img = observation
                lead_dim, T, B, img_shape = infer_leading_dims(img, 3)
                conv = self.conv(img.view(T * B, *img_shape))
                self.conv_rms.update(conv.view(-1, 1))

    def parameters(self):
        if not self.stop_conv_grad:
            yield from self.conv.parameters()
        yield from self.pi_v_mlp.parameters()

    def named_parameters(self):
        if not self.stop_conv_grad:
            yield from self.conv.named_parameters()
        yield from self.pi_v_mlp.named_parameters()

    @property
    def conv_out_size(self):
        return self._conv_out_size
示例#11
0
class RndBonusModule(SelfSupervisedModule):
    """
    Random Network Distillation Module. Produces intrinsic
    rewards as the prediction error between the feature
    embeddings from a target and distilled model, both
    randomly initialized.
    """
    def __init__(
            self,
            RndCls,  # type: BaseFeatureExtractor
            rnd_model_kwargs):
        """
        Constructs target and distillation model. Assumes identical architectures.
        Also constructs normalization models for observation and intrinsic rewards.
        """
        super().__init__()
        self.target_model = RndCls(**rnd_model_kwargs)
        self.distill_model = RndCls(**rnd_model_kwargs)
        rnd_param_init_(self.target_model)
        rnd_param_init_(self.distill_model)
        self.obs_rms = RunningMeanStdModel(
            wrap(rnd_model_kwargs["input_shape"])
        )  # Requires RndCls takes input_shape
        self.int_rff = None  # Intrinsic reward forward filter (this stores a discounted sum of non-episodic rewards)
        self.int_rff_rms = RunningMeanStdModel(torch.Size(
            [1]))  # Intrinsic reward forward filter RMS model
        self.update_norm = True  # Default to updating obs and int_rew normalization models

    def normalize_obs(self, obs):
        """
        Normalizes observations according to specifications in
        https://arxiv.org/abs/1810.12894. This is necessary since the target
        network is fixed and cannot adjust to varying environments.

        This model should be initialized in the sampler by running
        a small number of observations through it.

        WARNING: If observations are already normalized using
        a different model / formulation, this will cause issues
        if this model is initialized on raw obs in the sampler.
        """
        obs = obs.to(
            dtype=torch.float32)  # Obs may be byte tensor (e.g. 8-bit pixels)
        if self.update_norm:
            self.obs_rms.update(obs)
        obs = (obs - self.obs_rms.mean) / torch.sqrt(self.obs_rms.var + 1e-5)
        obs = torch.clamp(obs, min=-5, max=5)
        return obs

    def normalize_int_rew(self, int_rew, gamma=0.99):
        """
        Normalizes intrinsic rewards according to specifications in
        https://arxiv.org/abs/1810.12894. This is done to remove the
        need to search for optimal intrinsic reward scaling factors in between
        different environments.

        This model is *not* expected to be initialized, if following the authors'
        implementation.
        """
        # Update rewards forward filter and gather batch of results
        rff_batch = torch.empty_like(int_rew)
        int_rff_prior = self.int_rff
        for i, rews in enumerate(int_rew):
            if self.int_rff is None:
                self.int_rff = rews
            else:
                self.int_rff = self.int_rff * gamma + rews
            rff_batch[i, :] = self.int_rff

        # Update intrinsic rff rms for int rew normalization if updating norm models
        if self.update_norm:
            batch_size = rff_batch.numel()
            self.int_rff_rms.update(rff_batch.view((batch_size, 1)))
        else:  # Reset rff prior state if not updating norm models
            self.int_rff = int_rff_prior

        # Normalize by dividing out running std of rff values
        return int_rew / torch.sqrt(self.int_rff_rms.var)

    def forward(self, next_obs):
        """
        Runs forward pass for distillation and target models, producing intrinsic
        bonuses and distillation model loss. Note the self-supervised losses of
        the models are unused (and are presumably placeholders with a value of zero).
        """
        next_obs = self.normalize_obs(next_obs)
        distill_feat, _ = self.distill_model(next_obs)
        target_feat, _ = self.target_model(next_obs)
        pred_errors = torch.mean((distill_feat - target_feat.detach())**2,
                                 dim=-1)  # Maintains batch dimension
        distill_loss = torch.mean(pred_errors)  # Reduces batch dimension
        int_rew = pred_errors.detach()
        return int_rew, distill_loss
示例#12
0
class CppoModel(torch.nn.Module):

    def __init__(
            self,
            observation_shape,
            action_size,
            hidden_sizes=None,
            lstm_size=None,
            lstm_skip=True,
            constraint=True,
            hidden_nonlinearity="tanh",  # or "relu"
            mu_nonlinearity="tanh",
            init_log_std=0.,
            normalize_observation=True,
            var_clip=1e-6,
            ):
        super().__init__()
        if hidden_nonlinearity == "tanh":  # So these can be strings in config file.
            hidden_nonlinearity = torch.nn.Tanh
        elif hidden_nonlinearity == "relu":
            hidden_nonlinearity = torch.nn.ReLU
        else:
            raise ValueError(f"Unrecognized hidden_nonlinearity string: {hidden_nonlinearity}")
        if mu_nonlinearity == "tanh":  # So these can be strings in config file.
            mu_nonlinearity = torch.nn.Tanh
        elif mu_nonlinearity == "relu":
            mu_nonlinearity = torch.nn.ReLU
        else:
            raise ValueError(f"Unrecognized mu_nonlinearity string: {mu_nonlinearity}")
        self._obs_ndim = len(observation_shape)
        input_size = int(np.prod(observation_shape))
        self.body = MlpModel(
            input_size=input_size,
            hidden_sizes=hidden_sizes or [256, 256],
            nonlinearity=hidden_nonlinearity,
        )
        last_size = self.body.output_size
        if lstm_size:
            lstm_input_size = last_size + action_size + 1
            self.lstm = torch.nn.LSTM(lstm_input_size, lstm_size)
            last_size = lstm_size
        else:
            self.lstm = None
        mu_linear = torch.nn.Linear(last_size, action_size)
        if mu_nonlinearity is not None:
            self.mu = torch.nn.Sequential(mu_linear, mu_nonlinearity())
        else:
            self.mu = mu_linear
        self.value = torch.nn.Linear(last_size, 1)
        if constraint:
            self.constraint = torch.nn.Linear(last_size, 1)
        else:
            self.constraint = None
        self.log_std = torch.nn.Parameter(init_log_std *
            torch.ones(action_size))
        self._lstm_skip = lstm_skip
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.var_clip = var_clip
        self.normalize_observation = normalize_observation

    def forward(self, observation, prev_action, prev_reward, init_rnn_state=None):
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim)
        if self.normalize_observation:
            obs_var = self.obs_rms.var
            if self.var_clip is not None:
                obs_var = torch.clamp(obs_var, min=self.var_clip)
            observation = torch.clamp((observation - self.obs_rms.mean) /
                obs_var.sqrt(), -10, 10)
        fc_x = self.body(observation.view(T * B, -1))
        if self.lstm is not None:
            lstm_inputs = [fc_x, prev_action, prev_reward]
            lstm_input = torch.cat([x.view(T, B, -1) for x in lstm_inputs],
                dim=2)
            # lstm_input = torch.cat([
            #     fc_x.view(T, B, -1),
            #     prev_action.view(T, B, -1),
            #     prev_reward.view(T, B, -1),
            #     ], dim=2)
            init_rnn_state = None if init_rnn_state is None else tuple(init_rnn_state)
            lstm_out, (hn, cn) = self.lstm(lstm_input, init_rnn_state)
            lstm_out = lstm_out.view(T * B, -1)
            if self._lstm_skip:
                fc_x = fc_x + lstm_out
            else:
                fc_x = lstm_out

        mu = self.mu(fc_x)
        log_std = self.log_std.repeat(T * B, 1)
        v = self.value(fc_x).squeeze(-1)
        mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B)

        if self.constraint is None:
            value = ValueInfo(value=v, c_value=None)
        else:
            c = self.constraint(fc_x).squeeze(-1)
            c = restore_leading_dims(c, lead_dim, T, B)
            value = ValueInfo(value=v, c_value=c)

        outputs = (mu, log_std, value)
        if self.lstm is not None:
            outputs += (RnnState(h=hn, c=cn),)

        return outputs

    def update_obs_rms(self, observation):
        if not self.normalize_observation:
            return
        self.obs_rms.update(observation)
示例#13
0
 def __init__(
         self,
         observation_shape,
         action_size,
         option_size,
         hidden_sizes=None,  # None for default (see below).
         hidden_nonlinearity=torch.nn.Tanh,  # Module form.
         mu_nonlinearity=torch.nn.Tanh,  # Module form.
         init_log_std=0.,
         normalize_observation=True,
         norm_obs_clip=10,
         norm_obs_var_clip=1e-6,
         baselines_init=True,  # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value
         use_interest=False,  # IOC sigmoid interest functions
         ):
     """Instantiate neural net modules according to inputs."""
     super().__init__()
     self._obs_ndim = len(observation_shape)
     input_size = int(np.prod(observation_shape))
     hidden_sizes = hidden_sizes or [64, 64]
     inits_mu = inits_v = None
     if baselines_init:
         inits_mu = (np.sqrt(2), 0.01)
         inits_v = (np.sqrt(2), 1.)
     # Body for intra-option policy mean
     mu_mlp = MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes,
         output_size=None,
         nonlinearity=hidden_nonlinearity,
         inits=inits_mu
     )
     # Intra-option policy. Outputs tanh mu if exists, else unactivateed linear. Also logstd
     self.mu = torch.nn.Sequential(mu_mlp, ContinuousIntraOptionPolicy(input_size=mu_mlp.output_size,
                                                                       num_options=option_size,
                                                                       num_actions=action_size,
                                                                       ortho_init=baselines_init,
                                                                       ortho_init_value=inits_mu[-1],
                                                                       init_log_std=init_log_std,
                                                                       mu_nonlinearity=mu_nonlinearity))
     # Option value. Pure linear
     self.q = MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes,
         output_size=option_size,
         nonlinearity=hidden_nonlinearity,
         inits=inits_v
     )
     # Option termination. MLP with sigmoid at end
     self.beta = torch.nn.Sequential(MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes,
         output_size=option_size,
         nonlinearity=hidden_nonlinearity,
         inits=inits_v
     ), torch.nn.Sigmoid())
     # self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size))
     # Softmax policy over options
     self.pi_omega = torch.nn.Sequential(MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes,
         output_size=option_size,
         nonlinearity=hidden_nonlinearity,
         inits=inits_v
     ), torch.nn.Softmax(-1))
     # Per-option sigmoid interest functions
     self.pi_omega_I = torch.nn.Sequential(MlpModel(
         input_size=input_size,
         hidden_sizes=hidden_sizes,
         output_size=option_size,
         nonlinearity=hidden_nonlinearity,
         inits=inits_v
     ), torch.nn.Sigmoid()) if use_interest else Dummy(option_size)
     if normalize_observation:
         self.obs_rms = RunningMeanStdModel(observation_shape)
         self.norm_obs_clip = norm_obs_clip
         self.norm_obs_var_clip = norm_obs_var_clip
     self.normalize_observation = normalize_observation
     self.use_interest = use_interest
示例#14
0
class MujocoFfModel(torch.nn.Module):
    """
    Model commonly used in Mujoco locomotion agents: an MLP which outputs
    distribution means, separate parameter for learned log_std, and separate
    MLP for state-value estimate.
    """
    def __init__(
        self,
        observation_shape,
        action_size,
        hidden_sizes=None,  # None for default (see below).
        hidden_nonlinearity=torch.nn.Tanh,  # Module form.
        mu_nonlinearity=torch.nn.Tanh,  # Module form.
        init_log_std=0.,
        normalize_observation=True,
        norm_obs_clip=10,
        norm_obs_var_clip=1e-6,
        baselines_init=True,  # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value
    ):
        """Instantiate neural net modules according to inputs."""
        super().__init__()
        self._obs_ndim = len(observation_shape)
        input_size = int(np.prod(observation_shape))
        hidden_sizes = hidden_sizes or [64, 64]
        inits_mu = inits_v = None
        if baselines_init:
            inits_mu = (np.sqrt(2), 0.01)
            inits_v = (np.sqrt(2), 1.)
        mu_mlp = torch.jit.script(
            MlpModel(input_size=input_size,
                     hidden_sizes=hidden_sizes,
                     output_size=action_size,
                     nonlinearity=hidden_nonlinearity,
                     inits=inits_mu))
        if mu_nonlinearity is not None:
            self.mu = torch.nn.Sequential(mu_mlp, mu_nonlinearity())
        else:
            self.mu = mu_mlp
        self.v = torch.jit.script(
            MlpModel(input_size=input_size,
                     hidden_sizes=hidden_sizes,
                     output_size=1,
                     nonlinearity=hidden_nonlinearity,
                     inits=inits_v))
        self.log_std = torch.nn.Parameter(init_log_std *
                                          torch.ones(action_size))
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.norm_obs_clip = norm_obs_clip
            self.norm_obs_var_clip = norm_obs_var_clip
        self.normalize_observation = normalize_observation

    def forward(self, observation, prev_action, prev_reward):
        """
        Compute mean, log_std, and value estimate from input state. Infers
        leading dimensions of input: can be [T,B], [B], or []; provides
        returns with same leading dims.  Intermediate feedforward layers
        process as [T*B,H], with T=1,B=1 when not given. Used both in sampler
        and in algorithm (both via the agent).
        """
        # Infer (presence of) leading dimensions: [T,B], [B], or [].
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim)

        if self.normalize_observation:
            obs_var = self.obs_rms.var
            if self.norm_obs_var_clip is not None:
                obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip)
            observation = torch.clamp(
                (observation - self.obs_rms.mean) / obs_var.sqrt(),
                -self.norm_obs_clip, self.norm_obs_clip)

        obs_flat = observation.view(T * B, -1)
        mu = self.mu(obs_flat)
        v = self.v(obs_flat).squeeze(-1)
        log_std = self.log_std.repeat(T * B, 1)

        # Restore leading dimensions: [T,B], [B], or [], as input.
        mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B)

        return mu, log_std, v

    def update_obs_rms(self, observation):
        if self.normalize_observation:
            self.obs_rms.update(observation)
示例#15
0
class RefactoredMujocoOcFfModel(torch.nn.Module):
    """
    Model commonly used in Mujoco locomotion agents: an MLP which outputs
    distribution means, separate parameter for learned log_std, and separate
    MLPs for state-option-value estimate, termination probabilities. Policy over options
    """

    def __init__(
            self,
            observation_shape,
            action_size,
            option_size,
            hidden_sizes=None,  # None for default (see below).
            hidden_nonlinearity=torch.nn.Tanh,  # Module form.
            mu_nonlinearity=torch.nn.Tanh,  # Module form.
            init_log_std=0.,
            normalize_observation=True,
            norm_obs_clip=10,
            norm_obs_var_clip=1e-6,
            baselines_init=True,  # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value
            use_interest=False,  # IOC sigmoid interest functions
            use_diversity=False,  # TDEOC q entropy output
            use_attention=False,
            ):
        """Instantiate neural net modules according to inputs."""
        super().__init__()
        from functools import partial
        self._obs_ndim = len(observation_shape)
        input_size = int(np.prod(observation_shape))
        hidden_sizes = hidden_sizes or [64, 64]
        inits_mu = inits_v = None
        if baselines_init:
            inits_mu = (np.sqrt(2), 0.01)
            inits_v = (np.sqrt(2), 1.)
        body_mlp_class = partial(MlpModel, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits_v)
        self.model = OptionCriticHead_IndependentPreprocessor(
            input_size=input_size,
            input_module_class=body_mlp_class,
            output_size=action_size,
            option_size=option_size,
            intra_option_policy='continuous',
            intra_option_kwargs={'init_log_std': init_log_std, 'mu_nonlinearity': mu_nonlinearity},
            input_module_kwargs={},
            use_interest=use_interest,
            use_diversity=use_diversity,
            use_attention=use_attention,
            baselines_init=baselines_init,
            orthogonal_init_base=inits_v[1],
            orthogonal_init_pol=inits_mu[1]
        )
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.norm_obs_clip = norm_obs_clip
            self.norm_obs_var_clip = norm_obs_var_clip
        self.normalize_observation = normalize_observation
        self.use_interest = use_interest
        self.use_diversity = use_diversity
        self.use_attention = use_attention

    def forward(self, observation, prev_action, prev_reward):
        """
        Compute mean, log_std, q-value, and termination estimates from input state. Infers
        leading dimensions of input: can be [T,B], [B], or []; provides
        returns with same leading dims.  Intermediate feedforward layers
        process as [T*B,H], with T=1,B=1 when not given. Used both in sampler
        and in algorithm (both via the agent).
        """
        # Infer (presence of) leading dimensions: [T,B], [B], or [].
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim)
        if self.normalize_observation:
            obs_var = self.obs_rms.var
            if self.norm_obs_var_clip is not None:
                obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip)
            observation = torch.clamp((observation - self.obs_rms.mean) /
                obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip)

        obs_flat = observation.view(T * B, -1)
        (mu, logstd), beta, q, pi_I, q_ent = self.model(obs_flat)
        log_std = logstd.repeat(T * B, 1, 1)
        # Restore leading dimensions: [T,B], [B], or [], as input.
        mu, log_std, q, beta, pi, q_ent = restore_leading_dims((mu, log_std, q, beta, pi_I, q_ent), lead_dim, T, B)
        return mu, log_std, beta, q, pi

    def update_obs_rms(self, observation):
        if self.normalize_observation:
            self.obs_rms.update(observation)
示例#16
0
class MujocoOCFfModel(torch.nn.Module):
    """
    Model commonly used in Mujoco locomotion agents: an MLP which outputs
    distribution means, separate parameter for learned log_std, and separate
    MLPs for state-option-value estimate, termination probabilities. Policy over options
    """

    def __init__(
            self,
            observation_shape,
            action_size,
            option_size,
            hidden_sizes=None,  # None for default (see below).
            hidden_nonlinearity=torch.nn.Tanh,  # Module form.
            mu_nonlinearity=torch.nn.Tanh,  # Module form.
            init_log_std=0.,
            normalize_observation=True,
            norm_obs_clip=10,
            norm_obs_var_clip=1e-6,
            baselines_init=True,  # Orthogonal initialization of sqrt(2) until last layer, then 0.01 for policy, 1 for value
            use_interest=False,  # IOC sigmoid interest functions
            ):
        """Instantiate neural net modules according to inputs."""
        super().__init__()
        self._obs_ndim = len(observation_shape)
        input_size = int(np.prod(observation_shape))
        hidden_sizes = hidden_sizes or [64, 64]
        inits_mu = inits_v = None
        if baselines_init:
            inits_mu = (np.sqrt(2), 0.01)
            inits_v = (np.sqrt(2), 1.)
        # Body for intra-option policy mean
        mu_mlp = MlpModel(
            input_size=input_size,
            hidden_sizes=hidden_sizes,
            output_size=None,
            nonlinearity=hidden_nonlinearity,
            inits=inits_mu
        )
        # Intra-option policy. Outputs tanh mu if exists, else unactivateed linear. Also logstd
        self.mu = torch.nn.Sequential(mu_mlp, ContinuousIntraOptionPolicy(input_size=mu_mlp.output_size,
                                                                          num_options=option_size,
                                                                          num_actions=action_size,
                                                                          ortho_init=baselines_init,
                                                                          ortho_init_value=inits_mu[-1],
                                                                          init_log_std=init_log_std,
                                                                          mu_nonlinearity=mu_nonlinearity))
        # Option value. Pure linear
        self.q = MlpModel(
            input_size=input_size,
            hidden_sizes=hidden_sizes,
            output_size=option_size,
            nonlinearity=hidden_nonlinearity,
            inits=inits_v
        )
        # Option termination. MLP with sigmoid at end
        self.beta = torch.nn.Sequential(MlpModel(
            input_size=input_size,
            hidden_sizes=hidden_sizes,
            output_size=option_size,
            nonlinearity=hidden_nonlinearity,
            inits=inits_v
        ), torch.nn.Sigmoid())
        # self.log_std = torch.nn.Parameter(init_log_std * torch.ones(action_size))
        # Softmax policy over options
        self.pi_omega = torch.nn.Sequential(MlpModel(
            input_size=input_size,
            hidden_sizes=hidden_sizes,
            output_size=option_size,
            nonlinearity=hidden_nonlinearity,
            inits=inits_v
        ), torch.nn.Softmax(-1))
        # Per-option sigmoid interest functions
        self.pi_omega_I = torch.nn.Sequential(MlpModel(
            input_size=input_size,
            hidden_sizes=hidden_sizes,
            output_size=option_size,
            nonlinearity=hidden_nonlinearity,
            inits=inits_v
        ), torch.nn.Sigmoid()) if use_interest else Dummy(option_size)
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.norm_obs_clip = norm_obs_clip
            self.norm_obs_var_clip = norm_obs_var_clip
        self.normalize_observation = normalize_observation
        self.use_interest = use_interest

    def forward(self, observation, prev_action, prev_reward):
        """
        Compute mean, log_std, q-value, and termination estimates from input state. Infers
        leading dimensions of input: can be [T,B], [B], or []; provides
        returns with same leading dims.  Intermediate feedforward layers
        process as [T*B,H], with T=1,B=1 when not given. Used both in sampler
        and in algorithm (both via the agent).
        """
        # Infer (presence of) leading dimensions: [T,B], [B], or [].
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim)
        if self.normalize_observation:
            obs_var = self.obs_rms.var
            if self.norm_obs_var_clip is not None:
                obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip)
            observation = torch.clamp((observation - self.obs_rms.mean) /
                obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip)

        obs_flat = observation.view(T * B, -1)
        mu, logstd = self.mu(obs_flat)
        q = self.q(obs_flat)
        log_std = logstd.repeat(T * B, 1, 1)
        beta = self.beta(obs_flat)
        pi = self.pi_omega(obs_flat)
        I = self.pi_omega_I(obs_flat)
        # Restore leading dimensions: [T,B], [B], or [], as input.
        mu, log_std, q, beta, pi, I = restore_leading_dims((mu, log_std, q, beta, pi, I), lead_dim, T, B)
        pi = pi * I  # Torch multinomial will normalize
        return mu, log_std, beta, q, pi

    def update_obs_rms(self, observation):
        if self.normalize_observation:
            self.obs_rms.update(observation)
示例#17
0
class ModelPgNNContinuousSelective(torch.nn.Module):
    def __init__(
        self,
        observation_shape,
        action_size,
        policy_hidden_sizes=None,
        policy_hidden_nonlinearity=torch.nn.Tanh,
        value_hidden_sizes=None,
        value_hidden_nonlinearity=torch.nn.Tanh,
        init_log_std=0.,
        min_std=0.,
        normalize_observation=False,
        norm_obs_clip=10,
        norm_obs_var_clip=1e-6,
        policy_inputs_indices=None,
    ):
        super().__init__()
        self.min_std = min_std
        self._obs_ndim = len(observation_shape)
        input_size = int(np.prod(observation_shape))
        self.policy_inputs_indices = policy_inputs_indices if policy_inputs_indices is not None else list(
            range(input_size))

        policy_hidden_sizes = [
            400, 300
        ] if policy_hidden_sizes is None else policy_hidden_sizes
        value_hidden_sizes = [
            400, 300
        ] if value_hidden_sizes is None else value_hidden_sizes
        self.mu = MlpModel(input_size=len(self.policy_inputs_indices),
                           hidden_sizes=policy_hidden_sizes,
                           output_size=action_size,
                           nonlinearity=policy_hidden_nonlinearity)
        self.v = MlpModel(
            input_size=input_size,
            hidden_sizes=value_hidden_sizes,
            output_size=1,
            nonlinearity=value_hidden_nonlinearity,
        )
        self._log_std = torch.nn.Parameter(
            (np.log(np.exp(init_log_std) - self.min_std)) *
            torch.ones(action_size))
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.norm_obs_clip = norm_obs_clip
            self.norm_obs_var_clip = norm_obs_var_clip
        self.normalize_observation = normalize_observation

    @property
    def log_std(self):
        return (self._log_std.exp() + self.min_std).log()

    def forward(self, observation, prev_action, prev_reward):
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim)
        if self.normalize_observation:
            obs_var = self.obs_rms.var
            if self.norm_obs_var_clip is not None:
                obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip)
            observation = torch.clamp(
                (observation - self.obs_rms.mean) / obs_var.sqrt(),
                -self.norm_obs_clip, self.norm_obs_clip)
        obs_flat = observation.view(T * B, -1)
        mu = self.mu(obs_flat[:, self.policy_inputs_indices])
        v = self.v(obs_flat).squeeze(-1)
        log_std = self.log_std.repeat(T * B, 1)

        # Restore leading dimensions: [T,B], [B], or [], as input.
        mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B)

        return mu, log_std, v

    def update_obs_rms(self, observation):
        if self.normalize_observation:
            self.obs_rms.update(observation)
示例#18
0
class MujocoLstmModel(torch.nn.Module):
    """
    Recurrent model for Mujoco locomotion agents: an MLP into an LSTM which
    outputs distribution means, log_std, and state-value estimate.
    """
    def __init__(
        self,
        observation_shape,
        action_size,
        hidden_sizes=None,  # None for default (see below).
        lstm_size=256,
        nonlinearity=torch.nn.ReLU,
        normalize_observation=False,
        norm_obs_clip=10,
        norm_obs_var_clip=1e-6,
    ):
        super().__init__()
        self._obs_n_dim = len(observation_shape)
        self._action_size = action_size
        hidden_sizes = hidden_sizes or [256, 256]
        mlp_input_size = int(np.prod(observation_shape))
        self.mlp = MlpModel(
            input_size=mlp_input_size,
            hidden_sizes=hidden_sizes,
            output_size=None,
            nonlinearity=nonlinearity,
        )
        mlp_output_size = hidden_sizes[-1] if hidden_sizes else mlp_input_size
        self.lstm = torch.nn.LSTM(mlp_output_size + action_size + 1, lstm_size)
        self.head = torch.nn.Linear(lstm_size, action_size * 2 + 1)
        if normalize_observation:
            self.obs_rms = RunningMeanStdModel(observation_shape)
            self.norm_obs_clip = norm_obs_clip
            self.norm_obs_var_clip = norm_obs_var_clip
        self.normalize_observation = normalize_observation

    def forward(self, observation, prev_action, prev_reward, init_rnn_state):
        """
        Compute mean, log_std, and value estimate from input state. Infer
        leading dimensions of input: can be [T,B], [B], or []; provides
        returns with same leading dims.  Intermediate feedforward layers
        process as [T*B,H], and recurrent layers as [T,B,H], with T=1,B=1 when
        not given. Used both in sampler and in algorithm (both via the agent).
        Also returns the next RNN state.
        """
        # Infer (presence of) leading dimensions: [T,B], [B], or [].
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_n_dim)

        if self.normalize_observation:
            obs_var = self.obs_rms.var
            if self.norm_obs_var_clip is not None:
                obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip)
            observation = torch.clamp(
                (observation - self.obs_rms.mean) / obs_var.sqrt(),
                -self.norm_obs_clip,
                self.norm_obs_clip,
            )

        mlp_out = self.mlp(observation.view(T * B, -1))
        lstm_input = torch.cat(
            [
                mlp_out.view(T, B, -1),
                prev_action.view(T, B, -1),
                prev_reward.view(T, B, 1),
            ],
            dim=2,
        )
        init_rnn_state = None if init_rnn_state is None else tuple(
            init_rnn_state)
        lstm_out, (hn, cn) = self.lstm(lstm_input, init_rnn_state)
        outputs = self.head(lstm_out.view(T * B, -1))
        mu = outputs[:, :self._action_size]
        log_std = outputs[:, self._action_size:-1]
        v = outputs[:, -1]

        # Restore leading dimensions: [T,B], [B], or [], as input.
        mu, log_std, v = restore_leading_dims((mu, log_std, v), lead_dim, T, B)
        # Model should always leave B-dimension in rnn state: [N,B,H]
        next_rnn_state = RnnState(h=hn, c=cn)

        return mu, log_std, v, next_rnn_state

    def update_obs_rms(self, observation):
        if self.normalize_observation:
            self.obs_rms.update(observation)