示例#1
0
class GaussianEncoder:
    def __init__(
        self,
        name,
        ob_dim,
        latent_dim,
        in_layer=None,
        out_activation=None,
        hidden_dims=[64, 64, 64],
        hidden_activation=tf.nn.tanh,
        weight_init=tf.contrib.layers.xavier_initializer,
        bias_init=tf.zeros_initializer,
        reuse_scope=False,
    ):
        with tf.variable_scope(name, reuse=reuse_scope):
            if in_layer is None:
                self.obs = tf.placeholder(tf.float32,
                                          shape=[None, ob_dim],
                                          name='obs')
            else:
                self.obs = in_layer

            self.mean_network = MLP('means',
                                    ob_dim,
                                    latent_dim,
                                    out_activation=out_activation,
                                    hidden_dims=hidden_dims,
                                    hidden_activation=hidden_activation,
                                    weight_init=weight_init,
                                    bias_init=bias_init,
                                    in_layer=self.obs)
            self.means = self.mean_network.layers['out']

            self.log_var_network = MLP('log_vars',
                                       ob_dim,
                                       latent_dim,
                                       out_activation=out_activation,
                                       hidden_dims=hidden_dims,
                                       hidden_activation=hidden_activation,
                                       weight_init=weight_init,
                                       bias_init=bias_init,
                                       in_layer=self.obs)
            self.log_vars = self.log_var_network.layers['out']

            self.distribution = DiagGaussian(self.means, self.log_vars)
            self.zs = self.distribution.sample()

    def sample_encode(self, obs, global_session):
        zs = global_session.run(self.zs, feed_dict={self.obs: obs})
        return zs
示例#2
0
文件: policies.py 项目: alvinzz/IRL
class GaussianMLPPolicy:
    def __init__(
            self,
            name,
            ob_dim,
            action_dim,
            var_network=False,  # NN if true, else trainable params indep of obs
            out_activation=None,
            hidden_dims=[64, 64],
            hidden_activation=tf.nn.tanh,
            weight_init=tf.contrib.layers.xavier_initializer,
            bias_init=tf.zeros_initializer,
            optimizer=ClipPPO):
        with tf.variable_scope(name):
            self.obs = tf.placeholder(tf.float32,
                                      shape=[None, ob_dim],
                                      name='obs')

            # policy net
            self.mean_network = MLP('means',
                                    ob_dim,
                                    action_dim,
                                    out_activation=out_activation,
                                    hidden_dims=hidden_dims,
                                    hidden_activation=hidden_activation,
                                    weight_init=weight_init,
                                    bias_init=bias_init,
                                    in_layer=self.obs)
            self.means = self.mean_network.layers['out']

            if var_network:
                self.log_var_network = MLP('log_vars',
                                           ob_dim,
                                           action_dim,
                                           out_activation=out_activation,
                                           hidden_dims=hidden_dims,
                                           hidden_activation=hidden_activation,
                                           weight_init=weight_init,
                                           bias_init=bias_init,
                                           in_layer=self.obs)
                self.log_vars = self.log_var_network.layers['out']
            else:
                self.log_var_network = MLP('log_vars',
                                           ob_dim,
                                           action_dim,
                                           out_activation=out_activation,
                                           hidden_dims=[],
                                           hidden_activation=hidden_activation,
                                           weight_init=weight_init,
                                           bias_init=bias_init,
                                           in_layer=self.obs)
                self.log_vars = self.log_var_network.layers['out']

            self.distribution = DiagGaussian(self.means, self.log_vars)
            self.sampled_actions = self.distribution.sample()

            self.actions = tf.placeholder(tf.float32,
                                          shape=[None, action_dim],
                                          name='actions')
            self.action_log_probs = self.distribution.log_prob(self.actions)
            self.entropies = self.distribution.entropy()

            # value net
            self.value_network = MLP('values',
                                     ob_dim,
                                     1,
                                     out_activation=out_activation,
                                     hidden_dims=hidden_dims,
                                     hidden_activation=hidden_activation,
                                     weight_init=weight_init,
                                     bias_init=bias_init,
                                     in_layer=self.obs)
            self.values = self.value_network.layers['out']

            # training, PPO for now
            self.optimizer = optimizer(ob_dim, action_dim, self)

    def act(self, obs, global_session):
        actions = global_session.run(self.sampled_actions,
                                     feed_dict={self.obs: obs})
        return actions

    def rollout_data(self, obs, actions, global_session):
        action_log_probs, values, entropies = global_session.run(
            [self.action_log_probs, self.values, self.entropies],
            feed_dict={
                self.obs: obs,
                self.actions: actions
            })
        return action_log_probs, values, entropies
示例#3
0
class CNNPolicy_with_var(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(CNNPolicy_with_var, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        self.linear1 = nn.Linear(32 * 7 * 7, 512)

        self.critic_linear1 = nn.Linear(512, 200)
        self.critic_linear_mean = nn.Linear(200, 1)
        self.critic_linear_logvar = nn.Linear(200, 1)

        self.actor_linear1 = nn.Linear(512, 200)

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(200, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(200, num_outputs)
        else:
            raise NotImplementedError

        self.train()
        self.reset_parameters()

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

        if self.dist.__class__.__name__ == "DiagGaussian":
            self.dist.fc_mean.weight.data.mul_(0.01)


    def forward(self, inputs):
        x = self.conv1(inputs / 255.0)
        x = F.relu(x)

        x = self.conv2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = F.relu(x)

        x = x.view(-1, 32 * 7 * 7)
        x = self.linear1(x) #[B,512]
        x = F.relu(x)

        x_a = self.actor_linear1(x)
        x_a = F.relu(x_a)

        x_v = self.critic_linear1(x)
        x_v = F.relu(x_v)
        value_mean = self.critic_linear_mean(x_v)
        value_logvar = self.critic_linear_logvar(x_v)

        return value_mean, value_logvar, x_a


    def action_dist(self, inputs):
        x = self.conv1(inputs / 255.0)
        x = F.relu(x)

        x = self.conv2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = F.relu(x)

        x = x.view(-1, 32 * 7 * 7)
        x = self.linear1(x) #[B,512]
        x = F.relu(x)

        x_a = self.actor_linear1(x)
        x_a = F.relu(x_a)

        return self.dist.action_probs(x_a)



    def act(self, inputs, deterministic=False):
        value_mean, value_logvar, x_a = self.forward(inputs)
        action = self.dist.sample(x_a, deterministic=deterministic)
        return value_mean, value_logvar, action

    def evaluate_actions(self, inputs, actions):
        value_mean, value_logvar, x_a = self.forward(inputs)
        action_log_probs, dist_entropy = self.dist.evaluate_actions(x_a, actions)
        return value_mean, value_logvar, action_log_probs, dist_entropy