class GaussianEncoder: def __init__( self, name, ob_dim, latent_dim, in_layer=None, out_activation=None, hidden_dims=[64, 64, 64], hidden_activation=tf.nn.tanh, weight_init=tf.contrib.layers.xavier_initializer, bias_init=tf.zeros_initializer, reuse_scope=False, ): with tf.variable_scope(name, reuse=reuse_scope): if in_layer is None: self.obs = tf.placeholder(tf.float32, shape=[None, ob_dim], name='obs') else: self.obs = in_layer self.mean_network = MLP('means', ob_dim, latent_dim, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs) self.means = self.mean_network.layers['out'] self.log_var_network = MLP('log_vars', ob_dim, latent_dim, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs) self.log_vars = self.log_var_network.layers['out'] self.distribution = DiagGaussian(self.means, self.log_vars) self.zs = self.distribution.sample() def sample_encode(self, obs, global_session): zs = global_session.run(self.zs, feed_dict={self.obs: obs}) return zs
class GaussianMLPPolicy: def __init__( self, name, ob_dim, action_dim, var_network=False, # NN if true, else trainable params indep of obs out_activation=None, hidden_dims=[64, 64], hidden_activation=tf.nn.tanh, weight_init=tf.contrib.layers.xavier_initializer, bias_init=tf.zeros_initializer, optimizer=ClipPPO): with tf.variable_scope(name): self.obs = tf.placeholder(tf.float32, shape=[None, ob_dim], name='obs') # policy net self.mean_network = MLP('means', ob_dim, action_dim, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs) self.means = self.mean_network.layers['out'] if var_network: self.log_var_network = MLP('log_vars', ob_dim, action_dim, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs) self.log_vars = self.log_var_network.layers['out'] else: self.log_var_network = MLP('log_vars', ob_dim, action_dim, out_activation=out_activation, hidden_dims=[], hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs) self.log_vars = self.log_var_network.layers['out'] self.distribution = DiagGaussian(self.means, self.log_vars) self.sampled_actions = self.distribution.sample() self.actions = tf.placeholder(tf.float32, shape=[None, action_dim], name='actions') self.action_log_probs = self.distribution.log_prob(self.actions) self.entropies = self.distribution.entropy() # value net self.value_network = MLP('values', ob_dim, 1, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs) self.values = self.value_network.layers['out'] # training, PPO for now self.optimizer = optimizer(ob_dim, action_dim, self) def act(self, obs, global_session): actions = global_session.run(self.sampled_actions, feed_dict={self.obs: obs}) return actions def rollout_data(self, obs, actions, global_session): action_log_probs, values, entropies = global_session.run( [self.action_log_probs, self.values, self.entropies], feed_dict={ self.obs: obs, self.actions: actions }) return action_log_probs, values, entropies
class CNNPolicy_with_var(nn.Module): def __init__(self, num_inputs, action_space): super(CNNPolicy_with_var, self).__init__() self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4) self.conv2 = nn.Conv2d(32, 64, 4, stride=2) self.conv3 = nn.Conv2d(64, 32, 3, stride=1) self.linear1 = nn.Linear(32 * 7 * 7, 512) self.critic_linear1 = nn.Linear(512, 200) self.critic_linear_mean = nn.Linear(200, 1) self.critic_linear_logvar = nn.Linear(200, 1) self.actor_linear1 = nn.Linear(512, 200) if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(200, num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(200, num_outputs) else: raise NotImplementedError self.train() self.reset_parameters() def reset_parameters(self): self.apply(weights_init) relu_gain = nn.init.calculate_gain('relu') self.conv1.weight.data.mul_(relu_gain) self.conv2.weight.data.mul_(relu_gain) self.conv3.weight.data.mul_(relu_gain) self.linear1.weight.data.mul_(relu_gain) if self.dist.__class__.__name__ == "DiagGaussian": self.dist.fc_mean.weight.data.mul_(0.01) def forward(self, inputs): x = self.conv1(inputs / 255.0) x = F.relu(x) x = self.conv2(x) x = F.relu(x) x = self.conv3(x) x = F.relu(x) x = x.view(-1, 32 * 7 * 7) x = self.linear1(x) #[B,512] x = F.relu(x) x_a = self.actor_linear1(x) x_a = F.relu(x_a) x_v = self.critic_linear1(x) x_v = F.relu(x_v) value_mean = self.critic_linear_mean(x_v) value_logvar = self.critic_linear_logvar(x_v) return value_mean, value_logvar, x_a def action_dist(self, inputs): x = self.conv1(inputs / 255.0) x = F.relu(x) x = self.conv2(x) x = F.relu(x) x = self.conv3(x) x = F.relu(x) x = x.view(-1, 32 * 7 * 7) x = self.linear1(x) #[B,512] x = F.relu(x) x_a = self.actor_linear1(x) x_a = F.relu(x_a) return self.dist.action_probs(x_a) def act(self, inputs, deterministic=False): value_mean, value_logvar, x_a = self.forward(inputs) action = self.dist.sample(x_a, deterministic=deterministic) return value_mean, value_logvar, action def evaluate_actions(self, inputs, actions): value_mean, value_logvar, x_a = self.forward(inputs) action_log_probs, dist_entropy = self.dist.evaluate_actions(x_a, actions) return value_mean, value_logvar, action_log_probs, dist_entropy