def omniglot_conv_encoder(inputs, r_dim, is_training, nonlinearity=None, bn=True, kernel_initializer=None, kernel_regularizer=None, counters={}): name = get_name("omniglot_conv_encoder", counters) print("construct", name, "...") with tf.variable_scope(name): with arg_scope([conv2d, dense], nonlinearity=nonlinearity, bn=bn, kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, is_training=is_training): outputs = inputs outputs = conv2d(outputs, 64, 3, 1, "SAME") outputs = conv2d(outputs, 64, 3, 2, "SAME") outputs = conv2d(outputs, 128, 3, 1, "SAME") outputs = conv2d(outputs, 128, 3, 2, "SAME") outputs = conv2d(outputs, 256, 4, 1, "VALID") outputs = conv2d(outputs, 256, 4, 1, "VALID") outputs = tf.reshape(outputs, [-1, 256]) r = tf.dense(outputs, r_dim, nonlinearity=None, bn=False) return r
def a_net(s): mu = tf.layers.dense(s, 1)[:, 0] sig = tf.nn.softplus(tf.dense(s, 1))[:, 0] pi = tf.distributions.Normal(mu, sig) return pi
def _create_network(self, view_space, feature_space): # input input_view = tf.placeholder(tf.float32, (None, ) + view_space) input_feature = tf.placeholder(tf.float32, (None, ) + feature_space) input_act_prob = tf.placeholder(tf.float32, (None, self.num_actions)) action = tf.placeholder(tf.int32, [None]) reward = tf.placeholder(tf.float32, [None]) hidden_size = [256] # fully connected flatten_view = tf.reshape( input_view, [-1, np.prod([v.value for v in input_view.shape[1:]])]) h_view = tf.layers.dense(flatten_view, units=hidden_size[0], activation=tf.nn.relu) h_emb = tf.layers.dense(input_feature, units=hidden_size[0], activation=tf.nn.relu) concat_layer = tf.concat([h_view, h_emb], axis=1) dense = tf.layers.dense(concat_layer, units=hidden_size[0] * 2, activation=tf.nn.relu) policy = tf.layers.dense(dense / 0.1, units=self.num_actions, activation=tf.nn.softmax) policy = tf.clip_by_value(policy, 1e-10, 1 - 1e-10) self.calc_action = tf.multinomial(tf.log(policy), 1) # for value obtain emb_prob = tf.dense(input_act_prob, unit=64, activation=tf.nn.relu) dense_prob = tf.dense(emb_prob, unit=32, action=tf.nn.relu) concat_layer = tf.concat([concat_layer, dense_prob], axis=1) dense = tf.layers.dense(concat_layer, units=hidden_size[0], activation=tf.nn.relu) value = tf.layers.dense(dense, units=1) value = tf.reshape(value, (-1, )) action_mask = tf.one_hot(action, self.num_actions) advantage = tf.stop_gradient(reward - value) log_policy = tf.log(policy + 1e-6) log_prob = tf.reduce_sum(log_policy * action_mask, axis=1) pg_loss = -tf.reduce_mean(advantage * log_prob) vf_loss = self.value_coef * tf.reduce_mean(tf.square(reward - value)) neg_entropy = self.ent_coef * tf.reduce_mean( tf.reduce_sum(policy * log_policy, axis=1)) total_loss = pg_loss + vf_loss + neg_entropy # train op (clip gradient) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) gradients, variables = zip(*optimizer.compute_gradients(total_loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.train_op = optimizer.apply_gradients(zip(gradients, variables)) train_op = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(total_loss) self.input_view = input_view self.input_feature = input_feature self.input_act_prob = input_act_prob self.action = action self.reward = reward self.policy, self.value = policy, value self.train_op = train_op self.pg_loss, self.vf_loss, self.reg_loss = pg_loss, vf_loss, neg_entropy self.total_loss = total_loss