Пример #1
0
def omniglot_conv_encoder(inputs,
                          r_dim,
                          is_training,
                          nonlinearity=None,
                          bn=True,
                          kernel_initializer=None,
                          kernel_regularizer=None,
                          counters={}):
    name = get_name("omniglot_conv_encoder", counters)
    print("construct", name, "...")
    with tf.variable_scope(name):
        with arg_scope([conv2d, dense],
                       nonlinearity=nonlinearity,
                       bn=bn,
                       kernel_initializer=kernel_initializer,
                       kernel_regularizer=kernel_regularizer,
                       is_training=is_training):
            outputs = inputs
            outputs = conv2d(outputs, 64, 3, 1, "SAME")
            outputs = conv2d(outputs, 64, 3, 2, "SAME")
            outputs = conv2d(outputs, 128, 3, 1, "SAME")
            outputs = conv2d(outputs, 128, 3, 2, "SAME")
            outputs = conv2d(outputs, 256, 4, 1, "VALID")
            outputs = conv2d(outputs, 256, 4, 1, "VALID")
            outputs = tf.reshape(outputs, [-1, 256])
            r = tf.dense(outputs, r_dim, nonlinearity=None, bn=False)
            return r
Пример #2
0
 def a_net(s):
     mu = tf.layers.dense(s, 1)[:, 0]
     sig = tf.nn.softplus(tf.dense(s, 1))[:, 0]
     pi = tf.distributions.Normal(mu, sig)
     return pi
Пример #3
0
Файл: ac.py Проект: jk-cim/mfrl
    def _create_network(self, view_space, feature_space):
        # input
        input_view = tf.placeholder(tf.float32, (None, ) + view_space)
        input_feature = tf.placeholder(tf.float32, (None, ) + feature_space)
        input_act_prob = tf.placeholder(tf.float32, (None, self.num_actions))
        action = tf.placeholder(tf.int32, [None])

        reward = tf.placeholder(tf.float32, [None])

        hidden_size = [256]

        # fully connected
        flatten_view = tf.reshape(
            input_view,
            [-1, np.prod([v.value for v in input_view.shape[1:]])])
        h_view = tf.layers.dense(flatten_view,
                                 units=hidden_size[0],
                                 activation=tf.nn.relu)

        h_emb = tf.layers.dense(input_feature,
                                units=hidden_size[0],
                                activation=tf.nn.relu)

        concat_layer = tf.concat([h_view, h_emb], axis=1)
        dense = tf.layers.dense(concat_layer,
                                units=hidden_size[0] * 2,
                                activation=tf.nn.relu)

        policy = tf.layers.dense(dense / 0.1,
                                 units=self.num_actions,
                                 activation=tf.nn.softmax)
        policy = tf.clip_by_value(policy, 1e-10, 1 - 1e-10)

        self.calc_action = tf.multinomial(tf.log(policy), 1)

        # for value obtain
        emb_prob = tf.dense(input_act_prob, unit=64, activation=tf.nn.relu)
        dense_prob = tf.dense(emb_prob, unit=32, action=tf.nn.relu)
        concat_layer = tf.concat([concat_layer, dense_prob], axis=1)
        dense = tf.layers.dense(concat_layer,
                                units=hidden_size[0],
                                activation=tf.nn.relu)
        value = tf.layers.dense(dense, units=1)
        value = tf.reshape(value, (-1, ))

        action_mask = tf.one_hot(action, self.num_actions)
        advantage = tf.stop_gradient(reward - value)

        log_policy = tf.log(policy + 1e-6)
        log_prob = tf.reduce_sum(log_policy * action_mask, axis=1)

        pg_loss = -tf.reduce_mean(advantage * log_prob)
        vf_loss = self.value_coef * tf.reduce_mean(tf.square(reward - value))
        neg_entropy = self.ent_coef * tf.reduce_mean(
            tf.reduce_sum(policy * log_policy, axis=1))
        total_loss = pg_loss + vf_loss + neg_entropy

        # train op (clip gradient)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        gradients, variables = zip(*optimizer.compute_gradients(total_loss))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        self.train_op = optimizer.apply_gradients(zip(gradients, variables))

        train_op = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate).minimize(total_loss)

        self.input_view = input_view
        self.input_feature = input_feature
        self.input_act_prob = input_act_prob
        self.action = action
        self.reward = reward

        self.policy, self.value = policy, value
        self.train_op = train_op
        self.pg_loss, self.vf_loss, self.reg_loss = pg_loss, vf_loss, neg_entropy
        self.total_loss = total_loss