def pdfromlatent(self,
                  latent_vector,
                  init_scale=1.0,
                  init_bias=0.0,
                  trainable_variance=True,
                  init_logstd=0.,
                  clip=None,
                  trainable_bias=True):
     pdparam = fc(latent_vector,
                  'pi',
                  self.size,
                  init_scale=init_scale,
                  init_bias=init_bias,
                  trainable_bias=trainable_bias)
     return self.pdfromflat(pdparam), pdparam
    def network_fn(X):
        h = tf.cast(X, tf.float32) / 255.

        activ = tf.nn.relu
        h = activ(
            conv(h,
                 'c1',
                 nf=8,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h = activ(
            conv(h,
                 'c2',
                 nf=16,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h = conv_to_fc(h)
        h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
        return h, None
 def pdfromlatent(self,
                  latent_vector,
                  init_scale=1.0,
                  init_bias=0.0,
                  trainable_variance=True,
                  init_logstd=0.,
                  trainable_bias=True,
                  clip=None):
     mean = fc(latent_vector,
               'pi',
               self.size,
               init_scale=init_scale,
               init_bias=init_bias,
               trainable_bias=trainable_bias)
     if clip is not None:
         mean = tf.clip_by_value(mean, clip[0], clip[1])
     logstd = tf.get_variable(
         name='pi/logstd',
         shape=[1, self.size],
         initializer=tf.constant_initializer(value=init_logstd),
         trainable=trainable_variance)
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     return self.pdfromflat(pdparam), mean
예제 #4
0
    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None,
                 sess=None,trainable_variance=True,init_logstd=0, clip=None, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01,
                                                    trainable_variance=trainable_variance,
                                                    init_logstd=init_logstd,
                                                    clip=clip)

        self.stochastic = tf.placeholder(dtype=tf.bool, shape=())
        self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode())
        self.neglogp = self.pd.neglogp(self.action)
        self.logits=tf.nn.softmax(self.pd.flatparam())
        self.sess = sess
        self.prob = tf.nn.softmax(self.pd.flatparam())
        #out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob * action_selected, axis=1)))
        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi/pi")
        if len(self.vars) == 0:
            self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi")
        self.set_from_flat = tf_util.SetFromFlat(self.vars)
        try:
            self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder')
            self.action_selected = action_selected = tf.one_hot(self.action_ph, env.action_space.n)
        #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1))
            out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob*action_selected, axis=1)))
            gradients = tf.gradients(out, self.vars)
        except:
            self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + env.action_space.shape,
                                            name='targets_placeholder')
            gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars)
        #gradients = tf.gradients(out, self.vars)
        if gradients[0] is not None:
            flat_grad = tf_util.GetFlat(gradients).op
            self.compute_gradients = tf_util.function(
                inputs=[self.X, self.action_ph],
                outputs=[flat_grad]
            )
        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:,0]