def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0, trainable_variance=True, init_logstd=0., clip=None, trainable_bias=True): pdparam = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias, trainable_bias=trainable_bias) return self.pdfromflat(pdparam), pdparam
def network_fn(X): h = tf.cast(X, tf.float32) / 255. activ = tf.nn.relu h = activ( conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h = activ( conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h = conv_to_fc(h) h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2))) return h, None
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0, trainable_variance=True, init_logstd=0., trainable_bias=True, clip=None): mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias, trainable_bias=trainable_bias) if clip is not None: mean = tf.clip_by_value(mean, clip[0], clip[1]) logstd = tf.get_variable( name='pi/logstd', shape=[1, self.size], initializer=tf.constant_initializer(value=init_logstd), trainable=trainable_variance) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) return self.pdfromflat(pdparam), mean
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None,trainable_variance=True,init_logstd=0, clip=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01, trainable_variance=trainable_variance, init_logstd=init_logstd, clip=clip) self.stochastic = tf.placeholder(dtype=tf.bool, shape=()) self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self.neglogp = self.pd.neglogp(self.action) self.logits=tf.nn.softmax(self.pd.flatparam()) self.sess = sess self.prob = tf.nn.softmax(self.pd.flatparam()) #out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob * action_selected, axis=1))) self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi/pi") if len(self.vars) == 0: self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi") self.set_from_flat = tf_util.SetFromFlat(self.vars) try: self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder') self.action_selected = action_selected = tf.one_hot(self.action_ph, env.action_space.n) #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1)) out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob*action_selected, axis=1))) gradients = tf.gradients(out, self.vars) except: self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + env.action_space.shape, name='targets_placeholder') gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars) #gradients = tf.gradients(out, self.vars) if gradients[0] is not None: flat_grad = tf_util.GetFlat(gradients).op self.compute_gradients = tf_util.function( inputs=[self.X, self.action_ph], outputs=[flat_grad] ) if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:,0]