示例#1
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_stack,
                 reuse=False,
                 n_lstm=256):
        super(AcerLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env,
                                             n_steps, n_stack, reuse, n_lstm)
        with tf.variable_scope("model", reuse=reuse):
            extracted_features = nature_cnn(self.obs_ph)

            # lstm
            input_seq = batch_to_seq(extracted_features, n_env, n_steps)
            masks = batch_to_seq(self.masks_ph, n_env, n_steps)
            rnn_output, self.snew = lstm(input_seq,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm)
            rnn_output = seq_to_batch(rnn_output)

            pi_logits = linear(rnn_output, 'pi', self.n_act, init_scale=0.01)
            policy = tf.nn.softmax(pi_logits)
            q_value = linear(rnn_output, 'q', self.n_act)

        self.action = sample(
            pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((n_env, n_lstm * 2), dtype=np.float32)
        self.policy = policy  # actual policy params now
        self.q_value = q_value
示例#2
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_batch,
                 n_steps,
                 n_lstm=256,
                 reuse=False,
                 layer_norm=False,
                 **kwargs):
        super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_batch,
                                         n_steps, n_lstm, reuse)
        with tf.variable_scope("model", reuse=reuse):
            extracted_features = nature_cnn(self.obs_ph, **kwargs)
            input_sequence = batch_to_seq(extracted_features, self.n_env,
                                          n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=layer_norm)
            rnn_output = seq_to_batch(rnn_output)
            value_fn = linear(rnn_output, 'v', 1)
            self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent(
                rnn_output)

        self._value = value_fn[:, 0]
        self.action = self.proba_distribution.sample()
        self.neglogp = self.proba_distribution.neglogp(self.action)
        self.initial_state = np.zeros((self.n_env, n_lstm * 2),
                                      dtype=np.float32)
        self.value_fn = value_fn
    def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0):
        """
        returns the probability distribution from latent values

        :param latent_vector: ([float]) the latent values
        :param init_scale: (float) the inital scale of the distribution
        :param init_bias: (float) the inital bias of the distribution
        :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated
        """
        pdparam = linear(latent_vector, 'pi', self.n_cat, init_scale=init_scale, init_bias=init_bias)
        return self.proba_distribution_from_flat(pdparam), pdparam
    def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0):
        """
        returns the probability distribution from latent values

        :param latent_vector: ([float]) the latent values
        :param init_scale: (float) the inital scale of the distribution
        :param init_bias: (float) the inital bias of the distribution
        :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated
        """
        mean = linear(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
        logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        return self.proba_distribution_from_flat(pdparam), mean
示例#5
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_stack,
                 reuse=False):
        super(AcerCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env,
                                            n_steps, n_stack, reuse)
        with tf.variable_scope("model", reuse=reuse):
            extracted_features = nature_cnn(self.obs_ph)
            pi_logits = linear(extracted_features,
                               'pi',
                               self.n_act,
                               init_scale=0.01)
            policy = tf.nn.softmax(pi_logits)
            q_value = linear(extracted_features, 'q', self.n_act)

        self.action = sample(
            pi_logits)  # could change this to use self.pi instead
        self.initial_state = []  # not stateful
        self.policy = policy  # actual policy params now
        self.q_value = q_value
示例#6
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_batch,
                 n_steps,
                 n_lstm=256,
                 reuse=False,
                 _type="cnn",
                 **kwargs):
        super(FeedForwardPolicy,
              self).__init__(sess, ob_space, ac_space, n_batch, n_steps,
                             n_lstm, reuse)
        with tf.variable_scope("model", reuse=reuse):
            if _type == "cnn":
                extracted_features = nature_cnn(self.processed_x, **kwargs)
                value_fn = linear(extracted_features, 'v', 1)[:, 0]
            else:
                activ = tf.tanh
                processed_x = tf.layers.flatten(self.processed_x)
                pi_h1 = activ(
                    linear(processed_x,
                           'pi_fc1',
                           n_hidden=64,
                           init_scale=np.sqrt(2)))
                pi_h2 = activ(
                    linear(pi_h1, 'pi_fc2', n_hidden=64,
                           init_scale=np.sqrt(2)))
                vf_h1 = activ(
                    linear(processed_x,
                           'vf_fc1',
                           n_hidden=64,
                           init_scale=np.sqrt(2)))
                vf_h2 = activ(
                    linear(vf_h1, 'vf_fc2', n_hidden=64,
                           init_scale=np.sqrt(2)))
                value_fn = linear(vf_h2, 'vf', 1)[:, 0]
                extracted_features = pi_h2
            self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent(
                extracted_features, init_scale=0.01)

        self.action = self.proba_distribution.sample()
        self.neglogp = self.proba_distribution.neglogp(self.action)
        self.initial_state = None
        self.value_fn = value_fn
示例#7
0
def nature_cnn(unscaled_images, **kwargs):
    """
    CNN from Nature paper.

    :param unscaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    layer_1 = activ(
        conv(scaled_images,
             'c1',
             n_filters=32,
             filter_size=8,
             stride=4,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = activ(
        conv(layer_1,
             'c2',
             n_filters=64,
             filter_size=4,
             stride=2,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = activ(
        conv(layer_2,
             'c3',
             n_filters=64,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = conv_to_fc(layer_3)
    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))