예제 #1
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy",
                 nlstm=256):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                # h, self.dropout_assign_ops = choose_cnn(processed_x)
                # xs = batch_to_seq(h, nenv, nsteps)
                # ms = batch_to_seq(M, nenv, nsteps)
                # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
                # h5 = seq_to_batch(h5)
                # vf = fc(h5, 'v', 1)[:,0]
                # self.pd, self.pi = self.pdtype.pdfromlatent(h5)
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
예제 #2
0
    def __init__(self, ob_space, ac_space, hidsize,
                  feat_dim, layernormalize, nl, scope="policy"):
        if layernormalize:
            print("Warning: policy is operating on top of layer-normed features. It might slow down the training.")
        self.layernormalize = layernormalize
        self.nl = nl
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape, name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                vpred = fc(x, name='value_function_output', units=1, activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
    def get_loss(self):
        nl = tf.nn.leaky_relu
        ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)
        ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1)

        def add_ac(x):
            if x.get_shape().ndims == 2:
                return tf.concat([x, ac], axis=-1)
            elif x.get_shape().ndims == 4:
                sh = tf.shape(x)
                return tf.concat(
                    [
                        x,
                        ac_four_dim + tf.zeros(
                            [
                                sh[0], sh[1], sh[2],
                                ac_four_dim.get_shape()[3].value
                            ],
                            tf.float32,
                        ),
                    ],
                    axis=-1,
                )

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            mu, log_sigma_squared = unet(x,
                                         nl=nl,
                                         feat_dim=self.feat_dim,
                                         cond=add_ac)
            mu = unflatten_first_dim(mu, sh)
            log_sigma_squared = unflatten_first_dim(log_sigma_squared, sh)
        prediction_pixels = mu * self.ob_std + self.ob_mean
        if self.ama == "true":
            mse = tf.square(mu - 2 * tf.stop_gradient(self.out_features))
            dynamics_reward = tf.reduce_mean((mse - tf.exp(log_sigma_squared)),
                                             axis=[2, 3, 4])
            if self.clip_ama == "true":
                dynamics_reward = tf.clip_by_value(dynamics_reward, 0, 1e6)
            loss = tf.reduce_mean(
                (tf.exp(-log_sigma_squared) *
                 (mse) + self.uncertainty_penalty * log_sigma_squared),
                axis=[2, 3, 4],
            )
        elif self.ama == "false":
            mse = tf.square(mu - tf.stop_gradient(self.out_features))
            dynamics_reward = tf.reduce_mean(mse, axis=[2, 3, 4])
            loss = dynamics_reward
        else:
            raise ValueError("Please specify whether to use AMA or not")
        return (
            loss,
            dynamics_reward,
            prediction_pixels,
            log_sigma_squared,
        )
예제 #4
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope='policy'):
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        self.hidsize = hidsize
        self.feat_dim = feat_dim
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.placeholder_observation = tf.placeholder(dtype=tf.int32,
                                                          shape=(None, None) +
                                                          ob_space.shape,
                                                          name='observation')
            self.placeholder_action = self.ac_pdtype.sample_placeholder(
                [None, None], name='action')
            self.pd = self.vpred = None
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            shape = tf.shape(self.placeholder_observation)
            x = flatten_two_dims(self.placeholder_observation)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, shape)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                value_pred = fc(x,
                                name='value_func_output',
                                units=1,
                                activation=None)
            pdparam = unflatten_first_dim(pdparam, shape)
            self.vpred = unflatten_first_dim(value_pred, shape)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
예제 #5
0
 def decoder(self, z):  # z 是VAE后验分布的均值, shape=(None,None,512)
     nl = tf.nn.leaky_relu
     z_has_timesteps = (z.get_shape().ndims == 3)
     if z_has_timesteps:
         sh = tf.shape(z)
         z = flatten_two_dims(z)  # (None,512)
     with tf.variable_scope(self.scope + "decoder"):
         # 反卷积网络. de-convolution. spherical_obs=True, 输出 z.shape=(None,84,84,4)
         z = small_deconvnet(z,
                             nl=nl,
                             ch=4 if self.spherical_obs else 8,
                             positional_bias=True)
         if z_has_timesteps:
             z = unflatten_first_dim(z, sh)
         if self.spherical_obs:  # 球形损失, scale 在所有维度都是同一个常数, 简化运算
             scale = tf.get_variable(name="scale",
                                     shape=(),
                                     dtype=tf.float32,
                                     initializer=tf.ones_initializer())
             scale = tf.maximum(scale, -4.)
             scale = tf.nn.softplus(scale)
             scale = scale * tf.ones_like(z)
         else:
             z, scale = tf.split(z, 2, -1)  # 输出 split, 分别作为 mu 和 scale.
             scale = tf.nn.softplus(scale)
         # scale = tf.Print(scale, [scale])
         return tf.distributions.Normal(loc=z, scale=scale)
예제 #6
0
    def predict_next(self, reuse):
        if isinstance(self.ac_space, gym.spaces.Discrete):
            ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2)
        else:
            ac = self.ac
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        with tf.variable_scope(self.scope, reuse=reuse):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu)

            def residual(x):
                res = tf.layers.dense(add_ac(x),
                                      self.hidsize,
                                      activation=tf.nn.leaky_relu)
                res = tf.layers.dense(add_ac(res),
                                      self.hidsize,
                                      activation=None)
                return x + res

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x), n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
        return x
예제 #7
0
    def predict_next(self, reuse):
        nl = tf.nn.leaky_relu
        if isinstance(self.ac_space, gym.spaces.Discrete):
            ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2)
        else:
            ac = self.ac
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)
        ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1)

        def add_ac(x):
            if x.get_shape().ndims == 2:
                return tf.concat([x, ac], axis=-1)
            elif x.get_shape().ndims == 4:
                sh = tf.shape(x)
                return tf.concat([
                    x, ac_four_dim + tf.zeros([
                        sh[0], sh[1], sh[2],
                        ac_four_dim.get_shape()[3].value
                    ], tf.float32)
                ],
                                 axis=-1)

        with tf.variable_scope(self.scope, reuse=reuse):
            x = flatten_two_dims(self.features)
            x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac)
            x = unflatten_first_dim(x, sh)
        self.prediction_pixels = x * self.ob_std + self.ob_mean
        # return tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, [2, 3, 4])
        return x
예제 #8
0
    def get_loss(self):
        ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu)

            def residual(x):
                res = tf.layers.dense(add_ac(x),
                                      self.hidsize,
                                      activation=tf.nn.leaky_relu)
                res = tf.layers.dense(add_ac(res),
                                      self.hidsize,
                                      activation=None)
                return x + res

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x), n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
        return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)
    def get_loss(self):
        nl = tf.nn.leaky_relu
        ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)
        ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1)

        def add_ac(x):
            if x.get_shape().ndims == 2:
                return tf.concat([x, ac], axis=-1)
            elif x.get_shape().ndims == 4:
                sh = tf.shape(x)
                return tf.concat([
                    x, ac_four_dim + tf.zeros([
                        sh[0], sh[1], sh[2],
                        ac_four_dim.get_shape()[3].value
                    ], tf.float32)
                ],
                                 axis=-1)

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac)
            x = unflatten_first_dim(x, sh)
        self.prediction_pixels = x * self.ob_std + self.ob_mean
        return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2,
                              [2, 3, 4])
    def get_loss(self, ac):
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu,
                                reuse=tf.AUTO_REUSE)

            def residual(x):
                res = tf.layers.dense(add_ac(x),
                                      self.hidsize,
                                      activation=tf.nn.leaky_relu,
                                      reuse=tf.AUTO_REUSE)
                res = tf.layers.dense(add_ac(res),
                                      self.hidsize,
                                      activation=None,
                                      reuse=tf.AUTO_REUSE)
                return x + res

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x),
                                n_out_features,
                                activation=None,
                                reuse=tf.AUTO_REUSE)
            x = unflatten_first_dim(x, sh)
            return x
 def decoder(self, z):
     nl = tf.nn.leaky_relu
     z_has_timesteps = (z.get_shape().ndims == 3)
     if z_has_timesteps:
         sh = tf.shape(z)
         z = flatten_two_dims(z)
     with tf.variable_scope(self.scope + "decoder"):
         z = small_deconvnet(z,
                             nl=nl,
                             ch=4 if self.spherical_obs else 8,
                             positional_bias=True)
         if z_has_timesteps:
             z = unflatten_first_dim(z, sh)
         if self.spherical_obs:
             scale = tf.get_variable(name="scale",
                                     shape=(),
                                     dtype=tf.float32,
                                     initializer=tf.ones_initializer())
             scale = tf.maximum(scale, -4.)
             scale = tf.nn.softplus(scale)
             scale = scale * tf.ones_like(z)
         else:
             z, scale = tf.split(z, 2, -1)
             scale = tf.nn.softplus(scale)
         # scale = tf.Print(scale, [scale])
         return tf.distributions.Normal(loc=z, scale=scale)
    def get_last_features(self, x, reuse):
        x_has_timesteps = (x.get_shape().ndims == 5)
        if x_has_timesteps:
            sh = tf.shape(x)
            x = flatten_two_dims(x)

        #with tf.variable_scope(self.scope + "_features", reuse=reuse):
        with tf.variable_scope(self.scope+"_features", reuse=reuse):
            x = (tf.to_float(x) - self.ob_mean) / self.ob_std
            x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize)

            if x_has_timesteps:
                x = unflatten_first_dim(x, sh)
            x = tf.reshape(x, [-1, sh[1], self.feat_dim])
        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            init_1 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_1, self.last_h_in_1)
            if self.lstm2_size:
                init_2 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_2, self.last_h_in_2)
            if self.aux_input:
                prev_rews = tf.expand_dims(self.ph_last_rew, -1)
                x = tf.concat([x, prev_rews], -1)
            x, c_out_1, h_out_1 = lstm(self.lstm1_size)(x, initial_state=init_1)
            if self.lstm2_size:
                if self.aux_input:
                    prev_acs = tf.one_hot(self.ph_last_ac, depth=self.num_actions)
                    x = tf.concat([x, tf.cast(prev_acs, tf.float32)], -1)
                    x = tf.concat([x, self.ph_last_vel], -1)

                x, c_out_2, h_out_2  = lstm(self.lstm2_size)(x, initial_state=init_2)
        return x
예제 #13
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 n_env,
                 n_steps,
                 reuse,
                 n_lstm=256,
                 scope="policy"):
        super(RnnPolicy, self).__init__(ob_space, ac_space, hidsize, ob_mean,
                                        ob_std, feat_dim, layernormalize, nl,
                                        n_env, n_steps, reuse, n_lstm, scope)

        with tf.variable_scope(scope, reuse=self.reuse):
            ## Use features
            x = self.flat_features

            input_sequence = batch_to_seq(x, self.n_env, self.n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, self.n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=False)
            rnn_output = seq_to_batch(rnn_output)
            layernorm(rnn_output)

            ## Concat
            q = self.flat_features
            q = tf.concat([q, rnn_output], axis=1)
            q = fc(q, units=hidsize, activation=activ, name="fc1")
            q = fc(q, units=hidsize, activation=activ, name="fc2")

            pdparam, vpred = self.get_pdparam(q)

        self.pdparam = pdparam = unflatten_first_dim(pdparam, self.sh)
        self.vpred = unflatten_first_dim(vpred, self.sh)[:, :, 0]
        self.pd = pd = self.ac_pdtype.proba_distribution_from_flat(pdparam)
        self.a_samp = pd.sample()
        self.entropy = pd.entropy()
        self.nlp_samp = pd.neglogp(self.a_samp)
    def set_dynamics(self, dynamics):
        self.dynamics = dynamics
        with tf.variable_scope(self.scope):
            shaped = tf.shape(self.ph_ob)
            flat = flatten_two_dims(self.ph_ob)
            features = self.dynamics.auxiliary_task.get_features(flat, reuse=tf.AUTO_REUSE)
            pdparam = self.get_pdparam(features, False)
            pdparam = unflatten_first_dim(pdparam, shaped)
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)

            '''Alternate ac for forward dynamics'''
            pdparam_alt = self.get_pdparam(self.extracted_features, True)
            pdparam_alt = unflatten_first_dim(pdparam_alt, shaped)
            self.a_samp_alt = self.ac_pdtype.pdfromflat(pdparam_alt).sample()
    def __init__(self, ob_space, ac_space, hidsize,
                 ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"):
        if layernormalize:
            print("Warning: policy is operating on top of layer-normed features. It might slow down the training.")
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std

        ''' Defining variables that'll be initialized with dynamics '''
        self.dynamics = None
        self.a_samp = None
        self.entropy = None
        self.nlp_samp = None
        self.a_samp_alt = None

        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape, name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            self.pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            self.extracted_features = tf.placeholder(dtype=tf.float32,
                                                     shape=self.flat_features.shape)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                vpred = fc(x, name='value_function_output', units=1, activation=None)
                y = fc(vpred,  units=hidsize, activation=activ)
                y = fc(y, units=hidsize, activation=activ)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
예제 #16
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 n_env,
                 n_steps,
                 reuse,
                 n_lstm=256,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        self.n_env = n_env
        self.n_steps = n_steps
        self.n_batch = n_env * n_steps
        self.n_lstm = n_lstm
        self.reuse = reuse
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            # self.ac_pdtype = make_pdtype(ac_space)
            self.ac_pdtype = make_proba_dist_type(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(self.n_env, self.n_steps) +
                                        ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder(
                [self.n_env, self.n_steps], name='ac')
            self.masks_ph = tf.placeholder(tf.float32,
                                           [self.n_env, self.n_steps],
                                           name="masks_ph")  # mask (done t-1)
            self.flat_masks_ph = tf.reshape(self.masks_ph,
                                            [self.n_env * self.n_steps])
            self.states_ph = tf.placeholder(tf.float32,
                                            [self.n_env, n_lstm * 2],
                                            name="states_ph")  # states
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            self.pdparamsize = self.ac_pdtype.param_shape()[0]

            self.sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=self.reuse)
            self.features = unflatten_first_dim(self.flat_features, self.sh)
 def get_loss(self):
     with tf.variable_scope(self.scope):
         x = tf.concat([self.features, self.next_features], 2)
         sh = tf.shape(x)
         x = flatten_two_dims(x)
         x = fc(x, units=self.policy.hidsize, activation=activ)
         x = fc(x, units=self.ac_space.n, activation=None)
         param = unflatten_first_dim(x, sh)
         idfpd = self.policy.ac_pdtype.pdfromflat(param)
         return idfpd.neglogp(self.ac)
예제 #18
0
    def get_loss(self):
        ac = self.ac
        sh = ac.shape
        ac = flatten_dims(ac, len(self.ac_space.shape))
        ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_(
            1,
            torch.tensor(ac).unsqueeze(1),
            1)  # one_hot(self.ac, self.ac_space.n, axis=2)
        ac = unflatten_first_dim(ac, sh)

        features = self.features
        next_features = self.next_features
        assert features.shape[:-1] == ac.shape[:-1]
        sh = features.shape
        x = flatten_dims(features, 1)
        ac = flatten_dims(ac, 1)
        x = self.loss_net(x, ac)
        x = unflatten_first_dim(x, sh)
        return torch.mean((x - next_features)**2, -1)
        def forward_predictor(x):
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu)

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x), n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
            return x
예제 #20
0
 def get_features(self, x):
     x_has_timesteps = (x.get_shape().ndims == 5)
     if x_has_timesteps:
         sh = x.shape
         x = flatten_dims(x, self.ob_space.n)
     x = np.transpose(x,
                      [i for i in range(len(x.shape) - 3)] + [-1, -3, -2])
     x = (x - self.ob_mean) / self.ob_std
     x = self.features_model(x)
     if x_has_timesteps:
         x = unflatten_first_dim(x, sh)
     return x
예제 #21
0
 def get_features(self, x, reuse):
     nl = tf.nn.leaky_relu
     x_has_timesteps = (x.get_shape().ndims == 5)
     if x_has_timesteps:
         sh = tf.shape(x)
         x = flatten_two_dims(x)
     with tf.variable_scope(self.scope + "_features", reuse=reuse):
         x = (tf.to_float(x) - self.ob_mean) / self.ob_std
         x = small_convnet(x, nl=nl, feat_dim=self.feat_dim, last_nl=nl, layernormalize=False)
     if x_has_timesteps:
         x = unflatten_first_dim(x, sh)
     return x
 def get_features(self, x):
     x_has_timesteps = (len(x.shape) == 5)
     if x_has_timesteps:
         sh = x.shape
         x = flatten_dims(x, len(self.ob_space.shape))
     x = (x - self.ob_mean) / self.ob_std
     x = np.transpose(x, [i for i in range(len(x.shape) - 3)] +
                      [-1, -3, -2])  # transpose channel axis
     x = self.features_model(torch.tensor(x))
     if x_has_timesteps:
         x = unflatten_first_dim(x, sh)
     return x
예제 #23
0
 def get_loss(self, reuse=False):
     with tf.variable_scope(self.scope, reuse=reuse):
         x = tf.concat([self.features, self.next_features], 2)
         sh = tf.shape(x)
         x = flatten_two_dims(x)
         x = fc(x, units=self.policy.hidsize, activation=activ)
         # x = fc(x, units=self.ac_space.n, activation=None)
         x = fc(x, units=get_action_n(self.ac_space), activation=None)
         param = unflatten_first_dim(x, sh)
         # idfpd = self.policy.ac_pdtype.pdfromflat(param)
         idfpd = self.policy.ac_pdtype.proba_distribution_from_flat(param)
         return idfpd.neglogp(self.ac)
예제 #24
0
    def get_features(self, x, reuse):
        x_has_timesteps = (x.get_shape().ndims == 5)
        if x_has_timesteps:
            sh = tf.shape(x)
            x = flatten_two_dims(x)

        with tf.variable_scope(self.scope + "_features", reuse=reuse):
            x = tf.to_float(x)
            x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize)

        if x_has_timesteps:
            x = unflatten_first_dim(x, sh)
        return x
예제 #25
0
 def get_features(self, x, reuse):
     if (x.get_shape().ndims == 5):
         shape = tf.shape(x)
         x = flatten_two_dims(x)
     with tf.variable_scope(self.scope + '_features', reuse=reuse):
         x = (tf.cast(x, tf.float32) - self.ob_mean) / self.ob_std
         x = small_convnet(x,
                           nl=self.nl,
                           feat_dim=self.feat_dim,
                           last_nl=None,
                           layernormalize=self.layernormalize)
     if (x.get_shape().ndims == 5):
         x = unflatten_first_dim(x, shape)
     return x
예제 #26
0
    def get_features(self, x):
        x_has_timesteps = (len(x.shape) == 5)
        if x_has_timesteps:
            sh = torch.shape(x)
            x = flatten_two_dims(x)

        x = (x - self.ob_mean) / self.ob_std
        x = np.transpose(x, [i for i in range(len(x.shape) - 3)] +
                         [-1, -3, -2])  # [N, H, W, C] --> [N, C, H, W]
        x = self.features_model(torch.tensor(x))

        if x_has_timesteps:
            x = unflatten_first_dim(x, sh)
        return x
예제 #27
0
 def update_features(self, ob, ac):
     sh = ob.shape  # ob.shape = [nenvs, timestep, H, W, C]. Can timestep > 1 ?
     x = flatten_dims(
         ob, len(self.ob_space.shape)
     )  # flat first two dims of ob.shape and get a shape of [N, H, W, C].
     flat_features = self.get_features(x)  # [N, feat_dim]
     self.flat_features = flat_features
     hidden = self.pd_hidden(flat_features)
     pdparam = self.pd_head(hidden)
     vpred = self.vf_head(hidden)
     self.vpred = unflatten_first_dim(vpred, sh)  #[nenvs, tiemstep, v]
     self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
     self.ac = ac
     self.ob = ob
 def decoder(self, z):
     z_has_timesteps = (len(z.shape) == 3)
     if z_has_timesteps:
         sh = z.shape
         z = flatten_dims(z, 1)
     z = self.decoder_model(z)
     if z_has_timesteps:
         z = unflatten_first_dim(z, sh)
     if self.spherical_obs:
         scale = torch.max(self.scale, torch.tensor(-4.0))
         scale = torch.nn.functional.softplus(scale)
         scale = scale * torch.ones(z.shape)
     else:
         z, scale = torch.split(z, [4, 4], -3)
         scale = torch.nn.functional.softplus(scale)
     return torch.distributions.normal.Normal(z, scale)
예제 #29
0
 def get_loss(self):
     # 构造逆环境模型, 流程 输入 [feature(obs), feature(obs_next)] -> 输出动作参数
     # 计算不同动作的高斯或者softmax分布 -> 计算 log_prob 作为 inverse dynamics 的损失.
     with tf.variable_scope(self.scope):
         # features.shape=(None,None,512), next_features.shape=(None,None,512),
         x = tf.concat([self.features, self.next_features],
                       2)  # x.shape=(None,None,1024)
         sh = tf.shape(x)
         x = flatten_two_dims(x)  # (None, 1024) 融合了 feature 和 next_feature
         x = fc(x, units=self.policy.hidsize,
                activation=activ)  # (None,512)
         x = fc(x, units=self.ac_space.n,
                activation=None)  # (None,4)    输出动作logits
         param = unflatten_first_dim(x, sh)  # (None,None,4)  恢复维度
         idfpd = self.policy.ac_pdtype.pdfromflat(param)  # 根据输出 logits 建立分布
         # 如果是连续动作空间,这里代表高斯-log损失; 如果是离散动作空间, 这里代表 softmax 损失
         return idfpd.neglogp(self.ac)  # shape等于前2个维度 (None,None)
예제 #30
0
    def get_loss(self):
        sh = tf.shape(self.features)
        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu)
            x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu)

            # def residual(x):
            #     res = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu)
            #     res = tf.layers.dense(x, self.hidsize, activation=None)
            #     return x + res

            # for _ in range(4):
            #     x = residual(x)

            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(x, n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
        return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)