Exemplo n.º 1
0
    def _init(self, ob_space, ac_space, hid_layers=[],
              deterministic=True, diagonal=True, trainable_std=True,
              use_bias=True, use_critic=False,
              seed=None, verbose=True,
              hidden_W_init=U.normc_initializer(1.0),
              higher_mean_init=None,
              higher_logstd_init=tf.constant_initializer(np.log(0.11)),
              const_std_init=False):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid__layers: list with width of each hidden layer
            deterministic: whether the actor is deterministic
            diagonal: whether the higher order policy has a diagonal covariance
            matrix
            use_bias: whether to include bias in neurons
            use_critic: whether to include a critic network
            seed: optional random seed
        """
        # Check environment's shapes
        assert isinstance(ob_space, gym.spaces.Box)
        assert len(ac_space.shape) == 1
        # Set seed
        if seed is not None:
            set_global_seeds(seed)
        # Set some attributes
        self.diagonal = diagonal
        self.use_bias = use_bias
        batch_length = None  # Accepts a sequence of eps of arbitrary length
        self.ac_dim = ac_space.shape[0]
        self.ob_dim = ob_space.shape[0]
        self.linear = not hid_layers
        self.verbose = verbose
        self._ob = ob = U.get_placeholder(
            name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))

        # Actor (N.B.: weight initialization is irrelevant)
        with tf.variable_scope('actor'):
            last_out = ob
            for i, hid_size in enumerate(hid_layers):
                # Mlp feature extraction
                last_out = tf.nn.tanh(
                    tf.layers.dense(last_out, hid_size,
                                    name='fc%i' % (i+1),
                                    kernel_initializer=hidden_W_init,
                                    use_bias=use_bias))
            if deterministic and isinstance(ac_space, gym.spaces.Box):
                # Determinisitc action selection
                self.actor_mean = actor_mean = \
                    tf.layers.dense(last_out, ac_space.shape[0],
                                    name='action',
                                    kernel_initializer=hidden_W_init,
                                    use_bias=use_bias)
            else:
                raise NotImplementedError

        # Get actor flatten weights
        with tf.variable_scope('actor') as scope:
            self.actor_weights = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope=scope.name)
            # flatten weights
            self.flat_actor_weights = tf.concat(
                [tf.reshape(w, [-1]) for w in self.actor_weights], axis=0)
            self._n_actor_weights = n_actor_weights = \
                self.flat_actor_weights.shape[0]

        # Higher order policy (Gaussian)
        with tf.variable_scope('higher'):
            if higher_mean_init is None:
                # Initial means sampled from a normal distribution N(0,1)
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.random_normal(shape=[n_actor_weights.value],
                                     stddev=0.01),
                    tf.zeros(shape=[n_actor_weights]))  # bias init always zero
            self.higher_mean = tf.get_variable(
                name='higher_mean',
                initializer=higher_mean_init,
                shape=self.flat_actor_weights.get_shape())
            # Keep the weights'domain compact
            # self.higher_mean = higher_mean = tf.clip_by_value(
            #     self.higher_mean, -1, 1, 'higher_mean_clipped')
            higher_mean = self.higher_mean
            if diagonal:
                if const_std_init:
                    self.higher_logstd = higher_logstd = \
                        tf.get_variable(
                            name='higher_logstd',
                            initializer=higher_logstd_init,
                            trainable=trainable_std)
                else:
                    self.higher_logstd = higher_logstd = \
                        tf.get_variable(
                            name='higher_logstd',
                            shape=[n_actor_weights],
                            initializer=higher_logstd_init,
                            trainable=trainable_std)
                pdparam = tf.concat([higher_mean,
                                     higher_mean * 0. + higher_logstd],
                                    axis=0)
                self.pdtype = pdtype = \
                    DiagGaussianPdType(n_actor_weights.value)
            else:
                # Cholesky covariance matrix
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights*(n_actor_weights + 1)//2],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat([higher_mean, higher_logstd],
                                    axis=0)
                self.pdtype = pdtype = CholeskyGaussianPdType(
                    n_actor_weights.value)

        # Sample actor weights
        self.pd = pdtype.pdfromflat(pdparam)
        sampled_actor_params = self.pd.sample()
        symm_sampled_actor_params = self.pd.sample_symmetric()
        self._sample_actor_params = U.function([], [sampled_actor_params])
        self._sample_symm_actor_params = U.function(
            [], list(symm_sampled_actor_params))

        # Assign actor weights
        with tf.variable_scope('actor') as scope:
            actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             scope=scope.name)
            self._use_sampled_actor_params = \
                U.assignFromFlat(actor_params, sampled_actor_params)
            self._get_actor_params = U.GetFlat(actor_params)
            self._set_actor_params = U.SetFromFlat(actor_params)

        # Act
        self._action = action = actor_mean
        self._act = U.function([ob], [action])

        # Manage higher policy weights
        with tf.variable_scope('higher') as scope:
            self._higher_params = higher_params = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)
            self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in
                                                 self._higher_params], axis=0)
            self._n_higher_params = self.flat_higher_params.shape[0]
            self._get_flat_higher_params = U.GetFlat(higher_params)
            self._set_higher_params = U.SetFromFlat(self._higher_params)

        # Evaluating
        self._actor_params_in = actor_params_in = \
            U.get_placeholder(name='actor_params_in',
                              dtype=tf.float32,
                              shape=[batch_length] + [n_actor_weights])
        self._rets_in = rets_in = \
            U.get_placeholder(name='returns_in',
                              dtype=tf.float32,
                              shape=[batch_length])
        ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0])
        self._get_ret_mean = U.function([self._rets_in], [ret_mean])
        self._get_ret_std = U.function([self._rets_in], [ret_std])
        self._logprobs = logprobs = self.pd.logp(actor_params_in)
        pgpe_times_n = U.flatgrad(logprobs*rets_in, higher_params)
        self._get_pgpe_times_n = U.function([actor_params_in, rets_in],
                                            [pgpe_times_n])
        self._get_actor_mean = U.function([ob], [self.actor_mean])
        self._get_higher_mean = U.function([ob], [self.higher_mean])
        self._get_higher_std = U.function([], tf.exp([self.higher_logstd]))

        # Batch off-policy PGPE
        self._probs = tf.exp(logprobs)
        self._behavioral = None
        self._renyi_other = None

        # Renyi computation
        self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd))

        # Fisher computation (diagonal case)
        mean_fisher_diag = tf.exp(-2*self.higher_logstd)
        if trainable_std:
            cov_fisher_diag = mean_fisher_diag*0 + 2
            self._fisher_diag = tf.concat(
                [mean_fisher_diag, cov_fisher_diag], axis=0)
        else:
            self._fisher_diag = mean_fisher_diag
        self._get_fisher_diag = U.function([], [self._fisher_diag])
Exemplo n.º 2
0
    def _init(self,
              ob_space,
              ac_space,
              hid_layers=[],
              deterministic=True,
              diagonal=True,
              use_bias=False,
              use_critic=False,
              seed=None,
              verbose=True,
              zero_init=False):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid__layers: list with width of each hidden layer
            deterministic: whether the actor is deterministic
            diagonal: whether the higher order policy has a diagonal covariance
            matrix
            use_bias: whether to include bias in neurons
            use_critic: whether to include a critic network
            seed: optional random seed
        """
        assert isinstance(ob_space, gym.spaces.Box)
        assert len(ac_space.shape) == 1
        self.diagonal = diagonal
        self.use_bias = use_bias
        batch_length = None  #Accepts a sequence of episodes of arbitrary length
        self.observation_space = ob_space
        self.action_space = ac_space
        self.ac_dim = ac_space.shape[0]
        self.ob_dim = ob_space.shape[0]
        self.hid_layers = hid_layers
        self.deterministic = deterministic
        self.use_critic = use_critic
        self.linear = not hid_layers
        self.verbose = verbose

        if seed is not None:
            set_global_seeds(seed)

        self._ob = ob = U.get_placeholder(name="ob",
                                          dtype=tf.float32,
                                          shape=[None] + list(ob_space.shape))

        #Critic (normally not used)
        if use_critic:
            with tf.variable_scope('critic'):
                last_out = ob
                for i, hid_size in enumerate(hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

        #Actor (N.B.: weight initialization is irrelevant)
        with tf.variable_scope('actor'):
            last_out = ob
            for i, hid_size in enumerate(hid_layers):
                #Mlp feature extraction
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=tf.initializers.constant(0.),
                        use_bias=use_bias))
            if deterministic and isinstance(ac_space, gym.spaces.Box):
                #Determinisitc action selection
                self.actor_mean = actor_mean = tf.layers.dense(
                    last_out,
                    ac_space.shape[0],
                    name='action',
                    kernel_initializer=tf.initializers.constant(0.),
                    use_bias=use_bias)
            else:
                raise NotImplementedError  #Currently supports only deterministic action policies

        #Higher order policy (Gaussian)
        with tf.variable_scope('actor') as scope:
            self.actor_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)
            self.flat_actor_weights = tf.concat([tf.reshape(w, [-1]) for w in \
                                            self.actor_weights], axis=0) #flatten
            self._n_actor_weights = n_actor_weights = self.flat_actor_weights.shape[
                0]

        with tf.variable_scope('higher'):
            if zero_init:
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.zeros(shape=[n_actor_weights.value]),
                    tf.zeros(shape=[n_actor_weights]))
            else:
                #Initial means sampled from a normal distribution N(0,1)
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.random_normal(shape=[n_actor_weights.value],
                                     stddev=0.01),
                    tf.zeros(shape=[n_actor_weights]))
            self.higher_mean = higher_mean = tf.get_variable(
                name='higher_mean', initializer=higher_mean_init)

            if diagonal:
                #Diagonal covariance matrix; all stds initialized to 0
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat(
                    [higher_mean, higher_mean * 0. + higher_logstd], axis=0)
                self.pdtype = pdtype = DiagGaussianPdType(
                    n_actor_weights.value)
            else:
                #Cholesky covariance matrix
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights * (n_actor_weights + 1) // 2],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat([higher_mean, higher_logstd], axis=0)
                self.pdtype = pdtype = CholeskyGaussianPdType(
                    n_actor_weights.value)

        #Sample actor weights
        self.pd = pdtype.pdfromflat(pdparam)
        sampled_actor_params = self.pd.sample()
        symm_sampled_actor_params = self.pd.sample_symmetric()
        self._sample_symm_actor_params = U.function(
            [], list(symm_sampled_actor_params))
        self._sample_actor_params = U.function([], [sampled_actor_params])

        #Assign actor weights
        with tf.variable_scope('actor') as scope:
            actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)

            self._use_sampled_actor_params = U.assignFromFlat(
                actor_params, sampled_actor_params)

            self._set_actor_params = U.SetFromFlat(actor_params)

            self._get_actor_params = U.GetFlat(actor_params)

        #Act
        self._action = action = actor_mean
        self._act = U.function([ob], [action])

        #Higher policy weights
        with tf.variable_scope('higher') as scope:
            self._higher_params = higher_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)
            self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in \
                                            self._higher_params], axis=0) #flatten
            self._n_higher_params = self.flat_higher_params.shape[0]
            self._get_flat_higher_params = U.GetFlat(higher_params)
            self._set_higher_params = U.SetFromFlat(self._higher_params)

        #Batch PGPE
        self._actor_params_in = actor_params_in = \
                U.get_placeholder(name='actor_params_in',
                                  dtype=tf.float32,
                                  shape=[batch_length] + [n_actor_weights])
        self._rets_in = rets_in = U.get_placeholder(name='returns_in',
                                                    dtype=tf.float32,
                                                    shape=[batch_length])
        ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0])
        self._get_ret_mean = U.function([self._rets_in], [ret_mean])
        self._get_ret_std = U.function([self._rets_in], [ret_std])
        self._logprobs = logprobs = self.pd.logp(actor_params_in)
        pgpe_times_n = U.flatgrad(logprobs * rets_in, higher_params)
        self._get_pgpe_times_n = U.function([actor_params_in, rets_in],
                                            [pgpe_times_n])

        #One-episode PGPE
        #Used N times to compute the baseline -> can we do better?
        self._one_actor_param_in = one_actor_param_in = U.get_placeholder(
            name='one_actor_param_in',
            dtype=tf.float32,
            shape=[n_actor_weights])
        one_logprob = self.pd.logp(one_actor_param_in)
        score = U.flatgrad(one_logprob, higher_params)
        score_norm = tf.norm(score)
        self._get_score = U.function([one_actor_param_in], [score])
        self._get_score_norm = U.function([one_actor_param_in], [score_norm])

        #Batch off-policy PGPE
        self._probs = tf.exp(logprobs)
        self._behavioral = None
        self._renyi_other = None

        #One episode off-PGPE
        self._one_prob = tf.exp(one_logprob)

        #Renyi computation
        self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd))

        #Fisher computation (diagonal case)
        mean_fisher_diag = tf.exp(-2 * self.higher_logstd)
        cov_fisher_diag = mean_fisher_diag * 0 + 2
        self._fisher_diag = tf.concat([mean_fisher_diag, cov_fisher_diag],
                                      axis=0)
        self._get_fisher_diag = U.function([], [self._fisher_diag])

        #Multiple importance sampling
        self._memory = None