Exemplo n.º 1
0
    def _init(self,
              ob_space,
              ac_space,
              hidden_sizes):

        self.ob_space = ob_space
        self.ac_space = ac_space
        self.hidden_sizes = hidden_sizes

        ob = tfu.get_placeholder(name="ob",
                                 dtype=tf.float32,
                                 shape=[None] + list(self.ob_space.shape))

        self.obs_norm = tfu.RunningMeanStd(shape=self.ob_space.shape)
        obs = tf.clip_by_value((ob - self.obs_norm.mean) / self.obs_norm.std,
                               -5.0,
                               5.0)

        out = obs
        for i, hidden_size in enumerate(hidden_sizes):
            out = tf.nn.tanh(tfu.dense(out,
                                       hidden_size,
                                       "vffc%i" % (i+1),
                                       weight_init=tfu.normc_initializer(1.0)))
        self.vpred = tfu.dense(out,
                               1,
                               "vffinal",
                               weight_init=tfu.normc_initializer(1.0))[:, 0]

        out = obs
        for i, hidden_size in enumerate(hidden_sizes):
            out = tf.nn.tanh(tfu.dense(out,
                                       hidden_size,
                                       "policyfc%i" % (i+1),
                                       weight_init=tfu.normc_initializer(1.0)))

        mean = tfu.dense(out,
                         self.ac_space.shape[0],
                         "polfinal",
                         tfu.normc_initializer(0.01))
        logstd = tf.get_variable(name="logstd",
                                 shape=[1, self.ac_space.shape[0]],
                                 initializer=tf.zeros_initializer())
        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        self.pd = DiagGaussianPd(pdparam)

        stochastic = tfu.get_placeholder(name="stochastic",
                                         dtype=tf.bool, shape=())
        ac = tfu.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = tfu.function([stochastic, ob], [ac, self.vpred])
        self._predict = tfu.function([ob], self.vpred)

        # For pickle
        self.flatvars = tfu.GetFlat(self.get_trainable_variables())
        self.unflatvars = tfu.SetFromFlat(self.get_trainable_variables())
Exemplo n.º 2
0
    def ppo_surrogate(self):

        obs = tfu.get_placeholder_cached(name="ob")
        actions = tfu.get_placeholder_cached(name="actions")
        adv = tfu.get_placeholder_cached(name="advantages")
        kl = self.old_pi.pd.kl(self.pi.pd)
        meankl = tf.reduce_mean(kl)
        ratio = tf.exp(self.pi.pd.logp(actions) - self.old_pi.pd.logp(actions))
        surr = tf.reduce_mean(ratio * adv)
        optim = tf.train.AdamOptimizer(self.learn_rate).minimize(
            -tf.reduce_mean(surr) + self.beta * meankl,
            var_list=self.pol_var_list)

        surr_fun = tfu.function([obs, actions, adv], surr)
        optim_fun = tfu.function([obs, actions, adv], optim)

        return surr_fun, optim_fun
Exemplo n.º 3
0
    def ppo_surrogate(self):

        obs = tfu.get_placeholder_cached(name="ob")
        actions = tfu.get_placeholder_cached(name="actions")
        adv = tfu.get_placeholder_cached(name="advantages")
        lrmult = tfu.get_placeholder(name="lrmult", dtype=tf.float32, shape=[])
        self.clip_coef = self.clip_coef * lrmult
        ratio = tf.exp(self.pi.pd.logp(actions) - self.old_pi.pd.logp(actions))
        ratio_clip = tf.clip_by_value(ratio, 1 - self.clip_coef,
                                      1 + self.clip_coef)
        surr = tf.reduce_mean(tf.minimum(ratio * adv, ratio_clip * adv))
        optim = tf.train.AdamOptimizer(self.learn_rate).minimize(
            -tf.reduce_mean(surr), var_list=self.pol_var_list)

        surr_fun = tfu.function([obs, actions, adv, lrmult], surr)
        optim_fun = tfu.function([obs, actions, adv, lrmult], optim)

        return surr_fun, optim_fun
Exemplo n.º 4
0
    def build_graph(self):

        # Define all placeholders and get all variable lists
        actions = tfu.get_placeholder(name="actions",
                                      dtype=tf.float32,
                                      shape=[None, self.pi.ac_shape])
        obs = tfu.get_placeholder_cached(name="ob")
        adv = tfu.get_placeholder(name="advantages",
                                  dtype=tf.float32,
                                  shape=[None])
        ret = tfu.get_placeholder(name="returns",
                                  dtype=tf.float32,
                                  shape=[None])
        self.all_var_list = self.pi.get_trainable_variables()
        self.pol_var_list = [
            v for v in self.all_var_list
            if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")
        ]
        self.vf_var_list = [
            v for v in self.all_var_list if v.name.startswith("pi/vff")
        ]
        if self.pi.pd.__name__ is 'DiagGaussian':
            assert len(self.pol_var_list) == len(self.vf_var_list) + 1
        elif self.pi.pd.__name__ is 'Categorical':
            assert len(self.pol_var_list) == len(self.vf_var_list)

        # Define operations
        kl = self.old_pi.pd.kl(self.pi.pd)
        mean_kl = tf.reduce_mean(kl)
        entropy = self.pi.pd.entropy()
        mean_ent = tf.reduce_mean(entropy)
        cpi_surr = self.surrogate()
        flat_vpg = tfu.flatgrad(cpi_surr, self.pol_var_list)
        vferr = tf.reduce_mean(tf.square(self.pi.vpred - ret))
        vfoptim = tf.train.AdamOptimizer(learning_rate=self.alpha_vf).minimize(
            vferr, var_list=self.vf_var_list)

        # Define functions for operations
        self.get_flat = tfu.GetFlat(self.pol_var_list)
        self.set_from_flat = tfu.SetFromFlat(self.pol_var_list)
        self.vf_get_flat = tfu.GetFlat(self.vf_var_list)
        self.vf_set_from_flat = tfu.SetFromFlat(self.vf_var_list)
        self.kl_old_new = tfu.function([obs, actions], mean_kl)
        self.entropy = tfu.function([obs, actions], mean_ent)
        self.CPI_surrogate = tfu.function([obs, actions, adv], cpi_surr)
        self.flat_vpg = tfu.function([obs, actions, adv], flat_vpg)
        self.vf_loss = tfu.function([obs, ret], vferr)
        self.vf_optim = tfu.function([obs, ret], vfoptim)

        self.assign_old_new = tfu.function(
            [], [],
            updates=[
                tf.assign(oldv, newv) for (oldv, newv) in zip(
                    self.old_pi.get_variables(), self.pi.get_variables())
            ])