Пример #1
0
    def _build_graph(self, **kwargs):
        """ We treat tfFunctionApproximator as the stochastic map of the policy
        (which inputs ph_x and outputs ts_yh) and build additional
        attributes/methods required by Policy """
        # build tf.Variables
        # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars,
        #                ph_y, ts_pi, ts_logp, ts_pid, ts_pir, ts_pi_given_r
        tfFunctionApproximator._build_graph(self, **kwargs)
        # r_dim: dimension of randomness in generating actions.
        # build additional graphs for Policy
        # build conditional distribution
        self._pi = self._yh
        self._pi_given_r = U.function([self.ph_x, self.ph_r],
                                      self.ts_pi_given_r)
        self._pid = U.function([self.ph_x],
                               self.ts_pid)  # derandomized actions
        # actions and the randomness used in generating actions concatenated.
        self._pir = U.function([self.ph_x], self.ts_pir)
        self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp)
        self._logp_grad = U.function([self.ph_x, self.ph_y],
                                     tf.gradients(self.ts_logp, self.ts_vars))
        # build fvp operator (this depends only on self)
        ph_g, ts_grads = self._sh_vars.build_flat_ph()
        ts_kl = self.build_kl(self, self, p1_sg=True)
        ts_kl_grads = U.gradients(ts_kl,
                                  self.ts_vars)  # grad to the 2nd arg of KL
        ts_inner_prod = tf.add_n([
            tf.reduce_sum(kg * v)
            for (kg, v) in zipsame(ts_kl_grads, ts_grads)
        ])
        ts_fvp = U.gradients(
            ts_inner_prod,
            self.ts_vars)  # Fisher (information matrix) and Vector Product
        ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp],
                           axis=-1)  # continuous vector
        self._fvp = U.function([self.ph_x, ph_g], ts_fvp)

        # build nabla logp f.
        ts_loss = tf.reduce_sum(self.ph_f * self.ts_logp)  # sum!!
        ts_grads = U.gradients(ts_loss, self.ts_vars)
        ts_grads = [
            g if g is not None else tf.zeros_like(v)
            for (v, g) in zipsame(self.ts_vars, ts_grads)
        ]
        # need to flatten
        compute_ts_grad = U.function([self.ph_x, self.ph_y, self.ph_f],
                                     ts_grads)
        self.nabla_logp_f = lambda x, y, f: flatten(compute_ts_grad(x, y, f))
Пример #2
0
    def _build_graph(self, **kwargs):
        """ We treat tfFunctionApproximator as the stochastic map of the policy
        (which inputs ph_x and outputs ts_yh) and build additional
        attributes/methods required by Policy """
        # build tf.Variables
        # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars,
        #                ph_y, ts_pi, ts_logp, ts_pid
        tfFunctionApproximator._build_graph(self, **kwargs)

        # build additional graphs for Policy
        # build conditional distribution
        self._pi = self._yh
        self._pid = U.function([self.ph_x], self.ts_pid)
        self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp)
        # build fvp operator (this depends only on self)
        ph_g, ts_grads = self._sh_vars.build_flat_ph()
        ts_kl = self.build_kl(self, self, p1_sg=True)
        ts_kl_grads = U.gradients(ts_kl,
                                  self.ts_vars)  # grad to the 2nd arg of KL
        ts_inner_prod = tf.add_n([
            tf.reduce_sum(kg * v)
            for (kg, v) in zipsame(ts_kl_grads, ts_grads)
        ])
        ts_fvp = U.gradients(
            ts_inner_prod,
            self.ts_vars)  # Fisher (information matrix) and Vector Product
        ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp],
                           axis=-1)  # continuous vector
        self._fvp = U.function([self.ph_x, ph_g], ts_fvp)
Пример #3
0
 def _build_graph(self, **bg_kwargs):
     ts_loss, ph_args = self._build_loss_op(**bg_kwargs)
     # define compute_loss and compute_grad wrt loss
     self._compute_loss = U.function(ph_args, ts_loss)
     ts_grads = U.gradients(ts_loss, self._ts_vars)
     # fill None with zeros; otherwise tf.run will attempt to fetch for None.
     ts_grads = [g if g is not None else tf.zeros_like(v) for (v, g) in
                 zipsame(self._ts_vars, ts_grads)]
     self._compute_grad = U.function(ph_args, ts_grads)
Пример #4
0
def get_valcombs_and_keys(ranges):
    keys = []
    values = []
    for r in ranges:
        keys += r[::2]
    values = [list(zipsame(*r[1::2])) for r in ranges]
    cs = itertools.product(*values)
    combs = []
    for c in cs:
        comb = []
        for x in c:
            comb += x
        print(comb)
        combs.append(comb)
    return combs, keys