def advs(self, ro, lambd=None, use_is=None, ref_policy=None): # advantage function """ Compute adv (evaluated at ro) wrt to ref_policy. ro: a list of Rollout isinstances Note `ref_policy` argument is only considered when `self.use_is` is True; in this case, if `ref_policy` is None, it is wrt to `self.ref_policy`. Otherwise, when `self.use_is`_is is False, the adv is biased toward the behavior policy that collected the data. """ use_is = use_is or self.use_is vfns = self.vfns(ro) if use_is is 'multi': ws = self.weights(ro, ref_policy) # importance weight advs = [ self._pe.adv(rollout.rws, vf, rollout.done, w=w, lambd=lambd) for rollout, vf, w in zipsame(ro, vfns, ws) ] else: advs = [ self._spe.adv(rollout.rws, vf, rollout.done, w=1.0, lambd=lambd) for rollout, vf in zipsame(ro, vfns) ] return advs, vfns
def _build_graph(self, **bg_kwargs): ts_loss, ph_args = self._build_loss_op(**bg_kwargs) # define compute_loss and compute_grad wrt loss self._compute_loss = U.function(ph_args, ts_loss) ts_grads = U.gradients(ts_loss, self._ts_vars) # fill None with zeros; otherwise tf.run will attempt to fetch for None. ts_grads = [g if g is not None else tf.zeros_like(v) for (v, g) in zipsame(self._ts_vars, ts_grads)] self._compute_grad = U.function(ph_args, ts_grads)
def ts_fvp0(self, ts_xs, ts_ys, ts_gs): """ Computes F(self.pi)*g based on the expected outer product. """ dummy = tf.ones(shape=(len(ts_xs,))) with tf.GradientTape(watch_accessed_variables=False) as gt: gt.watch(dummy) ts_sum_logp_grads = self.ts_logp_grad(ts_xs, ts_ys, dummy) ts_pd = tf.math.accumulate_n([tf.reduce_sum(u*v) for (u, v) in zipsame(ts_sum_logp_grads, ts_gs)]) ts_fs = gt.gradient(ts_pd, dummy) # shape (N,) N = tf.constant(len(dummy),dtype=tf_float) return self.ts_logp_grad(ts_xs, ts_ys, ts_fs/N)
def ts_fvp(self, ts_xs, ts_gs): """ Computes F(self.pi)*g based on the Hessian of the entropy. """ with tf.GradientTape(watch_accessed_variables=False) as gt: gt.watch(self.ts_variables) with tf.GradientTape(watch_accessed_variables=False) as gt2: gt2.watch(self.ts_variables) # TODO add sample weight below?? ts_kl = self.ts_kl(self, ts_xs, p1_sg=True) ts_kl_grads = gt2.gradient(ts_kl, self.ts_variables) ts_pd = tf.math.accumulate_n([tf.reduce_sum(kg*v) for (kg, v) in zipsame(ts_kl_grads, ts_gs)]) ts_fvp = gt.gradient(ts_pd, self.ts_variables) return ts_fvp
def ts_fvp(self, ts_xs, ts_gs): """ Computes F(self.pi)*g, where F is the Fisher information matrix and g is a np.ndarray in the same shape as self.variable """ with tf.GradientTape() as gt: gt.watch(self.ts_variables) with tf.GradientTape() as gt2: gt2.watch(self.ts_variables) # TODO add sample weight below?? ts_kl = self.ts_kl(self, ts_xs, p1_sg=True) ts_kl_grads = gt2.gradient(ts_kl, self.ts_variables) ts_pd = tf.add_n([ tf.reduce_sum(kg * v) for (kg, v) in zipsame(ts_kl_grads, ts_gs) ]) ts_fvp = gt.gradient(ts_pd, self.ts_variables) return ts_fvp
def get_combs_and_keys(ranges): keys = [] values = [] for r in ranges: keys += r[::2] values = [list(zipsame(*r[1::2])) for r in ranges] cs = itertools.product(*values) combs = [] for c in cs: comb = [] for x in c: comb += x # print(comb) combs.append(comb) return combs, keys
def split(self, ro, policy_as_expert): # Split ro into two phases rollouts = ro.to_list() ro_mix = [rollouts[i] for i in self._ind_ro_mix] ro_pol = [rollouts[i] for i in self._ind_ro_pol] assert (len(ro_mix) + len(ro_pol)) == len(rollouts) ro_exps = [[] for _ in range(len(self.experts))] for r, t, s, k in zipsame(ro_mix, self._t_switch, self._scale, self._k_star): assert len(r) >= t # because t >= 1 if not policy_as_expert or k < len(self.experts) - 1: # we assume the last expert is the learner r = r[t:] r.weight = 1.0 ro_exps[k].append(r) if policy_as_expert: ro_pol += ro_exps[-1] del ro_exps[-1] ro_exps = [Dataset(ro_exp) for ro_exp in ro_exps] ro_pol = Dataset(ro_pol) return ro_exps, ro_pol
def _build_graph(self, **kwargs): """ We treat tfFunctionApproximator as the stochastic map of the policy (which inputs ph_x and outputs ts_yh) and build additional attributes/methods required by Policy """ # build tf.Variables # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars, # ph_y, ts_pi, ts_logp, ts_pid tfFunctionApproximator._build_graph(self, **kwargs) # build additional graphs for Policy # build conditional distribution self._pi = self._yh self._pid = U.function([self.ph_x], self.ts_pid) self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp) # build fvp operator (this depends only on self) ph_g, ts_grads = self._sh_vars.build_flat_ph() ts_kl = self.build_kl(self, self, p1_sg=True) ts_kl_grads = U.gradients(ts_kl, self.ts_vars) # grad to the 2nd arg of KL ts_inner_prod = tf.add_n([tf.reduce_sum(kg * v) for (kg, v) in zipsame(ts_kl_grads, ts_grads)]) ts_fvp = U.gradients(ts_inner_prod, self.ts_vars) # Fisher (information matrix) and Vector Product ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp], axis=-1) # continuous vector self._fvp = U.function([self.ph_x, ph_g], ts_fvp)
def mean_variable(self, val): vals = unflatten(val, shapes=self.mean_var_shapes) [var.assign(val) for var, val in zipsame(self.ts_mean_variables, vals)]
def variables(self, vals): # vals can be a list of nd.array or tf.Tensor [var.assign(val) for var, val in zipsame(self.ts_variables, vals)]
def var_assign(ts_var, x): # convert np.ndarray(s) to tf.Tensor(s) if type(ts_var) is list: return [vv.assign(xx) for vv, xx in zipsame(ts_var, x)] else: return ts_var(x)