def _build_graph(self, **kwargs): """ Add attributes ph_y, ph_w, ph_lr methods _compute_loss, _apply_gradients """ # build function approximator cls._build_graph(self, **kwargs) # build loss function self.ph_y = tf.placeholder(shape=[None, self.y_dim], name="y", dtype=tf_float) self.ph_w = tf.placeholder( shape=[None], name='w', dtype=tf_float) # the weighting for each sample ts_loss = self._build_loss(self.ts_yh, self.ph_y, self.ph_w) # user-defined # build optimizer from loss ts_grads = list( zip(U.gradients(ts_loss, self.ts_vars), self.ts_vars)) # a list of (grad, var) tuples self.ph_lr = tf.placeholder(shape=[], name="learning_rate", dtype=tf_float) ts_apply_gradients = self._build_apply_gradients( ts_grads, self.ph_lr) self._compute_loss = U.function([self.ph_x, self.ph_y, self.ph_w], ts_loss) self._apply_gradients = U.function( [self.ph_x, self.ph_y, self.ph_w, self.ph_lr], ts_apply_gradients)
def _build_graph(self, **bg_kwargs): ts_loss, ph_args = self._build_loss_op(**bg_kwargs) # define compute_loss and compute_grad wrt loss self._compute_loss = U.function(ph_args, ts_loss) ts_grads = U.gradients(ts_loss, self._ts_vars) # fill None with zeros; otherwise tf.run will attempt to fetch for None. ts_grads = [g if g is not None else tf.zeros_like(v) for (v, g) in zipsame(self._ts_vars, ts_grads)] self._compute_grad = U.function(ph_args, ts_grads)
def kl(self, other, x, reversesd=False): assert type(other) == type(self) key = str(id(other)) + str(reversesd) if self._kl_cache[key] is None: ts_kl = self.build_kl(self, other) if reversesd else self.build_kl(other, self) _kl = U.function([self.ph_x, other.ph_x], ts_kl) self._kl_cache[key] = lambda _x: _kl(_x, _x) return self._kl_cache[key](x)
def _build_graph(self, **kwargs): """ We treat tfFunctionApproximator as the stochastic map of the policy (which inputs ph_x and outputs ts_yh) and build additional attributes/methods required by Policy """ # build tf.Variables # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars, # ph_y, ts_pi, ts_logp, ts_pid tfFunctionApproximator._build_graph(self, **kwargs) # build additional graphs for Policy # build conditional distribution self._pi = self._yh self._pid = U.function([self.ph_x], self.ts_pid) self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp) # build fvp operator (this depends only on self) ph_g, ts_grads = self._sh_vars.build_flat_ph() ts_kl = self.build_kl(self, self, p1_sg=True) ts_kl_grads = U.gradients(ts_kl, self.ts_vars) # grad to the 2nd arg of KL ts_inner_prod = tf.add_n([tf.reduce_sum(kg * v) for (kg, v) in zipsame(ts_kl_grads, ts_grads)]) ts_fvp = U.gradients(ts_inner_prod, self.ts_vars) # Fisher (information matrix) and Vector Product ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp], axis=-1) # continuous vector self._fvp = U.function([self.ph_x, ph_g], ts_fvp)
def _build_graph(self, **kwargs): """ Builds the graph of mapping through the user-provided _build_func_apprx. After all the tf.Variables are created it adds a new attribute _sh_vars (a Shaper object) for convenient manipulation of the tf.Variables inside the graph. Added attributes: ph_x, ts_nor_x, ts_y, _yh, _sh_vars """ # build the input placeholder self.ph_x = tf.placeholder(shape=[None, self.x_dim], name="input", dtype=tf_float) # build the normalizer for whitening self.ts_nor_x = self._nor.build_nor_ops(self.ph_x) # build parameterized function approximator self.ts_yh = self._build_func_apprx(self.ts_nor_x, **kwargs) self._yh = U.function([self.ph_x], self.ts_yh) # build a Shaper of trainable variables for transforming # between continguous and list representations self._sh_vars = U.Shaper(self.ts_vars)
def _build_dist(self, ts_nor_x, ph_y): # mean and std self.ts_mean = cls._build_func_apprx(self, ts_nor_x) # use the tfFunctionApproximator to define mean self._ts_logstd = tf.get_variable( 'logstd', shape=[self.y_dim], initializer=tf.constant_initializer(self._init_logstd)) self._ts_stop_std_grad = tf.get_variable('stop_std_grad', initializer=tf.constant(False), trainable=False) _ts_logstd = tf.cond(self._ts_stop_std_grad, # whether to stop gradient true_fn=lambda: tf.stop_gradient(self._ts_logstd), false_fn=lambda: self._ts_logstd) # make sure the distribution does not degenerate self.ts_logstd = tf.maximum(tf.to_float(np.log(self._min_std)), _ts_logstd) ts_std = tf.exp(self.ts_logstd) self._std = U.function([], ts_std) self._set_logstd = U.build_set([self._ts_logstd]) self._set_stop_std_grad = U.build_set([self._ts_stop_std_grad]) # pi self.ts_noise = tf.random_normal(tf.shape(ts_std), stddev=ts_std, seed=self.seed) ts_pi = self.ts_mean + self.ts_noise ts_pid = self.ts_mean # logp ts_logp = self._build_logp(self.y_dim, ph_y, self.ts_mean, self.ts_logstd) return ts_pi, ts_logp, ts_pid