def __init__(self, params, cost_fun, batch_generator=None, stepsize_schedule=ConstantStepsizeSchedule(0.01), session=tf.get_default_session(), dtype=tf.float64, seed=None): """ Initialize the sampler base class. Sets up member variables and initializes uninitialized target parameters in the current `tensorflow.Graph`. Parameters ------------ params : list of `tensorflow.Variable` objects Target parameters for which we want to sample new values. cost_fun : callable Function that takes `params` as input and returns a 1-d `tensorflow.Tensor` that contains the cost-value. Frequently denoted with `U` in literature. batch_generator : `BatchGenerator`, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule Iterator class that produces a stream of stepsize values that we can use in our samplers. See also: `pysgmcmc.stepsize_schedules` session : `tensorflow.Session`, optional Session object which knows about the external part of the graph (which defines `cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. seed : int, optional Random seed to use. Defaults to `None`. See Also ------------ pysgmcmc.sampling.BurnInMCMCSampler: Abstract base class for samplers that perform a burn-in phase to tune their own hyperparameters. Inherits from `sampling.MCMCSampler`. """ # Sanitize inputs assert batch_generator is None or hasattr(batch_generator, "__next__") assert seed is None or isinstance(seed, int) assert isinstance(session, (tf.Session, tf.InteractiveSession)) assert isinstance(dtype, tf.DType) assert callable(cost_fun) self.dtype = dtype self.n_iterations = 0 self.seed = seed assert hasattr(stepsize_schedule, "update") assert hasattr(stepsize_schedule, "__next__") assert hasattr(stepsize_schedule, "initial_value") self.stepsize_schedule = stepsize_schedule self.batch_generator = batch_generator self.session = session self.params = params # set up costs self.cost_fun = cost_fun self.cost = cost_fun(self.params) # compute vectorized clones of all parameters self.vectorized_params = [vectorize(param) for param in self.params] self.epsilon = tf.Variable(self.stepsize_schedule.initial_value, dtype=self.dtype, name="epsilon", trainable=False) # Initialize uninitialized parameters before usage in any sampler. init = tf.variables_initializer( uninitialized_params(session=self.session, params=self.params + self.vectorized_params + [self.epsilon])) self.session.run(init) # query this later to determine the next sample self.theta_t = [None] * len(params)
def __init__(self, params, cost_fun, batch_generator=None, stepsize_schedule=ConstantStepsizeSchedule(0.01), burn_in_steps=3000, mdecay=0.05, scale_grad=1.0, session=tf.get_default_session(), dtype=tf.float64, seed=None): """ Initialize the sampler parameters and set up a tensorflow.Graph for later queries. parameters ---------- params : list of tensorflow.Variable objects Target parameters for which we want to sample new values. cost_fun : callable Function that takes `params` as input and returns a 1-d `tensorflow.Tensor` that contains the cost-value. Frequently denoted with `U` in literature. batch_generator : iterable, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule Iterator class that produces a stream of stepsize values that we can use in our samplers. See also: `pysgmcmc.stepsize_schedules` burn_in_steps : int, optional Number of burn-in steps to perform. In each burn-in step, this sampler will adapt its own internal parameters to decrease its error. Defaults to `3000`.\n For reference see: `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_ mdecay : float, optional (Constant) momentum decay per time-step. Defaults to `0.05`.\n For reference see: `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_ scale_grad : float, optional Value that is used to scale the magnitude of the noise used during sampling. In a typical batches-of-data setting this usually corresponds to the number of examples in the entire dataset. Defaults to `1.0` which corresponds to no scaling. session : tensorflow.Session, optional Session object which knows about the external part of the graph (which defines `Cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. seed : int, optional Random seed to use. Defaults to `None`. See Also ---------- pysgmcmc.sampling.BurnInMCMCSampler: Base class for `SGHMCSampler` that specifies how actual sampling is performed (using iterator protocol, e.g. `next(sampler)`). """ # Set up BurnInMCMCSampler base class: # initialize member variables common to all samplers # and run initializers for all uninitialized variables in `params` # (to avoid errors in the graph definitions below). super().__init__(params=params, cost_fun=cost_fun, burn_in_steps=burn_in_steps, batch_generator=batch_generator, seed=seed, dtype=dtype, session=session, stepsize_schedule=stepsize_schedule) # Initialize graph constants {{{ # noise = tf.constant(0., name="noise", dtype=dtype) scale_grad = tf.constant(scale_grad, dtype=dtype, name="scale_grad") epsilon_scaled = tf.divide(self.epsilon, tf.sqrt(scale_grad), name="epsilon_scaled") mdecay = tf.constant(mdecay, name="mdecay", dtype=dtype) # }}} Initialize graph constants # grads = [ vectorize(gradient) for gradient in tf.gradients(self.cost, params) ] # Initialize internal sampler parameters {{{ # tau = [ tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="tau_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] r = [ tf.Variable(1. / (tau[i].initialized_value() + 1), name="R_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] g = [ tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="g_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] v_hat = [ tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="v_hat_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] # Initialize Mass matrix inverse minv = [ tf.Variable(tf.divide(tf.constant(1., dtype=dtype), tf.sqrt(v_hat[i].initialized_value())), name="minv_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] # Initialize momentum V = [ tf.Variable(tf.zeros_like(param, dtype=dtype), dtype=dtype, name="v_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] # }}} Initialize internal sampler parameters # self.minv_t = [None] * len(params) # gets burned-in # R_t = 1/ (tau + 1), shouldn't it be: 1 / tau according to terms? # It is not, and changing it to that breaks everything! # Why? for i, (param, grad) in enumerate(zip(params, grads)): vectorized_param = self.vectorized_params[i] # Burn-in logic {{{ # r_t = tf.assign(r[i], 1. / (tau[i] + 1), name="r_t_{}".format(i)) # r_t should always use the old value of tau with tf.control_dependencies([r_t]): tau_t = tf.assign_add( tau[i], safe_divide(-g[i] * g[i] * tau[i], v_hat[i]) + 1, name="tau_t_{}".format(i)) # minv = v_hat^{-1/2} = 1 / sqrt(v_hat) self.minv_t[i] = tf.assign(minv[i], safe_divide(1., safe_sqrt(v_hat[i])), name="minv_t_{}".format(i)) # tau_t, minv_t should always use the old values of G, v_hat with tf.control_dependencies([tau_t, self.minv_t[i]]): g_t = tf.assign_add(g[i], -r_t * g[i] + r_t * grad, name="g_t_{}".format(i)) v_hat_t = tf.assign_add(v_hat[i], -r_t * v_hat[i] + r_t * grad**2, name="v_hat_t_{}".format(i)) # }}} Burn-in logic # with tf.control_dependencies([g_t, v_hat_t]): # Draw random normal sample {{{ # # Equation 10, variance of normal sample # 2 * epsilon ** 2 * mdecay * Minv - 0 (noise is 0) - epsilon ** 4 # = 2 * epsilon ** 2 * epsilon * v_hat^{-1/2} * C * Minv # = 2 * epsilon ** 3 * v_hat^{-1/2} * C * v_hat^{-1/2} - epsilon ** 4 # (co-) variance of normal sample noise_scale = ( tf.constant(2., dtype=dtype) * epsilon_scaled**tf.constant(2., dtype=dtype) * mdecay * self.minv_t[i] - tf.constant(2., dtype=dtype) * epsilon_scaled**tf.constant(3., dtype) * tf.square(self.minv_t[i]) * noise - epsilon_scaled**4) # turn into stddev sigma = tf.sqrt(tf.maximum(noise_scale, 1e-16), name="sigma_{}".format(i)) sample = self._draw_noise_sample( sigma=sigma, shape=vectorized_param.shape) # }}} Draw random sample # # HMC Update {{{ # # Equation 10: right side, where: # Minv = v_hat^{-1/2}, Mdecay = epsilon * v_hat^{-1/2} C v_t = tf.assign_add( V[i], -self.epsilon**2 * self.minv_t[i] * grad - mdecay * V[i] + sample, name="v_t_{}".format(i)) # Equation 10: left side vectorized_Theta_t = tf.assign_add( vectorized_param, v_t) self.theta_t[i] = tf.assign( param, unvectorize(vectorized_Theta_t, original_shape=param.shape), name="theta_t_{}".format(i))
def __init__(self, params, cost_fun, tf_scope="default", batch_generator=None, stepsize_schedule=ConstantStepsizeSchedule(0.001), mass=1.0, speed_of_light=0.5, D=1.0, Bhat=0.0, session=tf.get_default_session(), dtype=tf.float64, seed=None): """ Initialize the sampler parameters and set up a tensorflow.Graph for later queries. Parameters ---------- params : list of tensorflow.Variable objects Target parameters for which we want to sample new values. Cost : tensorflow.Tensor 1-d Cost tensor that depends on `params`. Frequently denoted as U(theta) in literature. batch_generator : BatchGenerator, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule Iterator class that produces a stream of stepsize values that we can use in our samplers. See also: `pysgmcmc.stepsize_schedules` mass : float, optional mass constant. Defaults to `1.0`. speed_of_light : float, optional "Speed of light" constant. TODO EXTEND DOKU Defaults to `1.0`. D : float, optional Diffusion constant. Defaults to `1.0`. Bhat : float, optional TODO: Documentation session : tensorflow.Session, optional Session object which knows about the external part of the graph (which defines `Cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. seed : int, optional Random seed to use. Defaults to `None`. See Also ---------- pysgmcmc.sampling.MCMCSampler: Base class for `RelativisticSGHMCSampler` that specifies how actual sampling is performed (using iterator protocol, e.g. `next(sampler)`). """ # Set up MCMCSampler base class: # initialize member variables common to all samplers # and run initializers for all uninitialized variables in `params` # (to avoid errors in the graph definitions below). super().__init__(params=params, cost_fun=cost_fun, batch_generator=batch_generator, tf_scope=tf_scope, stepsize_schedule=stepsize_schedule, seed=seed, dtype=dtype, session=session) # Use `-self.Cost` since the rest of the implementation expects # a log likelihood (instead of the *negative* log likelihood that # we normally use as costs) grads = [ vectorize(gradient) for gradient in tf.gradients(-self.cost, params) ] with tf.variable_scope(tf_scope, reuse=tf.AUTO_REUSE): D = tf.constant(D, dtype=dtype) b_hat = tf.constant(Bhat, dtype=dtype) # In internal implementation, stick to mathematical formulas. # For users, prefer readability. m = tf.constant(mass, dtype=dtype) c = tf.constant(speed_of_light, dtype=dtype) momentum = [] for i in range(len(params)): momentum_params = [] for momentum_sample in _sample_relativistic_momentum( m=mass, c=speed_of_light, n_params=self.vectorized_params[i].shape[0], seed=self.seed): momentum_params.append(momentum_sample) momentum_params = tf.reshape(momentum_params, self.vectorized_params[i].shape) momentum_params = tf.Variable(momentum_params, dtype=dtype) momentum.append(momentum_params) # momentum = [ # tf.Variable(momentum_sample, dtype=dtype) # for momentum_sample in _sample_relativistic_momentum( # m=mass, c=speed_of_light, n_params=len(self.params), seed=self.seed # ) # ] # # In internal implementation, stick to mathematical formulas. # # For users, prefer readability. # m = tf.constant(mass, dtype=dtype) # c = tf.constant(speed_of_light, dtype=dtype) for i, (param, grad) in enumerate(zip(params, grads)): vectorized_param = self.vectorized_params[i] p_grad = self.epsilon * momentum[i] / ( m * tf.sqrt(momentum[i] * momentum[i] / (tf.square(m) * tf.square(c)) + 1)) n = tf.sqrt( self.epsilon * (2 * D - self.epsilon * b_hat)) * tf.random_normal( shape=vectorized_param.shape, dtype=dtype, seed=seed) momentum_t = tf.assign_add( momentum[i], tf.reshape(self.epsilon * grad + n - D * p_grad, momentum[i].shape)) p_grad_new = self.epsilon * momentum_t / ( m * tf.sqrt(momentum_t * momentum_t / (tf.square(m) * tf.square(c)) + 1)) vectorized_theta_t = tf.assign_add( vectorized_param, tf.reshape(p_grad_new, vectorized_param.shape)) self.theta_t[i] = tf.assign( param, unvectorize(vectorized_theta_t, original_shape=param.shape))
def __init__(self, params, cost_fun, batch_generator=None, stepsize_schedule=ConstantStepsizeSchedule(0.01), burn_in_steps=3000, A=1.0, scale_grad=1.0, session=tf.get_default_session(), dtype=tf.float64, seed=None): """ Initialize the sampler parameters and set up a tensorflow.Graph for later queries. Parameters ---------- params : list of tensorflow.Variable objects Target parameters for which we want to sample new values. cost_fun : callable Function that takes `params` as input and returns a 1-d `tensorflow.Tensor` that contains the cost-value. Frequently denoted with `U` in literature. batch_generator : BatchGenerator, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule Iterator class that produces a stream of stepsize values that we can use in our samplers. See also: `pysgmcmc.stepsize_schedules` burn_in_steps: int, optional Number of burn-in steps to perform. In each burn-in step, this sampler will adapt its own internal parameters to decrease its error. Defaults to `3000`.\n For reference see: `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_ A : float, optional TODO Doku Defaults to `1.0`. scale_grad : float, optional Value that is used to scale the magnitude of the noise used during sampling. In a typical batches-of-data setting this usually corresponds to the number of examples in the entire dataset. session : tensorflow.Session, optional Session object which knows about the external part of the graph (which defines `cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. seed : int, optional Random seed to use. Defaults to `None`. See Also ---------- tensorflow_mcmc.sampling.mcmc_base_classes.BurnInMCMCSampler: Base class for `SGLDSampler` that specifies how actual sampling is performed (using iterator protocol, e.g. `next(sampler)`). """ super().__init__(params=params, cost_fun=cost_fun, batch_generator=batch_generator, burn_in_steps=burn_in_steps, seed=seed, session=session, dtype=dtype) n_params = len(params) # Initialize graph constants {{{ # A = tf.constant(A, name="A", dtype=dtype) noise = tf.constant(0., name="noise", dtype=dtype) scale_grad = tf.constant(scale_grad, name="scale_grad", dtype=dtype) # }}} Initialize graph constants # grads = [ vectorize(gradient) for gradient in tf.gradients(self.cost, params) ] # Initialize internal sampler parameters {{{ # tau = [ tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="tau_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] R = [ tf.Variable(1. / (tau[i].initialized_value() + 1), name="R_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] g = [ tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="g_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] v_hat = [ tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="v_hat_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] # Initialize mass matrix inverse {{{ # minv = [ tf.Variable(tf.divide(tf.constant(1., dtype=dtype), tf.sqrt(v_hat[i].initialized_value())), name="minv_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params) ] # }}} Initialize mass matrix inverse # # }}} Initialize internal sampler parameters # self.minv_t = [None] * n_params # gets burned-in for i, (param, grad) in enumerate(zip(params, grads)): vectorized_param = self.vectorized_params[i] # Burn-in logic {{{ # r_t = tf.assign(R[i], 1. / (tau[i] + 1.), name="r_t_{}".format(i)) # r_t should always use the old value of tau with tf.control_dependencies([r_t]): tau_t = tf.assign_add( tau[i], safe_divide(-g[i] * g[i] * tau[i], v_hat[i]) + 1, name="tau_t_{}".format(i)) self.minv_t[i] = tf.assign(minv[i], safe_divide(1., safe_sqrt(v_hat[i])), name="minv_t_{}".format(i)) # tau_t, minv_t should always use the old values of g, g2 with tf.control_dependencies([tau_t, self.minv_t[i]]): g_t = tf.assign_add(g[i], -r_t * g[i] + r_t * grad, name="g_t_{}".format(i)) v_hat_t = tf.assign_add(v_hat[i], -r_t * v_hat[i] + r_t * grad**2, name="v_hat_t_{}".format(i)) # }}} Burn-in logic # with tf.control_dependencies([g_t, v_hat_t]): # Draw random sample {{{ # sigma = safe_sqrt(2. * self.epsilon * safe_divide( (self.minv_t[i] * (A - noise)), scale_grad)) sample = self._draw_noise_sample( sigma=sigma, shape=vectorized_param.shape) # }}} Draw random sample # # SGLD Update {{{ # vectorized_theta_t = tf.assign_add( vectorized_param, -self.epsilon * self.minv_t[i] * A * grad + sample, ) self.theta_t[i] = tf.assign( param, unvectorize(vectorized_theta_t, original_shape=param.shape), name="Theta_t_{}".format(i))
def __init__(self, particles, cost_fun, tf_scope="default", batch_generator=None, stepsize_schedule=ConstantStepsizeSchedule(0.1), alpha=0.9, fudge_factor=1e-6, session=tf.get_default_session(), dtype=tf.float64, seed=None): """ Initialize the sampler parameters and set up a tensorflow.Graph for later queries. Parameters ---------- particles : List[tensorflow.Variable] List of particles each representing a (different) guess of the target parameters of this sampler. cost_fun : callable Function that takes `params` of *one* particle as input and returns a 1-d `tensorflow.Tensor` that contains the cost-value. Frequently denoted with `U` in literature. batch_generator : iterable, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule Iterator class that produces a stream of stepsize values that we can use in our samplers. See also: `pysgmcmc.stepsize_schedules` alpha : float, optional TODO DOKU Defaults to `0.9`. fudge_factor : float, optional TODO DOKU Defaults to `1e-6`. session : tensorflow.Session, optional Session object which knows about the external part of the graph (which defines `Cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. seed : int, optional Random seed to use. Defaults to `None`. See Also ---------- pysgmcmc.sampling.MCMCSampler: Base class for `SteinVariationalGradientDescentSampler` that specifies how actual sampling is performed (using iterator protocol, e.g. `next(sampler)`). """ assert isinstance(alpha, (int, float)) assert isinstance(fudge_factor, (int, float)) # assert callable(cost_fun) # self.particles = tf.stack(particles) self.particles = particles # def cost_fun_wrapper(params): # return tf.map_fn(lambda particle: cost_fun(particle), self.particles) # cost_fun_wrapper.__name__ = "potential_energy" # cost_fun.__name__ # super().__init__( self._init_basic( params=particles, cost_fun=cost_fun, # cost_fun_wrapper, tf_scope=tf_scope, batch_generator=batch_generator, session=session, seed=seed, dtype=dtype, stepsize_schedule=stepsize_schedule ) with tf.variable_scope(tf_scope, reuse=tf.AUTO_REUSE): fudge_factor = tf.constant( fudge_factor, dtype=self.dtype, name="fudge_factor" ) self.epsilon = tf.Variable( stepsize_schedule.initial_value, dtype=self.dtype, name="stepsize" ) stack_vectorized_params = tf.stack(self.vectorized_params) self.n_particles = tf.cast( # self.particles.shape[0], self.dtype stack_vectorized_params.shape[0], self.dtype ) historical_grad = tf.get_variable( "historical_grad", stack_vectorized_params.shape, dtype=dtype, initializer=tf.zeros_initializer() ) self.session.run( tf.variables_initializer([historical_grad, self.epsilon]) ) # lnpgrad = tf.squeeze(tf.gradients(self.cost, self.particles)) grads = [] for i, cost in enumerate(cost_fun): grads.append(tf.concat([vectorize(gradient) for gradient in tf.gradients(cost, self.particles[i])], axis=0)) lnpgrad = tf.squeeze(grads) kernel_matrix, kernel_gradients = self.svgd_kernel(stack_vectorized_params) # self.svgd_kernel(self.particles) grad_theta = tf.divide( tf.matmul(kernel_matrix, lnpgrad) + kernel_gradients, self.n_particles ) historical_grad_t = tf.assign( historical_grad, alpha * historical_grad + (1. - alpha) * (grad_theta ** 2) ) adj_grad = tf.divide( grad_theta, fudge_factor + tf.sqrt(historical_grad_t) ) for i, particle in enumerate(self.particles): vectorized_Theta_t = tf.assign_sub( self.vectorized_params[i], self.epsilon * adj_grad[i] ) start_idx = 0 for j, param in enumerate(particle): flat_shape = tf.reduce_prod(param.shape) vectorized_param = vectorized_Theta_t[start_idx:start_idx+flat_shape] self.theta_t[i*len(particle) + j] = tf.assign( param, tf.reshape(vectorized_param, shape=param.shape), name="theta_t_%d_%d" % (i, j) ) start_idx += flat_shape return