def __init__(self, normal_sampler_fn, log_likelihood_fn, seed=None, name=None): """Initializes this transition kernel. Args: normal_sampler_fn: Python callable that takes in a seed and returns a sample from a multivariate normal distribution. Note that the shape of the samples must agree with `log_likelihood_fn`. log_likelihood_fn: Python callable which takes an argument like `current_state` (or `*current_state` if it is a list) and returns its (possibly unnormalized) log-likelihood. seed: Python integer to seed the random number generator. name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., 'slice_sampler_kernel'). Returns: next_state: Tensor or Python list of `Tensor`s representing the state(s) of the Markov chain(s) at each result step. Has same shape as `current_state`. kernel_results: `collections.namedtuple` of internal calculations used to advance the chain. """ self._seed_stream = tfp_util.SeedStream( seed, salt='elliptical_slice_sampler') self._parameters = dict(normal_sampler_fn=normal_sampler_fn, log_likelihood_fn=log_likelihood_fn, seed=seed, name=name)
def _get_mixing_indices(size, seed=None, name=None): """Generates an array of indices suitable for mutation operation. The mutation operation in differential evolution requires that for every element of the population, three distinct other elements be chosen to produce a trial candidate. This function generates an array of shape [size, 3] satisfying the properties that: (a). array[i, :] does not contain the index 'i'. (b). array[i, :] does not contain any overlapping indices. (c). All elements in the array are between 0 and size - 1 inclusive. Args: size: Scalar integer `Tensor`. The number of samples as well as a the range of the indices to sample from. seed: `int` or None. The random seed for this `Op`. If `None`, no seed is applied. Default value: `None`. name: Python `str` name prefixed to Ops created by this function. Default value: 'get_mixing_indices'. Returns: sample: A `Tensor` of shape [size, 3] and same dtype as `size` containing samples without replacement between 0 and size - 1 (inclusive) with the `i`th row not including the number `i`. """ with tf1.name_scope( name, default_name='get_mixing_indices', values=[size]): size = tf.convert_to_tensor(value=size) dtype = size.dtype seed_stream = tfp_util.SeedStream(seed, salt='get_mixing_indices') first = tf.random.uniform([size], maxval=size-1, dtype=dtype, seed=seed_stream()) second = tf.random.uniform([size], maxval=size-2, dtype=dtype, seed=seed_stream()) third = tf.random.uniform([size], maxval=size-3, dtype=dtype, seed=seed_stream()) # Shift second if it is on top of or to the right of first second = tf1.where(first < second, x=second, y=second + 1) smaller = tf.math.minimum(first, second) larger = tf.math.maximum(first, second) # Shift the third one so it does not coincide with either the first or the # second number. Assuming first < second, shift by 1 if the number is in # [first, second) and by 2 if the number is greater than or equal to the # second. third = tf1.where(third < smaller, x=third, y=third + 1) third = tf1.where(third < larger, x=third, y=third + 1) sample = tf.stack([first, second, third], axis=1) to_avoid = tf.expand_dims(tf.range(size), axis=-1) sample = tf1.where(sample < to_avoid, x=sample, y=sample + 1) return sample
def _joint_sample_n(self, n, seed=None): """Draw a joint sample from the prior over latents and observations. This sampler is specific to LocalLevel models and is faster than the generic LinearGaussianStateSpaceModel implementation. Args: n: `int` `Tensor` number of samples to draw. seed: Optional `int` `Tensor` seed for the random number generator. Returns: latents: `float` `Tensor` of shape `concat([[n], self.batch_shape, [self.num_timesteps, self.latent_size]], axis=0)` representing samples of latent trajectories. observations: `float` `Tensor` of shape `concat([[n], self.batch_shape, [self.num_timesteps, self.observation_size]], axis=0)` representing samples of observed series generated from the sampled `latents`. """ with tf.name_scope('joint_sample_n'): strm = util.SeedStream(seed, 'LocalLevelStateSpaceModel_joint_sample_n') if self.batch_shape.is_fully_defined(): batch_shape = self.batch_shape.as_list() else: batch_shape = self.batch_shape_tensor() sample_and_batch_shape = tf.cast( prefer_static.concat([[n], batch_shape], axis=0), tf.int32) # Sample the initial timestep from the prior. Since we want # this sample to have full batch shape (not just the batch shape # of the self.initial_state_prior object which might in general be # smaller), we augment the sample shape to include whatever # extra batch dimensions are required. initial_level = self.initial_state_prior.sample( linear_gaussian_ssm._augment_sample_shape( # pylint: disable=protected-access self.initial_state_prior, sample_and_batch_shape, self.validate_args), seed=strm()) # Sample the latent random walk and observed noise, more efficiently than # the generic loop in `LinearGaussianStateSpaceModel`. level_jumps = (tf.random.normal(prefer_static.concat( [sample_and_batch_shape, [self.num_timesteps - 1]], axis=0), dtype=self.dtype, seed=strm()) * self.level_scale[..., tf.newaxis]) prior_level_sample = tf.cumsum(tf.concat( [initial_level, level_jumps], axis=-1), axis=-1) prior_observation_sample = prior_level_sample + ( # Sample noise. tf.random.normal(prefer_static.shape(prior_level_sample), dtype=self.dtype, seed=strm()) * self.observation_noise_scale[..., tf.newaxis]) return (prior_level_sample[..., tf.newaxis], prior_observation_sample[..., tf.newaxis])
def prior_sample(self, num_timesteps, initial_step=0, params_sample_shape=(), trajectories_sample_shape=(), seed=None): """Sample from the joint prior over model parameters and trajectories. Args: num_timesteps: Scalar `int` `Tensor` number of timesteps to model. initial_step: Optional scalar `int` `Tensor` specifying the starting timestep. Default value: 0. params_sample_shape: Number of possible worlds to sample iid from the parameter prior, or more generally, `Tensor` `int` shape to fill with iid samples. Default value: [] (i.e., draw a single sample and don't expand the shape). trajectories_sample_shape: For each sampled set of parameters, number of trajectories to sample, or more generally, `Tensor` `int` shape to fill with iid samples. Default value: [] (i.e., draw a single sample and don't expand the shape). seed: Python `int` random seed. Returns: trajectories: `float` `Tensor` of shape `trajectories_sample_shape + params_sample_shape + [num_timesteps, 1]` containing all sampled trajectories. param_samples: list of sampled parameter value `Tensor`s, in order corresponding to `self.parameters`, each of shape `params_sample_shape + prior.batch_shape + prior.event_shape`. """ seed = tfp_util.SeedStream(seed, salt='StructuralTimeSeries_prior_sample') with tf1.name_scope('prior_sample', values=[ num_timesteps, params_sample_shape, trajectories_sample_shape ]): param_samples = [ p.prior.sample(params_sample_shape, seed=seed(), name=p.name) for p in self.parameters ] model = self.make_state_space_model(num_timesteps=num_timesteps, initial_step=initial_step, param_vals=param_samples) return model.sample(trajectories_sample_shape, seed=seed()), param_samples
def _get_starting_population(initial_population, initial_position, population_size, population_stddev, seed): """Constructs the initial population. If an initial population is not already provided, this function constructs a population by adding random normal noise to the initial position. Args: initial_population: None or a list of `Tensor`s. The initial population. initial_position: None or a list of `Tensor`s. The initial position. If initial_population is None, this argument must not be None. population_size: Scalar integer `Tensor`. The number of members in the population. If the initial population is not None, this parameter is ignored. population_stddev: A positive scalar real `Tensor` of the same dtype as `initial_position` or `initial_population` (whichever is not None). This parameter is ignored if `initial_population` is specified. Used to generate the population from the `initial_position` by adding random normal noise with zero mean and the specified standard deviation. seed: Seed for random number generation. Returns: A list of `Tensor`s. The initial population. """ if initial_population is not None: return [tf.convert_to_tensor(value=part) for part in initial_population] # Constructs the population by adding normal noise to the initial position. seed_stream = tfp_util.SeedStream(seed, salt='get_starting_population') population = [] for part in initial_position: part = tf.convert_to_tensor(value=part) part_event_shape = tf.shape(input=part) # We only draw population_size-1 random vectors because we want to ensure # that the supplied position is part of the population. The first member # is set to be the initial_position. population_part_shape = tf.concat([[population_size-1], part_event_shape], axis=0) population_part = tf.random.normal(population_part_shape, stddev=population_stddev, dtype=part.dtype.base_dtype, seed=seed_stream()) population_part += part population_part = tf.concat([[part], population_part], axis=0) population.append(population_part) return population
def build_factored_surrogate_posterior( event_shape=None, constraining_bijectors=None, initial_unconstrained_loc=_sample_uniform_initial_loc, initial_unconstrained_scale=1e-2, trainable_distribution_fn=_build_trainable_normal_dist, seed=None, validate_args=False, name=None): """Builds a joint variational posterior that factors over model variables. By default, this method creates an independent trainable Normal distribution for each variable, transformed using a bijector (if provided) to match the support of that variable. This makes extremely strong assumptions about the posterior: that it is approximately normal (or transformed normal), and that all model variables are independent. Args: event_shape: `Tensor` shape, or nested structure of `Tensor` shapes, specifying the event shape(s) of the posterior variables. constraining_bijectors: Optional `tfb.Bijector` instance, or nested structure of such instances, defining support(s) of the posterior variables. The structure must match that of `event_shape` and may contain `None` values. A posterior variable will be modeled as `tfd.TransformedDistribution(underlying_dist, constraining_bijector)` if a corresponding constraining bijector is specified, otherwise it is modeled as supported on the unconstrained real line. initial_unconstrained_loc: Optional Python `callable` with signature `tensor = initial_unconstrained_loc(shape, seed)` used to sample real-valued initializations for the unconstrained representation of each variable. May alternately be a nested structure of `Tensor`s, giving specific initial locations for each variable; these must have structure matching `event_shape` and shapes determined by the inverse image of `event_shape` under `constraining_bijectors`, which may optionally be prefixed with a common batch shape. Default value: `functools.partial(tf.random.uniform, minval=-2., maxval=2., dtype=tf.float32)`. initial_unconstrained_scale: Optional scalar float `Tensor` initial scale for the unconstrained distributions, or a nested structure of `Tensor` initial scales for each variable. Default value: `1e-2`. trainable_distribution_fn: Optional Python `callable` with signature `trainable_dist = trainable_distribution_fn(initial_loc, initial_scale, event_ndims, validate_args)`. This is called for each model variable to build the corresponding factor in the surrogate posterior. It is expected that the distribution returned is supported on unconstrained real values. Default value: `functools.partial( tfp.experimental.vi.build_trainable_location_scale_distribution, distribution_fn=tfd.Normal)`, i.e., a trainable Normal distribution. seed: Python integer to seed the random number generator. This is used only when `initial_loc` is not specified. validate_args: Python `bool`. Whether to validate input with asserts. This imposes a runtime cost. If `validate_args` is `False`, and the inputs are invalid, correct behavior is not guaranteed. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'build_factored_surrogate_posterior'). Returns: surrogate_posterior: A `tfd.Distribution` instance whose samples have shape and structure matching that of `event_shape` or `initial_loc`. ### Examples Consider a Gamma model with unknown parameters, expressed as a joint Distribution: ```python Root = tfd.JointDistributionCoroutine.Root def model_fn(): concentration = yield Root(tfd.Exponential(1.)) rate = yield Root(tfd.Exponential(1.)) y = yield tfd.Sample(tfd.Gamma(concentration=concentration, rate=rate), sample_shape=4) model = tfd.JointDistributionCoroutine(model_fn) ``` Let's use variational inference to approximate the posterior over the data-generating parameters for some observed `y`. We'll build a surrogate posterior distribution by specifying the shapes of the latent `rate` and `concentration` parameters, and that both are constrained to be positive. ```python surrogate_posterior = tfp.experimental.vi.build_factored_surrogate_posterior( event_shape=model.event_shape_tensor()[:-1], # Omit the observed `y`. constraining_bijectors=[tfb.Softplus(), # Rate is positive. tfb.Softplus()]) # Concentration is positive. ``` This creates a trainable joint distribution, defined by variables in `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior` to fit this distribution by minimizing a divergence to the true posterior. ```python y = [0.2, 0.5, 0.3, 0.7] losses = tfp.vi.fit_surrogate_posterior( lambda rate, concentration: model.log_prob([rate, concentration, y]), surrogate_posterior=surrogate_posterior, num_steps=100, optimizer=tf.optimizers.Adam(0.1), sample_size=10) # After optimization, samples from the surrogate will approximate # samples from the true posterior. samples = surrogate_posterior.sample(100) posterior_mean = [tf.reduce_mean(x) for x in samples] # mean ~= [1.1, 2.1] posterior_std = [tf.math.reduce_std(x) for x in samples] # std ~= [0.3, 0.8] ``` If we wanted to initialize the optimization at a specific location, we can specify one when we build the surrogate posterior. This function requires the initial location to be specified in *unconstrained* space; we do this by inverting the constraining bijectors (note this section also demonstrates the creation of a dict-structured model). ```python initial_loc = {'concentration': 0.4, 'rate': 0.2} constraining_bijectors={'concentration': tfb.Softplus(), # Rate is positive. 'rate': tfb.Softplus()} # Concentration is positive. initial_unconstrained_loc = tf.nest.map_fn( lambda b, x: b.inverse(x) if b is not None else x, constraining_bijectors, initial_loc) surrogate_posterior = tfp.experimental.vi.build_factored_surrogate_posterior( event_shape=tf.nest.map_fn(tf.shape, initial_loc), constraining_bijectors=constraining_bijectors, initial_unconstrained_loc=initial_unconstrained_state, initial_unconstrained_scale=1e-4) ``` """ with tf.name_scope(name or 'build_factored_surrogate_posterior'): seed = tfp_util.SeedStream(seed, salt='build_factored_surrogate_posterior') # Convert event shapes to Tensors. shallow_structure = _get_event_shape_shallow_structure(event_shape) event_shape = nest.map_structure_up_to( shallow_structure, lambda s: tf.convert_to_tensor(s, dtype=tf.int32), event_shape) flat_event_shapes = tf.nest.flatten(event_shape) # For simplicity, we'll work with flattened lists of state parts and # repack the structure at the end. if constraining_bijectors is not None: flat_bijectors = tf.nest.flatten(constraining_bijectors) else: flat_bijectors = [None for _ in flat_event_shapes] flat_unconstrained_event_shapes = [ b.inverse_event_shape_tensor(s) if b is not None else s for s, b in zip(flat_event_shapes, flat_bijectors) ] # Construct initial locations for the internal unconstrained dists. if callable( initial_unconstrained_loc): # Sample random initialization. flat_unconstrained_locs = [ initial_unconstrained_loc(shape=s, seed=seed()) for s in flat_unconstrained_event_shapes ] else: # Use provided initialization. flat_unconstrained_locs = nest.flatten_up_to( shallow_structure, initial_unconstrained_loc, check_types=False) if nest.is_nested(initial_unconstrained_scale): flat_unconstrained_scales = nest.flatten_up_to( shallow_structure, initial_unconstrained_scale, check_types=False) else: flat_unconstrained_scales = [ initial_unconstrained_scale for _ in flat_unconstrained_locs ] # Extract the rank of each event, so that we build distributions with the # correct event shapes. flat_unconstrained_event_ndims = [ prefer_static.rank_from_shape(s) for s in flat_unconstrained_event_shapes ] # Build the component surrogate posteriors. flat_component_dists = [] for initial_loc, initial_scale, event_ndims, bijector in zip( flat_unconstrained_locs, flat_unconstrained_scales, flat_unconstrained_event_ndims, flat_bijectors): unconstrained_dist = trainable_distribution_fn( initial_loc=initial_loc, initial_scale=initial_scale, event_ndims=event_ndims, validate_args=validate_args) flat_component_dists.append( bijector(unconstrained_dist ) if bijector is not None else unconstrained_dist) component_distributions = tf.nest.pack_sequence_as( event_shape, flat_component_dists) # Return a `Distribution` object whose events have the specified structure. return (joint_distribution_util. independent_joint_distribution_from_structure( component_distributions, validate_args=validate_args))
def _binary_crossover(population, population_size, mutants, crossover_prob, seed): """Performs recombination by binary crossover for the current population. Let v_i denote the i'th component of the member v and m_i the corresponding component of the mutant vector corresponding to v. Then the crossed over vector w_i is determined by setting w_i = (m_i with probability=crossover_prob else v_i). In addition, DE requires that at least one of the components is crossed over (otherwise we end up with no change). This is done by choosing on index say k randomly where a force crossover is performed (i.e. w_k = m_k). This is the scheme implemented in this function. Args: population: A Python list of `Tensor`s where each `Tensor` in the list must be of rank at least 1 and all the elements must have a common first dimension. The base population to cross over. population_size: A scalar integer `Tensor`. The number of elements in the population (i.e. size of the first dimension of any member of `population`). mutants: A Python list of `Tensor`s with the same structure as `population`. The mutated population. crossover_prob: A positive real scalar `Tensor` bounded above by 1.0. The probability of a crossover being performed for each axis. seed: `int` or None. The random seed for this `Op`. If `None`, no seed is applied. Returns: A list of `Tensor`s of the same structure, dtype and shape as `population`. The recombined population. """ sizes = [tf.cast(tf.size(input=x), dtype=tf.float64) for x in population] seed_stream = tfp_util.SeedStream(seed, salt='binary_crossover') force_crossover_group = distributions.Categorical(sizes).sample( [population_size, 1], seed=seed_stream()) recombinants = [] for i, population_part in enumerate(population): pop_part_flat = tf.reshape(population_part, [population_size, -1]) mutant_part_flat = tf.reshape(mutants[i], [population_size, -1]) part_size = tf.size(input=population_part) // population_size force_crossovers = tf.one_hot( tf.random.uniform([population_size], minval=0, maxval=part_size, dtype=tf.int32, seed=seed_stream()), part_size, on_value=True, off_value=False, dtype=tf.bool) # Tensor of shape [population_size, size] group_mask = tf.math.equal(force_crossover_group, i) force_crossovers &= group_mask do_binary_crossover = tf.random.uniform( [population_size, part_size], dtype=crossover_prob.dtype.base_dtype, seed=seed_stream()) < crossover_prob do_binary_crossover |= force_crossovers recombinant_flat = tf1.where( do_binary_crossover, x=mutant_part_flat, y=pop_part_flat) recombinant = tf.reshape(recombinant_flat, tf.shape(input=population_part)) recombinants.append(recombinant) return recombinants
def one_step( objective_function, population, population_values=None, differential_weight=0.5, crossover_prob=0.9, seed=None, name=None): """Performs one step of the differential evolution algorithm. Args: objective_function: A Python callable that accepts a batch of possible solutions and returns the values of the objective function at those arguments as a rank 1 real `Tensor`. This specifies the function to be minimized. The input to this callable may be either a single `Tensor` or a Python `list` of `Tensor`s. The signature must match the format of the argument `population`. (i.e., objective_function(*population) must return the value of the function to be minimized). population: `Tensor` or Python `list` of `Tensor`s representing the current population vectors. Each `Tensor` must be of the same real dtype. The first dimension indexes individual population members while the rest of the dimensions are consumed by the value function. For example, if the population is a single `Tensor` of shape [n, m1, m2], then `n` is the population size and the output of `objective_function` applied to the population is a `Tensor` of shape [n]. If the population is a python list of `Tensor`s then each `Tensor` in the list should have the first axis of a common size, say `n` and `objective_function(*population)` should return a `Tensor of shape [n]. The population must have at least 4 members for the algorithm to work correctly. population_values: A `Tensor` of rank 1 and real dtype. The result of applying `objective_function` to the `population`. If not supplied it is computed using the `objective_function`. Default value: None. differential_weight: Real scalar `Tensor`. Must be positive and less than 2.0. The parameter controlling the strength of mutation. Default value: 0.5 crossover_prob: Real scalar `Tensor`. Must be between 0 and 1. The probability of recombination per site. Default value: 0.9 seed: `int` or None. The random seed for this `Op`. If `None`, no seed is applied. Default value: None. name: (Optional) Python str. The name prefixed to the ops created by this function. If not supplied, the default name 'one_step' is used. Default value: None Returns: A sequence containing the following elements (in order): next_population: A `Tensor` or Python `list` of `Tensor`s of the same structure as the input population. The population at the next generation. next_population_values: A `Tensor` of same shape and dtype as input `population_values`. The function values for the `next_population`. """ with tf1.name_scope( name, 'one_step', [population, population_values, differential_weight, crossover_prob]): population, _ = _ensure_list(population) if population_values is None: population_values = objective_function(*population) population_size = tf.shape(input=population[0])[0] seed_stream = tfp_util.SeedStream(seed, salt='one_step') mixing_indices = _get_mixing_indices(population_size, seed=seed_stream()) # Construct the mutated solution vectors. There is one for each member of # the population. mutants = _get_mutants(population, population_size, mixing_indices, differential_weight) # Perform recombination between the parents and the mutants. candidates = _binary_crossover(population, population_size, mutants, crossover_prob, seed=seed_stream()) candidate_values = objective_function(*candidates) if population_values is None: population_values = objective_function(*population) infinity = tf.zeros_like(population_values) + np.inf population_values = tf1.where( tf.math.is_nan(population_values), x=infinity, y=population_values) to_replace = candidate_values < population_values next_population = [ tf1.where(to_replace, x=candidates_part, y=population_part) for candidates_part, population_part in zip(candidates, population) ] next_values = tf1.where( to_replace, x=candidate_values, y=population_values) return next_population, next_values
def fit_with_hmc(model, observed_time_series, num_results=100, num_warmup_steps=50, num_leapfrog_steps=15, initial_state=None, initial_step_size=None, chain_batch_shape=(), num_variational_steps=150, variational_optimizer=None, variational_sample_size=5, seed=None, name=None): """Draw posterior samples using Hamiltonian Monte Carlo (HMC). Markov chain Monte Carlo (MCMC) methods are considered the gold standard of Bayesian inference; under suitable conditions and in the limit of infinitely many draws they generate samples from the true posterior distribution. HMC [1] uses gradients of the model's log-density function to propose samples, allowing it to exploit posterior geometry. However, it is computationally more expensive than variational inference and relatively sensitive to tuning. This method attempts to provide a sensible default approach for fitting StructuralTimeSeries models using HMC. It first runs variational inference as a fast posterior approximation, and initializes the HMC sampler from the variational posterior, using the posterior standard deviations to set per-variable step sizes (equivalently, a diagonal mass matrix). During the warmup phase, it adapts the step size to target an acceptance rate of 0.75, which is thought to be in the desirable range for optimal mixing [2]. Args: model: An instance of `StructuralTimeSeries` representing a time-series model. This represents a joint distribution over time-series and their parameters with batch shape `[b1, ..., bN]`. observed_time_series: `float` `Tensor` of shape `concat([sample_shape, model.batch_shape, [num_timesteps, 1]]) where `sample_shape` corresponds to i.i.d. observations, and the trailing `[1]` dimension may (optionally) be omitted if `num_timesteps > 1`. May optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes a mask `Tensor` to specify timesteps with missing observations. num_results: Integer number of Markov chain draws. Default value: `100`. num_warmup_steps: Integer number of steps to take before starting to collect results. The warmup steps are also used to adapt the step size towards a target acceptance rate of 0.75. Default value: `50`. num_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to `step_size * num_leapfrog_steps`. Default value: `15`. initial_state: Optional Python `list` of `Tensor`s, one for each model parameter, representing the initial state(s) of the Markov chain(s). These should have shape `concat([chain_batch_shape, param.prior.batch_shape, param.prior.event_shape])`. If `None`, the initial state is set automatically using a sample from a variational posterior. Default value: `None`. initial_step_size: Python `list` of `Tensor`s, one for each model parameter, representing the step size for the leapfrog integrator. Must broadcast with the shape of `initial_state`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. If `None`, the step size is set automatically using the standard deviation of a variational posterior. Default value: `None`. chain_batch_shape: Batch shape (Python `tuple`, `list`, or `int`) of chains to run in parallel. Default value: `[]` (i.e., a single chain). num_variational_steps: Python `int` number of steps to run the variational optimization to determine the initial state and step sizes. Default value: `150`. variational_optimizer: Optional `tf.train.Optimizer` instance to use in the variational optimization. If `None`, defaults to `tf.train.AdamOptimizer(0.1)`. Default value: `None`. variational_sample_size: Python `int` number of Monte Carlo samples to use in estimating the variational divergence. Larger values may stabilize the optimization, but at higher cost per step in time and memory. Default value: `1`. seed: Python integer to seed the random number generator. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'fit_with_hmc'). Returns: samples: Python `list` of `Tensors` representing posterior samples of model parameters, with shapes `[concat([[num_results], chain_batch_shape, param.prior.batch_shape, param.prior.event_shape]) for param in model.parameters]`. kernel_results: A (possibly nested) `tuple`, `namedtuple` or `list` of `Tensor`s representing internal calculations made within the HMC sampler. #### Examples Assume we've built a structural time-series model: ```python day_of_week = tfp.sts.Seasonal( num_seasons=7, observed_time_series=observed_time_series, name='day_of_week') local_linear_trend = tfp.sts.LocalLinearTrend( observed_time_series=observed_time_series, name='local_linear_trend') model = tfp.sts.Sum(components=[day_of_week, local_linear_trend], observed_time_series=observed_time_series) ``` To draw posterior samples using HMC under default settings: ```python samples, kernel_results = tfp.sts.fit_with_hmc(model, observed_time_series) print("acceptance rate: {}".format( np.mean(kernel_results.inner_results.inner_results.is_accepted, axis=0))) print("posterior means: {}".format( {param.name: np.mean(param_draws, axis=0) for (param, param_draws) in zip(model.parameters, samples)})) ``` We can also run multiple chains. This may help diagnose convergence issues and allows us to exploit vectorization to draw samples more quickly, although warmup still requires the same number of sequential steps. ```python from matplotlib import pylab as plt samples, kernel_results = tfp.sts.fit_with_hmc( model, observed_time_series, chain_batch_shape=[10]) print("acceptance rate: {}".format( np.mean(kernel_results.inner_results.inner_results.is_accepted, axis=0))) # Plot the sampled traces for each parameter. If the chains have mixed, their # traces should all cover the same region of state space, frequently crossing # over each other. for (param, param_draws) in zip(model.parameters, samples): if param.prior.event_shape.ndims > 0: print("Only plotting traces for scalar parameters, skipping {}".format( param.name)) continue plt.figure(figsize=[10, 4]) plt.title(param.name) plt.plot(param_draws.numpy()) plt.ylabel(param.name) plt.xlabel("HMC step") # Combining the samples from multiple chains into a single dimension allows # us to easily pass sampled parameters to downstream forecasting methods. combined_samples = [np.reshape(param_draws, [-1] + list(param_draws.shape[2:])) for param_draws in samples] ``` For greater flexibility, you may prefer to implement your own sampler using the TensorFlow Probability primitives in `tfp.mcmc`. The following recipe constructs a basic HMC sampler, using a `TransformedTransitionKernel` to incorporate constraints on the parameter space. ```python transformed_hmc_kernel = tfp.mcmc.TransformedTransitionKernel( inner_kernel=tfp.mcmc.DualAveragingStepSizeAdaptation( inner_kernel=tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=model.joint_log_prob(observed_time_series), step_size=step_size, num_leapfrog_steps=num_leapfrog_steps, state_gradients_are_stopped=True, seed=seed), num_adaptation_steps = int(0.8 * num_warmup_steps)), bijector=[param.bijector for param in model.parameters]) # Initialize from a Uniform[-2, 2] distribution in unconstrained space. initial_state = [tfp.sts.sample_uniform_initial_state( param, return_constrained=True) for param in model.parameters] samples, kernel_results = tfp.mcmc.sample_chain( kernel=transformed_hmc_kernel, num_results=num_results, current_state=initial_state, num_burnin_steps=num_warmup_steps) ``` #### References [1]: Radford Neal. MCMC Using Hamiltonian Dynamics. _Handbook of Markov Chain Monte Carlo_, 2011. https://arxiv.org/abs/1206.1901 [2] M.J. Betancourt, Simon Byrne, and Mark Girolami. Optimizing The Integrator Step Size for Hamiltonian Monte Carlo. https://arxiv.org/abs/1411.6669 """ with tf.name_scope(name or 'fit_with_hmc') as name: seed = tfp_util.SeedStream(seed, salt='StructuralTimeSeries_fit_with_hmc') observed_time_series = sts_util.pad_batch_dimension_for_multiple_chains( observed_time_series, model, chain_batch_shape=chain_batch_shape) target_log_prob_fn = model.joint_log_prob(observed_time_series) # Initialize state and step sizes from a variational posterior if not # specified. if initial_step_size is None or initial_state is None: variational_posterior = build_factored_surrogate_posterior( model, batch_shape=chain_batch_shape, seed=seed()) if variational_optimizer is None: variational_optimizer = tf1.train.AdamOptimizer( learning_rate=0.1 ) # TODO(b/137299119) Replace with TF2 optimizer. loss_curve = vi.fit_surrogate_posterior( target_log_prob_fn, variational_posterior, sample_size=variational_sample_size, num_steps=num_variational_steps, optimizer=variational_optimizer, seed=seed()) with tf.control_dependencies([loss_curve]): if initial_state is None: posterior_sample = variational_posterior.sample() initial_state = [ posterior_sample[p.name] for p in model.parameters ] # Set step sizes using the unconstrained variational distribution. if initial_step_size is None: q_dists_by_name, _ = variational_posterior.sample_distributions( ) initial_step_size = [ q_dists_by_name[p.name].distribution.stddev() for p in model.parameters ] # Run HMC to sample from the posterior on parameters. @tf.function(autograph=False) def run_hmc(): return mcmc.sample_chain( num_results=num_results, current_state=initial_state, num_burnin_steps=num_warmup_steps, kernel=mcmc.DualAveragingStepSizeAdaptation( inner_kernel=mcmc.TransformedTransitionKernel( inner_kernel=mcmc.HamiltonianMonteCarlo( target_log_prob_fn=target_log_prob_fn, step_size=initial_step_size, num_leapfrog_steps=num_leapfrog_steps, state_gradients_are_stopped=True, seed=seed()), bijector=[ param.bijector for param in model.parameters ]), num_adaptation_steps=int(num_warmup_steps * 0.8)), parallel_iterations=1 if seed is not None else 10) samples, kernel_results = run_hmc() return samples, kernel_results
def build_factored_surrogate_posterior(model, batch_shape=(), seed=None, name=None): """Build a variational posterior that factors over model parameters. The surrogate posterior consists of independent Normal distributions for each parameter with trainable `loc` and `scale`, transformed using the parameter's `bijector` to the appropriate support space for that parameter. Args: model: An instance of `StructuralTimeSeries` representing a time-series model. This represents a joint distribution over time-series and their parameters with batch shape `[b1, ..., bN]`. batch_shape: Batch shape (Python `tuple`, `list`, or `int`) of initial states to optimize in parallel. Default value: `()`. (i.e., just run a single optimization). seed: Python integer to seed the random number generator. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'build_factored_surrogate_posterior'). Returns: variational_posterior: `tfd.JointDistributionNamed` defining a trainable surrogate posterior over model parameters. Samples from this distribution are Python `dict`s with Python `str` parameter names as keys. ### Examples Assume we've built a structural time-series model: ```python day_of_week = tfp.sts.Seasonal( num_seasons=7, observed_time_series=observed_time_series, name='day_of_week') local_linear_trend = tfp.sts.LocalLinearTrend( observed_time_series=observed_time_series, name='local_linear_trend') model = tfp.sts.Sum(components=[day_of_week, local_linear_trend], observed_time_series=observed_time_series) ``` To fit the model to data, we define a surrogate posterior and fit it by optimizing a variational bound: ```python surrogate_posterior = tfp.sts.build_factored_surrogate_posterior( model=model) loss_curve = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_log_prob(observed_time_series), surrogate_posterior=surrogate_posterior, optimizer=tf.optimizers.Adam(learning_rate=0.1), num_steps=200) posterior_samples = surrogate_posterior.sample(50) # In graph mode, we would need to write: # with tf.control_dependencies([loss_curve]): # posterior_samples = surrogate_posterior.sample(50) ``` For more control, we can also build and optimize a variational loss manually: ```python @tf.function(autograph=False) # Ensure the loss is computed efficiently def loss_fn(): return tfp.vi.monte_carlo_variational_loss( model.joint_log_prob(observed_time_series), surrogate_posterior, sample_size=10) optimizer = tf.optimizers.Adam(learning_rate=0.1) for step in range(200): with tf.GradientTape() as tape: loss = loss_fn() grads = tape.gradient(loss, surrogate_posterior.trainable_variables) optimizer.apply_gradients( zip(grads, surrogate_posterior.trainable_variables)) if step % 20 == 0: print('step {} loss {}'.format(step, loss)) posterior_samples = surrogate_posterior.sample(50) ``` """ with tf.name_scope(name or 'build_factored_surrogate_posterior'): seed = tfp_util.SeedStream( seed, salt='StructuralTimeSeries_build_factored_surrogate_posterior') variational_posterior = collections.OrderedDict() for param in model.parameters: variational_posterior[ param.name] = _build_posterior_for_one_parameter( param, batch_shape=batch_shape, seed=seed()) return joint_distribution_named_lib.JointDistributionNamed( variational_posterior)
def _build_sampler_loop_body(model, observed_time_series, is_missing=None, compile_steps_with_xla=False, seed=None): """Builds a Gibbs sampler for the given model and observed data. Args: model: A `tf.sts.StructuralTimeSeries` model instance. This must be of the form constructed by `build_model_for_gibbs_sampling`. observed_time_series: Float `Tensor` time series of shape `[..., num_timesteps]`. is_missing: Optional `bool` `Tensor` of shape `[..., num_timesteps]`. A `True` value indicates that the observation for that timestep is missing. compile_steps_with_xla: Optional Python `bool`. If `True`, XLA compilation is used to accelerate sampling steps when supported. seed: Optional `Python` `int` seed controlling the sampled values. Returns: sampler_loop_body: Python callable that performs a single cycle of Gibbs sampling. Its first argument is a `GibbsSamplerState`, and it returns a new `GibbsSamplerState`. The second argument (passed by `tf.scan`) is ignored. """ # Require that the model has exactly the parameters expected by # `GibbsSamplerState`. observation_noise_param, level_scale_param, weights_param = model.parameters if (('observation_noise' not in observation_noise_param.name) or ('level_scale' not in level_scale_param.name) or ('weights' not in weights_param.name)): raise ValueError('Model parameters {} do not match the expected sampler ' 'state.'.format(model.parameters)) level_component = model.components[0] if not isinstance(level_component, sts.LocalLevel): raise ValueError('Expected the first model component to be an instance of ' '`tfp.sts.LocalLevel`; instead saw {}'.format( level_component)) if is_missing is not None: # Ensure series does not contain NaNs. observed_time_series = tf.where(is_missing, tf.zeros_like(observed_time_series), observed_time_series) num_observed_steps = prefer_static.shape(observed_time_series)[-1] design_matrix = _get_design_matrix(model).to_dense()[:num_observed_steps] # Compile the functions that sample from Gibbs conditional posteriors. # In principle, we should XLA-compile the entire loop body or even the entire # `fit_with_gibbs_sampling` loop. However, XLA can't currently compile the # gamma sampling op inside `_resample_scale` (b/141253568), so for now we # leave that method uncompiled but compile the other two sampling steps. # Empirically, the vast majority of sampling time is spent in # `resample_level`, so compiling it gives us most of the wins. # TODO(davmre): Wrap the entire sampling loop in `tf.function` while still # XLA-compiling these pieces as appropriate. # TODO(b/141253568): XLA-compile the entire sampling loop. compiled_resample_level = _build_resample_level_fn( initial_state_prior=level_component.initial_state_prior, is_missing=is_missing, compile_with_xla=compile_steps_with_xla) compiled_resample_weights = tf.function( _resample_weights, autograph=False, experimental_compile=compile_steps_with_xla) compiled_resample_scale = tf.function( _resample_scale, autograph=False, experimental_compile=False) # Untransform scale priors -> variance priors by reaching thru Sqrt bijector. level_scale_variance_prior = level_scale_param.prior.distribution observation_noise_variance_prior = observation_noise_param.prior.distribution # InverseGamma samplers are currently stateful, so we only need (and want) # a single seed for each, shared across loop iterations. strm = tfp_util.SeedStream(seed, salt='_sampler_loop_body') observation_noise_scale_seed = strm() level_scale_seed = strm() def sampler_loop_body(previous_sample, _): """Runs one sampler iteration, resampling all model variables.""" (weights_seed, level_seed, loop_seed) = samplers.split_seed( previous_sample.seed, n=3, salt='sampler_loop_body') # We encourage a reasonable initialization by sampling the weights first, # so at the first step they are regressed directly against the observed # time series. If we instead sampled the level first it might 'explain away' # some observed variation that we would ultimately prefer to explain through # the regression weights, because the level can represent arbitrary # variation, while the weights are limited to representing variation in the # subspace given by the design matrix. weights = compiled_resample_weights( design_matrix=design_matrix, target_residuals=(observed_time_series - previous_sample.level), observation_noise_scale=previous_sample.observation_noise_scale, weights_prior_scale=weights_param.prior.distribution.scale, is_missing=is_missing, seed=weights_seed) regression_residuals = observed_time_series - tf.linalg.matvec( design_matrix, weights) level = compiled_resample_level( observed_residuals=regression_residuals, level_scale=previous_sample.level_scale, observation_noise_scale=previous_sample.observation_noise_scale, seed=level_seed) # Estimate level scale from the empirical changes in level. level_scale = compiled_resample_scale( prior_scale=level_scale_variance_prior.scale, prior_concentration=level_scale_variance_prior.concentration, observed_residuals=level[..., 1:] - level[..., :-1], is_missing=None, seed=level_scale_seed) # Estimate noise scale from the residuals. observation_noise_scale = compiled_resample_scale( prior_scale=observation_noise_variance_prior.scale, prior_concentration=observation_noise_variance_prior.concentration, observed_residuals=regression_residuals - level, is_missing=is_missing, seed=observation_noise_scale_seed) return GibbsSamplerState( observation_noise_scale=observation_noise_scale, level_scale=level_scale, weights=weights, level=level, seed=loop_seed) return sampler_loop_body
def build_factored_variational_loss(model, observed_time_series, init_batch_shape=(), seed=None, name=None): """Build a loss function for variational inference in STS models. Variational inference searches for the distribution within some family of approximate posteriors that minimizes a divergence between the approximate posterior `q(z)` and true posterior `p(z|observed_time_series)`. By converting inference to optimization, it's generally much faster than sampling-based inference algorithms such as HMC. The tradeoff is that the approximating family rarely contains the true posterior, so it may miss important aspects of posterior structure (in particular, dependence between variables) and should not be blindly trusted. Results may vary; it's generally wise to compare to HMC to evaluate whether inference quality is sufficient for your task at hand. This method constructs a loss function for variational inference using the Kullback-Liebler divergence `KL[q(z) || p(z|observed_time_series)]`, with an approximating family given by independent Normal distributions transformed to the appropriate parameter space for each parameter. Minimizing this loss (the negative ELBO) maximizes a lower bound on the log model evidence `-log p(observed_time_series)`. This is equivalent to the 'mean-field' method implemented in [1]. and is a standard approach. The resulting posterior approximations are unimodal; they will tend to underestimate posterior uncertainty when the true posterior contains multiple modes (the `KL[q||p]` divergence encourages choosing a single mode) or dependence between variables. Args: model: An instance of `StructuralTimeSeries` representing a time-series model. This represents a joint distribution over time-series and their parameters with batch shape `[b1, ..., bN]`. observed_time_series: `float` `Tensor` of shape `concat([sample_shape, model.batch_shape, [num_timesteps, 1]]) where `sample_shape` corresponds to i.i.d. observations, and the trailing `[1]` dimension may (optionally) be omitted if `num_timesteps > 1`. May optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes a mask `Tensor` to specify timesteps with missing observations. init_batch_shape: Batch shape (Python `tuple`, `list`, or `int`) of initial states to optimize in parallel. Default value: `()`. (i.e., just run a single optimization). seed: Python integer to seed the random number generator. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'build_factored_variational_loss'). Returns: variational_loss: `float` `Tensor` of shape `concat([init_batch_shape, model.batch_shape])`, encoding a stochastic estimate of an upper bound on the negative model evidence `-log p(y)`. Minimizing this loss performs variational inference; the gap between the variational bound and the true (generally unknown) model evidence corresponds to the divergence `KL[q||p]` between the approximate and true posterior. variational_distributions: `collections.OrderedDict` giving the approximate posterior for each model parameter. The keys are Python `str` parameter names in order, corresponding to `[param.name for param in model.parameters]`. The values are `tfd.Distribution` instances with batch shape `concat([init_batch_shape, model.batch_shape])`; these will typically be of the form `tfd.TransformedDistribution(tfd.Normal(...), bijector=param.bijector)`. #### Examples Assume we've built a structural time-series model: ```python day_of_week = tfp.sts.Seasonal( num_seasons=7, observed_time_series=observed_time_series, name='day_of_week') local_linear_trend = tfp.sts.LocalLinearTrend( observed_time_series=observed_time_series, name='local_linear_trend') model = tfp.sts.Sum(components=[day_of_week, local_linear_trend], observed_time_series=observed_time_series) ``` To run variational inference, we simply construct the loss and optimize it: ```python (variational_loss, variational_distributions) = tfp.sts.build_factored_variational_loss( model=model, observed_time_series=observed_time_series) train_op = tf.train.AdamOptimizer(0.1).minimize(variational_loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for step in range(200): _, loss_ = sess.run((train_op, variational_loss)) if step % 20 == 0: print("step {} loss {}".format(step, loss_)) posterior_samples_ = sess.run({ param_name: q.sample(50) for param_name, q in variational_distributions.items()}) ``` As a more complex example, we might try to avoid local optima by optimizing from multiple initializations in parallel, and selecting the result with the lowest loss: ```python (variational_loss, variational_distributions) = tfp.sts.build_factored_variational_loss( model=model, observed_time_series=observed_time_series, init_batch_shape=[10]) train_op = tf.train.AdamOptimizer(0.1).minimize(variational_loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for step in range(200): _, loss_ = sess.run((train_op, variational_loss)) if step % 20 == 0: print("step {} losses {}".format(step, loss_)) # Draw multiple samples to reduce Monte Carlo error in the optimized # variational bounds. avg_loss = np.mean( [sess.run(variational_loss) for _ in range(25)], axis=0) best_posterior_idx = np.argmin(avg_loss, axis=0).astype(np.int32) ``` #### References [1]: Alp Kucukelbir, Dustin Tran, Rajesh Ranganath, Andrew Gelman, and David M. Blei. Automatic Differentiation Variational Inference. In _Journal of Machine Learning Research_, 2017. https://arxiv.org/abs/1603.00788 """ with tf.name_scope(name or 'build_factored_variational_loss') as name: seed = tfp_util.SeedStream( seed, salt='StructuralTimeSeries_build_factored_variational_loss') variational_posterior = build_factored_surrogate_posterior( model, batch_shape=init_batch_shape, seed=seed()) # Multiple initializations (similar to HMC chains) manifest as an extra # param batch dimension, so we need to add corresponding batch dimension(s) # to `observed_time_series`. observed_time_series = sts_util.pad_batch_dimension_for_multiple_chains( observed_time_series, model, chain_batch_shape=init_batch_shape) loss = vi.monte_carlo_variational_loss( model.joint_log_prob(observed_time_series), surrogate_posterior=variational_posterior, sample_size=1, seed=seed()) ds, _ = variational_posterior._flat_sample_distributions() # pylint: disable=protected-access ds_dict = variational_posterior._model_unflatten(ds) # pylint: disable=protected-access variational_distributions = collections.OrderedDict([ (p.name, ds_dict[p.name]) for p in model.parameters]) return loss, variational_distributions