def _make_elbo_t( observed_RVs, global_RVs, local_RVs, potentials, n_mcsamples, random_seed): global_order = pm.ArrayOrdering([v for v in global_RVs]) local_order = pm.ArrayOrdering([v for v in local_RVs]) inarray_g, uw_g, replace_g, c_g = _join_global_RVs(global_RVs, global_order) inarray_l, uw_l, replace_l, c_l = _join_local_RVs(local_RVs, local_order) logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, potentials) replace = replace_g replace.update(replace_l) logpt = theano.clone(logpt, replace, strict=False) elbo = _elbo_t(logpt, uw_g, uw_l, inarray_g, inarray_l, c_g, c_l, n_mcsamples, random_seed) return elbo, uw_l, uw_g
def _init_uw_global_shared(start, global_RVs): global_order = pm.ArrayOrdering([v for v in global_RVs]) start = {v.name: start[v.name] for v in global_RVs} bij = pm.DictToArrayBijection(global_order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw_start = floatX(np.concatenate([u_start, w_start])) uw_global_shared = theano.shared(uw_start, 'uw_global_shared') return uw_global_shared, bij
def __init__(self, model, observed): self.model = model self.observed = observed vars = pm.inputvars(model.cont_vars) bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point) self.logp = bij.mapf(model.fastlogp) self.dlogp = bij.mapf(model.fastdlogp(vars)) self.num_vars = len(vars)
def __init__(self, model): """ Parameters ---------- model : pymc3.Model The probability model, written with Theano shared variables to form any observations. The Theano shared variables are set during inference. """ self.model = model vars = pm.inputvars(model.cont_vars) self.n_vars = len(vars) bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point) self.logp = bij.mapf(model.fastlogp) self.dlogp = bij.mapf(model.fastdlogp(vars))
def __init__(self, model): """ Parameters ---------- model : pymc3.Model The probability model, written with Theano shared variables to form any observations and with `transform=None` for any latent variables. The Theano shared variables are set during inference, and all latent variables live on their original (constrained) space. """ self.model = model self.n_vars = None vars = pm.inputvars(model.cont_vars) bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point) self.logp = bij.mapf(model.fastlogp) self.dlogp = bij.mapf(model.fastdlogp(vars))
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. random_seed : int or None Seed to initialize random state. None uses current seed. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if not pm.model.all_continuous(vars): raise ValueError('Model should not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates) # Optimization loop elbos = np.empty(n) try: progress = trange(n) for i in progress: uw_i, e = f() elbos[i] = e if i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo)) except KeyboardInterrupt: elbos = elbos[:i] avg_elbo = elbos[i - n // 10:].mean() pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format( i, 100 * i // n, avg_elbo)) else: avg_elbo = elbos[-n // 10:].mean() pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, mode=None, tol_obj=0.01, eval_elbo=100, random_seed=None, progressbar=True): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. tol_obj : float Relative tolerance for testing convergence of ELBO. eval_elbo : int Window for checking convergence of ELBO. Convergence will be checked for every multiple of eval_elbo. random_seed : int or None Seed to initialize random state. None uses current seed. mode : string or `Mode` instance. Compilation mode passed to Theano functions progressbar : bool Whether or not to display a progress bar in the command line. The bar shows the percentage of completion, the sampling speed in samples per second (SPS), the estimated remaining time until completion ("expected time of arrival"; ETA), and the current ELBO. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if len(vars) == 0: raise ValueError('No free random variables to fit.') if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode) # For tracking convergence of ELBO window_size = int(max(0.1 * n // eval_elbo, 2.0)) circ_buff = deque([], maxlen=window_size) # Optimization loop elbos = np.empty(n) divergence_flag = False progress = trange(n) if progressbar else range(n) try: uw_i, elbo_current = f() if np.isnan(elbo_current): raise FloatingPointError('NaN occurred in ADVI optimization.') for i in progress: uw_i, e = f() if np.isnan(e): raise FloatingPointError('NaN occurred in ADVI optimization.') elbos[i] = e if progressbar: if n < 10: progress.set_description('ELBO = {:,.5g}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = infmean(elbos[i - n // 10:i]) progress.set_description( 'Average ELBO = {:,.5g}'.format(avg_elbo)) if i % eval_elbo == 0: elbo_prev = elbo_current elbo_current = elbos[i] delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev) circ_buff.append(delta_elbo) avg_delta = np.mean(circ_buff) med_delta = np.median(circ_buff) if i > 0 and avg_delta < tol_obj: pm._log.info('Mean ELBO converged.') elbos = elbos[:(i + 1)] break elif i > 0 and med_delta < tol_obj: pm._log.info('Median ELBO converged.') elbos = elbos[:(i + 1)] break if i > 10 * eval_elbo: if med_delta > 0.5 or avg_delta > 0.5: divergence_flag = True else: divergence_flag = False except KeyboardInterrupt: elbos = elbos[:i] if n < 10: pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format( i, 100 * i // n, elbos[i])) else: avg_elbo = infmean(elbos[i - n // 10:i]) pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'. format(i, 100 * i // n, avg_elbo)) else: if n < 10: pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1])) else: avg_elbo = infmean(elbos[-n // 10:]) pm._log.info( 'Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) finally: if progressbar: progress.close() if divergence_flag: pm._log.info('Evidence of divergence detected, inspect ELBO.') # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1, minibatch_RVs=None, minibatch_tensors=None, minibatches=None, local_RVs=None, observed_RVs=None, encoder_params=None, total_size=None, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None): """Perform mini-batch ADVI. This function implements a mini-batch ADVI with the meanfield approximation. Autoencoding variational inference is also supported. The log probability terms for mini-batches, corresponding to RVs in minibatch_RVs, are scaled to (total_size) / (the number of samples in each mini-batch), where total_size is an argument for the total data size. minibatch_tensors is a list of tensors (can be shared variables) to which mini-batch samples are set during the optimization. In most cases, these tensors are observations for RVs in the model. local_RVs and observed_RVs are used for autoencoding variational Bayes. Both of these RVs are associated with each of given samples. The difference is that local_RVs are unkown and their posterior distributions are approximated. local_RVs are Ordered dict, whose keys and values are RVs and a tuple of two objects. The first is the theano expression of variational parameters (mean and log of std) of the approximate posterior, which are encoded from given samples by an arbitrary deterministic function, e.g., MLP. The other one is a scaling constant to be multiplied to the log probability term corresponding to the RV. observed_RVs are also Ordered dict with RVs as the keys, but whose values are only the scaling constant as in local_RVs. In this case, total_size is ignored. If local_RVs is None (thus not using autoencoder), the following two settings are equivalent: - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)]) - minibatch_RVs=[rv], total_size=total_size where minibatch_size is minibatch_tensors[0].shape[0]. The variational parameters and the parameters of the autoencoder are simultaneously optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. See the docstring of pymc3.variational.advi(). Parameters ---------- vars : object List of random variables. If None, variational posteriors (normal distribution) are fit for all RVs in the given model. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of iterations updating parameters. n_mcsamples : int Number of Monte Carlo samples to approximate ELBO. minibatch_RVs : list of ObservedRVs Random variables in the model for which mini-batch tensors are set. When this argument is given, both of arguments local_RVs and observed_RVs must be None. minibatch_tensors : list of (tensors or shared variables) Tensors used to create ObservedRVs in minibatch_RVs. minibatches : generator of list Generates a set of minibatches when calling next(). The length of the returned list must be the same with the number of random variables in `minibatch_tensors`. total_size : int Total size of training samples. This is used to appropriately scale the log likelihood terms corresponding to mini-batches in ELBO. local_RVs : Ordered dict Include encoded variational parameters and a scaling constant for the corresponding RV. See the above description. observed_RVs : Ordered dict Include a scaling constant for the corresponding RV. See the above description encoder_params : list of theano shared variables Parameters of encoder. optimizer : (loss, list of shared variables) -> dict or OrderedDict A function that returns parameter updates given loss and shared variables of parameters. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when an optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when an optimizer is given. random_seed : int Seed to initialize random state. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. """ theano.config.compute_test_value = 'ignore' model = pm.modelcontext(model) vars = inputvars(vars if vars is not None else model.vars) start = start if start is not None else model.test_point check_discrete_rvs(vars) _check_minibatches(minibatch_tensors, minibatches) if encoder_params is None: encoder_params = [] # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # For backward compatibility in how input arguments are given local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs, minibatch_tensors, total_size) # Replace local_RVs with transformed variables ds = model.deterministics def get_transformed(v): if v in ds: return v.transformed return v local_RVs = OrderedDict([(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()]) # Get global variables global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs))) # Ordering for concatenation of random variables global_order = pm.ArrayOrdering([v for v in global_RVs]) local_order = pm.ArrayOrdering([v for v in local_RVs]) # ELBO wrt variational parameters inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order) inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order) logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model) replace = replace_g replace.update(replace_l) logp = theano.clone(logpt, replace, strict=False) elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed) del logpt # Replacements tensors of variational parameters in the graph replaces = dict() # Variational parameters for global RVs if 0 < len(global_RVs): uw_global_shared, bij = _init_uw_global_shared(start, global_RVs, global_order) replaces.update({uw_g: uw_global_shared}) # Variational parameters for local RVs, encoded from samples in # mini-batches if 0 < len(local_RVs): uws = [uw for _, (uw, _) in local_RVs.items()] uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] + [uw[1].ravel() for uw in uws]) replaces.update({uw_l: uw_local_encoded}) # Replace tensors of variational parameters in ELBO elbo = theano.clone(elbo, OrderedDict(replaces), strict=False) # Replace input shared variables with tensors def is_shared(t): return isinstance(t, theano.compile.sharedvalue.SharedVariable) tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors] updates = OrderedDict( {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)}) elbo = theano.clone(elbo, updates, strict=False) # Create parameter update function used in the training loop params = encoder_params if 0 < len(global_RVs): params += [uw_global_shared] updates = OrderedDict(optimizer(loss=-1 * elbo, param=params)) f = theano.function(tensors, elbo, updates=updates) # Optimization loop elbos = np.empty(n) progress = tqdm.trange(n) for i in progress: e = f(*next(minibatches)) elbos[i] = e if i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo)) pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1])) # Variational parameters of global RVs if 0 < len(global_RVs): l = int(uw_global_shared.get_value(borrow=True).size / 2) u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l]) w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) else: u = dict() w = dict() return ADVIFit(u, w, elbos)
def bijection(self): return pm.DictToArrayBijection( pm.ArrayOrdering(pm.inputvars(self.model.cont_vars)), self.model.test_point)