def delta_logp(logp, vars, shared): [logp0], inarray0 = pm.join_nonshared_inputs([logp], vars, shared) tensor_type = inarray0.type inarray1 = tensor_type("inarray1") logp1 = pm.CallableTensor(logp0)(inarray1) f = aesara.function([inarray1, inarray0], logp1 - logp0) f.trust_input = True return f
def delta_logp(point, logp, vars, shared): [logp0], inarray0 = pm.join_nonshared_inputs(point, [logp], vars, shared) tensor_type = inarray0.type inarray1 = tensor_type("inarray1") logp1 = pm.CallableTensor(logp0)(inarray1) f = compile_rv_inplace([inarray1, inarray0], logp1 - logp0) f.trust_input = True return f
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. random_seed : int or None Seed to initialize random state. None uses current seed. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if not pm.model.all_continuous(vars): raise ValueError('Model should not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates) # Optimization loop elbos = np.empty(n) try: progress = trange(n) for i in progress: uw_i, e = f() elbos[i] = e if i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo)) except KeyboardInterrupt: elbos = elbos[:i] avg_elbo = elbos[i - n // 10:].mean() pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format( i, 100 * i // n, avg_elbo)) else: avg_elbo = elbos[-n // 10:].mean() pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, mode=None, tol_obj=0.01, eval_elbo=100, random_seed=None, progressbar=True): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. tol_obj : float Relative tolerance for testing convergence of ELBO. eval_elbo : int Window for checking convergence of ELBO. Convergence will be checked for every multiple of eval_elbo. random_seed : int or None Seed to initialize random state. None uses current seed. mode : string or `Mode` instance. Compilation mode passed to Theano functions progressbar : bool Whether or not to display a progress bar in the command line. The bar shows the percentage of completion, the sampling speed in samples per second (SPS), the estimated remaining time until completion ("expected time of arrival"; ETA), and the current ELBO. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if len(vars) == 0: raise ValueError('No free random variables to fit.') if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode) # For tracking convergence of ELBO window_size = int(max(0.1 * n // eval_elbo, 2.0)) circ_buff = deque([], maxlen=window_size) # Optimization loop elbos = np.empty(n) divergence_flag = False progress = trange(n) if progressbar else range(n) try: uw_i, elbo_current = f() if np.isnan(elbo_current): raise FloatingPointError('NaN occurred in ADVI optimization.') for i in progress: uw_i, e = f() if np.isnan(e): raise FloatingPointError('NaN occurred in ADVI optimization.') elbos[i] = e if progressbar: if n < 10: progress.set_description('ELBO = {:,.5g}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = infmean(elbos[i - n // 10:i]) progress.set_description( 'Average ELBO = {:,.5g}'.format(avg_elbo)) if i % eval_elbo == 0: elbo_prev = elbo_current elbo_current = elbos[i] delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev) circ_buff.append(delta_elbo) avg_delta = np.mean(circ_buff) med_delta = np.median(circ_buff) if i > 0 and avg_delta < tol_obj: pm._log.info('Mean ELBO converged.') elbos = elbos[:(i + 1)] break elif i > 0 and med_delta < tol_obj: pm._log.info('Median ELBO converged.') elbos = elbos[:(i + 1)] break if i > 10 * eval_elbo: if med_delta > 0.5 or avg_delta > 0.5: divergence_flag = True else: divergence_flag = False except KeyboardInterrupt: elbos = elbos[:i] if n < 10: pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format( i, 100 * i // n, elbos[i])) else: avg_elbo = infmean(elbos[i - n // 10:i]) pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'. format(i, 100 * i // n, avg_elbo)) else: if n < 10: pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1])) else: avg_elbo = infmean(elbos[-n // 10:]) pm._log.info( 'Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) finally: if progressbar: progress.close() if divergence_flag: pm._log.info('Evidence of divergence detected, inspect ELBO.') # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)