def create_prior(K, a_p=1, b_p=1, a_gamma=1, b_gamma=1, m_loc=0, g_loc=0.1, m_sigma=3, s_sigma=2, m_nu=0, s_nu=1, m_skew=0, g_skew=0.1, dtype=np.float64): return tfd.JointDistributionNamed( dict( p=tfd.Beta(dtype(a_p), dtype(b_p)), gamma_C=tfd.Gamma(dtype(a_gamma), dtype(b_gamma)), gamma_T=tfd.Gamma(dtype(a_gamma), dtype(b_gamma)), eta_C=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), eta_T=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), nu=tfd.Sample(tfd.LogNormal(dtype(m_nu), s_nu), sample_shape=K), sigma_sq=tfd.Sample(tfd.InverseGamma(dtype(m_sigma), dtype(s_sigma)), sample_shape=K), loc=lambda sigma_sq: tfd.Independent(tfd.Normal( dtype(m_loc), g_loc * tf.sqrt(sigma_sq)), reinterpreted_batch_ndims=1), skew=lambda sigma_sq: tfd.Independent(tfd.Normal( dtype(m_skew), g_skew * tf.sqrt(sigma_sq)), reinterpreted_batch_ndims=1), ))
def create_dp_sb_gmm(nobs, K, dtype=np.float64): return tfd.JointDistributionNamed( dict( # Mixture means mu=tfd.Independent(tfd.Normal(np.zeros(K, dtype), 3), reinterpreted_batch_ndims=1), # Mixture scales sigma=tfd.Independent(tfd.LogNormal(loc=np.full(K, -2, dtype), scale=0.5), reinterpreted_batch_ndims=1), # Mixture weights (stick-breaking construction) alpha=tfd.Gamma(concentration=np.float64(1.0), rate=10.0), v=lambda alpha: tfd.Independent( # NOTE: Dave Moore suggests doing this instead, to ensure # that a batch dimension in alpha doesn't conflict with # the other parameters. tfd.Beta(np.ones(K - 1, dtype), alpha[..., tf.newaxis]), reinterpreted_batch_ndims=1), # Observations (likelihood) obs=lambda mu, sigma, v: tfd.Sample( tfd.MixtureSameFamily( # This will be marginalized over. mixture_distribution=tfd.Categorical(probs=stickbreak(v)), components_distribution=tfd.Normal(mu, sigma)), sample_shape=nobs)))
def create_model(n_C, n_T, K, neg_inf=-10, dtype=np.float64): return tfd.JointDistributionNamed( dict(p=tfd.Beta(dtype(1), dtype(1)), gamma_C=tfd.Gamma(dtype(3), dtype(3)), gamma_T=tfd.Gamma(dtype(3), dtype(3)), eta_C=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), eta_T=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), loc=tfd.Sample(tfd.Normal(dtype(0), dtype(1)), sample_shape=K), sigma_sq=tfd.Sample(tfd.InverseGamma(dtype(3), dtype(2)), sample_shape=K), y_C=lambda gamma_C, eta_C, loc, sigma_sq: mix( gamma_C, eta_C, loc, tf.sqrt(sigma_sq), dtype(neg_inf), n_C), y_T=lambda gamma_C, gamma_T, eta_C, eta_T, p, loc, sigma_sq: mix_T(gamma_C, gamma_T, eta_C, eta_T, p, loc, tf.sqrt(sigma_sq), dtype(neg_inf), n_T)))
def create_distributions(self): """Create distribution objects """ self.bijectors = { 'u': tfb.Softplus(), 'v': tfb.Softplus(), 'u_eta': tfb.Softplus(), 'u_tau': tfb.Softplus(), 's': tfb.Softplus(), 's_eta': tfb.Softplus(), 's_tau': tfb.Softplus(), 'w': tfb.Softplus() } symmetry_breaking_decay = self.symmetry_breaking_decay**tf.cast( tf.range(self.latent_dim), self.dtype)[tf.newaxis, ...] distribution_dict = { 'v': tfd.Independent(tfd.HalfNormal(scale=0.1 * tf.ones( (self.latent_dim, self.feature_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2), 'w': tfd.Independent(tfd.HalfNormal( scale=tf.ones((1, self.feature_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2) } if self.horseshoe_plus: distribution_dict = { **distribution_dict, 'u': lambda u_eta, u_tau: tfd.Independent(tfd.HalfNormal( scale=u_eta * u_tau * symmetry_breaking_decay), reinterpreted_batch_ndims= 2), 'u_eta': tfd.Independent(tfd.HalfCauchy( loc=tf.zeros((self.feature_dim, self.latent_dim), dtype=self.dtype), scale=tf.ones((self.feature_dim, self.latent_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2), 'u_tau': tfd.Independent(tfd.HalfCauchy( loc=tf.zeros((1, self.latent_dim), dtype=self.dtype), scale=tf.ones((1, self.latent_dim), dtype=self.dtype) * self.u_tau_scale), reinterpreted_batch_ndims=2), } distribution_dict['s'] = lambda s_eta, s_tau: tfd.Independent( tfd.HalfNormal(scale=s_eta * s_tau), reinterpreted_batch_ndims=2) distribution_dict['s_eta'] = tfd.Independent( tfd.HalfCauchy(loc=tf.zeros((2, self.feature_dim), dtype=self.dtype), scale=tf.ones((2, self.feature_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2) distribution_dict['s_tau'] = tfd.Independent( tfd.HalfCauchy(loc=tf.zeros((1, self.feature_dim), dtype=self.dtype), scale=tf.ones( (1, self.feature_dim), dtype=self.dtype) * self.s_tau_scale), reinterpreted_batch_ndims=2) self.bijectors['u_eta_a'] = tfb.Softplus() self.bijectors['u_tau_a'] = tfb.Softplus() self.bijectors['s_eta_a'] = tfb.Softplus() self.bijectors['s_tau_a'] = tfb.Softplus() distribution_dict['u_eta'] = lambda u_eta_a: tfd.Independent( SqrtInverseGamma(concentration=0.5 * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), scale=1.0 / u_eta_a), reinterpreted_batch_ndims=2) distribution_dict['u_eta_a'] = tfd.Independent( tfd.InverseGamma(concentration=0.5 * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), scale=tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2) distribution_dict['u_tau'] = lambda u_tau_a: tfd.Independent( SqrtInverseGamma(concentration=0.5 * tf.ones( (1, self.latent_dim), dtype=self.dtype), scale=1.0 / u_tau_a), reinterpreted_batch_ndims=2) distribution_dict['u_tau_a'] = tfd.Independent( tfd.InverseGamma(concentration=0.5 * tf.ones( (1, self.latent_dim), dtype=self.dtype), scale=tf.ones( (1, self.latent_dim), dtype=self.dtype) / self.u_tau_scale**2), reinterpreted_batch_ndims=2) distribution_dict['s_eta'] = lambda s_eta_a: tfd.Independent( SqrtInverseGamma(concentration=0.5 * tf.ones( (2, self.feature_dim), dtype=self.dtype), scale=1.0 / s_eta_a), reinterpreted_batch_ndims=2) distribution_dict['s_eta_a'] = tfd.Independent( tfd.InverseGamma(concentration=0.5 * tf.ones( (2, self.feature_dim), dtype=self.dtype), scale=tf.ones((2, self.feature_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2) distribution_dict['s_tau'] = lambda s_tau_a: tfd.Independent( SqrtInverseGamma(concentration=0.5 * tf.ones( (1, self.feature_dim), dtype=self.dtype), scale=1.0 / s_tau_a), reinterpreted_batch_ndims=2) distribution_dict['s_tau_a'] = tfd.Independent( tfd.InverseGamma(concentration=0.5 * tf.ones( (1, self.feature_dim), dtype=self.dtype), scale=tf.ones( (1, self.feature_dim), dtype=self.dtype) / self.s_tau_scale**2), reinterpreted_batch_ndims=2) else: distribution_dict = { **distribution_dict, 'u': tfd.Independent( AbsHorseshoe( scale=(self.u_tau_scale * symmetry_breaking_decay * tf.ones((self.feature_dim, self.latent_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2)), 's': tfd.Independent(AbsHorseshoe( scale=self.s_tau_scale * tf.ones((1, self.feature_dim), dtype=self.dtype)), reinterpreted_batch_ndims=2) } self.prior_distribution = tfd.JointDistributionNamed(distribution_dict) surrogate_dict = { 'v': self.bijectors['v'](build_trainable_normal_dist( -6. * tf.ones( (self.latent_dim, self.feature_dim), dtype=self.dtype), 5e-4 * tf.ones( (self.latent_dim, self.feature_dim), dtype=self.dtype), 2, strategy=self.strategy)), 'w': self.bijectors['w'](build_trainable_normal_dist( -6 * tf.ones((1, self.feature_dim), dtype=self.dtype), 5e-4 * tf.ones((1, self.feature_dim), dtype=self.dtype), 2, strategy=self.strategy)) } if self.horseshoe_plus: surrogate_dict = { **surrogate_dict, 'u': self.bijectors['u'](build_trainable_normal_dist( -6. * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), 5e-4 * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), 2, strategy=self.strategy)), 'u_eta': self.bijectors['u_eta'](build_trainable_InverseGamma_dist( 3 * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), tf.ones((self.feature_dim, self.latent_dim), dtype=self.dtype), 2, strategy=self.strategy)), 'u_tau': self.bijectors['u_tau'](build_trainable_InverseGamma_dist( 3 * tf.ones((1, self.latent_dim), dtype=self.dtype), tf.ones((1, self.latent_dim), dtype=self.dtype), 2, strategy=self.strategy)), } surrogate_dict['s_eta'] = self.bijectors['s_eta']( build_trainable_InverseGamma_dist(tf.ones( (2, self.feature_dim), dtype=self.dtype), tf.ones( (2, self.feature_dim), dtype=self.dtype), 2, strategy=self.strategy)) surrogate_dict['s_tau'] = self.bijectors['s_tau']( build_trainable_InverseGamma_dist(1 * tf.ones( (1, self.feature_dim), dtype=self.dtype), tf.ones( (1, self.feature_dim), dtype=self.dtype), 2, strategy=self.strategy)) surrogate_dict['s'] = self.bijectors['s']( build_trainable_normal_dist( tf.ones((2, self.feature_dim), dtype=self.dtype) * tf.cast([[-2.], [-1.]], dtype=self.dtype), 1e-3 * tf.ones((2, self.feature_dim), dtype=self.dtype), 2, strategy=self.strategy)) self.bijectors['u_eta_a'] = tfb.Softplus() self.bijectors['u_tau_a'] = tfb.Softplus() surrogate_dict['u_eta_a'] = self.bijectors['u_eta_a']( build_trainable_InverseGamma_dist( 2. * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), tf.ones((self.feature_dim, self.latent_dim), dtype=self.dtype), 2, strategy=self.strategy)) surrogate_dict['u_tau_a'] = self.bijectors['u_tau_a']( build_trainable_InverseGamma_dist( 2. * tf.ones((1, self.latent_dim), dtype=self.dtype), tf.ones((1, self.latent_dim), dtype=self.dtype) / self.u_tau_scale**2, 2, strategy=self.strategy)) self.bijectors['s_eta_a'] = tfb.Softplus() self.bijectors['s_tau_a'] = tfb.Softplus() surrogate_dict['s_eta_a'] = self.bijectors['s_eta_a']( build_trainable_InverseGamma_dist(2. * tf.ones( (2, self.feature_dim), dtype=self.dtype), tf.ones( (2, self.feature_dim), dtype=self.dtype), 2, strategy=self.strategy)) surrogate_dict['s_tau_a'] = self.bijectors['s_tau_a']( build_trainable_InverseGamma_dist( 2. * tf.ones((1, self.feature_dim), dtype=self.dtype), (tf.ones((1, self.feature_dim), dtype=self.dtype) / self.s_tau_scale**2), 2, strategy=self.strategy)) else: surrogate_dict = { **surrogate_dict, 's': self.bijectors['s'](build_trainable_normal_dist( tf.ones((2, self.feature_dim), dtype=self.dtype) * tf.cast([[-2.], [-1.]], dtype=self.dtype), 1e-3 * tf.ones((2, self.feature_dim), dtype=self.dtype), 2, strategy=self.strategy)), 'u': self.bijectors['u'](build_trainable_normal_dist( -9. * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), 5e-4 * tf.ones( (self.feature_dim, self.latent_dim), dtype=self.dtype), 2, strategy=self.strategy)) } self.surrogate_distribution = tfd.JointDistributionNamed( surrogate_dict) self.surrogate_vars = self.surrogate_distribution.variables self.var_list = list(surrogate_dict.keys()) self.set_calibration_expectations()
return tf.linalg.cholesky(K) def compute_f(alpha, rho, beta, eta): LK = compute_LK(alpha, rho, X) f = tf.linalg.matvec(LK, eta) # LK * eta, (matrix * vector) return f + beta[..., tf.newaxis] # GP Binary Classification Model. gpc_model = tfd.JointDistributionNamed( dict( alpha=tfd.LogNormal(dtype(0), dtype(1)), rho=tfd.LogNormal(dtype(0), dtype(1)), beta=tfd.Normal(dtype(0), dtype(1)), eta=tfd.Sample(tfd.Normal(dtype(0), dtype(1)), sample_shape=X.shape[0]), # NOTE: `Sample` and `Independent` resemble, respectively, # `filldist` and `arraydist` in Turing. obs=lambda alpha, rho, beta, eta: tfd.Independent( tfd.Bernoulli(logits=compute_f(alpha, rho, beta, eta)), reinterpreted_batch_ndims=1))) ### MODEL SET UP ### # For some reason, this is needed for the compiler # to know the correct model parameter dimensions. _ = gpc_model.sample() # Parameters as they appear in model definition. # NOTE: Initial values should be defined in order appeared in model. ordered_params = ['alpha', 'rho', 'beta', 'eta']
# Here we will use the squared exponential covariance function: # # $$ # \alpha^2 \cdot \exp\left\{-\frac{d^2}{2\rho^2}\right\} # $$ # # where $\alpha$ is the amplitude of the covariance, $\rho$ is the length scale which controls how slowly information decays with distance (larger $\rho$ means information about a point can be used for data far away); and $d$ is the distance. # In[4]: # Specify GP model gp_model = tfd.JointDistributionNamed( dict( amplitude=tfd.LogNormal(dtype(0), dtype(0.1)), # amplitude length_scale=tfd.LogNormal(dtype(0), dtype(1)), # length scale v=tfd.LogNormal(dtype(0), dtype(1)), # model sd obs=lambda length_scale, amplitude, v: tfd.GaussianProcess( kernel=tfp.math.psd_kernels.ExponentiatedQuadratic( amplitude, length_scale), index_points=X[..., np.newaxis], observation_noise_variance=v))) # Run graph to make sure it works. _ = gp_model.sample() # Initial values. initial_state = [ 1e-1 * tf.ones([], dtype=np.float64, name='amplitude'), 1e-1 * tf.ones([], dtype=np.float64, name='length_scale'), 1e-1 * tf.ones([], dtype=np.float64, name='v') ]
qv_rho = tf.Variable(tf.random.normal([ncomponents - 1], dtype=np.float64) - 1, name='qv_rho') qalpha_loc = tf.Variable(tf.random.normal([], dtype=np.float64), name='qalpha_loc') qalpha_rho = tf.Variable(tf.random.normal([], dtype=np.float64), name='qalpha_rho') # Create variational distribution. surrogate_posterior = tfd.JointDistributionNamed( dict( # qmu mu=tfd.Independent(tfd.Normal(qmu_loc, tf.nn.softplus(qmu_rho)), reinterpreted_batch_ndims=1), # qsigma sigma=tfd.Independent(tfd.LogNormal(qsigma_loc, tf.nn.softplus(qsigma_rho)), reinterpreted_batch_ndims=1), # qv v=tfd.Independent(tfd.LogitNormal(qv_loc, tf.nn.softplus(qv_rho)), reinterpreted_batch_ndims=1), # qalpha alpha=tfd.LogNormal(qalpha_loc, tf.nn.softplus(qalpha_rho)))) # In[12]: # Run optimizer # @tf.function(autograph=False) , experimental_compile=True) # Makes slower? def run_advi(optimizer, sample_size=1, num_steps=2000, seed=1): return tfp.vi.fit_surrogate_posterior( target_log_prob_fn=target_log_prob_fn,
from tensorflow_probability import distributions as tfd N = 1000 dists = {"A": {}, "B": {}} samples = [] for i in range(N): dists["A"][i] = tfd.Poisson(rate=1e-6) dists["B"][i] = tfd.Poisson(rate=1e-6) #dists += [tfd.Normal(loc = 0, scale = 1)] joint = tfd.JointDistributionNamed(dists) samples = joint.sample(N) print("joint.log_prob =", joint.log_prob(samples))