def log_p_y_z(self): if self.continuous: h_decoder = softplus(dot(self.W_zh, self.z.T) + self.b_zh) if self.numHiddenLayers_decoder == 2: h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh) mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1 log_sigma_decoder = 0.5 * (dot(self.W_hy2, h_decoder) + self.b_hy2) log_pyz = T.sum( -(0.5 * np.log(2 * np.pi) + log_sigma_decoder) \ - 0.5 * ((self.y_miniBatch.T - mu_decoder) / T.exp(log_sigma_decoder))**2 ) log_sigma_decoder.name = 'log_sigma_decoder' mu_decoder.name = 'mu_decoder' h_decoder.name = 'h_decoder' log_pyz.name = 'log_p_y_z' else: h_decoder = tanh(dot(self.W_zh, self.z) + self.b_zh) if self.numHiddenLayers_decoder == 2: h_decoder = softplus(dot(W_hh, h_decoder) + self.b_hh) y_hat = sigmoid(dot(self.W_hy1, h_decoder) + self.b_hy1) log_pyz = -T.nnet.binary_crossentropy(y_hat, self.y_miniBatch).sum() h_decoder.name = 'h_decoder' y_hat.name = 'y_hat' log_pyz.name = 'log_p_y_z' return log_pyz
def log_p_y_z(self): if self.continuous: h_decoder = softplus(dot(self.W_zh,self.z.T) + self.b_zh) if self.numHiddenLayers_decoder == 2: h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh) mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1 log_sigma_decoder = 0.5*(dot(self.W_hy2, h_decoder) + self.b_hy2) log_pyz = T.sum( -(0.5 * np.log(2 * np.pi) + log_sigma_decoder) \ - 0.5 * ((self.y_miniBatch.T - mu_decoder) / T.exp(log_sigma_decoder))**2 ) log_sigma_decoder.name = 'log_sigma_decoder' mu_decoder.name = 'mu_decoder' h_decoder.name = 'h_decoder' log_pyz.name = 'log_p_y_z' else: h_decoder = tanh(dot(self.W_zh, self.z) + self.b_zh) if self.numHiddenLayers_decoder == 2: h_decoder = softplus(dot(W_hh, h_decoder) + self.b_hh) y_hat = sigmoid(dot(self.W_hy1, h_decoder) + self.b_hy1) log_pyz = -T.nnet.binary_crossentropy(y_hat, self.y_miniBatch).sum() h_decoder.name = 'h_decoder' y_hat.name = 'y_hat' log_pyz.name = 'log_p_y_z' return log_pyz
def create_new_data_function(self): # self.z_test = sharedZeroMatrix(self.Q,1,'z_test') h_decoder = softplus(dot(self.W_zh,self.z_test.T) + self.b_zh) if self.numHiddenLayers_decoder == 2: h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh) mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1 self.new_data_function = th.function([], mu_decoder, no_default_updates=True) return mu_decoder
def forward_propagation(self): # layer 1 -> layer 2 self.z1 = self.linear_line_with_self(self.x, self.w1, self.b1) self.z2 = self.linear_line_with_self(self.x, self.w2, self.b2) self.a1 = softplus(self.z1) self.a2 = softplus(self.z2) # layer 2 -> layer 3 self.z3 = self.linear_line_with_self(self.a1, self.w3, 0) self.z4 = self.linear_line_with_self(self.a2, self.w4, 0) self.h = self.z3 + self.z4 + self.b3
def create_new_data_function(self): # self.z_test = sharedZeroMatrix(self.Q,1,'z_test') h_decoder = softplus(dot(self.W_zh, self.z_test.T) + self.b_zh) if self.numHiddenLayers_decoder == 2: h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh) mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1 self.new_data_function = th.function([], mu_decoder, no_default_updates=True) return mu_decoder
def forward_propagation(Teta): # layer 1 -> layer 2 z1 = linear_line(x, Teta[3], Teta[0]) z2 = linear_line(x, Teta[4], Teta[1]) a1 = softplus(z1) a2 = softplus(z2) # layer 2 -> layer 3 z3 = linear_line(a1, Teta[5], 0) z4 = linear_line(a2, Teta[6], 0) h = z3 + z4 + Teta[2] return h
def moment_match(self, y, cav_mean, cav_cov, hyp=None, power=1.0, cubature_func=None): """ Closed form Gaussian moment matching. Calculates the log partition function of the EP tilted distribution: logZₙ = log ∫ 𝓝ᵃ(yₙ|fₙ,σ²) 𝓝(fₙ|mₙ,vₙ) dfₙ = E[𝓝(yₙ|fₙ,σ²)] and its derivatives w.r.t. mₙ, which are required for moment matching. :param y: observed data (yₙ) [scalar] :param cav_mean: cavity mean (mₙ) [scalar] :param cav_cov: cavity variance (vₙ) [scalar] :param hyp: observation noise variance (σ²) [scalar] :param power: EP power / fraction (a) - this is never required for the Gaussian likelihood [scalar] :param cubature_func: not used :return: lZ: the log partition function, logZₙ [scalar] dlZ: first derivative of logZₙ w.r.t. mₙ (if derivatives=True) [scalar] d2lZ: second derivative of logZₙ w.r.t. mₙ (if derivatives=True) [scalar] """ hyp = softplus(self.hyp) if hyp is None else hyp return gaussian_moment_match(y, cav_mean, cav_cov, hyp)
def reconstruct_test_datum(self): self.y_test = self.y(np.random.choice(self.N, 1)) h_qX = softplus(plus(dot(self.W1_qX, self.y_test.T), self.b1_qX)) mu_qX = plus(dot(self.W2_qX, h_qX), self.b2_qX) log_sigma_qX = mul( 0.5, plus(dot(self.W3_qX, h_qX), self.b3_qX)) self.phi_test = mu_qX.T # [BxR] (self.Phi_test,self.cPhi_test,self.iPhi_test,self.logDetPhi_test) \ = diagCholInvLogDet_fromLogDiag(log_sigma_qX) self.Xz_test = plus( self.phi_test, dot(self.cPhi_test, self.xi[0,:])) self.Kzz_test = kfactory.kernel(self.Xz_test, None, self.log_theta) self.Kzu_test = kfactory.kernel(self.Xz_test, self.Xu, self.log_theta) self.A_test = dot(self.Kzu_test, self.iKuu) self.C_test = minus( self.Kzz_test, dot(self.A_test, self.Kzu_test.T)) self.cC_test, self.iC_test, self.logDetC_test = cholInvLogDet(self.C_test, self.B, self.jitter) self.u_test = plus( self.kappa, (dot(self.cKappa, self.alpha))) self.mu_test = dot(self.A_test, self.u_test) self.z_test = plus(self.mu_test, (dot(self.cC_test, self.beta[0,:])))
def reconstruct_test_datum(self): self.y_test = self.y(np.random.choice(self.N, 1)) h_qX = softplus(plus(dot(self.W1_qX, self.y_test.T), self.b1_qX)) mu_qX = plus(dot(self.W2_qX, h_qX), self.b2_qX) log_sigma_qX = mul(0.5, plus(dot(self.W3_qX, h_qX), self.b3_qX)) self.phi_test = mu_qX.T # [BxR] (self.Phi_test,self.cPhi_test,self.iPhi_test,self.logDetPhi_test) \ = diagCholInvLogDet_fromLogDiag(log_sigma_qX) self.Xz_test = plus(self.phi_test, dot(self.cPhi_test, self.xi[0, :])) self.Kzz_test = kfactory.kernel(self.Xz_test, None, self.log_theta) self.Kzu_test = kfactory.kernel(self.Xz_test, self.Xu, self.log_theta) self.A_test = dot(self.Kzu_test, self.iKuu) self.C_test = minus(self.Kzz_test, dot(self.A_test, self.Kzu_test.T)) self.cC_test, self.iC_test, self.logDetC_test = cholInvLogDet( self.C_test, self.B, self.jitter) self.u_test = plus(self.kappa, (dot(self.cKappa, self.alpha))) self.mu_test = dot(self.A_test, self.u_test) self.z_test = plus(self.mu_test, (dot(self.cC_test, self.beta[0, :])))
def negative_log_predictive_density(self, t=None, y=None, r=None): """ Compute the (normalised) negative log predictive density (NLPD) of the test data yₙ*: NLPD = - ∑ₙ log ∫ p(yₙ*|fₙ*) 𝓝(fₙ*|mₙ*,vₙ*) dfₙ* where fₙ* is the function value at the test location. The above can be computed using the EP moment matching method, which we vectorise using vmap. :param t: test time steps [M, 1] :param y: test observations [M, 1] :param r: test spatial locations [M, R] :return: NLPD: the negative log predictive density for the test data """ if t is None: t, y, r = self.t, self.y, self.r (t, y, r, r_test, dt, train_id, test_id, mask) = test_input_admin(self.t, self.y, self.r, t, y, r) return_full = r_test.shape[1] != r.shape[ 1] # are spatial test locations different size to training locations? # run the filter and smooth across both train and test points posterior_mean, posterior_cov, _ = self.predict_everywhere( y, r, dt, train_id, mask, sampling=False, return_full=return_full) test_mean, test_cov = posterior_mean[test_id], posterior_cov[test_id] hyp_prior, hyp_lik = softplus_list(self.prior.hyp), softplus( self.likelihood.hyp) if return_full: measure_func = vmap(self.compute_measurement, (0, 0, 0, None)) test_mean, test_cov = measure_func(r_test, test_mean, test_cov, hyp_prior) # vectorise the EP moment matching method lpd_func = vmap(self.likelihood.moment_match, (0, 0, 0, None, None, None)) log_predictive_density, _, _ = lpd_func(y[test_id], test_mean, test_cov, hyp_lik, 1, None) return -np.mean(log_predictive_density) # mean = normalised sum
def _free_energy_with_z(self, z): """Return binary rbm style free energy in shape: [batch_size]""" zbias_term = tf.matmul(z, self.zbias, transpose_b=True) zbias_term = tf.reshape(zbias_term, [-1]) # flattern h_total_input = tf.matmul(z, self.weights) + self.hbias softplus_term = utils.softplus(h_total_input) sum_softplus = tf.reduce_sum(softplus_term, 1) return -zbias_term - sum_softplus
def save_tensorboard_embeddings(self, u, v, embedding_dim, name_u, name_v, writer, global_step, matrix_bin): u = softplus(u.weight.detach().cpu().numpy().reshape( (self.vert, self.horz, embedding_dim))) u = np.expand_dims(np.stack([u, u, u], axis=0), axis=0) writer.add_images(name_u, refactor(u), global_step=global_step) v = softplus(v.weight.detach().cpu().numpy()) fig = plt.figure() plt.plot(v) writer.add_figure(name_v, fig, global_step=global_step) dot_product = np.dot(u, v.T)[0, 0, ...] myocardium_dot_prod = self.get_video(dot_product, matrix_bin, cmap='rainbow') writer.add_video(name_u + '_' + name_v + '_dotprod', myocardium_dot_prod, global_step=global_step)
def __init__(self, variance=0.1): """ param hyp: observation noise """ super().__init__(hyp=variance) self.name = 'Audio Amplitude Demodulation' self.link_fn = lambda f: softplus(f) self.dlink_fn = lambda f: sigmoid(f) # derivative of the link function
def conditional_moments(self, f, hyp=None): """ The first two conditional moments of a Gaussian are the mean and variance: E[y|f] = f Var[y|f] = σ² """ hyp = softplus(self.hyp) if hyp is None else hyp return f, hyp.reshape(-1, 1)
def produce(self, controller): """ writeHead.recurrence(controller, previous_weight) -> key (batchsize x N), add (batchsize x N), erase (batchsize x N), shift (batchsize x 3), sharpen (batchsize x 1), strengthen (batchsize x 1), interpolation (batchsize x 1) produces controller parameters to manipulate/write memory @param controller: a batchsize x controller_size matrix, representing the output of the controller """ # key, add, erase -> batchsize x N key = T.dot(controller, self.weights["controller->key"]) add = T.tanh(T.dot(controller, self.weights["controller->add"])) erase = T.nnet.sigmoid(T.dot(controller, self.weights["controller->erase"])) # SIGMOID # shift -> batchsize x 3 shift = T.nnet.softmax(T.dot(controller, self.weights["controller->shift"])) # SOFTMAX backward_shift = shift[:, 0] stay_forward_shift = shift[:, 1:3] # represents the shift values for STAY and FORWARD zeros_size = self.memory_slots - 3 # We are concatenating along the second axis, we're basically moving the first element (which represents the backward shift) to the front # ex: # There are 7 memory slots # Useless zeros are wrapped in [] to increase history. # 0.2 0.9 0.1 [0.0 0.0 0.0 0.0] -> 0.9 0.1 [0.0 0.0 0.0 0.0] 0.2 true_shift = T.concatenate([stay_forward_shift, T.zeros([self.batch_size, zeros_size]), backward_shift.reshape([self.batch_size, 1])], axis = 1) # WRAP # sharpen, strengthen, interpolation -> batchsize x 1 # sharpen and strengthen must both be greater than or equal to 1, so we'll apply the softplus function (Graves et al., 2016) sharpen = softplus(T.dot(controller, self.weights["controller->sharpen"])) # SOFTPLUS strengthen = softplus(T.dot(controller, self.weights["controller->strengthen"])) # SOFTPLUS interpolation = T.nnet.sigmoid(T.dot(controller, self.weights["controller->interpolation"])) # SIGMOID return key, add, erase, true_shift, T.addbroadcast(sharpen, 1), T.addbroadcast(strengthen, 1), T.addbroadcast(interpolation, 1)
def free_energy(self, vis_samples): """Compute the free energy defined on visibles. return: free energy of shape: [batch_size, 1] """ vbias_term = tf.matmul(vis_samples, self.vbias, transpose_b=True) vbias_term = tf.reshape(vbias_term, [-1]) # flattern h_total_input = tf.matmul(vis_samples, self.weights) + self.hbias softplus_term = utils.softplus(h_total_input) sum_softplus = tf.reduce_sum(softplus_term, 1) return -vbias_term - sum_softplus
def evaluate_log_likelihood(self, y, f, hyp=None): """ Evaluate the log-Gaussian function log𝓝(yₙ|fₙ,σ²). Can be used to evaluate Q cubature points. :param y: observed data yₙ [scalar] :param f: mean, i.e. the latent function value fₙ [Q, 1] :param hyp: likelihood variance σ² [scalar] :return: log𝓝(yₙ|fₙ,σ²), where σ² is the observation noise [Q, 1] """ hyp = softplus(self.hyp) if hyp is None else hyp return -0.5 * np.log(2 * pi * hyp) - 0.5 * (y - f)**2 / hyp
def __init__(self, link='exp'): """ :param link: link function, either 'exp' or 'logistic' """ super().__init__(hyp=None) if link == 'exp': self.link_fn = lambda mu: np.exp(mu) self.dlink_fn = lambda mu: np.exp(mu) elif link == 'logistic': self.link_fn = lambda mu: softplus(mu) self.dlink_fn = lambda mu: sigmoid(mu) else: raise NotImplementedError('link function not implemented') self.name = 'Poisson'
def __init__(self, link='softplus'): """ :param link: link function, either 'exp' or 'softplus' (note that the link is modified with an offset) """ super().__init__(hyp=None) if link == 'exp': self.link_fn = lambda mu: np.exp(mu - 0.5) self.dlink_fn = lambda mu: np.exp(mu - 0.5) elif link == 'softplus': self.link_fn = lambda mu: softplus(mu - 0.5) + 1e-10 self.dlink_fn = lambda mu: sigmoid(mu - 0.5) else: raise NotImplementedError('link function not implemented') self.name = 'Heteroscedastic Noise'
def gradient_step(i, state, mod): params = get_params(state) mod.prior.hyp = params[0] mod.likelihood.hyp = params[1] # grad(Filter) + Smoother: # neg_log_marg_lik, gradients = mod.run() neg_log_marg_lik, gradients = mod.run_two_stage() prior_params = softplus_list(params[0]) # print('iter %2d: var1=%1.2f len1=%1.2f om1=%1.2f var2=%1.2f len2=%1.2f om2=%1.2f var3=%1.2f len3=%1.2f om3=%1.2f ' # 'var4=%1.2f len4=%1.2f var5=%1.2f len5=%1.2f var6=%1.2f len6=%1.2f ' # 'vary=%1.2f, nlml=%2.2f' % # (i, prior_params[0][0], prior_params[0][1], prior_params[0][2], # prior_params[1][0], prior_params[1][1], prior_params[1][2], # prior_params[2][0], prior_params[2][1], prior_params[2][2], # prior_params[3][0], prior_params[3][1], # prior_params[4][0], prior_params[4][1], # prior_params[5][0], prior_params[5][1], # softplus(params[1]), neg_log_marg_lik)) # print('iter %2d: len1=%1.2f om1=%1.2f len2=%1.2f om2=%1.2f len3=%1.2f om3=%1.2f ' # 'var4=%1.2f len4=%1.2f var5=%1.2f len5=%1.2f var6=%1.2f len6=%1.2f ' # 'vary=%1.2f, nlml=%2.2f' % # (i, prior_params[0][0], prior_params[0][1], # prior_params[1][0], prior_params[1][1], # prior_params[2][0], prior_params[2][1], # prior_params[3][0], prior_params[3][1], # prior_params[4][0], prior_params[4][1], # prior_params[5][0], prior_params[5][1], # softplus(params[1]), neg_log_marg_lik)) print( 'iter %2d: len1=%1.2f om1=%1.2f len2=%1.2f om2=%1.2f len3=%1.2f om3=%1.2f ' 'len4=%1.2f len5=%1.2f len6=%1.2f ' 'vary=%1.2f, nlml=%2.2f' % (i, prior_params[0][0], prior_params[0][1], prior_params[1][0], prior_params[1][1], prior_params[2][0], prior_params[2][1], prior_params[3], prior_params[4], prior_params[5], softplus( params[1]), neg_log_marg_lik)) if plot_intermediate: plot(mod, i) return opt_update(i, gradients, state)
for _ in range(args.maxIter): # Maximize ELBO grads = elementwise_grad(elbo)( (lambda_pi, lambda_phi, lambda_m, lambda_beta, lambda_nu, lambda_w)) # Variational parameter updates (gradient ascent) lambda_pi -= ps['lambda_pi'] * grads[0] lambda_phi -= ps['lambda_phi'] * grads[1] lambda_m -= ps['lambda_m'] * grads[2] lambda_beta -= ps['lambda_beta'] * grads[3] lambda_nu -= ps['lambda_nu'] * grads[4] lambda_w -= ps['lambda_w'] * grads[5] lambda_phi = agnp.array([softmax(lambda_phi[i]) for i in range(N)]) lambda_beta = softplus(lambda_beta) lambda_nu = softplus(lambda_nu) lambda_pi = softplus(lambda_pi) lambda_w = agnp.array( [agnp.dot(lambda_w[k], lambda_w[k].T) for k in range(K)]) # ELBO computation lb = elbo( (lambda_pi, lambda_phi, lambda_m, lambda_beta, lambda_nu, lambda_w)) lbs.append(lb) if VERBOSE: print('\n******* ITERATION {} *******'.format(n_iters)) print('lambda_pi: {}'.format(lambda_pi)) print('lambda_beta: {}'.format(lambda_beta)) print('lambda_nu: {}'.format(lambda_nu))
def __init__(self, num_ds_dim=4): super(SigmoidFlow, self).__init__() self.num_ds_dim = num_ds_dim self.act_a = lambda x: utils.softplus(x) self.act_b = lambda x: x self.act_w = lambda x: utils.softmax(x, dim=2)
def predict(self, y=None, dt=None, mask=None, site_params=None, sampling=False, r=None, return_full=False, compute_nlpd=True): """ Calculate posterior predictive distribution p(f*|f,y) by filtering and smoothing across the training & test locations. This function is also used during posterior sampling to smooth the auxillary data sampled from the prior. The output shapes depend on return_full :param y: observations (nans at test locations) [M, 1] :param dt: step sizes Δtₙ = tₙ - tₙ₋₁ [M, 1] :param mask: a boolean array signifying which elements are observed and which are nan [M, 1] :param site_params: the sites computed during a previous inference proceedure [2, M, obs_dim] :param sampling: notify whether we are doing posterior sampling :param r: spatial locations [M, R] :param return_full: flag to notify if we are handling the case where spatial test locations are a different size to training locations :param compute_nlpd: flag to notify whether to compute the negative log predictive density of the test data :return: posterior_mean: the posterior predictive mean [M, state_dim] or [M, obs_dim] posterior_cov: the posterior predictive (co)variance [M, M, state_dim] or [M, obs_dim] site_params: the site parameters. If none are provided then new sites are computed [2, M, obs_dim] """ y = self.y_all if y is None else y r = self.r_all if r is None else r dt = self.dt_all if dt is None else dt mask = self.mask if mask is None else mask params = [self.prior.hyp.copy(), self.likelihood.hyp.copy()] site_params = self.sites.site_params if site_params is None else site_params if site_params is not None and not sampling: # construct a vector of site parameters that is the full size of the test data # test site parameters are 𝓝(0,∞), and will not be used site_mean = np.zeros([dt.shape[0], self.func_dim, 1]) site_cov = 1e5 * np.tile(np.eye(self.func_dim), (dt.shape[0], 1, 1)) # replace parameters at training locations with the supplied sites site_mean = index_add(site_mean, index[self.train_id], site_params[0]) site_cov = index_update(site_cov, index[self.train_id], site_params[1]) site_params = (site_mean, site_cov) _, (filter_mean, filter_cov, site_params) = self.kalman_filter(y, dt, params, True, mask, site_params, r) _, posterior_mean, posterior_cov = self.rauch_tung_striebel_smoother(params, filter_mean, filter_cov, dt, True, return_full, None, None, r) if compute_nlpd: nlpd_test = self.negative_log_predictive_density(self.t_all[self.test_id], self.y_all[self.test_id], posterior_mean[self.test_id], posterior_cov[self.test_id], softplus_list(params[0]), softplus(params[1]), return_full) else: nlpd_test = np.nan # in the spatial model, the train and test points may be of different size. This deals with that situation: if return_full: measure_func = vmap( self.compute_measurement, (0, 0, 0, None) ) posterior_mean, posterior_cov = measure_func(self.r_test, posterior_mean[self.test_id], posterior_cov[self.test_id], softplus_list(self.prior.hyp)) return posterior_mean, posterior_cov, site_params, nlpd_test
def variance(self): return softplus(self.hyp)
"wb") as fp: pickle.dump(nlpd, fp) # with open("output/" + str(method) + "_" + str(fold) + "_nlpd.txt", "rb") as fp: # nlpd_show = pickle.load(fp) # print(nlpd_show) if plot_final: x_pred = model.t_all[:, 0] # link = model.likelihood.link_fn # lb = posterior_mean[:, 0, 0] - np.sqrt(posterior_var[:, 0, 0]) * 1.96 # ub = posterior_mean[:, 0, 0] + np.sqrt(posterior_var[:, 0, 0]) * 1.96 test_id = model.test_id posterior_mean_subbands = posterior_mean[:, :3, 0] posterior_mean_modulators = softplus(posterior_mean[:, 3:, 0]) posterior_mean_sig = np.sum(posterior_mean_subbands * posterior_mean_modulators, axis=-1) posterior_var_subbands = posterior_var[:, :3, 0] posterior_var_modulators = softplus(posterior_var[:, 3:, 0]) print('plotting ...') plt.figure(1, figsize=(12, 5)) plt.clf() plt.plot(x, y, 'k', label='signal', linewidth=0.6) plt.plot(x_test, y_test, 'g.', label='test', markersize=4) plt.plot(x_pred, posterior_mean_sig, 'r', label='posterior mean',
def sparsity_cost(self, vis): p_target = tf.constant(0.01, dtype=tf.float32, shape=[1, self.num_hid]) h_total_input = tf.matmul(vis, self.weights) + self.hbias penalty = (-tf.matmul(p_target, h_total_input, transpose_b=True) + tf.reduce_sum(utils.softplus(h_total_input), 1)) return tf.reduce_mean(penalty)
def kalman_filter(self, y, dt, params, store=False, mask=None, site_params=None, r=None): """ Run the Kalman filter to get p(fₙ|y₁,...,yₙ). The Kalman update step invloves some control flow to work out whether we are i) initialising the sites ii) using supplied sites iii) performing a Gaussian update with fixed parameters (e.g. in posterior sampling or ELBO calc.) If store is True then we compute and return the intermediate filtering distributions p(fₙ|y₁,...,yₙ) and sites sₙ(fₙ), otherwise we do not store the intermediates and simply return the energy / negative log-marginal likelihood, -log p(y). :param y: observed data [N, obs_dim] :param dt: step sizes Δtₙ = tₙ - tₙ₋₁ [N, 1] :param params: the model parameters, i.e the hyperparameters of the prior & likelihood :param store: flag to notify whether to store the intermediates :param mask: boolean array signifying which elements of y are observed [N, obs_dim] :param site_params: the Gaussian approximate likelihoods [2, N, obs_dim] :param r: spatial input locations :return: if store is True: neg_log_marg_lik: the filter energy, i.e. negative log-marginal likelihood -log p(y), used for hyperparameter optimisation (learning) [scalar] filtered_mean: intermediate filtering means [N, state_dim, 1] filtered_cov: intermediate filtering covariances [N, state_dim, state_dim] site_mean: mean of the approximate likelihood sₙ(fₙ) [N, obs_dim] site_cov: variance of the approximate likelihood sₙ(fₙ) [N, obs_dim] otherwise: neg_log_marg_lik: the filter energy, i.e. negative log-marginal likelihood -log p(y), used for hyperparameter optimisation (learning) [scalar] """ theta_prior, theta_lik = softplus_list(params[0]), softplus(params[1]) self.update_model( theta_prior ) # all model components that are not static must be computed inside the function N = dt.shape[0] with loops.Scope() as s: s.neg_log_marg_lik = 0.0 # negative log-marginal likelihood s.m, s.P = self.minf, self.Pinf if store: s.filtered_mean = np.zeros([N, self.state_dim, 1]) s.filtered_cov = np.zeros([N, self.state_dim, self.state_dim]) s.site_mean = np.zeros([N, self.func_dim, 1]) s.site_cov = np.zeros([N, self.func_dim, self.func_dim]) for n in s.range(N): y_n = y[n][..., np.newaxis] # -- KALMAN PREDICT -- # mₙ⁻ = Aₙ mₙ₋₁ # Pₙ⁻ = Aₙ Pₙ₋₁ Aₙ' + Qₙ, where Qₙ = Pinf - Aₙ Pinf Aₙ' A = self.prior.state_transition(dt[n], theta_prior) m_ = A @ s.m P_ = A @ (s.P - self.Pinf) @ A.T + self.Pinf # --- KALMAN UPDATE --- # Given previous predicted mean mₙ⁻ and cov Pₙ⁻, incorporate yₙ to get filtered mean mₙ & # cov Pₙ and compute the marginal likelihood p(yₙ|y₁,...,yₙ₋₁) H = self.prior.measurement_model(r[n], theta_prior) predict_mean = H @ m_ predict_cov = H @ P_ @ H.T if mask is not None: # note: this is a bit redundant but may come in handy in multi-output problems y_n = np.where(mask[n][..., np.newaxis], predict_mean[:y_n.shape[0]], y_n) # fill in masked obs with expectation log_lik_n, site_mean, site_cov = self.sites.update( self.likelihood, y_n, predict_mean, predict_cov, theta_lik, None) if site_params is not None: # use supplied site parameters to perform the update site_mean, site_cov = site_params[0][n], site_params[1][n] # modified Kalman update (see Nickish et. al. ICML 2018 or Wilkinson et. al. ICML 2019): S = predict_cov + site_cov HP = H @ P_ K = solve(S, HP).T # PH'(S^-1) s.m = m_ + K @ (site_mean - predict_mean) s.P = P_ - K @ HP if mask is not None: # note: this is a bit redundant but may come in handy in multi-output problems s.m = np.where(np.any(mask[n]), m_, s.m) s.P = np.where(np.any(mask[n]), P_, s.P) log_lik_n = np.where(mask[n][..., 0], np.zeros_like(log_lik_n), log_lik_n) s.neg_log_marg_lik -= np.sum(log_lik_n) if store: s.filtered_mean = index_add(s.filtered_mean, index[n, ...], s.m) s.filtered_cov = index_add(s.filtered_cov, index[n, ...], s.P) s.site_mean = index_add(s.site_mean, index[n, ...], site_mean) s.site_cov = index_add(s.site_cov, index[n, ...], site_cov) if store: return s.neg_log_marg_lik, (s.filtered_mean, s.filtered_cov, (s.site_mean, s.site_cov)) return s.neg_log_marg_lik
def rauch_tung_striebel_smoother(self, params, m_filtered, P_filtered, dt, store=False, return_full=False, y=None, site_params=None, r=None): """ Run the RTS smoother to get p(fₙ|y₁,...,y_N), i.e. compute p(f)𝚷ₙsₙ(fₙ) where sₙ(fₙ) are the sites (approx. likelihoods). If sites are provided, then it is assumed they are to be updated, which is done by calling the site-specific update() method. :param params: the model parameters, i.e the hyperparameters of the prior & likelihood :param m_filtered: the intermediate distribution means computed during filtering [N, state_dim, 1] :param P_filtered: the intermediate distribution covariances computed during filtering [N, state_dim, state_dim] :param dt: step sizes Δtₙ = tₙ - tₙ₋₁ [N, 1] :param store: a flag determining whether to store and return state mean and covariance :param return_full: a flag determining whether to return the full state distribution or just the function(s) :param y: observed data [N, obs_dim] :param site_params: the Gaussian approximate likelihoods [2, N, obs_dim] :param r: spatial input locations :return: var_exp: the sum of the variational expectations [scalar] smoothed_mean: the posterior marginal means [N, obs_dim] smoothed_var: the posterior marginal variances [N, obs_dim] site_params: the updated sites [2, N, obs_dim] """ theta_prior, theta_lik = softplus_list(params[0]), softplus(params[1]) self.update_model( theta_prior ) # all model components that are not static must be computed inside the function N = dt.shape[0] dt = np.concatenate([dt[1:], np.array([0.0])], axis=0) with loops.Scope() as s: s.m, s.P = m_filtered[-1, ...], P_filtered[-1, ...] if return_full: s.smoothed_mean = np.zeros([N, self.state_dim, 1]) s.smoothed_cov = np.zeros([N, self.state_dim, self.state_dim]) else: s.smoothed_mean = np.zeros([N, self.func_dim, 1]) s.smoothed_cov = np.zeros([N, self.func_dim, self.func_dim]) if site_params is not None: s.site_mean = np.zeros([N, self.func_dim, 1]) s.site_var = np.zeros([N, self.func_dim, self.func_dim]) for n in s.range(N - 1, -1, -1): # --- First compute the smoothing distribution: --- A = self.prior.state_transition( dt[n], theta_prior ) # closed form integration of transition matrix m_predicted = A @ m_filtered[n, ...] tmp_gain_cov = A @ P_filtered[n, ...] P_predicted = A @ (P_filtered[n, ...] - self.Pinf) @ A.T + self.Pinf # backward Kalman gain: # G = F * A' * P^{-1} # since both F(iltered) and P(redictive) are cov matrices, thus self-adjoint, we can take the transpose: # = (P^{-1} * A * F)' G_transpose = solve(P_predicted, tmp_gain_cov) # (P^-1)AF s.m = m_filtered[n, ...] + G_transpose.T @ (s.m - m_predicted) s.P = P_filtered[ n, ...] + G_transpose.T @ (s.P - P_predicted) @ G_transpose H = self.prior.measurement_model(r[n], theta_prior) if store: if return_full: s.smoothed_mean = index_add(s.smoothed_mean, index[n, ...], s.m) s.smoothed_cov = index_add(s.smoothed_cov, index[n, ...], s.P) else: s.smoothed_mean = index_add(s.smoothed_mean, index[n, ...], H @ s.m) s.smoothed_cov = index_add(s.smoothed_cov, index[n, ...], H @ s.P @ H.T) # --- Now update the site parameters: --- if site_params is not None: # extract mean and var from state: post_mean, post_cov = H @ s.m, H @ s.P @ H.T # calculate the new sites _, site_mu, site_cov = self.sites.update( self.likelihood, y[n][..., np.newaxis], post_mean, post_cov, theta_lik, (site_params[0][n], site_params[1][n])) s.site_mean = index_add(s.site_mean, index[n, ...], site_mu) s.site_var = index_add(s.site_var, index[n, ...], site_cov) if site_params is not None: site_params = (s.site_mean, s.site_var) if store: return site_params, s.smoothed_mean, s.smoothed_cov return site_params
with open("output/" + str(method) + "_" + str(fold) + "_nlpd.txt", "wb") as fp: pickle.dump(nlpd, fp) # with open("output/" + str(method) + "_" + str(fold) + "_nlpd.txt", "rb") as fp: # nlpd_show = pickle.load(fp) # print(nlpd_show) if plot_final: def diag(Q): vectorised_diag = vmap(jnp.diag, 0) return vectorised_diag(Q) posterior_mean_subbands = posterior_mean[:, :3] posterior_mean_modulators = softplus(posterior_mean[:, 3:]) posterior_mean_sig = np.sum(posterior_mean_subbands * posterior_mean_modulators, axis=-1) posterior_var_subbands = diag(posterior_var[:, :3, :3]) posterior_var_modulators = softplus(diag(posterior_var[:, 3:, 3:])) lb_subbands = posterior_mean_subbands - np.sqrt( posterior_var_subbands) * 1.96 ub_subbands = posterior_mean_subbands + np.sqrt( posterior_var_subbands) * 1.96 lb_modulators = softplus(posterior_mean_modulators - np.sqrt(posterior_var_modulators) * 1.96) ub_modulators = softplus(posterior_mean_modulators + np.sqrt(posterior_var_modulators) * 1.96) color1 = [0.2667, 0.4471, 0.7098] # blue
def __init__(self, numberOfInducingPoints, # Number of inducing ponts in sparse GP batchSize, # Size of mini batch dimX, # Dimensionality of the latent co-ordinates dimZ, # Dimensionality of the latent variables data, # [NxP] matrix of observations kernelType='ARD', encoderType_qX='FreeForm2', # 'MLP', 'Kernel'. encoderType_rX='FreeForm2', # 'MLP', 'Kernel' Xu_optimise=False, numberOfEncoderHiddenUnits=10 ): self.numTestSamples = 5000 # set the data data = np.asarray(data, dtype=precision) self.N = data.shape[0] # Number of observations self.P = data.shape[1] # Dimension of each observation self.M = numberOfInducingPoints self.B = batchSize self.R = dimX self.Q = dimZ self.H = numberOfEncoderHiddenUnits self.encoderType_qX = encoderType_qX self.encoderType_rX = encoderType_rX self.Xu_optimise = Xu_optimise self.y = th.shared(data) self.y.name = 'y' if kernelType == 'RBF': self.numberOfKernelParameters = 2 elif kernelType == 'RBFnn': self.numberOfKernelParameters = 1 elif kernelType == 'ARD': self.numberOfKernelParameters = self.R + 1 else: raise RuntimeError('Unrecognised kernel type') self.lowerBound = -np.inf # Lower bound self.numberofBatchesPerEpoch = int(np.ceil(np.float32(self.N) / self.B)) numPad = self.numberofBatchesPerEpoch * self.B - self.N self.batchStream = srng.permutation(n=self.N) self.padStream = srng.choice(size=(numPad,), a=self.N, replace=False, p=None, ndim=None, dtype='int32') self.batchStream.name = 'batchStream' self.padStream.name = 'padStream' self.iterator = th.shared(0) self.iterator.name = 'iterator' self.allBatches = T.reshape(T.concatenate((self.batchStream, self.padStream)), [self.numberofBatchesPerEpoch, self.B]) self.currentBatch = T.flatten(self.allBatches[self.iterator, :]) self.allBatches.name = 'allBatches' self.currentBatch.name = 'currentBatch' self.y_miniBatch = self.y[self.currentBatch, :] self.y_miniBatch.name = 'y_miniBatch' self.jitterDefault = np.float64(0.0001) self.jitterGrowthFactor = np.float64(1.1) self.jitter = th.shared(np.asarray(self.jitterDefault, dtype='float64'), name='jitter') kfactory = kernelFactory(kernelType) # kernel parameters self.log_theta = sharedZeroMatrix(1, self.numberOfKernelParameters, 'log_theta', broadcastable=(True,False)) # parameters of Kuu, Kuf, Kff self.log_omega = sharedZeroMatrix(1, self.numberOfKernelParameters, 'log_omega', broadcastable=(True,False)) # parameters of Kuu, Kuf, Kff self.log_gamma = sharedZeroMatrix(1, self.numberOfKernelParameters, 'log_gamma', broadcastable=(True,False)) # parameters of Kuu, Kuf, Kff # Random variables self.xi = srng.normal(size=(self.B, self.R), avg=0.0, std=1.0, ndim=None) self.alpha = srng.normal(size=(self.M, self.Q), avg=0.0, std=1.0, ndim=None) self.beta = srng.normal(size=(self.B, self.Q), avg=0.0, std=1.0, ndim=None) self.xi.name = 'xi' self.alpha.name = 'alpha' self.beta.name = 'beta' self.sample_xi = th.function([], self.xi) self.sample_alpha = th.function([], self.alpha) self.sample_beta = th.function([], self.beta) self.sample_batchStream = th.function([], self.batchStream) self.sample_padStream = th.function([], self.padStream) self.getCurrentBatch = th.function([], self.currentBatch, no_default_updates=True) # Compute parameters of q(X) if self.encoderType_qX == 'FreeForm1' or self.encoderType_qX == 'FreeForm2': # Have a normal variational distribution over location of latent co-ordinates self.phi_full = sharedZeroMatrix(self.N, self.R, 'phi_full') self.phi = self.phi_full[self.currentBatch, :] self.phi.name = 'phi' if encoderType_qX == 'FreeForm1': self.Phi_full_sqrt = sharedZeroMatrix(self.N, self.N, 'Phi_full_sqrt') Phi_batch_sqrt = self.Phi_full_sqrt[self.currentBatch][:, self.currentBatch] Phi_batch_sqrt.name = 'Phi_batch_sqrt' self.Phi = dot(Phi_batch_sqrt, Phi_batch_sqrt.T, 'Phi') self.cPhi, _, self.logDetPhi = cholInvLogDet(self.Phi, self.B, 0) self.qX_vars = [self.Phi_full_sqrt, self.phi_full] else: self.Phi_full_logdiag = sharedZeroArray(self.N, 'Phi_full_logdiag') Phi_batch_logdiag = self.Phi_full_logdiag[self.currentBatch] Phi_batch_logdiag.name = 'Phi_batch_logdiag' self.Phi, self.cPhi, _, self.logDetPhi \ = diagCholInvLogDet_fromLogDiag(Phi_batch_logdiag, 'Phi') self.qX_vars = [self.Phi_full_logdiag, self.phi_full] elif self.encoderType_qX == 'MLP': # Auto encode self.W1_qX = sharedZeroMatrix(self.H, self.P, 'W1_qX') self.W2_qX = sharedZeroMatrix(self.R, self.H, 'W2_qX') self.W3_qX = sharedZeroMatrix(1, self.H, 'W3_qX') self.b1_qX = sharedZeroVector(self.H, 'b1_qX', broadcastable=(False, True)) self.b2_qX = sharedZeroVector(self.R, 'b2_qX', broadcastable=(False, True)) self.b3_qX = sharedZeroVector(1, 'b3_qX', broadcastable=(False, True)) # [HxB] = softplus( [HxP] . [BxP]^T + repmat([Hx1],[1,B]) ) h_qX = softplus(plus(dot(self.W1_qX, self.y_miniBatch.T), self.b1_qX), 'h_qX' ) # [RxB] = sigmoid( [RxH] . [HxB] + repmat([Rx1],[1,B]) ) mu_qX = plus(dot(self.W2_qX, h_qX), self.b2_qX, 'mu_qX') # [1xB] = 0.5 * ( [1xH] . [HxB] + repmat([1x1],[1,B]) ) log_sigma_qX = mul( 0.5, plus(dot(self.W3_qX, h_qX), self.b3_qX), 'log_sigma_qX') self.phi = mu_qX.T # [BxR] self.Phi, self.cPhi, self.iPhi,self.logDetPhi \ = diagCholInvLogDet_fromLogDiag(log_sigma_qX, 'Phi') self.qX_vars = [self.W1_qX, self.W2_qX, self.W3_qX, self.b1_qX, self.b2_qX, self.b3_qX] elif self.encoderType_qX == 'Kernel': # Draw the latent coordinates from a GP with data co-ordinates self.Phi = kfactory.kernel(self.y_miniBatch, None, self.log_gamma, 'Phi') self.phi = sharedZeroMatrix(self.B, self.R, 'phi') (self.cPhi, self.iPhi, self.logDetPhi) = cholInvLogDet(self.Phi, self.B, self.jitter) self.qX_vars = [self.log_gamma] else: raise RuntimeError('Unrecognised encoding for q(X): ' + self.encoderType_qX) # Variational distribution q(u) self.kappa = sharedZeroMatrix(self.M, self.Q, 'kappa') self.Kappa_sqrt = sharedZeroMatrix(self.M, self.M, 'Kappa_sqrt') self.Kappa = dot(self.Kappa_sqrt, self.Kappa_sqrt.T, 'Kappa') (self.cKappa, self.iKappa, self.logDetKappa) \ = cholInvLogDet(self.Kappa, self.M, 0) self.qu_vars = [self.Kappa_sqrt, self.kappa] # Calculate latent co-ordinates Xf # [BxR] = [BxR] + [BxB] . [BxR] self.Xz = plus( self.phi, dot(self.cPhi, self.xi), 'Xf' ) # Inducing points co-ordinates self.Xu = sharedZeroMatrix(self.M, self.R, 'Xu') # Kernels self.Kzz = kfactory.kernel(self.Xz, None, self.log_theta, 'Kff') self.Kuu = kfactory.kernel(self.Xu, None, self.log_theta, 'Kuu') self.Kzu = kfactory.kernel(self.Xz, self.Xu, self.log_theta, 'Kfu') self.cKuu, self.iKuu, self.logDetKuu = cholInvLogDet(self.Kuu, self.M, self.jitter) # Variational distribution # A has dims [BxM] = [BxM] . [MxM] self.A = dot(self.Kzu, self.iKuu, 'A') # L is the covariance of conditional distribution q(z|u,Xf) self.C = minus( self.Kzz, dot(self.A, self.Kzu.T), 'C') self.cC, self.iC, self.logDetC = cholInvLogDet(self.C, self.B, self.jitter) # Sample u_q from q(u_q) = N(u_q; kappa_q, Kappa ) [MxQ] self.u = plus(self.kappa, (dot(self.cKappa, self.alpha)), 'u') # compute mean of z [QxB] # [BxQ] = [BxM] * [MxQ] self.mu = dot(self.A, self.u, 'mu') # Sample f from q(f|u,X) = N( mu_q, C ) # [BxQ] = self.z = plus(self.mu, (dot(self.cC, self.beta)), 'z') self.qz_vars = [self.log_theta] self.iUpsilon = plus(self.iKappa, dot(self.A.T, dot(self.iC, self.A) ), 'iUpsilon') _, self.Upsilon, self.negLogDetUpsilon = cholInvLogDet(self.iUpsilon, self.M, self.jitter) if self.encoderType_rX == 'MLP': self.W1_rX = sharedZeroMatrix(self.H, self.Q+self.P, 'W1_rX') self.W2_rX = sharedZeroMatrix(self.R, self.H, 'W2_rX') self.W3_rX = sharedZeroMatrix(self.R, self.H, 'W3_rX') self.b1_rX = sharedZeroVector(self.H, 'b1_rX', broadcastable=(False, True)) self.b2_rX = sharedZeroVector(self.R, 'b2_rX', broadcastable=(False, True)) self.b3_rX = sharedZeroVector(self.R, 'b3_rX', broadcastable=(False, True)) # [HxB] = softplus( [Hx(Q+P)] . [(Q+P)xB] + repmat([Hx1], [1,B]) ) h_rX = softplus(plus(dot(self.W1_rX, T.concatenate((self.z.T, self.y_miniBatch.T))), self.b1_rX), 'h_rX') # [RxB] = softplus( [RxH] . [HxB] + repmat([Rx1], [1,B]) ) mu_rX = plus(dot(self.W2_rX, h_rX), self.b2_rX, 'mu_rX') # [RxB] = 0.5*( [RxH] . [HxB] + repmat([Rx1], [1,B]) ) log_sigma_rX = mul( 0.5, plus(dot(self.W3_rX, h_rX), self.b3_rX), 'log_sigma_rX') self.tau = mu_rX.T # Diagonal optimisation of Tau self.Tau_isDiagonal = True self.Tau = T.reshape(log_sigma_rX, [self.B * self.R, 1]) self.logDetTau = T.sum(log_sigma_rX) self.Tau.name = 'Tau' self.logDetTau.name = 'logDetTau' self.rX_vars = [self.W1_rX, self.W2_rX, self.W3_rX, self.b1_rX, self.b2_rX, self.b3_rX] elif self.encoderType_rX == 'Kernel': self.tau = sharedZeroMatrix(self.B, self.R, 'tau') # Tau_r [BxB] = kernel( [[BxQ]^T,[BxP]^T].T ) Tau_r = kfactory.kernel(T.concatenate((self.z.T, self.y_miniBatch.T)).T, None, self.log_omega, 'Tau_r') (cTau_r, iTau_r, logDetTau_r) = cholInvLogDet(Tau_r, self.B, self.jitter) # self.Tau = slinalg.kron(T.eye(self.R), Tau_r) self.cTau = slinalg.kron(cTau_r, T.eye(self.R)) self.iTau = slinalg.kron(iTau_r, T.eye(self.R)) self.logDetTau = logDetTau_r * self.R self.tau.name = 'tau' # self.Tau.name = 'Tau' self.cTau.name = 'cTau' self.iTau.name = 'iTau' self.logDetTau.name = 'logDetTau' self.Tau_isDiagonal = False self.rX_vars = [self.log_omega] else: raise RuntimeError('Unrecognised encoding for r(X|z)') # Gradient variables - should be all the th.shared variables # We always want to optimise these variables if self.Xu_optimise: self.gradientVariables = [self.Xu] else: self.gradientVariables = [] self.gradientVariables.extend(self.qu_vars) self.gradientVariables.extend(self.qz_vars) self.gradientVariables.extend(self.qX_vars) self.gradientVariables.extend(self.rX_vars) self.lowerBounds = [] self.condKappa = myCond()(self.Kappa) self.condKappa.name = 'condKappa' self.Kappa_conditionNumber = th.function([], self.condKappa, no_default_updates=True) self.condKuu = myCond()(self.Kuu) self.condKuu.name = 'condKuu' self.Kuu_conditionNumber = th.function([], self.condKuu, no_default_updates=True) self.condC = myCond()(self.C) self.condC.name = 'condC' self.C_conditionNumber = th.function([], self.condC, no_default_updates=True) self.condUpsilon = myCond()(self.Upsilon) self.condUpsilon.name = 'condUpsilon' self.Upsilon_conditionNumber = th.function([], self.condUpsilon, no_default_updates=True) self.Xz_get_value = th.function([], self.Xz, no_default_updates=True)