def _M_step(self): C, N, X = self.C, self.N, self.X denoms = np.sum(self.Q, axis=0) # update cluster priors self.pi = denoms / N # update cluster means nums_mu = [np.dot(self.Q[:, c], X) for c in range(C)] for ix, (num, den) in enumerate(zip(nums_mu, denoms)): self.mu[ix, :] = num / den if den > 0 else np.zeros_like(num) # update cluster covariances for c in range(C): mu_c = self.mu[c, :] n_c = denoms[c] outer = np.zeros((self.d, self.d)) for i in range(N): wic = self.Q[i, c] xi = self.X[i, :] outer += wic * np.outer(xi - mu_c, xi - mu_c) outer = outer / n_c if n_c > 0 else outer self.sigma[c, :, :] = outer assert_allclose(np.sum(self.pi), 1, err_msg="{}".format(np.sum(self.pi)))
def loss(y, y_pred, t_mean, t_log_var): """ Variational lower bound for a Bernoulli VAE. Parameters ---------- y : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, N)` The original images. y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, N)` The VAE reconstruction of the images. t_mean: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)` Mean of the variational distribution :math:`q(t \mid x)`. t_log_var: :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, T)` Log of the variance vector of the variational distribution :math:`q(t \mid x)`. Returns ------- loss : float The VLB, averaged across the batch. """ # prevent nan on log(0) eps = 2.220446049250313e-16 y_pred = np.clip(y_pred, eps, 1 - eps) # reconstruction loss: binary cross-entropy rec_loss = -np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred), axis=1) # KL divergence between the variational distribution q and the prior p, # a unit gaussian kl_loss = -0.5 * np.sum(1 + t_log_var - t_mean ** 2 - np.exp(t_log_var), axis=1) loss = np.mean(kl_loss + rec_loss) return loss
def conv2D_naive(X, W, stride, pad, dilation=0): """ A slow but more straightforward implementation of a 2D "convolution" (technically, cross-correlation) of input `X` with a collection of kernels `W`. Notes ----- This implementation uses ``for`` loops and direct indexing to perform the convolution. As a result, it is slower than the vectorized :func:`conv2D` function that relies on the :func:`col2im` and :func:`im2col` transformations. Parameters ---------- X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)` Input volume. W: :py:class:`ndarray <numpy.ndarray>` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)` The volume of convolution weights/kernels. stride : int The stride of each convolution kernel. pad : tuple, int, or 'same' The padding amount. If 'same', add padding to ensure that the output of a 2D convolution with a kernel of `kernel_shape` and stride `stride` produces an output volume of the same dimensions as the input. If 2-tuple, specifies the number of padding rows and colums to add *on both sides* of the rows/columns in `X`. If 4-tuple, specifies the number of rows/columns to add to the top, bottom, left, and right of the input volume. dilation : int Number of pixels inserted between kernel elements. Default is 0. Returns ------- Z : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)` The covolution of `X` with `W`. """ s, d = stride, dilation X_pad, p = pad2D(X, pad, W.shape[:2], stride=s, dilation=d) pr1, pr2, pc1, pc2 = p fr, fc, in_ch, out_ch = W.shape n_ex, in_rows, in_cols, in_ch = X.shape # update effective filter shape based on dilation factor fr, fc = fr * (d + 1) - d, fc * (d + 1) - d out_rows = int((in_rows + pr1 + pr2 - fr) / s + 1) out_cols = int((in_cols + pc1 + pc2 - fc) / s + 1) Z = np.zeros((n_ex, out_rows, out_cols, out_ch)) for m in range(n_ex): for c in range(out_ch): for i in range(out_rows): for j in range(out_cols): i0, i1 = i * s, (i * s) + fr j0, j1 = j * s, (j * s) + fc window = X_pad[m, i0:i1:(d + 1), j0:j1:(d + 1), :] Z[m, i, j, c] = np.sum(window * W[:, :, :, c]) return Z
def loss(y, y_pred): """ Compute the cross-entropy (log) loss. Notes ----- This method returns the sum (not the average!) of the losses for each sample. Parameters ---------- y : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) Class labels (one-hot with `m` possible classes) for each of `n` examples. y_pred : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) Probabilities of each of `m` classes for the `n` examples in the batch. Returns ------- loss : float The sum of the cross-entropy across classes and examples. """ is_binary(y) is_stochastic(y_pred) # prevent taking the log of 0 eps = 2.220446049250313e-16 # each example is associated with a single class; sum the negative log # probability of the correct label over all samples in the batch. # observe that we are taking advantage of the fact that y is one-hot # encoded cross_entropy = -np.sum(y * np.log(y_pred + eps)) return cross_entropy
def minkowski(x, y, p): """ Compute the Minkowski-`p` distance between two real vectors. Notes ----- The Minkowski-`p` distance between two vectors **x** and **y** is .. math:: d(\mathbf{x}, \mathbf{y}) = \left( \sum_i |x_i - y_i|^p \\right)^{1/p} Parameters ---------- x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)` The two vectors to compute the distance between p : float > 1 The parameter of the distance function. When `p = 1`, this is the `L1` distance, and when `p=2`, this is the `L2` distance. For `p < 1`, Minkowski-`p` does not satisfy the triangle inequality and hence is not a valid distance metric. Returns ------- d : float The Minkowski-`p` distance between **x** and **y**. """ return np.sum(np.abs(x - y)**p)**(1 / p)
def _p_decreasing(self, loss_history, i): """ Compute the probability that the slope of the OLS fit to the loss history is negative. Parameters ---------- loss_history : numpy array of shape (N,) The sequence of loss values for the previous `N` minibatches. i : int Compute P(Slope < 0) beginning at index i in `history`. Returns ------ p_decreasing : float The probability that the slope of the OLS fit to loss_history is less than or equal to 0. """ loss = loss_history[i:] N = len(loss) # perform OLS on the loss entries to calc the slope mean X = np.c_[np.ones(N), np.arange(i, len(loss_history))] intercept, s_mean = np.linalg.inv(X.T @ X) @ X.T @ loss loss_pred = s_mean * X[:, 1] + intercept # compute the variance of our loss predictions and use this to compute # the (unbiased) estimate of the slope variance loss_var = 1 / (N - 2) * np.sum((loss - loss_pred)**2) s_var = (12 * loss_var) / (N**3 - N) # compute the probability that a random sample from a Gaussian # parameterized by s_mean and s_var is less than or equal to 0 p_decreasing = gaussian_cdf(0, s_mean, s_var) return p_decreasing
def _loss(self, X, target, neg_samples): """Actual computation of NCE loss""" fstr = "X must have shape (n_ex, n_c, n_in), but got {} dims instead" assert X.ndim == 3, fstr.format(X.ndim) W = self.parameters["W"] b = self.parameters["b"] # sample negative samples from the noise distribution if neg_samples is None: neg_samples = self.noise_sampler(self.num_negative_samples) assert len(neg_samples) == self.num_negative_samples # get the probability of the negative sample class and the target # class under the noise distribution p_neg_samples = self.noise_sampler.probs[neg_samples] p_target = np.atleast_2d(self.noise_sampler.probs[target]) # save the noise samples for debugging noise_samples = (neg_samples, p_target, p_neg_samples) # compute the logit for the negative samples and target Z_target = X @ W[target].T + b[0, target] Z_neg = X @ W[neg_samples].T + b[0, neg_samples] # subtract the log probability of each label under the noise dist if self.subtract_log_label_prob: n, m = Z_target.shape[0], Z_neg.shape[0] Z_target[range(n), ...] -= np.log(p_target) Z_neg[range(m), ...] -= np.log(p_neg_samples) # only retain the probability of the target under its associated # minibatch example aa, _, cc = Z_target.shape Z_target = Z_target[range(aa), :, range(cc)][..., None] # p_target = (n_ex, n_c, 1) # p_neg = (n_ex, n_c, n_samples) pred_p_target = self.act_fn(Z_target) pred_p_neg = self.act_fn(Z_neg) # if we're in evaluation mode, ignore the negative samples - just # return the binary cross entropy on the targets y_pred = pred_p_target if self.trainable: # (n_ex, n_c, 1 + n_samples) (target is first column) y_pred = np.concatenate((y_pred, pred_p_neg), axis=-1) n_targets = 1 y_true = np.zeros_like(y_pred) y_true[..., :n_targets] = 1 # binary cross entropy eps = 2.220446049250313e-16 np.clip(y_pred, eps, 1 - eps, y_pred) loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) return loss, Z_target, Z_neg, y_pred, y_true, noise_samples
def _E_step(self): for i in range(self.N): x_i = self.X[i, :] denom_vals = [] for c in range(self.C): pi_c = self.pi[c] mu_c = self.mu[c, :] sigma_c = self.sigma[c, :, :] log_pi_c = np.log(pi_c) log_p_x_i = log_gaussian_pdf(x_i, mu_c, sigma_c) # log N(X_i | mu_c, Sigma_c) + log pi_c denom_vals.append(log_p_x_i + log_pi_c) # log \sum_c exp{ log N(X_i | mu_c, Sigma_c) + log pi_c } ] log_denom = logsumexp(denom_vals) q_i = np.exp([num - log_denom for num in denom_vals]) assert_allclose(np.sum(q_i), 1, err_msg="{}".format(np.sum(q_i))) self.Q[i, :] = q_i
def _maximize_gamma(self): """ Optimize variational parameter gamma γ_t = α_t + \sum_{n=1}^{N_d} ϕ_{t, n} """ D = self.D phi = self.phi alpha = self.alpha gamma = np.tile(alpha, (D, 1)) + np.array( list(map(lambda x: np.sum(x, axis=0), phi)) ) return gamma
def test_HMM(): np.random.seed(12345) np.set_printoptions(precision=5, suppress=True) P = default_hmm() ls, obs = P["latent_states"], P["obs_types"] # generate a new sequence O = generate_training_data(P, n_steps=30, n_examples=25) tol = 1e-5 n_runs = 5 best, best_theirs = (-np.inf, []), (-np.inf, []) for _ in range(n_runs): hmm = MultinomialHMM() A_, B_, pi_ = hmm.fit(O, ls, obs, tol=tol, verbose=True) theirs = MHMM( tol=tol, verbose=True, n_iter=int(1e9), transmat_prior=1, startprob_prior=1, algorithm="viterbi", n_components=len(ls), ) O_flat = O.reshape(1, -1).flatten().reshape(-1, 1) theirs = theirs.fit(O_flat, lengths=[O.shape[1]] * O.shape[0]) hmm2 = MultinomialHMM(A=A_, B=B_, pi=pi_) like = np.sum([hmm2.log_likelihood(obs) for obs in O]) like_theirs = theirs.score(O_flat, lengths=[O.shape[1]] * O.shape[0]) if like > best[0]: best = (like, {"A": A_, "B": B_, "pi": pi_}) if like_theirs > best_theirs[0]: best_theirs = ( like_theirs, { "A": theirs.transmat_, "B": theirs.emissionprob_, "pi": theirs.startprob_, }, ) print("Final log likelihood of sequence: {:.5f}".format(best[0])) print("Final log likelihood of sequence (theirs): {:.5f}".format( best_theirs[0])) plot_matrices(P, best, best_theirs)
def _maximize_beta(self): """ Optimize model parameter beta β_{t, n} ∝ \sum_{d=1}^D \sum_{i=1}^{N_d} ϕ_{d, t, n} [ i = n] """ T = self.T V = self.V phi = self.phi beta = self.beta corpus = self.corpus for n in range(V): # Construct binary mask [i == n] to be the same shape as phi mask = [np.tile((doc == n), (T, 1)).T for doc in corpus] beta[n, :] = np.sum( np.array(list(map(lambda x: np.sum(x, axis=0), phi * mask))), axis=0 ) # Normalize over words for t in range(T): beta[:, t] = beta[:, t] / np.sum(beta[:, t]) return beta
def VLB(self): """ Return the variational lower bound associated with the current model parameters. """ phi = self.phi alpha = self.alpha beta = self.beta gamma = self.gamma corpus = self.corpus D = self.D T = self.T N = self.N a, b, c, _d = 0, 0, 0, 0 for d in range(D): a += ( gammaln(np.sum(alpha)) - np.sum(gammaln(alpha)) + np.sum([(alpha[t] - 1) * dg(gamma, d, t) for t in range(T)]) ) _d += ( gammaln(np.sum(gamma[d, :])) - np.sum(gammaln(gamma[d, :])) + np.sum([(gamma[d, t] - 1) * dg(gamma, d, t) for t in range(T)]) ) for n in range(N[d]): w_n = int(corpus[d][n]) b += np.sum([phi[d][n, t] * dg(gamma, d, t) for t in range(T)]) c += np.sum([phi[d][n, t] * np.log(beta[w_n, t]) for t in range(T)]) _d += np.sum([phi[d][n, t] * np.log(phi[d][n, t]) for t in range(T)]) return a + b + c - _d
def predict(self, X): """ Generate predictions for the targets associated with the rows in `X`. Parameters ---------- X : numpy array of shape `(N', M')` An array of `N'` examples to generate predictions on. Returns ------- y : numpy array of shape `(N',\*)` Predicted targets for the `N'` rows in `X`. """ predictions = [] H = self.hyperparameters for x in X: pred = None nearest = self._ball_tree.nearest_neighbors(H["k"], x) targets = [n.val.item() for n in nearest] # print("targets", type(targets),targets) if H["classifier"]: if H["weights"] == "uniform": pred = Counter(targets).most_common(1)[0][0] elif H["weights"] == "distance": best_score = -np.inf for label in set(targets): scores = [1 / n.distance for n in nearest if n.val == label] pred = label if np.sum(scores) > best_score else pred else: if H["weights"] == "uniform": pred = np.mean(targets) elif H["weights"] == "distance": weights = [1 / n.distance for n in nearest] pred = np.average(targets, weights=weights) predictions.append(pred) return np.array(predictions)
def _maximize_phi(self): """ Optimize variational parameter phi ϕ_{t, n} ∝ β_{t, w_n} e^( Ψ(γ_t) ) """ D = self.D N = self.N T = self.T phi = self.phi beta = self.beta gamma = self.gamma corpus = self.corpus for d in range(D): for n in range(N[d]): for t in range(T): w_n = int(corpus[d][n]) phi[d][n, t] = beta[w_n, t] * np.exp(dg(gamma, d, t)) # Normalize over topics phi[d][n, :] = phi[d][n, :] / np.sum(phi[d][n, :]) return phi
def euclidean(x, y): """ Compute the Euclidean (`L2`) distance between two real vectors Notes ----- The Euclidean distance between two vectors **x** and **y** is .. math:: d(\mathbf{x}, \mathbf{y}) = \sqrt{ \sum_i (x_i - y_i)^2 } Parameters ---------- x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)` The two vectors to compute the distance between Returns ------- d : float The L2 distance between **x** and **y**. """ return np.sqrt(np.sum((x - y)**2))
def manhattan(x, y): """ Compute the Manhattan (`L1`) distance between two real vectors Notes ----- The Manhattan distance between two vectors **x** and **y** is .. math:: d(\mathbf{x}, \mathbf{y}) = \sum_i |x_i - y_i| Parameters ---------- x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)` The two vectors to compute the distance between Returns ------- d : float The L1 distance between **x** and **y**. """ return np.sum(np.abs(x - y))
def hamming(x, y): """ Compute the Hamming distance between two integer-valued vectors. Notes ----- The Hamming distance between two vectors **x** and **y** is .. math:: d(\mathbf{x}, \mathbf{y}) = \\frac{1}{N} \sum_i \mathbb{1}_{x_i \\neq y_i} Parameters ---------- x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)` The two vectors to compute the distance between. Both vectors should be integer-valued. Returns ------- d : float The Hamming distance between **x** and **y**. """ return np.sum(x != y) / len(x)
def _maximize_alpha(self, max_iters=1000, tol=0.1): """ Optimize alpha using Blei's O(n) Newton-Raphson modification for a Hessian with special structure """ D = self.D T = self.T alpha = self.alpha gamma = self.gamma for _ in range(max_iters): alpha_old = alpha # Calculate gradient g = D * (digamma(np.sum(alpha)) - digamma(alpha)) + np.sum( digamma(gamma) - np.tile(digamma(np.sum(gamma, axis=1)), (T, 1)).T, axis=0, ) # Calculate Hessian diagonal component h = -D * polygamma(1, alpha) # Calculate Hessian constant component z = D * polygamma(1, np.sum(alpha)) # Calculate constant c = np.sum(g / h) / (z ** (-1.0) + np.sum(h ** (-1.0))) # Update alpha alpha = alpha - (g - c) / h # Check convergence if np.sqrt(np.mean(np.square(alpha - alpha_old))) < tol: break return alpha
def fit( self, O, latent_state_types, observation_types, pi=None, tol=1e-5, verbose=False ): """ Given an observation sequence `O` and the set of possible latent states, learn the MLE HMM parameters `A` and `B`. Notes ----- Model fitting is done iterativly using the Baum-Welch/Forward-Backward algorithm, a special case of the EM algorithm. We begin with an intial estimate for the transition (`A`) and emission (`B`) matrices and then use these to derive better and better estimates by computing the forward probability for an observation and then dividing that probability mass among all the paths that contributed to it. Parameters ---------- O : :py:class:`ndarray <numpy.ndarray>` of shape `(I, T)` The set of `I` training observations, each of length `T`. latent_state_types : list of length `N` The collection of valid latent states. observation_types : list of length `V` The collection of valid observation states. pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` The prior probability of each latent state. If None, assume each latent state is equally likely a priori. Default is None. tol : float The tolerance value. If the difference in log likelihood between two epochs is less than this value, terminate training. Default is 1e-5. verbose : bool Print training stats after each epoch. Default is True. Returns ------- A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)` The estimated transition matrix. B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)` The estimated emission matrix. pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` The estimated prior probabilities of each latent state. """ if O.ndim == 1: O = O.reshape(1, -1) # observations self.O = O # number of training examples (I) and their lengths (T) self.I, self.T = self.O.shape # number of types of observation self.V = len(observation_types) # number of latent state types self.N = len(latent_state_types) # Uniform initialization of prior over latent states self.pi = pi if self.pi is None: self.pi = np.ones(self.N) self.pi = self.pi / self.pi.sum() # Uniform initialization of A self.A = np.ones((self.N, self.N)) self.A = self.A / self.A.sum(axis=1)[:, None] # Random initialization of B self.B = np.random.rand(self.N, self.V) self.B = self.B / self.B.sum(axis=1)[:, None] # iterate E and M steps until convergence criteria is met step, delta = 0, np.inf ll_prev = np.sum(np.array([self.log_likelihood(o) for o in self.O])) while delta > tol: gamma, xi, phi = self._Estep() self.A, self.B, self.pi = self._Mstep(gamma, xi, phi) ll = np.sum(np.array([self.log_likelihood(o) for o in self.O])) delta = ll - ll_prev ll_prev = ll step += 1 if verbose: fstr = "[Epoch {}] LL: {:.3f} Delta: {:.5f}" print(fstr.format(step, ll_prev, delta)) return self.A, self.B, self.pi
def dg(gamma, d, t): """ E[log X_t] where X_t ~ Dir """ return digamma(gamma[d, t]) - digamma(np.sum(gamma[d, :]))
def mel_spectrogram( x, window_duration=0.025, stride_duration=0.01, mean_normalize=True, window="hamming", n_filters=20, center=True, alpha=0.95, fs=44000, ): """ Apply the Mel-filterbank to the power spectrum for a signal `x`. Notes ----- The Mel spectrogram is the projection of the power spectrum of the framed and windowed signal onto the basis set provided by the Mel filterbank. Parameters ---------- x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` A 1D signal consisting of N samples window_duration : float The duration of each frame / window (in seconds). Default is 0.025. stride_duration : float The duration of the hop between consecutive windows (in seconds). Default is 0.01. mean_normalize : bool Whether to subtract the coefficient means from the final filter values to improve the signal-to-noise ratio. Default is True. window : {'hamming', 'hann', 'blackman_harris'} The windowing function to apply to the signal before FFT. Default is 'hamming'. n_filters : int The number of mel filters to include in the filterbank. Default is 20. center : bool Whether to the `k` th frame of the signal should *begin* at index ``x[k * stride_len]`` (center = False) or be *centered* at ``x[k * stride_len]`` (center = True). Default is False. alpha : float in [0, 1) The coefficient for the preemphasis filter. A value of 0 corresponds to no filtering. Default is 0.95. fs : int The sample rate/frequency for the signal. Default is 44000. Returns ------- filter_energies : :py:class:`ndarray <numpy.ndarray>` of shape `(G, n_filters)` The (possibly mean_normalized) power for each filter in the Mel filterbank (i.e., the Mel spectrogram). Rows correspond to frames, columns to filters energy_per_frame : :py:class:`ndarray <numpy.ndarray>` of shape `(G,)` The total energy in each frame of the signal """ eps = np.finfo(float).eps window_fn = WindowInitializer()(window) stride = round(stride_duration * fs) frame_width = round(window_duration * fs) N = frame_width # add a preemphasis filter to the raw signal x = preemphasis(x, alpha) # convert signal to overlapping frames and apply a window function x = np.pad(x, N // 2, "reflect") if center else x frames = to_frames(x, frame_width, stride, fs) window = np.tile(window_fn(frame_width), (frames.shape[0], 1)) frames = frames * window # compute the power spectrum power_spec = power_spectrum(frames) energy_per_frame = np.sum(power_spec, axis=1) energy_per_frame[energy_per_frame == 0] = eps # compute the power at each filter in the Mel filterbank fbank = mel_filterbank(N, n_filters=n_filters, fs=fs) filter_energies = power_spec @ fbank.T filter_energies -= np.mean(filter_energies, axis=0) if mean_normalize else 0 filter_energies[filter_energies == 0] = eps return filter_energies, energy_per_frame
def marginal_log_likelihood(self, kernel_params=None): """ Compute the log of the marginal likelihood (i.e., the log model evidence), :math:`p(y \mid X, \\text{kernel_params})`. Notes ----- Under the GP regression model, the marginal likelihood is normally distributed: .. math:: y | X, \\theta \sim \mathcal{N}(0, K + \\alpha I) Hence, .. math:: \log p(y \mid X, \\theta) = -0.5 \log \det(K + \\alpha I) - 0.5 y^\\top (K + \\alpha I)^{-1} y + \\frac{n}{2} \log 2 \pi where :math:`K = \\text{kernel}(X, X)`, :math:`\\theta` is the set of kernel parameters, and `n` is the number of dimensions in `K`. Parameters ---------- kernel_params : dict Parameters for the kernel function. If None, calculate the marginal likelihood under the kernel parameters defined at model initialization. Default is None. Returns ------- marginal_log_likelihood : float The log likelihood of the training targets given the kernel parameterized by `kernel_params` and the training inputs, marginalized over all functions `f`. """ X = self.parameters["X"] y = self.parameters["y"] alpha = self.hyperparameters["alpha"] K = self.parameters["GP_cov"] if kernel_params is not None: # create a new kernel with parameters `kernel_params` and recalc # the GP covariance matrix summary_dict = self.kernel.summary_dict() summary_dict["parameters"].update(kernel_params) kernel = KernelInitializer(summary_dict)() K = kernel(X, X) # add isotropic noise to kernel diagonal K += np.eye(K.shape[0]) * alpha Kinv = inv(K) Klogdet = -0.5 * slogdet(K)[1] const = K.shape[0] / 2 * np.log(2 * np.pi) # handle both uni- and multidimensional target values if y.ndim == 1: y = y[:, np.newaxis] # sum over each dimension of y marginal_ll = np.sum([ Klogdet - 0.5 * np.dot(np.dot(_y.T, Kinv), _y) - const for _y in y.T ]) return marginal_ll