def _forward(self, Obs): """ Computes the forward probability trellis for an HMM parameterized by :math:`(A, B, \pi)`. Notes ----- The forward trellis (sometimes referred to as `alpha` in the HMM literature), is a 2D array where entry `i`, `j` represents the probability under the HMM of being in latent state `i` after seeing the first `j` observations: .. math:: \mathtt{forward[i,j]} = P(o_1,\ldots,o_j,q_j=i|A,B,\pi) Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`. The DP step is:: forward[i,j] = sum_{s'=1}^N forward[s',j-1] * A[s',i] * B[i,o_j] = sum_{s'=1}^N P(o_1,\ldots,o_{j-1},q_{j-1}=s'|A,B,pi) * P(q_j=i|q_{j-1}=s') * P(o_j|q_j=i) In words, ``forward[i,j]`` is the weighted sum of the values computed on the previous timestep. The weight on each previous state value is the product of the probability of transitioning from that state to state `i` and the probability of emitting observation `j` in state `i`. Parameters ---------- Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)` An observation sequence of length `T`. Returns ------- forward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)` The forward trellis. """ eps = self.eps T = Obs.shape[0] # initialize the forward probability matrix forward = np.zeros((self.N, T)) ot = Obs[0] for s in range(self.N): forward[s, 0] = np.log(self.pi[s] + eps) + np.log(self.B[s, ot] + eps) for t in range(1, T): ot = Obs[t] for s in range(self.N): forward[s, t] = logsumexp( [ forward[s_, t - 1] + np.log(self.A[s_, s] + eps) + np.log(self.B[s, ot] + eps) for s_ in range(self.N) ] ) return forward
def _backward(self, Obs): """ Compute the backward probability trellis for an HMM parameterized by :math:`(A, B, \pi)`. Notes ----- The backward trellis (sometimes referred to as `beta` in the HMM literature), is a 2D array where entry `i`,`j` represents the probability of seeing the observations from time `j+1` onward given that the HMM is in state `i` at time `j` .. math:: \mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,\pi) Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`. The DP step is:: backward[i,j] = sum_{s'=1}^N backward[s',j+1] * A[i, s'] * B[s',o_{j+1}] = sum_{s'=1}^N P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,pi) * P(q_{j+1}=s'|q_{j}=i) * P(o_{j+1}|q_{j+1}=s') In words, ``backward[i,j]`` is the weighted sum of the values computed on the following timestep. The weight on each state value from the `j+1`'th timestep is the product of the probability of transitioning from state i to that state and the probability of emitting observation `j+1` from that state. Parameters ---------- Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)` A single observation sequence of length `T`. Returns ------- backward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)` The backward trellis. """ eps = self.eps T = Obs.shape[0] # initialize the backward trellis backward = np.zeros((self.N, T)) for s in range(self.N): backward[s, T - 1] = 0 for t in reversed(range(T - 1)): ot1 = Obs[t + 1] for s in range(self.N): backward[s, t] = logsumexp( [ np.log(self.A[s, s_] + eps) + np.log(self.B[s_, ot1] + eps) + backward[s_, t + 1] for s_ in range(self.N) ] ) return backward
def _loss(self, X, target, neg_samples): """Actual computation of NCE loss""" fstr = "X must have shape (n_ex, n_c, n_in), but got {} dims instead" assert X.ndim == 3, fstr.format(X.ndim) W = self.parameters["W"] b = self.parameters["b"] # sample negative samples from the noise distribution if neg_samples is None: neg_samples = self.noise_sampler(self.num_negative_samples) assert len(neg_samples) == self.num_negative_samples # get the probability of the negative sample class and the target # class under the noise distribution p_neg_samples = self.noise_sampler.probs[neg_samples] p_target = np.atleast_2d(self.noise_sampler.probs[target]) # save the noise samples for debugging noise_samples = (neg_samples, p_target, p_neg_samples) # compute the logit for the negative samples and target Z_target = X @ W[target].T + b[0, target] Z_neg = X @ W[neg_samples].T + b[0, neg_samples] # subtract the log probability of each label under the noise dist if self.subtract_log_label_prob: n, m = Z_target.shape[0], Z_neg.shape[0] Z_target[range(n), ...] -= np.log(p_target) Z_neg[range(m), ...] -= np.log(p_neg_samples) # only retain the probability of the target under its associated # minibatch example aa, _, cc = Z_target.shape Z_target = Z_target[range(aa), :, range(cc)][..., None] # p_target = (n_ex, n_c, 1) # p_neg = (n_ex, n_c, n_samples) pred_p_target = self.act_fn(Z_target) pred_p_neg = self.act_fn(Z_neg) # if we're in evaluation mode, ignore the negative samples - just # return the binary cross entropy on the targets y_pred = pred_p_target if self.trainable: # (n_ex, n_c, 1 + n_samples) (target is first column) y_pred = np.concatenate((y_pred, pred_p_neg), axis=-1) n_targets = 1 y_true = np.zeros_like(y_pred) y_true[..., :n_targets] = 1 # binary cross entropy eps = 2.220446049250313e-16 np.clip(y_pred, eps, 1 - eps, y_pred) loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) return loss, Z_target, Z_neg, y_pred, y_true, noise_samples
def loss(y, y_pred): """ Compute the cross-entropy (log) loss. Notes ----- This method returns the sum (not the average!) of the losses for each sample. Parameters ---------- y : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) Class labels (one-hot with `m` possible classes) for each of `n` examples. y_pred : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) Probabilities of each of `m` classes for the `n` examples in the batch. Returns ------- loss : float The sum of the cross-entropy across classes and examples. """ is_binary(y) is_stochastic(y_pred) # prevent taking the log of 0 eps = 2.220446049250313e-16 # each example is associated with a single class; sum the negative log # probability of the correct label over all samples in the batch. # observe that we are taking advantage of the fact that y is one-hot # encoded cross_entropy = -np.sum(y * np.log(y_pred + eps)) return cross_entropy
def logsumexp(log_probs, axis=None): """ Redefine scipy.special.logsumexp see: http://bayesjumping.net/log-sum-exp-trick/ """ _max = np.max(log_probs) ds = log_probs - _max exp_sum = np.exp(ds).sum(axis=axis) return _max + np.log(exp_sum)
def fn(self, z): """ Evaluate the softplus activation on the elements of input `z`. .. math:: \\text{SoftPlus}(z_i) = \log(1 + e^{z_i}) """ return np.log(np.exp(z) + 1)
def log_gaussian_pdf(x_i, mu, sigma): """ Compute log N(x_i | mu, sigma) """ n = len(mu) a = n * np.log(2 * np.pi) _, b = np.linalg.slogdet(sigma) y = np.linalg.solve(sigma, x_i - mu) c = np.dot(x_i - mu, y) return -0.5 * (a + b + c)
def logsumexp(log_probs, axis=None): """ Redefine scipy.special.logsumexp see: http://bayesjumping.net/log-sum-exp-trick/ """ # print("\nlogsumexp") # print("log_probs",type(log_probs),log_probs.shape,log_probs) _max = np.max(log_probs) # print("_max",type(_max),_max.shape,_max) ds = log_probs - _max exp_sum = np.exp(ds).sum(axis=axis) # print("exp_sum",type(exp_sum),exp_sum.shape,exp_sum) return float(_max + np.log(exp_sum))
def VLB(self): """ Return the variational lower bound associated with the current model parameters. """ phi = self.phi alpha = self.alpha beta = self.beta gamma = self.gamma corpus = self.corpus D = self.D T = self.T N = self.N a, b, c, _d = 0, 0, 0, 0 for d in range(D): a += ( gammaln(np.sum(alpha)) - np.sum(gammaln(alpha)) + np.sum([(alpha[t] - 1) * dg(gamma, d, t) for t in range(T)]) ) _d += ( gammaln(np.sum(gamma[d, :])) - np.sum(gammaln(gamma[d, :])) + np.sum([(gamma[d, t] - 1) * dg(gamma, d, t) for t in range(T)]) ) for n in range(N[d]): w_n = int(corpus[d][n]) b += np.sum([phi[d][n, t] * dg(gamma, d, t) for t in range(T)]) c += np.sum([phi[d][n, t] * np.log(beta[w_n, t]) for t in range(T)]) _d += np.sum([phi[d][n, t] * np.log(phi[d][n, t]) for t in range(T)]) return a + b + c - _d
def _E_step(self): for i in range(self.N): x_i = self.X[i, :] denom_vals = [] for c in range(self.C): pi_c = self.pi[c] mu_c = self.mu[c, :] sigma_c = self.sigma[c, :, :] log_pi_c = np.log(pi_c) log_p_x_i = log_gaussian_pdf(x_i, mu_c, sigma_c) # log N(X_i | mu_c, Sigma_c) + log pi_c denom_vals.append(log_p_x_i + log_pi_c) # log \sum_c exp{ log N(X_i | mu_c, Sigma_c) + log pi_c } ] log_denom = logsumexp(denom_vals) q_i = np.exp([num - log_denom for num in denom_vals]) assert_allclose(np.sum(q_i), 1, err_msg="{}".format(np.sum(q_i))) self.Q[i, :] = q_i
def mfcc( x, fs=44000, n_mfccs=13, alpha=0.95, center=True, n_filters=20, window="hann", normalize=True, lifter_coef=22, stride_duration=0.01, window_duration=0.025, replace_intercept=True, ): """ Compute the Mel-frequency cepstral coefficients (MFCC) for a signal. Notes ----- Computing MFCC features proceeds in the following stages: 1. Convert the signal into overlapping frames and apply a window fn 2. Compute the power spectrum at each frame 3. Apply the mel filterbank to the power spectra to get mel filterbank powers 4. Take the logarithm of the mel filterbank powers at each frame 5. Take the discrete cosine transform (DCT) of the log filterbank energies and retain only the first k coefficients to further reduce the dimensionality MFCCs were developed in the context of HMM-GMM automatic speech recognition (ASR) systems and can be used to provide a somewhat speaker/pitch invariant representation of phonemes. Parameters ---------- x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` A 1D signal consisting of N samples fs : int The sample rate/frequency for the signal. Default is 44000. n_mfccs : int The number of cepstral coefficients to return (including the intercept coefficient). Default is 13. alpha : float in [0, 1) The preemphasis coefficient. A value of 0 corresponds to no filtering. Default is 0.95. center : bool Whether to the kth frame of the signal should *begin* at index ``x[k * stride_len]`` (center = False) or be *centered* at ``x[k * stride_len]`` (center = True). Default is True. n_filters : int The number of filters to include in the Mel filterbank. Default is 20. normalize : bool Whether to mean-normalize the MFCC values. Default is True. lifter_coef : int in :math:[0, + \infty]` The cepstral filter coefficient. 0 corresponds to no filtering, larger values correspond to greater amounts of smoothing. Default is 22. window : {'hamming', 'hann', 'blackman_harris'} The windowing function to apply to the signal before taking the DFT. Default is 'hann'. stride_duration : float The duration of the hop between consecutive windows (in seconds). Default is 0.01. window_duration : float The duration of each frame / window (in seconds). Default is 0.025. replace_intercept : bool Replace the first MFCC coefficient (the intercept term) with the log of the total frame energy instead. Default is True. Returns ------- mfccs : :py:class:`ndarray <numpy.ndarray>` of shape `(G, C)` Matrix of Mel-frequency cepstral coefficients. Rows correspond to frames, columns to cepstral coefficients """ # map the power spectrum for the (framed + windowed representation of) `x` # onto the mel scale filter_energies, frame_energies = mel_spectrogram( x=x, fs=fs, alpha=alpha, center=center, window=window, n_filters=n_filters, mean_normalize=False, window_duration=window_duration, stride_duration=stride_duration, ) log_energies = 10 * np.log10(filter_energies) # perform a DCT on the log-mel coefficients to further reduce the data # dimensionality -- the early DCT coefficients will capture the majority of # the data, allowing us to discard coefficients > n_mfccs mfccs = np.array([DCT(frame) for frame in log_energies])[:, :n_mfccs] mfccs = cepstral_lifter(mfccs, D=lifter_coef) mfccs -= np.mean(mfccs, axis=0) if normalize else 0 if replace_intercept: # the 0th MFCC coefficient doesn't tell us anything about the spectrum; # replace it with the log of the frame energy for something more # informative mfccs[:, 0] = np.log(frame_energies) return mfccs
def marginal_log_likelihood(self, kernel_params=None): """ Compute the log of the marginal likelihood (i.e., the log model evidence), :math:`p(y \mid X, \\text{kernel_params})`. Notes ----- Under the GP regression model, the marginal likelihood is normally distributed: .. math:: y | X, \\theta \sim \mathcal{N}(0, K + \\alpha I) Hence, .. math:: \log p(y \mid X, \\theta) = -0.5 \log \det(K + \\alpha I) - 0.5 y^\\top (K + \\alpha I)^{-1} y + \\frac{n}{2} \log 2 \pi where :math:`K = \\text{kernel}(X, X)`, :math:`\\theta` is the set of kernel parameters, and `n` is the number of dimensions in `K`. Parameters ---------- kernel_params : dict Parameters for the kernel function. If None, calculate the marginal likelihood under the kernel parameters defined at model initialization. Default is None. Returns ------- marginal_log_likelihood : float The log likelihood of the training targets given the kernel parameterized by `kernel_params` and the training inputs, marginalized over all functions `f`. """ X = self.parameters["X"] y = self.parameters["y"] alpha = self.hyperparameters["alpha"] K = self.parameters["GP_cov"] if kernel_params is not None: # create a new kernel with parameters `kernel_params` and recalc # the GP covariance matrix summary_dict = self.kernel.summary_dict() summary_dict["parameters"].update(kernel_params) kernel = KernelInitializer(summary_dict)() K = kernel(X, X) # add isotropic noise to kernel diagonal K += np.eye(K.shape[0]) * alpha Kinv = inv(K) Klogdet = -0.5 * slogdet(K)[1] const = K.shape[0] / 2 * np.log(2 * np.pi) # handle both uni- and multidimensional target values if y.ndim == 1: y = y[:, np.newaxis] # sum over each dimension of y marginal_ll = np.sum([ Klogdet - 0.5 * np.dot(np.dot(_y.T, Kinv), _y) - const for _y in y.T ]) return marginal_ll
def _Mstep(self, gamma, xi, phi): """ Run a single M-step update for the Baum-Welch/Forward-Backward algorithm. Parameters ---------- gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)` The estimated state-occupancy count matrix. xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)` The estimated state-state transition count matrix. phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)` The estimated starting count matrix for each latent state. Returns ------- A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)` The estimated transition matrix. B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)` The estimated emission matrix. pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` The estimated prior probabilities for each latent state. """ eps = self.eps # initialize the estimated transition (A) and emission (B) matrices A = np.zeros((self.N, self.N)) B = np.zeros((self.N, self.V)) pi = np.zeros(self.N) count_gamma = np.zeros((self.I, self.N, self.V)) count_xi = np.zeros((self.I, self.N, self.N)) for i in range(self.I): Obs = self.O[i, :] for si in range(self.N): for vk in range(self.V): # if not (Obs == vk).any(): if not int(Obs[0]) == vk: # count_gamma[i, si, vk] = -np.inf count_gamma[i, si, vk] = np.log(eps) else: count_gamma[i, si, vk] = logsumexp(gamma[i, si, Obs == vk]) for sj in range(self.N): count_xi[i, si, sj] = logsumexp(xi[i, si, sj, :]) pi = logsumexp(phi, axis=0) - np.log(self.I + eps) np.testing.assert_almost_equal(np.exp(pi).sum(), 1) for si in range(self.N): for vk in range(self.V): B[si, vk] = logsumexp(count_gamma[:, si, vk]) - logsumexp( count_gamma[:, si, :] ) for sj in range(self.N): A[si, sj] = logsumexp(count_xi[:, si, sj]) - logsumexp( count_xi[:, si, :] ) np.testing.assert_almost_equal(np.exp(A[si, :]).sum(), 1) np.testing.assert_almost_equal(np.exp(B[si, :]).sum(), 1) return np.exp(A), np.exp(B), np.exp(pi)
def _Estep(self): """ Run a single E-step update for the Baum-Welch/Forward-Backward algorithm. This step estimates ``xi`` and ``gamma``, the excepted state-state transition counts and the expected state-occupancy counts, respectively. ``xi[i,j,k]`` gives the probability of being in state `i` at time `k` and state `j` at time `k+1` given the observed sequence `O` and the current estimates for transition (`A`) and emission (`B`) matrices:: xi[i,j,k] = P(q_k=i,q_{k+1}=j|O,A,B,pi) = P(q_k=i,q_{k+1}=j,O|A,B,pi) / P(O|A,B,pi) = [ P(o_1,o_2,...,o_k,q_k=i|A,B,pi) * P(q_{k+1}=j|q_k=i) * P(o_{k+1}|q_{k+1}=j) * P(o_{k+2},o_{k+3},...,o_T|q_{k+1}=j,A,B,pi) ] / P(O|A,B,pi) = [ fwd[j, k] * self.A[j, i] * self.B[i, o_{k+1}] * bwd[i, k + 1] ] / fwd[:, T].sum() The expected number of transitions from state `i` to state `j` across the entire sequence is then the sum over all timesteps: ``xi[i,j,:].sum()``. ``gamma[i,j]`` gives the probability of being in state `i` at time `j` .. math:: \mathtt{gamma[i,j]} = P(q_j = i \mid O, A, B, \pi) Returns ------- gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)` The estimated state-occupancy count matrix. xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)` The estimated state-state transition count matrix. phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)` The estimated prior counts for each latent state. """ eps = self.eps gamma = np.zeros((self.I, self.N, self.T)) xi = np.zeros((self.I, self.N, self.N, self.T)) phi = np.zeros((self.I, self.N)) for i in range(self.I): Obs = self.O[i, :] fwd = self._forward(Obs) bwd = self._backward(Obs) log_likelihood = logsumexp(fwd[:, self.T - 1]) t = self.T - 1 for si in range(self.N): gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood phi[i, si] = fwd[si, 0] + bwd[si, 0] - log_likelihood for t in range(self.T - 1): ot1 = Obs[t + 1] for si in range(self.N): gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood for sj in range(self.N): xi[i, si, sj, t] = ( fwd[si, t] + np.log(self.A[si, sj] + eps) + np.log(self.B[sj, ot1] + eps) + bwd[sj, t + 1] - log_likelihood ) return gamma, xi, phi
def decode(self, O): """ Given the HMM parameterized by :math:`(A, B, \pi)` and an observation sequence :math:`O = o_1, \ldots, o_T`, compute the most probable sequence of latent states, :math:`Q = q_1, \ldots, q_T`. Notes ----- HMM decoding is done efficiently via DP using the Viterbi algorithm, which produces a 2D trellis, ``viterbi``, where entry `i`, `j` represents the probability under the HMM of being in state `i` at time `j` after having passed through the *most probable* state sequence :math:`q_1,\ldots,q_{j-1}`: .. math:: \mathtt{viterbi[i,j]} = \max_{q_1,\ldots,q_{j-1}} P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi) Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`, and :math:`\max_{q_1,\ldots,q_{j-1}}` represents the maximum over all possible latent state sequences for the first `j-1` observations. The DP step is: .. math:: \mathtt{viterbi[i,j]} &= \max_{s'=1}^N \mathtt{viterbi[s',j-1]} \cdot \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\ &= \max_{s'=1}^N P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi) P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i) In words, ``viterbi[i,j]`` is the weighted sum of the values computed on the previous timestep. The weight on each value is the product of the probability of transitioning from that state to state `i` and the probability of emitting observation `j` in state `i`. To compute the most probable state sequence we maintain a second trellis, ``back_pointer``, whose `i`, `j` entry contains the value of the latent state at timestep `j-1` that is most likely to lead to latent state `i` at timestep `j`. When we have completed the ``viterbi`` and ``back_pointer`` trellises for all `T` timseteps/observations, we greedily move backwards through the ``back_pointer`` trellis to construct the best path for the full sequence of observations. Parameters ---------- O : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)` An observation sequence of length `T`. Returns ------- best_path : list of length `T` The most probable sequence of latent states for observations `O`. best_path_prob : float The probability of the latent state sequence in `best_path` under the HMM. """ eps = self.eps if O.ndim == 1: O = O.reshape(1, -1) # number of observations in each sequence T = O.shape[1] # number of training sequences I = O.shape[0] if I != 1: raise ValueError("Can only decode a single sequence (O.shape[0] must be 1)") # initialize the viterbi and back_pointer matrices viterbi = np.zeros((self.N, T)) back_pointer = np.zeros((self.N, T)).astype(int) ot = O[0, 0] for s in range(self.N): back_pointer[s, 0] = 0 viterbi[s, 0] = np.log(self.pi[s] + eps) + np.log(self.B[s, ot] + eps) for t in range(1, T): ot = O[0, t] for s in range(self.N): seq_probs = [ viterbi[s_, t - 1] + np.log(self.A[s_, s] + eps) + np.log(self.B[s, ot] + eps) for s_ in range(self.N) ] viterbi[s, t] = np.max(seq_probs) back_pointer[s, t] = np.argmax(seq_probs) best_path_log_prob = viterbi[:, T - 1].max() # backtrack through the trellis to get the most likely sequence of # latent states pointer = viterbi[:, T - 1].argmax() best_path = [pointer] for t in reversed(range(1, T)): pointer = back_pointer[pointer, t] best_path.append(pointer) best_path = best_path[::-1] return best_path, best_path_log_prob