def _forward(self, O, scale = True): ''' Calculates the forward variable, alpha: the probability of the partial observation sequence O1 O2 ... Ot (until time t) and state Si at time t. PARAMETERS ---------- O {TxD}: observation matrix with a sequence of T observations, each having dimension D scale {Boolean}: default True RETURNS ------- lnP {Float}: log probability of the observation sequence O lnAlpha {T,N}: log of the forward variable: the probability of the partial observation sequence O1 O2 ... Ot (until time t) and state Si at time t. lnC (T,): log of the scaling coefficients for each observation ''' O = unsqueeze(O,2) T, D = O.shape # check dimensions of provided observations agree with the trained emission distributions dim = self._B[0].mu.shape[1] if D != dim: raise ValueError('GHMM: observation dimension does not agree with the trained emission distributions for the model') # calculate lnP for each observation for each state's emission distribution # lnP_obs {T, N} lnP_obs = np.zeros([T,self.N]) for i in range(self.N): lnP_obs[:,i] = self._B[i].calcLnP(O) # forward variable, alpha {T,N} lnAlpha = np.zeros([T,self.N]) # initialize vector of scaling coefficients lnC = np.zeros(T) # Step 1: Initialization lnAlpha[0,:] = np.log(self._pi) + lnP_obs[0,:] if scale: lnC[0] = -logsumexp(lnAlpha[0,:]) lnAlpha[0,:] += lnC[0] # Step 2: Induction for t in range(1,T): lnAlpha[t,:] = logsumexp(lnAlpha[[t-1],:].T + np.log(self._A), axis=0) + lnP_obs[t,:] if scale: lnC[t] = -logsumexp(lnAlpha[0,:]) lnAlpha[t,:] += lnC[t] # Step 3: Termination if scale: lnP = -np.sum(lnC) else: lnP = logsumexp(lnAlpha[T-1,:]) return lnP, lnAlpha, lnC
def logit_cost(self, theta, X, y): tt = X.shape[0] # number of training examples theta = np.reshape(theta, (len(theta), 1)) lgsum = utils.logsumexp(X.dot(theta)) lgsumneg = utils.logsumexp(-X.dot(theta)) J = (1. / tt) * (np.transpose(y).dot(lgsumneg) + np.transpose(1 - y).dot(lgsum)) return J
def _expect(self, X, verbose = False): ''' Expectation step of the expectation maximization algorithm. PARAMETERS ---------- X {NxD}: training data RETURNS ------- lnP (N,): ln[sum_M p(l)*p(Xi | l)] ln probabilities of each observation in the training data, marginalizing over mixture components to get ln[p(Xi)] posteriors {NxM}: p(l | Xi) Posterior probabilities of each mixture component for each observation. ''' N, _ = X.shape lnP_Xi_l = np.zeros([N, self.M]) # zero correction self._Sigma[self._Sigma == 0.0] += self._zeroCorr if hasattr(slinalg, 'solve_triangular'): # only in scipy since 0.9 solve_triangular = slinalg.solve_triangular else: # slower, but works solve_triangular = slinalg.solve # for each mixture component for l in range(0,self.M): X_mu = X - self._mu[l,:] if self.covType == 'diag': sig_l = np.diag(self._Sigma[l,:,:]) lnP_Xi_l[:,l] = -0.5 * (self.D * np.log(2.0*np.pi) + np.sum((X_mu ** 2) / sig_l, axis=1) + np.sum(np.log(sig_l))) elif self.covType == 'full': try: # cholesky decomposition => U*U.T = _Sigma[l,:,:] U = slinalg.cholesky(self._Sigma[l,:,:], lower=True) except slinalg.LinAlgError: # reinitialization trick is from scikit learn GMM if verbose: print "Sigma is not positive definite. Reinitializing ..." self._Sigma[l,:,:] = 1e-6 * np.eye(self.D) U = 1000.0 * self._Sigma[l,:,:] Q = solve_triangular(U, X_mu.T, lower=True) lnP_Xi_l[:,l] = -0.5 * (self.D * np.log(2.0 * np.pi) + 2.0 * np.sum(np.log(np.diag(U))) + np.sum(Q ** 2, axis=0)) lnP_Xi_l += self._lnw # calculate sum of probabilities (marginalizing over mixtures) lnP = logsumexp(lnP_Xi_l, axis=1) posteriors = np.exp(lnP_Xi_l - lnP[:,np.newaxis]) return lnP, posteriors
def logpartition(self): if self.nature == 'Bernoulli': self.logZ = np.logaddexp(0, self.weights).sum(-1) elif self.nature == 'Spin': self.logZ = np.logaddexp(self.weights, -self.weights).sum(-1) elif self.nature == 'Potts': self.logZ = utilities.logsumexp(self.weights, axis=-1).sum(-1)
def partial_EM(self, data, cond_muh_ijk, indices, weights=None, eps=1e-4, maxiter=10, verbose=0): (i, j, k) = indices converged = False previous_L = utilities.average( self.likelihood(data), weights=weights) / self.N mini_epochs = 0 if verbose: print('Partial EM %s, L = %.3f' % (mini_epochs, previous_L)) while not converged: if self.nature in ['Bernoulli', 'Spin']: f = np.dot(data, self.weights[[i, j, k], :].T) elif self.nature == 'Potts': f = cy_utilities.compute_output_C(data, self.weights[[i, j, k], :, :], np.zeros([ data.shape[0], 3], dtype=curr_float)) tmp = f - self.logZ[np.newaxis, [i, j, k]] tmp -= tmp.max(-1)[:, np.newaxis] cond_muh = np.exp(tmp) * self.muh[np.newaxis, [i, j, k]] cond_muh /= cond_muh.sum(-1)[:, np.newaxis] cond_muh *= cond_muh_ijk[:, np.newaxis] self.muh[[i, j, k]] = utilities.average(cond_muh, weights=weights) self.cum_muh = np.cumsum(self.muh) self.gh[[i, j, k]] = np.log(self.muh[[i, j, k]]) self.gh -= self.gh.mean() if self.nature == 'Bernoulli': self.cond_muv[[i, j, k]] = utilities.average_product( cond_muh, data, mean1=True, weights=weights) / self.muh[[i, j, k], np.newaxis] self.weights[[i, j, k]] = np.log( (self.cond_muv[[i, j, k]] + eps) / (1 - self.cond_muv[[i, j, k]] + eps)) self.logZ[[i, j, k]] = np.logaddexp( 0, self.weights[[i, j, k]]).sum(-1) elif self.nature == 'Spin': self.cond_muv[[i, j, k]] = utilities.average_product( cond_muh, data, mean1=True, weights=weights) / self.muh[[i, j, k], np.newaxis] self.weights[[i, j, k]] = 0.5 * np.log( (1 + self.cond_muv[[i, j, k]] + eps) / (1 - self.cond_muv[[i, j, k]] + eps)) self.logZ[[i, j, k]] = np.logaddexp( self.weights[[i, j, k]], -self.weights[[i, j, k]]).sum(-1) elif self.nature == 'Potts': self.cond_muv[[i, j, k]] = utilities.average_product( cond_muh, data, c2=self.n_c, mean1=True, weights=weights) / self.muh[[i, j, k], np.newaxis, np.newaxis] self.cum_cond_muv[[i, j, k]] = np.cumsum( self.cond_muv[[i, j, k]], axis=-1) self.weights[[i, j, k]] = np.log( self.cond_muv[[i, j, k]] + eps) self.weights[[i, j, k]] -= self.weights[[i, j, k] ].mean(-1)[:, :, np.newaxis] self.logZ[[i, j, k]] = utilities.logsumexp( self.weights[[i, j, k]], axis=-1).sum(-1) current_L = utilities.average( self.likelihood(data), weights=weights) / self.N mini_epochs += 1 converged = (mini_epochs >= maxiter) | ( np.abs(current_L - previous_L) < eps) previous_L = current_L.copy() if verbose: print('Partial EM %s, L = %.3f' % (mini_epochs, current_L)) return current_L
def likelihood(self, data): if data.ndim == 1: data = data[np.newaxis, :] if self.nature in ['Bernoulli', 'Spin']: f = np.dot(data, self.weights.T) elif self.nature == 'Potts': f = cy_utilities.compute_output_C(data, self.weights, np.zeros( [data.shape[0], self.M], dtype=curr_float)) return utilities.logsumexp((f - self.logZ[np.newaxis, :] + np.log(self.muh)[np.newaxis, :]), axis=1)
def pseudo_likelihood(self, x): if self.nature not in ['Bernoulli', 'Spin', 'Potts']: print('PL not supported for continuous data') else: fields = self.compute_fields_eff(x) if self.nature == 'Bernoulli': return (fields * x - np.logaddexp(fields, 0)).mean(1) elif self.nature == 'Spin': return (fields * x - np.logaddexp(fields, -fields)).mean(1) elif self.nature == 'Potts': return (cy_utilities.substitute_C(fields, x) - utilities.logsumexp(fields, axis=2)).mean(1)
def likelihood_and_expectation(self, data): if self.nature in ['Bernoulli', 'Spin']: f = np.dot(data, self.weights.T) elif self.nature == 'Potts': f = cy_utilities.compute_output_C(data, self.weights, np.zeros( [data.shape[0], self.M], dtype=curr_float)) L = utilities.logsumexp( (f - self.logZ[np.newaxis, :] + np.log(self.muh)[np.newaxis, :]), axis=1) cond_muh = np.exp( f - self.logZ[np.newaxis, :]) * self.muh[np.newaxis, :] cond_muh /= cond_muh.sum(-1)[:, np.newaxis] return L, cond_muh
def __call__(self, a): if self._partition is None: aMax = np.max(a) return np.exp(a - aMax) / np.sum(np.exp(a - aMax)) else: activation = np.zeros_like(a) pPart = 0 for part in self._partition: lnAct = a[pPart:part] - logsumexp(a[pPart:part]) # clamp values to avoid numerical overflow/underflow lnAct[lnAct <= -self.clamp] = -self.clamp lnAct[lnAct >= self.clamp] = self.clamp activation[pPart:part] = np.exp(lnAct) pPart = part # now calculate softmax over last partition boundary to the end lnAct = a[pPart:] - logsumexp(a[pPart:]) # clamp values to avoid numerical overflow/underflow lnAct[lnAct <= -self.clamp] = -self.clamp lnAct[lnAct >= self.clamp] = self.clamp activation[pPart:] = np.exp(lnAct) return activation
def merge_split(self, proposed_merge_split, eps=1e-6): i, j, k = proposed_merge_split old_mui = self.muh[i].copy() old_muj = self.muh[j].copy() old_muk = self.muh[k].copy() self.muh[i] = old_mui + old_muj self.muh[k] = old_muk / 2 self.muh[j] = old_muk / 2 self.gh = np.log(self.muh) self.gh -= self.gh.mean() self.cum_muh = np.cumsum(self.muh) old_cond_muvi = self.cond_muv[i].copy() old_cond_muvj = self.cond_muv[j].copy() old_cond_muvk = self.cond_muv[k].copy() self.cond_muv[i] = (old_cond_muvi * old_mui + old_cond_muvj * old_muj) / (old_mui + old_muj) if self.nature == 'Potts': noise = np.random.rand(self.N, self.n_c) noise /= noise.sum(-1)[:, np.newaxis] elif self.nature == 'Bernoulli': noise = np.random.rand(self.N) noise /= noise.sum(-1) elif self.nature == 'Spin': noise = (2 * np.random.rand(self.N) - 1) self.cond_muv[j] = 0.95 * old_cond_muvk + 0.05 * noise if self.nature == 'Bernoulli': self.weights[[i, j]] = np.log( (self.cond_muv[[i, j]] + eps) / (1 - self.cond_muv[[i, j]] + eps)) self.logZ[[i, j]] = np.logaddexp(0, self.weights[[i, j]]).sum(-1) elif self.nature == 'Spin': self.weights[[i, j]] = 0.5 * np.log( (1 + self.cond_muv[[i, j]] + eps) / (1 - self.cond_muv[[i, j]] + eps)) self.logZ[[i, j]] = np.logaddexp( self.weights[[i, j]], -self.weights[[i, j]]).sum(-1) elif self.nature == 'Potts': self.cum_cond_muv[[i, j]] = np.cumsum( self.cond_muv[[i, j]], axis=-1) self.weights[[i, j]] = np.log(self.cond_muv[[i, j]] + eps) self.weights[[i, j]] -= self.weights[[i, j] ].mean(-1)[:, :, np.newaxis] self.logZ[[i, j]] = utilities.logsumexp( self.weights[[i, j]], axis=-1).sum(-1)
def _backward(self, O, lnC): ''' Calculates the backward variable, beta: the probability of the partial observation sequence 0T OT-1 ... Ot+1 (backwards to time t+1) and State Si at time t+1 PARAMETERS ---------- O {TxD}: observation matrix with a sequence of T observations, each having dimension D lnC (T,): log of the scaling coefficients for each observation calculated from the forward pass RETURNS ------- lnBeta {T,N}: log of the backward variable: the probability of the partial observation sequence 0T OT-1 ... Ot+1 (backwards to time t+1) and State Si at time t+1 ''' O = unsqueeze(O, 2) T, D = O.shape # check dimensions of provided observations agree with the trained emission distributions dim = self._B[0].mu.shape[1] if D != dim: raise ValueError( 'GHMM: observation dimension does not agree with the trained emission distributions for the model' ) # calculate lnP for each observation for each state's emission distribution # lnP_obs {T, N} lnP_obs = np.zeros([T, self.N]) for i in range(0, self.N): lnP_obs[:, i] = self._B[i].calcLnP(O) # backward variable, beta {T,N} # Step 1: Initialization # since ln(1) = 0 lnBeta = np.zeros([T, self.N]) + lnC[T - 1] # Step 2: Induction for t in reversed(range(T - 1)): lnBeta[t, :] = logsumexp( np.log(self._A) + lnP_obs[t + 1, :] + lnBeta[t + 1, :], axis=1) + lnC[t] return lnBeta
def _backward(self, O, lnC): ''' Calculates the backward variable, beta: the probability of the partial observation sequence 0T OT-1 ... Ot+1 (backwards to time t+1) and State Si at time t+1 PARAMETERS ---------- O {TxD}: observation matrix with a sequence of T observations, each having dimension D lnC (T,): log of the scaling coefficients for each observation calculated from the forward pass RETURNS ------- lnBeta {T,N}: log of the backward variable: the probability of the partial observation sequence 0T OT-1 ... Ot+1 (backwards to time t+1) and State Si at time t+1 ''' O = unsqueeze(O,2) T, D = O.shape # check dimensions of provided observations agree with the trained emission distributions dim = self._B[0].mu.shape[1] if D != dim: raise ValueError('GHMM: observation dimension does not agree with the trained emission distributions for the model') # calculate lnP for each observation for each state's emission distribution # lnP_obs {T, N} lnP_obs = np.zeros([T,self.N]) for i in range(0,self.N): lnP_obs[:,i] = self._B[i].calcLnP(O) # backward variable, beta {T,N} # Step 1: Initialization # since ln(1) = 0 lnBeta = np.zeros([T,self.N]) + lnC[T-1] # Step 2: Induction for t in reversed(range(T-1)): lnBeta[t,:] = logsumexp(np.log(self._A) + lnP_obs[t+1,:] + lnBeta[t+1,:], axis=1) + lnC[t] return lnBeta
def _expect(self, X, verbose=False): ''' Expectation step of the expectation maximization algorithm. PARAMETERS ---------- X {NxD}: training data RETURNS ------- lnP (N,): ln[sum_M p(l)*p(Xi | l)] ln probabilities of each observation in the training data, marginalizing over mixture components to get ln[p(Xi)] posteriors {NxM}: p(l | Xi) Posterior probabilities of each mixture component for each observation. ''' N, _ = X.shape lnP_Xi_l = np.zeros([N, self.M]) # zero correction self._Sigma[self._Sigma == 0.0] += self._zeroCorr if hasattr(slinalg, 'solve_triangular'): # only in scipy since 0.9 solve_triangular = slinalg.solve_triangular else: # slower, but works solve_triangular = slinalg.solve # for each mixture component for l in range(0, self.M): X_mu = X - self._mu[l, :] if self.covType == 'diag': sig_l = np.diag(self._Sigma[l, :, :]) lnP_Xi_l[:, l] = -0.5 * (self.D * np.log(2.0 * np.pi) + np.sum( (X_mu**2) / sig_l, axis=1) + np.sum(np.log(sig_l))) elif self.covType == 'full': try: # cholesky decomposition => U*U.T = _Sigma[l,:,:] U = slinalg.cholesky(self._Sigma[l, :, :], lower=True) except slinalg.LinAlgError: # reinitialization trick is from scikit learn GMM if verbose: print "Sigma is not positive definite. Reinitializing ..." self._Sigma[l, :, :] = 1e-6 * np.eye(self.D) U = 1000.0 * self._Sigma[l, :, :] Q = solve_triangular(U, X_mu.T, lower=True) lnP_Xi_l[:, l] = -0.5 * (self.D * np.log(2.0 * np.pi) + 2.0 * np.sum(np.log(np.diag(U))) + np.sum(Q**2, axis=0)) lnP_Xi_l += self._lnw # calculate sum of probabilities (marginalizing over mixtures) lnP = logsumexp(lnP_Xi_l, axis=1) posteriors = np.exp(lnP_Xi_l - lnP[:, np.newaxis]) return lnP, posteriors
def _forward(self, O, scale=True): ''' Calculates the forward variable, alpha: the probability of the partial observation sequence O1 O2 ... Ot (until time t) and state Si at time t. PARAMETERS ---------- O {TxD}: observation matrix with a sequence of T observations, each having dimension D scale {Boolean}: default True RETURNS ------- lnP {Float}: log probability of the observation sequence O lnAlpha {T,N}: log of the forward variable: the probability of the partial observation sequence O1 O2 ... Ot (until time t) and state Si at time t. lnC (T,): log of the scaling coefficients for each observation ''' O = unsqueeze(O, 2) T, D = O.shape # check dimensions of provided observations agree with the trained emission distributions dim = self._B[0].mu.shape[1] if D != dim: raise ValueError( 'GHMM: observation dimension does not agree with the trained emission distributions for the model' ) # calculate lnP for each observation for each state's emission distribution # lnP_obs {T, N} lnP_obs = np.zeros([T, self.N]) for i in range(self.N): lnP_obs[:, i] = self._B[i].calcLnP(O) # forward variable, alpha {T,N} lnAlpha = np.zeros([T, self.N]) # initialize vector of scaling coefficients lnC = np.zeros(T) # Step 1: Initialization lnAlpha[0, :] = np.log(self._pi) + lnP_obs[0, :] if scale: lnC[0] = -logsumexp(lnAlpha[0, :]) lnAlpha[0, :] += lnC[0] # Step 2: Induction for t in range(1, T): lnAlpha[t, :] = logsumexp(lnAlpha[[t - 1], :].T + np.log(self._A), axis=0) + lnP_obs[t, :] if scale: lnC[t] = -logsumexp(lnAlpha[0, :]) lnAlpha[t, :] += lnC[t] # Step 3: Termination if scale: lnP = -np.sum(lnC) else: lnP = logsumexp(lnAlpha[T - 1, :]) return lnP, lnAlpha, lnC