def _compute_loss(self, scores, targets): num_train = scores.shape[0] probabilities = softmax(scores) loss = -np.sum(np.log(probabilities[np.arange(num_train), targets])) / num_train probabilities[np.arange(num_train), targets] -= 1 dsoftmax = probabilities / num_train return loss, dsoftmax
def predict(self, new_data, **kwargs): # make sure the new_data is shaped like the train data if new_data.shape[1] != 28 * 28 + 1 and new_data.shape[1] != 28 * 28: new_data = new_data.reshape(new_data.shape[0], 28*28) if self.add_bias and new_data.shape[1] != 28*28 + 1: new_data = np.hstack((new_data, np.ones((new_data.shape[0], 1)))) scores = np.dot(new_data, self.W.T) probs = softmax(scores) # this is unnecessary as the softmax function will not change the order return np.argmax(probs, axis=1)
def sample(self, latent_mean, latent_cov): num_samples = latent_mean.shape[0] pi_star = np.zeros((num_samples, n_classes)) print("\t\tPerforming MCMC sampling from the posterior latent function") for k in xrange(num_samples): f_sampled = np.random.multivariate_normal(mean=latent_mean[k], cov=latent_cov[k], size=self.sampling_steps) # class posterior is a softmax pi_star[k, :] = np.sum(softmax(f_sampled), axis=0) pi_star /= float(self.sampling_steps) return pi_star
def compute_latent_mean_cov_multiclass(self, cov_matrix_train, cov_matrix_test, f_posterior): print("\t\tComputing latent mean and covariance") num_test_samples = cov_matrix_test['hetero'][0].shape[0] # for more details see Algorithm 3.4, p51 from Rasmussen's Gaussian Processes pi = softmax(f_posterior) E = list() z = list() for cls in xrange(n_classes): # compute pi and use it as Pi too pi_sqrt_cls = spm.diags(np.sqrt(pi[cls]), format='csc') # cholesky(I + D_c^(1/2) * K * D_c^(1/2)) L = spl.cholesky((spm.identity(self.num_samples) + pi_sqrt_cls.dot( spm.csc_matrix( cov_matrix_train[cls]).dot(pi_sqrt_cls))).toarray(), lower=True) # E_c = D_c^(1/2) * L^T \ (L \ D_c^(1/2)) E.append((pi_sqrt_cls.dot( spsl.spsolve(spm.csc_matrix(L.T), spsl.spsolve(spm.csc_matrix(L), pi_sqrt_cls)))).toarray()) E = np.asarray(E) # M = cholesky(sum_c E_c) M = spl.cholesky(np.sum(E, axis=0), lower=True) latent_means = np.zeros((num_test_samples, n_classes)) latent_covs = np.zeros((num_test_samples, n_classes, n_classes)) for cls in xrange(n_classes): latent_means[:, cls] = cov_matrix_test['hetero'][cls].dot( self.one_hot_targets[:, cls] - pi[cls]) b = E[cls].dot(cov_matrix_test['hetero'][cls].T) c = E[cls].dot(spl.solve(M.T, spl.solve(M, b))) for cls_hat in xrange(n_classes): latent_covs[:, cls, cls_hat] = np.einsum( 'ij,ij->i', cov_matrix_test['hetero'][cls_hat], c.T) latent_covs[:, cls, cls] += cov_matrix_test['auto'][cls] - \ np.einsum('ij,ij->i', cov_matrix_test['hetero'][cls], b.T) return latent_means, latent_covs
def approximate_multiclass(self, cov_matrix, targets, latent_init): print("\t\tComputing the Laplace approximation with Newton iterations") self.one_hot_targets = OneHotLabels(n_classes).generate_labels(targets) self.iter_counter = 0 self.num_samples = targets.shape[0] # initialise the temporary result storage variables and the latent function f = latent_init.copy() # for more details see Algorithm 3.3, p50 from Rasmussen's Gaussian Processes while (not self._is_converged(f)): pi = softmax(f) E = list() z = list() for cls in xrange(n_classes): # compute pi and use it as Pi too pi_sqrt_cls = spm.diags(np.sqrt(pi[cls]), format='csc') # cholesky(I + D_c^(1/2) * K * D_c^(1/2)) L = spl.cholesky( (spm.identity(self.num_samples) + pi_sqrt_cls.dot( spm.csc_matrix( cov_matrix[cls]).dot(pi_sqrt_cls))).toarray(), lower=True) # E_c = D_c^(1/2) * L^T \ (L \ D_c^(1/2)) E.append((pi_sqrt_cls.dot( spsl.spsolve(spm.csc_matrix(L.T), spsl.spsolve(spm.csc_matrix(L), pi_sqrt_cls)))).toarray()) # z_c = sum_i log(L_ii) z.append(np.sum(np.log(np.diagonal(L)))) E = np.asarray(E) # M = cholesky(sum_c E_c) M = spl.cholesky(np.sum(E, axis=0), lower=True) b = list() c = list() for cls in xrange(n_classes): # compute Pi * Pi^T * f, Note that Pi * Pi^T is symmetric -> possible optimization! PiPiTf_cls = np.zeros(self.num_samples) for cls_prime in xrange(n_classes): PiPiTf_cls += pi[cls] * pi[cls_prime] * f[cls_prime] # b = (D - Pi * Pi^T) * f + y - pi b_cls = pi[cls] * f[ cls] - PiPiTf_cls + self.one_hot_targets[:, cls] - pi[cls] # c = E * K * b c_cls = E[cls].dot((cov_matrix[cls].dot(b_cls))) b.append(b_cls) c.append(c_cls) c = np.asarray(c) b = np.asarray(b) # a = b - c + E * R * M^T \ (M \ (R^T * c)) a = (b.ravel() - c.ravel() + np.vstack(E).dot(spl.solve(M.T, spl.solve(M, np.sum(c, axis=0))))) \ .reshape((n_classes, -1)) # f = K * a for cls in xrange(n_classes): f[cls] = cov_matrix[cls].dot(a[cls]) approx_log_marg_likelihood = -0.5 * a.ravel().dot(f.ravel()) \ + self.one_hot_targets.T.ravel().dot(f.ravel()) \ - np.sum(np.log(np.sum(np.exp(f), axis=0))) - np.sum(z) return f, approx_log_marg_likelihood