def _estimate_log_prob_resp(self, X, group=None): """Estimate log probabilities and responsibilities for each sample. Compute the log probabilities, weighted log probabilities per component and responsibilities for each sample in X with respect to the current state of the model. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- log_prob_norm : array, shape (n_samples,) log p(X) log_responsibilities : array, shape (n_samples, n_components) logarithm of the responsibilities """ weighted_log_prob = self._estimate_weighted_log_prob(X, group=group) log_prob_norm = logsumexp(weighted_log_prob, axis=1) with np.errstate(under='ignore'): # ignore underflow log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis] return log_prob_norm, log_resp
def estimate_predict(self, X, y, X_test): _, n_features = X.shape self.n_features_ = n_features labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) n_effective_classes = Y.shape[1] self.starting_values(n_effective_classes, n_features) self.count(X, Y) alpha = 0.01 self.update_feature_log_distribution(alpha) self.update_class_log_distribution() # The maxium of posteriori (MAP) jll = self.joint_log_likelihood(X_test) log_prob_x = logsumexp(jll, axis=1) predict_log_prob = jll - np.atleast_2d(log_prob_x).T predict_prob = np.exp(predict_log_prob) predict = self.classes_[np.argmax(jll, axis=1)] return predict, predict_prob
def test_multinomial_loss_ground_truth(): # n_samples, n_features, n_classes = 4, 2, 3 n_classes = 3 X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]]) y = np.array([0, 1, 2, 0]) lbin = LabelBinarizer() Y_bin = lbin.fit_transform(y) weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]]) intercept = np.array([1., 0, -.2]) sample_weights = np.array([0.8, 1, 1, 0.8]) prediction = np.dot(X, weights) + intercept logsumexp_prediction = logsumexp(prediction, axis=1) p = prediction - logsumexp_prediction[:, np.newaxis] loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum() diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin) grad_1 = np.dot(X.T, diff) weights_intercept = np.vstack((weights, intercept)).T.ravel() loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin, 0.0, sample_weights) grad_2 = grad_2.reshape(n_classes, -1) grad_2 = grad_2[:, :-1].T assert_almost_equal(loss_1, loss_2) assert_array_almost_equal(grad_1, grad_2) # ground truth loss_gt = 11.680360354325961 grad_gt = np.array([[-0.557487, -1.619151, +2.176638], [-0.903942, +5.258745, -4.354803]]) assert_almost_equal(loss_1, loss_gt) assert_array_almost_equal(grad_1, grad_gt)
def _e_step(self, X): """ Parameters ---------- X: array-like, (n_samples, n_features) The data. Output ------ out: dict out['log_resp']: array-like, (n_samples, n_components) The responsitiblities. out['obs_nll']: float The observed negative log-likelihood of the data at the current parameters. """ log_prob = self.log_probs(X) log_resp = self.log_resps(log_prob) obs_nll = -logsumexp(log_prob, axis=1).mean() return {'log_resp': log_resp, 'obs_nll': obs_nll}
def _estimate_prob_resp(self, X): """Estimate log probabilities and responsibilities for each sample. Compute the log probabilities, weighted log probabilities per component and responsibilities for each sample in X with respect to the current state of the model. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- log_prob_norm : array, shape (n_samples,) log p(X) log_responsibilities : array, shape (n_samples, n_components) logarithm of the responsibilities """ weighted_log_prob = self._estimate_weighted_log_prob(X) log_prob_norm = logsumexp(weighted_log_prob, axis=1) with np.errstate(under='ignore'): # ignore underflow log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis] temp = np.zeros_like(log_resp) # print('log resp: ',log_resp) # print('resp: ', np.exp(log_resp)) n_sample, n_component = log_resp.shape temp[np.arange(n_sample), np.argmax(log_resp, axis=1)] = 1 resp = temp return log_prob_norm, resp
def p_i(X, i): diff_embedded = X[i] - X dist_embedded = np.einsum('ij,ij->i', diff_embedded, diff_embedded) dist_embedded[i] = np.inf # compute exponentiated distances (use the log-sum-exp trick to # avoid numerical instabilities exp_dist_embedded = np.exp(-dist_embedded - logsumexp(-dist_embedded)) return exp_dist_embedded
def _e_step(self, X): """ Parameters ---------- X: The observed data. Output ------ E_out: dict E_out['log_resp']: array-like E_out['obs_nll']: float E_out['evals']: array-like, (n_blocks, ) E_out['eig_var']: array-like """ # standard E-step log_prob = self.log_probs(X) log_resp = self.log_resps(log_prob) obs_nll = - logsumexp(log_prob, axis=1).mean() if self.n_blocks is not None: B = self.n_blocks else: B = len(self.eval_weights) assert self.__mode in ['lap_pen', 'fine_tune_bd'] if self.__mode == 'lap_pen' and self.n_blocks != 1: if self.lap == 'sym': evals, eig_var = geigh_Lsym_bp_smallest(X=self.bd_weights_, rank=B, zero_tol=1e-10, method='tsym') elif self.lap == 'un': Lun = get_unnorm_laplacian_bp(self.bd_weights_) all_evals, all_evecs = eigh_wrapper(Lun) eig_var = all_evecs[:, -B:] evals = all_evals[-B:] else: # if self.__mode == 'fine_tune_bd': evals = None eig_var = None return {'log_resp': log_resp, 'obs_nll': obs_nll, 'evals': evals, 'eig_var': eig_var}
def predict(self, test_set): predictions = [] predict_prob = [] for example in test_set: cleaned_example = self.tokenize(example) post_prob = self.joint_log_likelihood(cleaned_example) predictions.append(self.classese[np.argmax(post_prob)]) log_prob_x = logsumexp(post_prob) predict_log_prob = post_prob - np.atleast_2d(log_prob_x).T predict_prob.append(np.exp(predict_log_prob)) return np.array(predictions), np.concatenate(predict_prob, axis=0)
def preProba(self, testData): logSum = np.zeros(shape=(testData.shape[0], 2)) for i in range(200): clf = self.clf[i] prob = clf.predict_proba(testData[['var_' + str(i)]]) logSum += np.log(prob) logSum += np.array([ np.log(self.zero) - np.log(self.total), np.log(self.one) - np.log(self.total) ]) log_prob_x = logsumexp(logSum, axis=1) return np.exp(logSum - np.atleast_2d(log_prob_x).T)
def _loss_grad_lbfgs(self, A, X, mask, sign=1.0): if self.n_iter_ == 0 and self.verbose: header_fields = ['Iteration', 'Objective Value', 'Time(s)'] header_fmt = '{:>10} {:>20} {:>10}' header = header_fmt.format(*header_fields) cls_name = self.__class__.__name__ print('[{cls}]'.format(cls=cls_name)) print('[{cls}] {header}\n[{cls}] {sep}'.format(cls=cls_name, header=header, sep='-' * len(header))) start_time = time.time() A = A.reshape(-1, X.shape[1]) X_embedded = np.dot(X, A.T) # (n_samples, n_components) # Compute softmax distances p_ij = pairwise_distances(X_embedded, squared=True) np.fill_diagonal(p_ij, np.inf) p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1)[:, np.newaxis]) print('p_ij', p_ij) # (n_samples, n_samples) # Compute loss masked_p_ij = p_ij * mask p = masked_p_ij.sum(axis=1, keepdims=True) # (n_samples, 1) loss = p.sum() # Compute gradient of loss w.r.t. `transform` weighted_p_ij = masked_p_ij - p_ij * p weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0)) gradient = 2 * (X_embedded.T.dot(weighted_p_ij_sym)).dot(X) if self.verbose: start_time = time.time() - start_time values_fmt = '[{cls}] {n_iter:>10} {loss:>20.6e} {start_time:>10.2f}' print( values_fmt.format(cls=self.__class__.__name__, n_iter=self.n_iter_, loss=loss, start_time=start_time)) sys.stdout.flush() self.n_iter_ += 1 return sign * loss, sign * gradient.ravel()
def score_samples(self, X): """Compute the weighted log probabilities for each sample. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- log_prob : array, shape (n_samples,) Log probabilities of each data point in X. """ self._check_is_fitted() X = _check_X(X, None, self.means_.shape[1]) return logsumexp(self._estimate_weighted_log_prob(X), axis=1)
def _estimate_log_resp(self, X, tau, use_prior=False): logP_mtrx = self.predict_logP_mtrx(X) if use_prior: log_weights = np.log(self.pi) else: log_weights = np.log( np.ones(self.n_components) / self.n_components) weighted_logP_mtrx = logP_mtrx + log_weights weighted_logP_mtrx = weighted_logP_mtrx * (1 / tau) log_prob_norm = logsumexp(weighted_logP_mtrx, axis=1) with np.errstate(under="ignore"): log_resp = weighted_logP_mtrx - log_prob_norm[:, np.newaxis] return log_resp
def score_samples(self, X): """ Computes the observed data log-likelihood for each sample. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- log_prob : array, shape (n_samples,) Log probabilities of each data point in X. """ check_is_fitted(self) # X = _check_X(X, None, self.metadata_['n_features']) return logsumexp(self.log_probs(X), axis=1)
def _loss(self, flatA, X, y): if self.n_iter_ == 0 and self.verbose: header_fields = ['Iteration', 'Objective Value', 'Time(s)'] header_fmt = '{:>10} {:>20} {:>10}' header = header_fmt.format(*header_fields) cls_name = self.__class__.__name__ print('[{cls}]'.format(cls=cls_name)) print('[{cls}] {header}\n[{cls}] {sep}'.format(cls=cls_name, header=header, sep='-' * len(header))) start_time = time.time() A = flatA.reshape((-1, X.shape[1])) X_embedded = np.dot(X, A.T) dist = pairwise_distances(X_embedded, squared=True) np.fill_diagonal(dist, np.inf) softmax = np.exp(-dist - logsumexp(-dist, axis=1)[:, np.newaxis]) yhat = softmax.dot(y) ydiff = yhat - y cost = (ydiff**2).sum() # also compute the gradient W = softmax * ydiff[:, np.newaxis] * (y - yhat[:, np.newaxis]) W_sym = W + W.T np.fill_diagonal(W_sym, -W.sum(axis=0)) grad = 4 * (X_embedded.T.dot(W_sym)).dot(X) if self.verbose: start_time = time.time() - start_time values_fmt = '[{cls}] {n_iter:>10} {loss:>20.6e} {start_time:>10.2f}' print( values_fmt.format(cls=self.__class__.__name__, n_iter=self.n_iter_, loss=cost, start_time=start_time)) sys.stdout.flush() self.n_iter_ += 1 return cost, grad.ravel()
def _loss_grad_lbfgs(self, A, X, mask, sign=1.0): if self.n_iter_ == 0 and self.verbose: header_fields = ['Iteration', 'Objective Value', 'Time(s)'] header_fmt = '{:>10} {:>20} {:>10}' header = header_fmt.format(*header_fields) cls_name = self.__class__.__name__ print('[{cls}]'.format(cls=cls_name)) print('[{cls}] {header}\n[{cls}] {sep}'.format(cls=cls_name, header=header, sep='-' * len(header))) start_time = time.time() A = A.reshape(-1, X.shape[1]) X_embedded = np.dot(X, A.T) # (n_samples, num_dims) # Compute softmax distances p_ij = pairwise_distances(X_embedded, squared=True) np.fill_diagonal(p_ij, np.inf) p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1)[:, np.newaxis]) # (n_samples, n_samples) # Compute loss masked_p_ij = p_ij * mask p = masked_p_ij.sum(axis=1, keepdims=True) # (n_samples, 1) loss = p.sum() # Compute gradient of loss w.r.t. `transform` weighted_p_ij = masked_p_ij - p_ij * p weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T np.fill_diagonal(weighted_p_ij_sym, - weighted_p_ij.sum(axis=0)) gradient = 2 * (X_embedded.T.dot(weighted_p_ij_sym)).dot(X) if self.verbose: start_time = time.time() - start_time values_fmt = '[{cls}] {n_iter:>10} {loss:>20.6e} {start_time:>10.2f}' print(values_fmt.format(cls=self.__class__.__name__, n_iter=self.n_iter_, loss=loss, start_time=start_time)) sys.stdout.flush() self.n_iter_ += 1 return sign * loss, sign * gradient.ravel()
def predict_log_proba(self, X): """ Return log-probability estimates for the test vector X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array-like, shape = [n_samples, n_classes] Returns the log-probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ jll = self._joint_log_likelihood(X) # normalize by P(x) = P(f_1, ..., f_n) log_prob_x = logsumexp(jll, axis=1) return jll - np.atleast_2d(log_prob_x).T
def _m_step_clust_params(self, X, log_resp): """ M step. Each view's cluster parameters can be updated independently. Parameters ---------- X : array-like, shape (n_samples, n_features) log_resp : array-like, shape (n_samples, n_components) Logarithm of the posterior probabilities (or responsibilities) of the point of each sample in X. """ # TODO: document this as it is a critical step # for each view-cluster pair, which columns of log_resp to logsumsxp vc_axes2sum = [[[] for c in range(self.view_models_[v].n_components)] for v in range(self.n_views)] for k in range(self.n_components): view_idxs = self._get_view_clust_idx(k) for v in range(self.n_views): vc_axes2sum[v][view_idxs[v]].append(k) # idx_0, idx_1 = self._get_view_clust_idx(k) # vc_axes2sum[0][idx_0].append(k) # vc_axes2sum[1][idx_1].append(k) view_params = [None for v in range(self.n_views)] for v in range(self.n_views): view_log_resp = [] # for each view-component logsumexp the responsibilities for c in range(self.view_models_[v].n_components): axes2sum = vc_axes2sum[v][c] view_log_resp.append(logsumexp(log_resp[:, axes2sum], axis=1)) view_log_resp = np.array(view_log_resp).T view_params[v] = self.view_models_[v].\ _m_step_clust_params(X=X[v], log_resp=view_log_resp) return view_params
def _estimate_responsibilities(x, weights, means, precisions_cholesky, covariance_type): """Estimate log-likelihood and responsibilities for the given data portion. Compute the sum of log-likelihoods, the count of samples, and the responsibilities for each sample in the data portion with respect to the current state of the model. Parameters ---------- x : collection of depth 2 Blocks of a horizontal portion of the data. weights : array-like, shape (n_components,) The weights of the current components. means : array-like, shape (n_components, n_features) The centers of the current components. precisions_cholesky : array-like The cholesky decomposition of sample precisions of the current components. The shape depends of the covariance_type. covariance_type : {'full', 'tied', 'diag', 'spherical'} The type of precision matrices. Returns ------- log_prob_norm_x : tuple tuple(sum, count) for log p(x) responsibilities : array-like, shape (x.shape[0], n_features) """ x = Array._merge_blocks(x) weighted_log_prob = _estimate_weighted_log_prob(x, weights, means, precisions_cholesky, covariance_type) log_prob_norm = logsumexp(weighted_log_prob, axis=1) log_prob_norm_sum = np.sum(log_prob_norm) count = len(log_prob_norm) with np.errstate(under='ignore'): # ignore underflow resp = np.exp(weighted_log_prob - log_prob_norm[:, np.newaxis]) return (log_prob_norm_sum, count), resp
def _estimate_log_prob_gamma(self, X): """Estimate log probabilities and responsibilities for each sample. Compute the log probability, and the prior Gamma weightsof the samples in X with respect to the current state of the model. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- log_prob_norm : float Mean of the logarithms of the probabilities of each sample in X gamma_priors : array, shape (n_samples,) Gamma weights of each sample in X. """ log_prob = self._estimate_log_prob(X) log_prob_norm = logsumexp(log_prob) gamma_priors = self._estimate_gamma_priors(X) return log_prob_norm, gamma_priors
def _approx_bound(self, X, doc_topic_distr, sub_sampling): """Estimate the variational bound. Estimate the variational bound over "all documents" using only the documents passed in as X. Since log-likelihood of each word cannot be computed directly, we use this bound to estimate it. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. doc_topic_distr : array, shape=(n_samples, n_components) Document topic distribution. In the literature, this is called gamma. sub_sampling : boolean, optional, (default=False) Compensate for subsampling of documents. It is used in calculate bound in online learning. Returns ------- score : float """ def _loglikelihood(prior, distr, dirichlet_distr, size): # calculate log-likelihood score = np.sum((prior - distr) * dirichlet_distr) score += np.sum(gammaln(distr) - gammaln(prior)) score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1))) return score is_sparse_x = sp.issparse(X) n_samples, n_components = doc_topic_distr.shape n_features = self.components_.shape[1] score = 0 dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr) dirichlet_component_ = _dirichlet_expectation_2d(self.components_) doc_topic_prior = self.doc_topic_prior_ topic_word_prior = self.topic_word_prior_ if is_sparse_x: X_data = X.data X_indices = X.indices X_indptr = X.indptr # E[log p(docs | theta, beta)] for idx_d in xrange(0, n_samples): if is_sparse_x: ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]] cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]] else: ids = np.nonzero(X[idx_d, :])[0] cnts = X[idx_d, ids] temp = (dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]) norm_phi = logsumexp(temp, axis=0) score += np.dot(cnts, norm_phi) # compute E[log p(theta | alpha) - log q(theta | gamma)] score += _loglikelihood(doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self._n_components) # Compensate for the subsampling of the population of documents if sub_sampling: doc_ratio = float(self.total_samples) / n_samples score *= doc_ratio # E[log p(beta | eta) - log q (beta | lambda)] score += _loglikelihood(topic_word_prior, self.components_, dirichlet_component_, n_features) return score
def predict_log_proba(self, X): jll = self._joint_log_likelihood(X) # normalize by P(x) = P(f_1, ..., f_n) log_prob_x = logsumexp(jll, axis=1) return jll - np.atleast_2d(log_prob_x).T