def objective(K, y, alpha, lamda, beta, w): """Objective function for lasso kernel learning.""" obj = .5 * sum(squared_norm( alpha[j].dot(K[j].T.dot(w)) - y[j]) for j in range(len(K))) obj += lamda * np.abs(w).sum() obj += beta * sum(squared_norm(a) for a in alpha) return obj
def _f(x, _Z_0, Z_0, loss_res, nabla_con, nabla_pen, loss_func, S, C): _Z_0, A = _Z_0(x[0], x[1], Z_0, loss_res, nabla_con, nabla_pen) loss_res = loss_gen(loss_func, S, _Z_0) - C # loss_res_A = loss_gen(loss_func, S, A) - C # return squared_norm(loss_res) + squared_norm(loss_res - loss_res_A) return squared_norm(loss_res) + squared_norm(_Z_0 - A) / (S.shape[1] * S.shape[2])
def objective_admm(K, y, alpha, lamda, beta, w, w1, w2): """Objective function for lasso kernel learning.""" obj = .5 * sum(squared_norm( np.dot(alpha[j], K[j].T.dot(w)) - y[j]) for j in range(len(K))) obj += lamda * np.abs(w1).sum() obj += beta * squared_norm(w2) return obj
def objective(K, y, alpha, lamda, beta, w): """Objective function for lasso kernel learning.""" obj = .5 * sum( squared_norm(alpha[j].dot(K[j].T.dot(w)) - y[j]) for j in range(len(K))) obj += lamda * np.abs(w).sum() obj += beta * sum(squared_norm(a) for a in alpha) return obj
def test_norm_squared_norm(): X = np.random.RandomState(42).randn(50, 63) X *= 100 # check stability X += 200 assert_almost_equal(np.linalg.norm(X.ravel()), norm(X)) assert_almost_equal(norm(X)**2, squared_norm(X), decimal=6) assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
def test_norm_squared_norm(): X = np.random.RandomState(42).randn(50, 63) X *= 100 # check stability X += 200 assert_almost_equal(np.linalg.norm(X.ravel()), norm(X)) assert_almost_equal(norm(X) ** 2, squared_norm(X), decimal=6) assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
def objective_admm(K, y, alpha, lamda, beta, w, w1, w2): """Objective function for lasso kernel learning.""" obj = .5 * sum( squared_norm(np.dot(alpha[j], K[j].T.dot(w)) - y[j]) for j in range(len(K))) obj += lamda * np.abs(w1).sum() obj += beta * squared_norm(w2) return obj
def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, l1_ratio): gradW = (np.dot(W, np.dot(H, H.T)) - safe_sparse_dot(X, H.T, dense_output=True)) gradH = (np.dot(np.dot(W.T, W), H) - safe_sparse_dot(W.T, X, dense_output=True)) init_grad = squared_norm(gradW) + squared_norm(gradH.T) # max(0.001, tol) to force alternating minimizations of W and H tolW = max(0.001, tol) * np.sqrt(init_grad) tolH = tolW for n_iter in range(1, max_iter + 1): # stopping condition as discussed in paper proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0)) proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0)) if (proj_grad_W + proj_grad_H) / init_grad < tol**2: break # update W Wt, gradWt, iterW = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio) W, gradW = Wt.T, gradWt.T if iterW == 1: tolW = 0.1 * tolW # update H H, gradH, iterH = _nls_subproblem(X, W, H, tolH, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio) if iterH == 1: tolH = 0.1 * tolH H[H == 0] = 0 # fix up negative zeros if n_iter == max_iter: Wt, _, _ = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio) W = Wt.T return W, H, n_iter
def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, l1_ratio, sparseness, beta, eta): """Compute Non-negative Matrix Factorization (NMF) with Projected Gradient References ---------- C.-J. Lin. Projected gradient methods for non-negative matrix factorization. Neural Computation, 19(2007), 2756-2779. http://www.csie.ntu.edu.tw/~cjlin/nmf/ P. Hoyer. Non-negative Matrix Factorization with Sparseness Constraints. Journal of Machine Learning Research 2004. """ gradW = (np.dot(W, np.dot(H, H.T)) - safe_sparse_dot(X, H.T, dense_output=True)) gradH = (np.dot(np.dot(W.T, W), H) - safe_sparse_dot(W.T, X, dense_output=True)) init_grad = squared_norm(gradW) + squared_norm(gradH.T) # max(0.001, tol) to force alternating minimizations of W and H tolW = max(0.001, tol) * np.sqrt(init_grad) tolH = tolW for n_iter in range(1, max_iter + 1): # stopping condition # as discussed in paper proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0)) proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0)) if (proj_grad_W + proj_grad_H) / init_grad < tol**2: break # update W W, gradW, iterW = _update_projected_gradient_w(X, W, H, tolW, nls_max_iter, alpha, l1_ratio, sparseness, "L2", beta, eta) if iterW == 1: tolW = 0.1 * tolW # update H H, gradH, iterH = _update_projected_gradient_h(X, W, H, tolH, nls_max_iter, alpha, l1_ratio, sparseness, "L1", beta, eta) if iterH == 1: tolH = 0.1 * tolH H[H == 0] = 0 # fix up negative zeros if n_iter == max_iter: W, _, _ = _update_projected_gradient_w(X, W, H, tol, nls_max_iter, alpha, l1_ratio, sparseness, "L2", beta, eta) return W, H, n_iter
def test_norm_squared_norm(): X = np.random.RandomState(42).randn(50, 63) X *= 100 # check stability X += 200 assert_almost_equal(np.linalg.norm(X.ravel()), norm(X)) assert_almost_equal(norm(X)**2, squared_norm(X), decimal=6) assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6) # Check the warning with an int array and np.dot potential overflow assert_warns_message( UserWarning, 'Array type is integer, np.dot may ' 'overflow. Data should be float type to avoid this issue', squared_norm, X.astype(int))
def test_norm_squared_norm(): X = np.random.RandomState(42).randn(50, 63) X *= 100 # check stability X += 200 assert_almost_equal(np.linalg.norm(X.ravel()), norm(X)) assert_almost_equal(norm(X) ** 2, squared_norm(X), decimal=6) assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6) # Check the warning with an int array and np.dot potential overflow assert_warns_message( UserWarning, 'Array type is integer, np.dot may ' 'overflow. Data should be float type to avoid this issue', squared_norm, X.astype(int))
def _objective_func(self, w): bias, wf = self._split_coefficents(w) l_plus, xv_plus, l_minus, xv_minus = self._counter.calculate(wf) # pylint: disable=unused-variable xw = self._xw val = 0.5 * squared_norm(wf) if self._has_time: val += 0.5 * self._regr_penalty * squared_norm(self.y_compressed - bias - xw.compress(self.regr_mask, axis=0)) val += 0.5 * self._rank_penalty * numexpr.evaluate( 'sum(xw * ((l_plus + l_minus) * xw - xv_plus - xv_minus - 2 * (l_minus - l_plus)) + l_minus)') return val
def _objective_func(self, w): bias, wf = self._split_coefficents(w) l_plus, xv_plus, l_minus, xv_minus = self._counter.calculate(wf) xw = self._xw val = 0.5 * squared_norm(wf) if self._has_time: val += 0.5 * self._regr_penalty * squared_norm(self.y_compressed - bias - xw.compress(self.regr_mask, axis=0)) val += 0.5 * self._rank_penalty * numexpr.evaluate( 'sum(xw * ((l_plus + l_minus) * xw - xv_plus - xv_minus - 2 * (l_minus - l_plus)) + l_minus)') return val
def _beta_divergence(X, W, H, square_root=False): """Compute the beta-divergence of X and dot(W, H). Parameters ---------- X : float or array-like, shape (n_samples, n_features) W : float or dense array-like, shape (n_samples, n_components) H : float or dense array-like, shape (n_components, n_features) square_root : boolean, default False If True, return np.sqrt(2 * res) Returns ------- res : float Beta divergence of X and np.dot(X, H) """ if not sp.issparse(X): X = np.atleast_2d(X) W = np.atleast_2d(W) H = np.atleast_2d(H) # Avoid the creation of the dense np.dot(W, H) if X is sparse. if sp.issparse(X): norm_X = np.dot(X.data, X.data) norm_WH = trace_dot(np.dot(np.dot(W.T, W), H), H) cross_prod = trace_dot((X * H.T), W) res = (norm_X + norm_WH - 2. * cross_prod) / 2 else: res = squared_norm(X - np.dot(W, H)) / 2 if square_root: return np.sqrt(res * 2) else: return res
def _beta_divergence_dense(X, W, H, beta): """Compute the beta-divergence of X and W.H for dense array only. Used as a reference for testing nmf._beta_divergence. """ WH = np.dot(W, H) if beta == 2: return squared_norm(X - WH) / 2 WH_Xnonzero = WH[X != 0] X_nonzero = X[X != 0] np.maximum(WH_Xnonzero, 1e-9, out=WH_Xnonzero) if beta == 1: res = np.sum(X_nonzero * np.log(X_nonzero / WH_Xnonzero)) res += WH.sum() - X.sum() elif beta == 0: div = X_nonzero / WH_Xnonzero res = np.sum(div) - X.size - np.sum(np.log(div)) else: res = (X_nonzero**beta).sum() res += (beta - 1) * (WH**beta).sum() res -= beta * (X_nonzero * (WH_Xnonzero**(beta - 1))).sum() res /= beta * (beta - 1) return res
def loss(self, w): """Compute negative partial log-likelihood Parameters ---------- w : array, shape = [n_features] Estimate of coefficients Returns ------- loss : float Average negative partial log-likelihood """ xw = numpy.dot(self.x, w) at_risk = numpy.empty(self.x.shape[0]) for i in range(self.x.shape[0]): idx = self.time >= self.time[i] at_risk[i] = logsumexp(xw[idx]) loss = numpy.mean(self.event * (xw - at_risk)) if self.alpha > 0: loss -= 0.5 * self.alpha * squared_norm(w) return -loss
def _update_center(self, X): """ Fix Label, Weight, Update Center """ centers_old = self.cluster_centers_.copy() if self.cluster_method == 'k-means': cluster_center = kmeans_center elif self.cluster_method == 'k-median': cluster_center = kmedian_center else: raise ValueError('cluster_method must be kmeans or kmedian') # Choose data belong to cluster k and # Update cluster center with it mean for k in range(self.n_clusters): mask = self.labels_ == k self.cluster_centers_[k] = cluster_center(X[mask]) # check cluster is empty if np.isnan(self.cluster_centers_).any(): raise ValueError('Cluster must have at least one member') center_shift_total = squared_norm(self.cluster_centers_ - centers_old) return center_shift_total
def choose_alpha(alpha, x, S, n_samples, beta, lamda, gamma, theta=.99, max_iter=1000): """Choose alpha for backtracking. References ---------- Salzo S. (2017). https://doi.org/10.1137/16M1073741 """ eps = .5 partial_J = partial(_J, x, beta=beta, lamda=lamda, gamma=gamma, S=S, n_samples=n_samples) partial_f = partial(_f, n_samples=n_samples, S=S) gradient_ = _gradient(x, S, n_samples) for i in range(max_iter): iter_diff = partial_J(alpha=alpha) - x obj_diff = partial_f(K=partial_J(alpha=alpha)) - partial_f(K=x) if obj_diff - _scalar_product_3d(iter_diff, gradient_) <= theta / ( gamma * alpha) * squared_norm(iter_diff) + 1e-16: return alpha alpha *= eps return alpha
def _beta_divergence_dense(X, W, H, beta): """Compute the beta-divergence of X and W.H for dense array only. Used as a reference for testing nmf._beta_divergence. """ if isinstance(X, numbers.Number): W = np.array([[W]]) H = np.array([[H]]) X = np.array([[X]]) WH = np.dot(W, H) if beta == 2: return squared_norm(X - WH) / 2 WH_Xnonzero = WH[X != 0] X_nonzero = X[X != 0] np.maximum(WH_Xnonzero, 1e-9, out=WH_Xnonzero) if beta == 1: res = np.sum(X_nonzero * np.log(X_nonzero / WH_Xnonzero)) res += WH.sum() - X.sum() elif beta == 0: div = X_nonzero / WH_Xnonzero res = np.sum(div) - X.size - np.sum(np.log(div)) else: res = (X_nonzero ** beta).sum() res += (beta - 1) * (WH ** beta).sum() res -= beta * (X_nonzero * (WH_Xnonzero ** (beta - 1))).sum() res /= beta * (beta - 1) return res
def _solve_weight_vector(similarities, grouping_matrix, delta): """Solve for the weight vector of the similarities, used for _solve_omega and _solve_pi Parameters ---------- similarities : np.ndarray (n_similarities, (n_features * (n_features - 1) /2) similarity matrices grouping_matrix : np.ndarray (n_features, n_communities) delta : float Returns ------- weights : np.ndarray (1, n_similarities) """ # do some type check if np.any(similarities < 0): raise ValueError('similarities contain invalid values (< 0)') if delta <= 0: raise ValueError('delta value of {0} not allowed, ' 'needs to be >=0'.format(delta)) sigma = np.dot(grouping_matrix, grouping_matrix.T) n_similarities = len(similarities) # preallocate vector a = np.zeros(n_similarities) for i in range(n_similarities): a[i] = squared_norm(squareform(similarities[i]) - sigma) # solve for weight weight = simplex_projection(a / (2 * delta)) return np.atleast_2d(weight)
def _fit(self, X): """Fit the LatentTimeMatrixDecomposition model to X. Parameters ---------- X : ndarray, shape (n_time, n_samples, n_features), or (n_samples, n_features, n_time) Matrix to decompose. """ self.precision_, self.latent_, self.n_iter_ = latent_time_matrix_decomposition( X, alpha=self.alpha, tau=self.tau, rho=self.rho, beta=self.beta, eta=self.eta, mode=self.mode, tol=self.tol, rtol=self.rtol, psi=self.psi, phi=self.phi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=False, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective, ) self.reconstruction_err_ = squared_norm(X - self.get_observed_precision()) return self
def kmeans(X, k): samples = X.shape[0] best_inertia = None best_labels = None x_squared_norms = row_norms(X, squared=True) seeds = random_state.permutation(samples)[:k] centers = X[seeds] centers = centers.toarray() distances = numpy.zeros(shape=(X.shape[0], ), dtype=X.dtype) # Iterations for i in range(100): centers_old = centers.copy() labels, inertia = assign_labels(X, x_squared_norms, centers, distances, samples) centers = maximization(X, labels, distances, centers, samples) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_inertia = inertia center_shift = squared_norm(centers_old - centers) if center_shift > 0: best_labels, best_inertia = assign_labels(X, x_squared_norms, centers, distances, samples) return best_labels
def nlog_likelihood(self, w): """Compute negative partial log-likelihood Parameters ---------- w : array, shape = (n_features,) Estimate of coefficients Returns ------- loss : float Average negative partial log-likelihood """ time = self.time n_samples = self.x.shape[0] xw = numpy.dot(self.x, w) loss = 0 risk_set = 0 k = 0 for i in range(n_samples): ti = time[i] while k < n_samples and ti == time[k]: risk_set += numpy.exp(xw[k]) k += 1 if self.event[i]: loss -= (xw[i] - numpy.log(risk_set)) / n_samples # add regularization term to log-likelihood return loss + self.alpha * squared_norm(w) / (2. * n_samples)
def objectiveFLGL(emp_cov, K, R, T, H, U, mu, eta, rho): res = -fast_logdet(R) + np.sum(R * emp_cov) res += rho / 2. * squared_norm(R - T + U + np.linalg.multi_dot( (K.T, linalg.pinvh(H), K))) res += mu * l1_od_norm(H) res += eta * l1_od_norm(T) return res
def objective(S, R, Z_0, Z_1, Z_2, W_0, W_1, W_2, alpha, tau, beta, eta, psi, phi): """Objective for latent variable time-varying matrix decomposition.""" obj = squared_norm(S - R) obj += alpha * sum(map(l1_od_norm, Z_0)) obj += tau * sum(map(partial(np.linalg.norm, ord="nuc"), W_0)) obj += beta * sum(map(psi, Z_2 - Z_1)) obj += eta * sum(map(phi, W_2 - W_1)) return obj
def _objective_func(self, w): self._update_constraints(w) l_plus, xv_plus, l_minus, xv_minus = self._counter.calculate(w) x = self._counter.x xs = numpy.dot(x, w) val = 0.5 * squared_norm(w) if self._has_time: val += 0.5 * self._regr_penalty * squared_norm( self.y_compressed - xs.compress(self.regr_mask, axis=0)) val += 0.5 * self._rank_penalty * numexpr.evaluate( 'sum(xs * ((l_plus + l_minus) * xs - xv_plus - xv_minus - 2 * (l_minus - l_plus)) + l_minus)' ) return val
def test_loss_gradients_hessp_intercept(base_loss, sample_weight, l2_reg_strength, X_sparse): """Test that loss and gradient handle intercept correctly.""" loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False) loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True) n_samples, n_features = 10, 5 X, y, coef = random_X_y_coef(linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42) X[:, -1] = 1 # make last column of 1 to mimic intercept term X_inter = X[:, : -1] # exclude intercept column as it is added automatically by loss_inter if X_sparse: X = sparse.csr_matrix(X) if sample_weight == "range": sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) l, g = loss.loss_gradient(coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength) _, hessp = loss.gradient_hessian_product(coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength) l_inter, g_inter = loss_inter.loss_gradient( coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength) _, hessp_inter = loss_inter.gradient_hessian_product( coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength) # Note, that intercept gets no L2 penalty. assert l == pytest.approx(l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])) g_inter_corrected = g_inter g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1] assert_allclose(g, g_inter_corrected) s = np.random.RandomState(42).randn(*coef.shape) h = hessp(s) h_inter = hessp_inter(s) h_inter_corrected = h_inter h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1] assert_allclose(h, h_inter_corrected)
def norm(x): """Dot product-based Euclidean norm implementation See: http://fseoane.net/blog/2011/computing-the-vector-norm/ Parameters ---------- x : array-like Vector for which to compute the norm """ return sqrt(squared_norm(x))
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4): from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums) sc = SparkContext(conf=conf) data = sc.parallelize(X) data.cache() random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None x_squared_norms = row_norms(X, squared=True) # x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect() # x_squared_norms = np.array(x_squared_norms, dtype='float64') centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms) bs = X.shape[0]/worker_nums data_temp = [] for i in range(worker_nums-1): data_temp.append(X[i*bs:(i+1)*bs]) data_temp.append(X[(worker_nums-1)*bs:]) data_temp = np.array(data_temp, dtype='float64') data_temp = sc.parallelize(data_temp) data_temp.cache() for i in range(max_iter): centers_old = centers.copy() all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect() temp_all_distances = all_distances[0] for i in range(1, worker_nums): temp_all_distances = np.hstack((temp_all_distances, all_distances[i])) all_distances = temp_all_distances # all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect() # # reshape, from (1, n_samples, k) to (k, n_samples) # all_distances = np.asarray(all_distances, dtype="float64").T[0] # Assignment, also called E-step of EM labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances) # re-computation of the centroids, also called M-step of EM centers = _centers(X, labels, n_clusters) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: break return best_centers, best_labels, best_inertia
def fit(self, X): self.n_samples = X.shape[0] self.n_features = X.shape[1] if self.balanced: self.cluster_size = int(self.n_samples / self.n_clusters) # Place k centroids randomly. centers = self.init_centers(X) best_labels, best_inertia, best_centers = None, None, None for i in range(self.max_iterations): centers_old = centers.copy() # Get labels and inertia. if not self.balanced: labels, inertia = self.get_labels_and_inertia(X, centers) else: labels, inertia = self.get_labels_and_inertia_extended( X, centers) # Move the centers to the mean of the points assigned to it. centers = self.move_to_mean(X, labels) print("Iteration {:2d}, inertia {:.3f}".format(i, inertia)) # Update the labels and centers if the inertia is the minimum. if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia # Check if the centers move. center_shift_total = squared_norm(centers_old - centers) print("center shift {:f}".format(center_shift_total)) if center_shift_total == 0: print("Converged at iteration {:d}: center shift {:f}".format( i, center_shift_total)) break # For the case it stops due to the max iterations if center_shift_total > 0: if not self.balanced: best_labels, best_inertia = self.get_labels_and_inertia( X, best_centers) else: best_labels, best_inertia = self.get_labels_and_inertia_extended( X, best_centers) # Convert array to list for grading purpose. list_best_centers = [] for centroid in best_centers: list_best_centers.append(list(centroid)) return list(best_labels), list_best_centers
def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, l1_ratio): gradW = (np.dot(W, np.dot(H, H.T)) - safe_sparse_dot(X, H.T, dense_output=True)) gradH = (np.dot(np.dot(W.T, W), H) - safe_sparse_dot(W.T, X, dense_output=True)) init_grad = squared_norm(gradW) + squared_norm(gradH.T) # max(0.001, tol) to force alternating minimizations of W and H tolW = max(0.001, tol) * np.sqrt(init_grad) tolH = tolW for n_iter in range(1, max_iter + 1): # stopping condition as discussed in paper proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0)) proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0)) if (proj_grad_W + proj_grad_H) / init_grad < tol ** 2: break # update W Wt, gradWt, iterW = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio) W, gradW = Wt.T, gradWt.T if iterW == 1: tolW = 0.1 * tolW # update H H, gradH, iterH = _nls_subproblem(X, W, H, tolH, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio) if iterH == 1: tolH = 0.1 * tolH H[H == 0] = 0 # fix up negative zeros if n_iter == max_iter: Wt, _, _ = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio) W = Wt.T return W, H, n_iter
def _compute_RMSETOTAL(D, A, Ft, P, O, S, R): fD = 0 numElementsD = 0 fA = 0 numElementsA = 0 baselineSum = sum([norm(mat) for mat in D]) + sum([norm(mat) for mat in A]) for i, Di in enumerate(D): PiSiFt = P[i] @ S[i] @ Ft fD += squared_norm(Di - PiSiFt) / (squared_norm(Di)) # numElementsD += Di.size for i, Ai in enumerate(A): OiRiFt = O[i] @ R[i] @ Ft fA += squared_norm(Ai - OiRiFt) / (squared_norm(Ai)) # numElementsA += Ai.size answer = (fD + fA) #/(numElementsD+numElementsA) return math.sqrt(answer), math.sqrt(fA) #/numElementsA)
def _compute_RMSETOTAL(D, A, F, P, O): fD = 0 numElementsD = 0 fA = 0 numElementsA = 0 for i, Di in enumerate(D): PiF = dot(P[i], F) fD += squared_norm(Di - PiF) # numElementsD += Di.size numElementsD += squared_norm(Di) for i, Ai in enumerate(A): OiF = dot(O[i], F) fA += squared_norm(Ai - OiF) # numElementsA += Ai.size numElementsA += squared_norm(Ai) answer = (fD + fA) / (numElementsD + numElementsA) return answer, fA / numElementsA
def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means||', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, oversampling_factor=2, init_max_iter=None): centers = k_init(X, n_clusters, init=init, oversampling_factor=oversampling_factor, random_state=random_state, max_iter=init_max_iter) dt = X.dtype X = X.astype(np.float32) P = X.shape[1] for i in range(max_iter): t0 = tic() centers = centers.astype('f4') labels, distances = pairwise_distances_argmin_min( X, centers, metric='euclidean', metric_kwargs={"squared": True} ) labels = labels.astype(np.int32) distances = distances.astype(np.float32) r = da.atop(_centers_dense, 'ij', X, 'ij', labels, 'i', n_clusters, None, distances, 'i', adjust_chunks={"i": n_clusters, "j": P}, dtype='f8') new_centers = da.from_delayed( sum(r.to_delayed().flatten()), (n_clusters, P), X.dtype ) counts = da.bincount(labels, minlength=n_clusters) new_centers = new_centers / counts[:, None] new_centers, = compute(new_centers) # Convergence check shift = squared_norm(centers - new_centers) t1 = tic() logger.info("Lloyd loop %2d. Shift: %0.4f [%.2f s]", i, shift, t1 - t0) if shift < tol: break centers = new_centers if shift > 1e-7: labels, distances = pairwise_distances_argmin_min(X, centers) inertia = distances.astype(dt).sum() centers = centers.astype(dt) labels = labels.astype(np.int64) return labels, inertia, centers, i + 1
def _multinomial_loss_and_gradient(w, X, Y, alpha, sample_weight, xStd, standardization): # print("coefficients = " + str(w)) _, n_features = X.shape _, n_classes = Y.shape n_samples = np.sum(sample_weight) sample_weight = sample_weight[:, np.newaxis] fit_intercept = (w.size == n_classes * (n_features + 1)) grad = np.zeros((n_classes, n_features + bool(fit_intercept))) # Calculate loss value w = w.reshape(n_classes, -1) if fit_intercept: intercept = w[:, -1] w = w[:, :-1] else: intercept = 0 p = safe_sparse_dot(X, w.T) + intercept p -= logsumexp(p, axis=1)[:, np.newaxis] if standardization: l2reg = 0.5 * alpha * squared_norm(w) l2reg_grad = alpha * w else: _w = w / xStd l2reg = 0.5 * alpha * squared_norm(_w) l2reg_grad = alpha * _w / xStd loss = -(sample_weight * Y * p).sum() / n_samples + l2reg # print("loss = " + str(loss)) diff = sample_weight * (np.exp(p) - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) / n_samples grad[:, :n_features] += l2reg_grad # print("grad = " + str(grad)) if fit_intercept: grad[:, -1] = diff.sum(axis=0) / n_samples return loss, grad.ravel()
def logistic_loss(w, X, Y, alpha): """ Implementation of the logistic loss function when Y is a probability distribution. loss = -SUM_i SUM_k y_ik * log(P[yi == k]) + alpha * ||w||^2 """ n_classes = Y.shape[1] n_features = X.shape[1] intercept = 0 if n_classes > 2: fit_intercept = w.size == (n_classes * (n_features + 1)) w = w.reshape(n_classes, -1) if fit_intercept: intercept = w[:, -1] w = w[:, :-1] else: fit_intercept = w.size == (n_features + 1) if fit_intercept: intercept = w[-1] w = w[:-1] z = safe_sparse_dot(X, w.T) + intercept if n_classes == 2: # in the binary case, simply compute the logistic function p = np.vstack([log_logistic(-z), log_logistic(z)]).T else: # compute the logistic function for each class and normalize denom = expit(z) denom = denom.sum(axis=1).reshape((denom.shape[0], -1)) p = log_logistic(z) loss = -(Y * p).sum() loss += np.log(denom).sum() # Y.sum() = 1 loss += 0.5 * alpha * squared_norm(w) return loss loss = -(Y * p).sum() + 0.5 * alpha * squared_norm(w) return loss
def _multinomial_loss(w, X, Y, alpha, sample_weight): """Computes multinomial loss and class probabilities. Parameters ---------- w : ndarray, shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Y : ndarray, shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : ndarray, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- loss : float Multinomial loss. p : ndarray, shape (n_samples, n_classes) Estimated class probabilities. w : ndarray, shape (n_classes, n_features) Reshaped param vector excluding intercept terms. """ n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = w.size == (n_classes * (n_features + 1)) w = w.reshape(n_classes, -1) sample_weight = sample_weight[:, np.newaxis] if fit_intercept: intercept = w[:, -1] w = w[:, :-1] else: intercept = 0 p = safe_sparse_dot(X, w.T) p += intercept p -= logsumexp(p, axis=1)[:, np.newaxis] loss = -(sample_weight * Y * p).sum() loss += 0.5 * alpha * squared_norm(w) p = np.exp(p, p) return loss, p, w
def _update_center(self, X): """ Fix Weights and Labels, Update Centers """ centers_old = self.cluster_centers_.copy() # choose data belong to cluster k # and update cluster center with it mean for k in range(self.n_clusters): mask = self.labels_ == k self.cluster_centers_[k] = np.mean(X[mask], axis=0) center_shift_total = squared_norm(self.cluster_centers_ - centers_old) return center_shift_total
def _objective_func(self, beta_bias): bias, beta = self._split_coefficents(beta_bias) Kw = self._Kw val = 0.5 * numpy.dot(beta, Kw) if self._has_time: val += 0.5 * self._regr_penalty * squared_norm(self.y_compressed - bias - Kw.compress(self.regr_mask, axis=0)) l_plus, xv_plus, l_minus, xv_minus = self._counter.calculate(beta) val += 0.5 * self._rank_penalty * numexpr.evaluate( 'sum(Kw * ((l_plus + l_minus) * Kw - xv_plus - xv_minus - 2 * (l_minus - l_plus)) + l_minus)') return val
def kmeansopt(data, k ,rng, T = 50 , method = 'kmeans' , tol = 1e-4 ): centroids = [] lable = [] if(method == 'kmeans++'): centroids = optimize_centroids(data, centroids , k ,rng ) else: centroids = ramdon_centroids(data, centroids , k ,rng) # print("inital centroids") # print(centroids) old_centroids = [] # result_dict = {} Iteration = 0 clusters = [[] for i in range(k)] # while(Iteration < T and not compare(old_centroids , centroids)): while(Iteration < T ): clusters = [[] for i in range(k)] clusters,lable= euclidean(data, centroids, clusters) # print(" The %d times cluster" % Iteration) # print(clusters) # recalculate centriods from exist cluster index = 0 old_centroids = list(centroids); # print(Iteration) for cluster in clusters: centroids[index] = np.mean(cluster, axis = 0).tolist() index += 1 # for num in range(0,len(clusters)): # for ld in clusters[num]: # result_dict[str(ld)] = num # print(centroids) centroids_matrix = np.matrix(centroids) # print(centroids_matrix) # print(old_centroids) old_centroids_matrix = np.matrix(old_centroids) # print(old_centroids_matrix) shift = squared_norm(old_centroids_matrix - centroids_matrix) if shift <= tol: # print("Already Coverage , break") break Iteration += 1 # End of innerLoop return clusters, centroids, lable
def temp_log_loss(w, X, Y, alpha): n_classes = Y.shape[1] w = w.reshape(n_classes, -1) intercept = w[:, -1] w = w[:, :-1] z = safe_sparse_dot(X, w.T) + intercept denom = expit(z) #print denom #print denom.sum() denom = denom.sum(axis=1).reshape((denom.shape[0], -1)) #print denom p = log_logistic(z) loss = - (Y * p).sum() loss += np.log(denom).sum() loss += 0.5 * alpha * squared_norm(w) return loss
def _multinomial_loss(w, X, Y, alpha): sample_weight = np.ones(len(Y)) n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = w.size == (n_classes * (n_features + 1)) w = w.reshape(n_classes, -1) sample_weight = sample_weight[:, np.newaxis] if fit_intercept: intercept = w[:, -1] w = w[:, :-1] else: intercept = 0 p = safe_sparse_dot(X, w.T) p += intercept p -= logsumexp(p, axis=1)[:, np.newaxis] loss = -(sample_weight * Y * p).sum() loss += 0.5 * alpha * squared_norm(w) p = np.exp(p, p) return loss, p, w
def _kmeans_single(X, n_clusters, max_iter=300, init='k-means++', random_state=None, tol=1e-4): random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init x_squared_norms = row_norms(X, squared=True) centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms) # distances = np.zeros(shape=(X.shape[0],), dtype=np.float64) # iterations for i in range(max_iter): centers_old = centers.copy() # Assignment, also called E-step of EM labels, inertia = _labels_inertia(X, x_squared_norms, centers) # re-computation of the centroids, also called M-step of EM centers = _centers(X, labels, n_clusters) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: break if shift > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers) return best_centers, best_labels, best_inertia
def enet_kernel_learning( K, y, lamda=0.01, beta=0.01, gamma='auto', max_iter=100, verbose=0, tol=1e-4, return_n_iter=True): """Elastic Net kernel learning. Solve the following problem via alternating minimisation: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] # KKT = [K[j].T.dot(K[j]) for j in range(len(K))] # print(KKT[0].shape) if gamma == 'auto': lipschitz_constant = np.array([ sum(np.linalg.norm(K_j[i].dot(K_j[i].T)) for i in range(K_j.shape[0])) for K_j in K]) gamma = 1. / (lipschitz_constant) objective_new = 0 for iteration_ in range(max_iter): w_old = coef.copy() alpha_old = [a.copy() for a in alpha] objective_old = objective_new # update w A = [K[j].dot(alpha[j]) for j in range(n_patients)] alpha_coef_K = [alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients)] gradient = sum((alpha_coef_K[j] - y[j]).dot(A[j].T) for j in range(n_patients)) # gradient_2 = coef.dot(sum( # np.dot(K[j].dot(alpha[j]), K[j].dot(alpha[j]).T) # for j in range(len(K)))) - sum( # y[j].dot(K[j].dot(alpha[j]).T) for j in range(len(K))) # gradient = coef.dot(sum( # alpha[j].dot(KKT[j].dot(alpha[j])) for j in range(len(K)))) - sum( # y[j].dot(K[j].dot(alpha[j]).T) for j in range(len(K))) # gradient += 2 * beta * coef coef = soft_thresholding(coef - gamma * gradient, lamda=lamda * gamma) # update alpha # for j in range(len(K)): # alpha[j] = _solve_cholesky_kernel( # K[j].T.dot(coef), y[j][..., None], lamda).ravel() A = [K[j].T.dot(coef) for j in range(n_patients)] alpha_coef_K = [alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients)] gradient = [(alpha_coef_K[j] - y[j]).dot(A[j].T) + 2 * beta * alpha[j] for j in range(n_patients)] alpha = [alpha[j] - gamma * gradient[j] for j in range(n_patients)] objective_new = objective(K, y, alpha, lamda, beta, coef) objective_difference = abs(objective_new - objective_old) snorm = np.sqrt(squared_norm(coef - w_old) + sum( squared_norm(a - a_old) for a, a_old in zip(alpha, alpha_old))) obj = objective(K, y, alpha, lamda, beta, coef) if verbose and iteration_ % 10 == 0: print("obj: %.4f, snorm: %.4f" % (obj, snorm)) if snorm < tol and objective_difference < tol: break if np.isnan(snorm) or np.isnan(objective_difference): raise ValueError('assdgg') else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def objective_admm2(x, y, alpha, lamda, beta, w1): """Objective function for lasso kernel learning.""" obj = .5 * sum(squared_norm(x[j] - y[j]) for j in range(len(x))) obj += lamda * np.abs(w1).sum() obj += beta * sum(squared_norm(a) for a in alpha) return obj
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def enet_kernel_learning_admm( K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) u_1 = np.zeros(n_kernels) u_2 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) w_2 = np.zeros(n_kernels) w_1_old = w_1.copy() w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK A = [K[j].T.dot(coef) for j in range(n_patients)] KK = [A[j].dot(A[j].T) for j in range(n_patients)] yy = [y[j].dot(A[j]) for j in range(n_patients)] alpha = [_solve_cholesky_kernel( KK[j], yy[j][..., None], 2).ravel() for j in range(n_patients)] # alpha = [_solve_cholesky_kernel( # K_dot_coef[j], y[j][..., None], 0).ravel() for j in range(n_patients)] w_1 = soft_thresholding(coef + u_1, lamda / rho) w_2 = prox_laplacian(coef + u_2, beta / rho) # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(y[j].dot(A[j].T) for j in range(n_patients)) yy += rho * (w_1 + w_2 - u_1 - u_2) coef = _solve_cholesky_kernel(KK, yy[..., None], 2 * rho).ravel() # update residuals u_1 += coef - w_1 u_2 += coef - w_2 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(coef - w_1) + squared_norm(coef - w_2)) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + squared_norm(w_2 - w_2_old)) obj = objective_admm(K, y, alpha, lamda, beta, coef, w_1, w_2) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(2 * coef.size) * tol + rtol * max( np.sqrt(squared_norm(coef) + squared_norm(coef)), np.sqrt(squared_norm(w_1) + squared_norm(w_2))), e_dual=np.sqrt(2 * coef.size) * tol + rtol * rho * ( np.sqrt(squared_norm(u_1) + squared_norm(u_2)))) w_1_old = w_1.copy() w_2_old = w_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u_1 *= rho / rho_new u_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def _norm(x): """Dot product-based Euclidean norm implementation See: http://fseoane.net/blog/2011/computing-the-vector-norm/ """ return np.sqrt(squared_norm(x))
def matrix_factorization(X, H=None, n_components=None, init=None, update_H=True, tol=1e-4, max_iter=200, alpha=0.01, beta=0.02): n_samples, n_features = X.shape if n_components is None: n_components = n_features # check W and H, or initialize them if not update_H: W = np.zeros((n_samples, n_components)) else: W, H = _initialize_nmf(X, n_components, init=init, eps=1e-6) print W print H n_iter = 0 e_before = 0 for step in xrange(max_iter): n_iter = step + 1 print n_iter xs, ys = X.nonzero() # the x index and y index of nonzero W_temp = W ER = X - np.dot(W,H) # the error matrix for i in xrange(n_samples): for k in xrange(n_components): sum = 0 for j in ys[xs==i]: sum += ER[i][j] * H[k][j] t = W[i][k] + alpha * (sum - beta * W[i][k]) if t < 0: a = alpha for l in xrange(10): a /= 2 t = W[i][k] + a * (sum - beta * W[i][k]) if t >= 0: break if t < 0: t = W[i][k] W[i][k] = t if update_H: for j in xrange(n_features): for k in xrange(n_components): sum = 0 for i in xs[ys==j]: sum += ER[i][j] * W_temp[i][k] t = H[k][j] + alpha * (sum - beta * H[k][j]) if t < 0: a = alpha for l in xrange(10): a /= 2 t = H[k][j] + a * (sum - beta * H[k][j]) if t >= 0: break if t < 0: t = H[k][j] H[k][j] = t E = (X - np.dot(W,H)) * (X>0) e = squared_norm(E) + beta * ( squared_norm(W) + squared_norm(H) ) # if step > 0: # if abs(e/e_before - 1) < tol: # break # e_before = e print e if e < tol: break if n_iter == max_iter: print ("Maximum number of iteration %d reached. Increase it to" " improve convergence." % max_iter) return W, H, n_iter
def norm(x): return sqrt(squared_norm(x))
b = pd.DataFrame(temp) print b b0 = b.fillna(0) X = np.array(b0) # print X.mean() # print (X[X>0]).mean() # X[X==0] = (X[X>0]).mean() # print X # U, S, V = randomized_svd(X, 20) U, S, V = svds(sparse.csr_matrix(X), k=50, maxiter=2000) S = vector_to_diagonal(S) print X print U print S print V recon= pd.DataFrame(np.dot(U,np.dot(S,V)),b0.index,b0.columns) recon[recon<1] = 1 recon[recon>5] = 5 # recon.to_csv('svdrecon.csv') print recon d = (t0 - recon) * (t0>0) # d.to_csv('svdd.csv') d.fillna(0,inplace = True) print squared_norm(d)
def _objective_func(self, w): z = self.Aw.shape[0] + squared_norm(self.AXw) - 2. * self.AXw.sum() val = 0.5 * squared_norm(w) + 0.5 * self.alpha * z return val
def enet_kernel_learning_admm2( K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||y_i - alpha_i * sum_{k=1}^{n_k} w_k * K_{ik}||^2 + lamda ||w||_1 + beta sum_{j=1}^{c_i}||alpha_j||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] u = [np.zeros(K[j].shape[1]) for j in range(n_patients)] u_1 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) x_old = [np.zeros(K[0].shape[1]) for j in range(n_patients)] w_1_old = w_1.copy() # w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update x A = [K[j].T.dot(coef) for j in range(n_patients)] x = [prox_laplacian(y[j] + rho * (A[j].T.dot(alpha[j]) - u[j]), rho / 2.) for j in range(n_patients)] # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK KK = [rho * A[j].dot(A[j].T) for j in range(n_patients)] yy = [rho * A[j].dot(x[j] + u[j]) for j in range(n_patients)] alpha = [_solve_cholesky_kernel( KK[j], yy[j][..., None], 2 * beta).ravel() for j in range(n_patients)] # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(A[j].dot(x[j] + u[j]) for j in range(n_patients)) yy += w_1 - u_1 coef = _solve_cholesky_kernel(KK, yy[..., None], 1).ravel() w_1 = soft_thresholding(coef + u_1, lamda / rho) # w_2 = prox_laplacian(coef + u_2, beta / rho) # update residuals alpha_coef_K = [ alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients)] residuals = [x[j] - alpha_coef_K[j] for j in range(n_patients)] u = [u[j] + residuals[j] for j in range(n_patients)] u_1 += coef - w_1 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(coef - w_1) + sum(squared_norm(residuals[j]) for j in range(n_patients))) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + sum(squared_norm(x[j] - x_old[j]) for j in range(n_patients))) obj = objective_admm2(x, y, alpha, lamda, beta, w_1) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(coef.size + sum( x[j].size for j in range(n_patients))) * tol + rtol * max( np.sqrt(squared_norm(coef) + sum(squared_norm( alpha_coef_K[j]) for j in range(n_patients))), np.sqrt(squared_norm(w_1) + sum(squared_norm( x[j]) for j in range(n_patients)))), e_dual=np.sqrt(coef.size + sum( x[j].size for j in range(n_patients))) * tol + rtol * rho * ( np.sqrt(squared_norm(u_1) + sum(squared_norm( u[j]) for j in range(n_patients))))) w_1_old = w_1.copy() x_old = [x[j].copy() for j in range(n_patients)] if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u = [u[j] * (rho / rho_new) for j in range(n_patients)] u_1 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def _objective_func(self, w): val = 0.5 * squared_norm(w) + 0.5 * self.alpha * squared_norm(self.L) return val