def test_primal_dual_relationship(): y = y_diabetes.reshape(-1, 1) coef = _solve_cholesky(X_diabetes, y, alpha=[1e-2]) K = np.dot(X_diabetes, X_diabetes.T) dual_coef = _solve_cholesky_kernel(K, y, alpha=[1e-2]) coef2 = np.dot(X_diabetes.T, dual_coef).T assert_array_almost_equal(coef, coef2)
def test_ridge_sample_weights_in_feature_space(): """Check that Cholesky solver in feature space applies sample_weights correctly. """ rng = np.random.RandomState(42) n_samples_list = [5, 6, 7] * 2 n_features_list = [7, 6, 5] * 2 n_targets_list = [1, 1, 1, 2, 2, 2] noise = 1. alpha = 2. alpha = np.atleast_1d(alpha) for n_samples, n_features, n_targets in zip(n_samples_list, n_features_list, n_targets_list): X = rng.randn(n_samples, n_features) beta = rng.randn(n_features, n_targets) Y = X.dot(beta) Y_noisy = Y + rng.randn(*Y.shape) * np.sqrt((Y ** 2).sum(0)) * noise K = X.dot(X.T) sample_weights = 1. + (rng.randn(n_samples) ** 2) * 10 coef_sample_space = _solve_cholesky_kernel(K, Y_noisy, alpha, sample_weight=sample_weights) coef_feature_space = _solve_cholesky(X, Y_noisy, alpha, sample_weight=sample_weights) assert_array_almost_equal(X.T.dot(coef_sample_space), coef_feature_space.T)
def test_ridge_sample_weights_in_feature_space(): """Check that Cholesky solver in feature space applies sample_weights correctly. """ rng = np.random.RandomState(42) n_samples_list = [5, 6, 7] * 2 n_features_list = [7, 6, 5] * 2 n_targets_list = [1, 1, 1, 2, 2, 2] noise = 1. alpha = 2. alpha = np.atleast_1d(alpha) for n_samples, n_features, n_targets in zip(n_samples_list, n_features_list, n_targets_list): X = rng.randn(n_samples, n_features) beta = rng.randn(n_features, n_targets) Y = X.dot(beta) Y_noisy = Y + rng.randn(*Y.shape) * np.sqrt((Y**2).sum(0)) * noise K = X.dot(X.T) sample_weights = 1. + (rng.randn(n_samples)**2) * 10 coef_sample_space = _solve_cholesky_kernel( K, Y_noisy, alpha, sample_weight=sample_weights) coef_feature_space = _solve_cholesky(X, Y_noisy, alpha, sample_weight=sample_weights) assert_array_almost_equal(X.T.dot(coef_sample_space), coef_feature_space.T)
def predict(self, X): Weights_S = np.multiply(self.alphas.ravel(), np.sum(self.E_mat, axis=1)) Weights_all = np.hstack( (np.ones(self.n_aux_samples), (Weights_S / np.max(Weights_S)))) Ysp = np.vstack((self.Y_aux_, self.y_fit_)) Xsp = np.vstack((self.X_aux_, self.X_fit_)) KK = self._get_kernel(Xsp) Kx = self._get_kernel(Xsp, X) self.a_prediction = _solve_cholesky_kernel(KK, Ysp, self.lmbd, sample_weight=Weights_all) y_prediction = np.dot(Kx.T, self.a_prediction) return y_prediction
def fit(self, X, y=None, sample_weight=None): """Fit Kernel Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or numpy array of shape [n_samples] Individual weights for each sample, ignored if None is passed. Returns ------- self : returns an instance of self. """ # Convert data X, y = check_X_y(X, y, multi_output=True) n_samples = X.shape[0] K = self._get_kernel(X) alpha = np.atleast_1d(self.alpha) ravel = False if len(y.shape) == 1: y = y.reshape(-1, 1) ravel = True copy = self.kernel == "precomputed" self.dual_coef_ = _solve_cholesky_kernel(K, y, alpha, sample_weight, copy) if ravel: self.dual_coef_ = self.dual_coef_.ravel() self.X_fit_ = X return self
def enet_kernel_learning_admm2( K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||y_i - alpha_i * sum_{k=1}^{n_k} w_k * K_{ik}||^2 + lamda ||w||_1 + beta sum_{j=1}^{c_i}||alpha_j||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] u = [np.zeros(K[j].shape[1]) for j in range(n_patients)] u_1 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) x_old = [np.zeros(K[0].shape[1]) for j in range(n_patients)] w_1_old = w_1.copy() # w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update x A = [K[j].T.dot(coef) for j in range(n_patients)] x = [prox_laplacian(y[j] + rho * (A[j].T.dot(alpha[j]) - u[j]), rho / 2.) for j in range(n_patients)] # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK KK = [rho * A[j].dot(A[j].T) for j in range(n_patients)] yy = [rho * A[j].dot(x[j] + u[j]) for j in range(n_patients)] alpha = [_solve_cholesky_kernel( KK[j], yy[j][..., None], 2 * beta).ravel() for j in range(n_patients)] # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(A[j].dot(x[j] + u[j]) for j in range(n_patients)) yy += w_1 - u_1 coef = _solve_cholesky_kernel(KK, yy[..., None], 1).ravel() w_1 = soft_thresholding(coef + u_1, lamda / rho) # w_2 = prox_laplacian(coef + u_2, beta / rho) # update residuals alpha_coef_K = [ alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients)] residuals = [x[j] - alpha_coef_K[j] for j in range(n_patients)] u = [u[j] + residuals[j] for j in range(n_patients)] u_1 += coef - w_1 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(coef - w_1) + sum(squared_norm(residuals[j]) for j in range(n_patients))) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + sum(squared_norm(x[j] - x_old[j]) for j in range(n_patients))) obj = objective_admm2(x, y, alpha, lamda, beta, w_1) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(coef.size + sum( x[j].size for j in range(n_patients))) * tol + rtol * max( np.sqrt(squared_norm(coef) + sum(squared_norm( alpha_coef_K[j]) for j in range(n_patients))), np.sqrt(squared_norm(w_1) + sum(squared_norm( x[j]) for j in range(n_patients)))), e_dual=np.sqrt(coef.size + sum( x[j].size for j in range(n_patients))) * tol + rtol * rho * ( np.sqrt(squared_norm(u_1) + sum(squared_norm( u[j]) for j in range(n_patients))))) w_1_old = w_1.copy() x_old = [x[j].copy() for j in range(n_patients)] if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u = [u[j] * (rho / rho_new) for j in range(n_patients)] u_1 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning_admm( K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) u_1 = np.zeros(n_kernels) u_2 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) w_2 = np.zeros(n_kernels) w_1_old = w_1.copy() w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK A = [K[j].T.dot(coef) for j in range(n_patients)] KK = [A[j].dot(A[j].T) for j in range(n_patients)] yy = [y[j].dot(A[j]) for j in range(n_patients)] alpha = [_solve_cholesky_kernel( KK[j], yy[j][..., None], 2).ravel() for j in range(n_patients)] # alpha = [_solve_cholesky_kernel( # K_dot_coef[j], y[j][..., None], 0).ravel() for j in range(n_patients)] w_1 = soft_thresholding(coef + u_1, lamda / rho) w_2 = prox_laplacian(coef + u_2, beta / rho) # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(y[j].dot(A[j].T) for j in range(n_patients)) yy += rho * (w_1 + w_2 - u_1 - u_2) coef = _solve_cholesky_kernel(KK, yy[..., None], 2 * rho).ravel() # update residuals u_1 += coef - w_1 u_2 += coef - w_2 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(coef - w_1) + squared_norm(coef - w_2)) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + squared_norm(w_2 - w_2_old)) obj = objective_admm(K, y, alpha, lamda, beta, coef, w_1, w_2) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(2 * coef.size) * tol + rtol * max( np.sqrt(squared_norm(coef) + squared_norm(coef)), np.sqrt(squared_norm(w_1) + squared_norm(w_2))), e_dual=np.sqrt(2 * coef.size) * tol + rtol * rho * ( np.sqrt(squared_norm(u_1) + squared_norm(u_2)))) w_1_old = w_1.copy() w_2_old = w_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u_1 *= rho / rho_new u_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def lasso_kernel_admm(K, y, lamda=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None, sample_weight=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||y_i - alpha_i * sum_{k=1}^{n_k} w_k * K_{ik}||^2 + lamda ||w||_1 + beta sum_{j=1}^{c_i}||alpha_j||_2^2 """ n_kernels, n_samples, n_features = K.shape coef = np.ones(n_kernels) # alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] # u = [np.zeros(K[j].shape[1]) for j in range(n_patients)] w_1 = coef.copy() u_1 = np.zeros(n_kernels) # x_old = [np.zeros(K[0].shape[1]) for j in range(n_patients)] w_1_old = w_1.copy() Y = y[:, None].dot(y[:, None].T) checks = [] for iteration_ in range(max_iter): # update w KK = 2 * np.tensordot(K, K.T, axes=([1, 2], [0, 1])) yy = 2 * np.tensordot(Y, K, axes=([0, 1], [1, 2])) yy += rho * (w_1 - u_1) coef = _solve_cholesky_kernel(KK, yy[..., None], rho).ravel() w_1 = soft_thresholding(coef + u_1, lamda / rho) # w_2 = prox_laplacian(coef + u_2, beta / rho) u_1 += coef - w_1 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(coef - w_1)) snorm = rho * np.sqrt(squared_norm(w_1 - w_1_old)) obj = lasso_objective(Y, coef, K, w_1, lamda) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(coef.size) * tol + rtol * max(np.sqrt(squared_norm(coef)), np.sqrt(squared_norm(w_1))), e_dual=np.sqrt(coef.size) * tol + rtol * rho * (np.sqrt(squared_norm(u_1)))) w_1_old = w_1.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u_1 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [coef] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning_admm2(K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||y_i - alpha_i * sum_{k=1}^{n_k} w_k * K_{ik}||^2 + lamda ||w||_1 + beta sum_{j=1}^{c_i}||alpha_j||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) alpha = [np.zeros(K[j].shape[2]) for j in range(n_patients)] u = [np.zeros(K[j].shape[1]) for j in range(n_patients)] u_1 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) x_old = [np.zeros(K[0].shape[1]) for j in range(n_patients)] w_1_old = w_1.copy() # w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update x A = [K[j].T.dot(coef) for j in range(n_patients)] x = [ prox_laplacian(y[j] + rho * (A[j].T.dot(alpha[j]) - u[j]), rho / 2.) for j in range(n_patients) ] # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK KK = [rho * A[j].dot(A[j].T) for j in range(n_patients)] yy = [rho * A[j].dot(x[j] + u[j]) for j in range(n_patients)] alpha = [ _solve_cholesky_kernel(KK[j], yy[j][..., None], 2 * beta).ravel() for j in range(n_patients) ] # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(A[j].dot(x[j] + u[j]) for j in range(n_patients)) yy += w_1 - u_1 coef = _solve_cholesky_kernel(KK, yy[..., None], 1).ravel() w_1 = soft_thresholding(coef + u_1, lamda / rho) # w_2 = prox_laplacian(coef + u_2, beta / rho) # update residuals alpha_coef_K = [ alpha[j].dot(K[j].T.dot(coef)) for j in range(n_patients) ] residuals = [x[j] - alpha_coef_K[j] for j in range(n_patients)] u = [u[j] + residuals[j] for j in range(n_patients)] u_1 += coef - w_1 # diagnostics, reporting, termination checks rnorm = np.sqrt( squared_norm(coef - w_1) + sum(squared_norm(residuals[j]) for j in range(n_patients))) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + sum(squared_norm(x[j] - x_old[j]) for j in range(n_patients))) obj = objective_admm2(x, y, alpha, lamda, beta, w_1) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(coef.size + sum(x[j].size for j in range(n_patients))) * tol + rtol * max( np.sqrt( squared_norm(coef) + sum( squared_norm(alpha_coef_K[j]) for j in range(n_patients))), np.sqrt( squared_norm(w_1) + sum(squared_norm(x[j]) for j in range(n_patients)))), e_dual=np.sqrt(coef.size + sum(x[j].size for j in range(n_patients))) * tol + rtol * rho * (np.sqrt( squared_norm(u_1) + sum(squared_norm(u[j]) for j in range(n_patients))))) w_1_old = w_1.copy() x_old = [x[j].copy() for j in range(n_patients)] if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u = [u[j] * (rho / rho_new) for j in range(n_patients)] u_1 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def enet_kernel_learning_admm(K, y, lamda=0.01, beta=0.01, rho=1., max_iter=100, verbose=0, rtol=1e-4, tol=1e-4, return_n_iter=True, update_rho_options=None): """Elastic Net kernel learning. Solve the following problem via ADMM: min sum_{i=1}^p 1/2 ||alpha_i * w * K_i - y_i||^2 + lamda ||w||_1 + + beta||w||_2^2 """ n_patients = len(K) n_kernels = len(K[0]) coef = np.ones(n_kernels) u_1 = np.zeros(n_kernels) u_2 = np.zeros(n_kernels) w_1 = np.zeros(n_kernels) w_2 = np.zeros(n_kernels) w_1_old = w_1.copy() w_2_old = w_2.copy() checks = [] for iteration_ in range(max_iter): # update alpha # solve (AtA + 2I)^-1 (Aty) with A = wK A = [K[j].T.dot(coef) for j in range(n_patients)] KK = [A[j].dot(A[j].T) for j in range(n_patients)] yy = [y[j].dot(A[j]) for j in range(n_patients)] alpha = [ _solve_cholesky_kernel(KK[j], yy[j][..., None], 2).ravel() for j in range(n_patients) ] # alpha = [_solve_cholesky_kernel( # K_dot_coef[j], y[j][..., None], 0).ravel() for j in range(n_patients)] w_1 = soft_thresholding(coef + u_1, lamda / rho) w_2 = prox_laplacian(coef + u_2, beta / rho) # equivalent to alpha_dot_K # solve (sum(AtA) + 2*rho I)^-1 (sum(Aty) + rho(w1+w2-u1-u2)) # with A = K * alpha A = [K[j].dot(alpha[j]) for j in range(n_patients)] KK = sum(A[j].dot(A[j].T) for j in range(n_patients)) yy = sum(y[j].dot(A[j].T) for j in range(n_patients)) yy += rho * (w_1 + w_2 - u_1 - u_2) coef = _solve_cholesky_kernel(KK, yy[..., None], 2 * rho).ravel() # update residuals u_1 += coef - w_1 u_2 += coef - w_2 # diagnostics, reporting, termination checks rnorm = np.sqrt(squared_norm(coef - w_1) + squared_norm(coef - w_2)) snorm = rho * np.sqrt( squared_norm(w_1 - w_1_old) + squared_norm(w_2 - w_2_old)) obj = objective_admm(K, y, alpha, lamda, beta, coef, w_1, w_2) check = convergence( obj=obj, rnorm=rnorm, snorm=snorm, e_pri=np.sqrt(2 * coef.size) * tol + rtol * max(np.sqrt(squared_norm(coef) + squared_norm(coef)), np.sqrt(squared_norm(w_1) + squared_norm(w_2))), e_dual=np.sqrt(2 * coef.size) * tol + rtol * rho * (np.sqrt(squared_norm(u_1) + squared_norm(u_2)))) w_1_old = w_1.copy() w_2_old = w_2.copy() if verbose: print("obj: %.4f, rnorm: %.4f, snorm: %.4f," "eps_pri: %.4f, eps_dual: %.4f" % check) checks.append(check) if check.rnorm <= check.e_pri and check.snorm <= check.e_dual and iteration_ > 1: break rho_new = update_rho(rho, rnorm, snorm, iteration=iteration_, **(update_rho_options or {})) # scaled dual variables should be also rescaled u_1 *= rho / rho_new u_2 *= rho / rho_new rho = rho_new else: warnings.warn("Objective did not converge.") return_list = [alpha, coef] if return_n_iter: return_list.append(iteration_) return return_list
def fit(self, X, y=None, sample_weight=None): """Fit Kernel Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or array-like of shape [n_samples] Individual weights for each sample, ignored if None is passed. Returns ------- self : returns an instance of self. """ # Convert data X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True) if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = check_array(sample_weight, ensure_2d=False) if self.kernel_mat is None: self.kernel_mat = self._get_kernel(X, nystroem_kernel=self.nystroem) alpha = np.atleast_1d( self.alpha) # XXX not needed anymore for KTBoost? ravel = False if len(y.shape) == 1: y = y.reshape(-1, 1) ravel = True if self.nystroem: # XXX maybe this can be done better (without the need for copying) y = y[self.component_indices].copy() if not sample_weight is None: sample_weight = sample_weight[self.component_indices].copy() X = X[self.component_indices].copy() if self.sparse: if self.solve_kernel is None: K = self.kernel_mat.copy( ) ##Need to copy since for the weighted case, the matrix gets modified self.dual_coef_ = _solve_cholesky_kernel_sparse( K, y, alpha, sample_weight) else: self.dual_coef_ = self.solve_kernel(y) else: if self.solve_kernel is None: self.dual_coef_ = _solve_cholesky_kernel( self.kernel_mat, y, alpha, sample_weight, copy=True ) ##Need to copy since for the weighted case, the matrix gets modified else: self.dual_coef_ = self.solve_kernel.dot(y) if ravel: self.dual_coef_ = self.dual_coef_.ravel() self.X_fit_ = X return self