def _check_non_neg_array(self, X, whom): """check X format check X format and make sure no negative value in X. Parameters ---------- X : array-like """ X = check_array(X) check_non_negative(X, whom) return X
def _check_non_neg_array(self, X, whom): """check X format check X format and make sure no negative value in X. Parameters ---------- X : array-like or sparse matrix """ X = check_array(X, accept_sparse='csr') check_non_negative(X, whom) return X
def fit(self, X, y=None, **params): X = check_array(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32]) try: check_non_negative(X, whom='') except ValueError: raise ValueError("Some non-informative error msg") return super().fit(X, y, **params)
def test_check_non_negative(retype): A = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) X = retype(A) check_non_negative(X, "") X = retype([[0, 0], [0, 0]]) check_non_negative(X, "") A[0, 0] = -1 X = retype(A) assert_raises_regex(ValueError, "Negative ", check_non_negative, X, "")
def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): X = check_array(X, accept_sparse=('csr', 'csc')) check_non_negative(X, "NMF (input X)") n_samples, n_features = X.shape n_components = self.n_components if n_components is None: n_components = n_features if (not isinstance(n_components, numbers.Integral) or n_components <= 0): raise ValueError("Number of components must be a positive integer;" " got (n_components=%r)" % n_components) if (not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0): raise ValueError("Maximum number of iterations must be a positive " "integer; got (max_iter=%r)" % self.max_iter) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % self.tol) # check W and H, or initialize them if self.init == 'custom' and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") W = np.zeros((n_samples, n_components)) else: W, H = _initialize_nmf(X, n_components, init=self.init, random_state=self.random_state) if update_H: # fit_transform W, H, n_iter = _fit_projected_gradient(X, W, H, self.tol, self.max_iter, self.nls_max_iter, self.alpha, self.l1_ratio) else: # transform Wt, _, n_iter = _nls_subproblem(X.T, H.T, W.T, self.tol, self.nls_max_iter, alpha=self.alpha, l1_ratio=self.l1_ratio) W = Wt.T if n_iter == self.max_iter and self.tol > 0: warnings.warn( "Maximum number of iteration %d reached. Increase it" " to improve convergence." % self.max_iter, ConvergenceWarning) return W, H, n_iter
def _check_init(A, shape, whom): """Check initialization of A. Method copied from scikit-learn NMF source code """ A = check_array(A) if np.shape(A) != shape: raise ValueError('Array with wrong shape passed to %s. Expected %s, ' 'but got %s ' % (whom, shape, np.shape(A))) check_non_negative(A, whom) if np.max(A) == 0: raise ValueError('Array passed to %s is full of zeros.' % whom)
def test_check_non_negative(retype): A = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) X = retype(A) check_non_negative(X, "") X = retype([[0, 0], [0, 0]]) check_non_negative(X, "") A[0, 0] = -1 X = retype(A) assert_raises_regex(ValueError, "Negative ", check_non_negative, X, "")
def _check_alphas(self): create_path = self.alphas is None if create_path: if self.n_alphas <= 0: raise ValueError("n_alphas must be a positive integer") alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64) else: alphas = column_or_1d(self.alphas, warn=True) assert_all_finite(alphas) check_non_negative(alphas, "alphas") assert_all_finite(alphas) return alphas, create_path
def _check_penalty_factor(self, n_features): if self.penalty_factor is None: penalty_factor = numpy.ones(n_features, dtype=numpy.float64) else: pf = column_or_1d(self.penalty_factor, warn=True) if pf.shape[0] != n_features: raise ValueError( "penalty_factor must be array of length n_features (%d), " "but got %d" % (n_features, pf.shape[0])) assert_all_finite(pf) check_non_negative(pf, "penalty_factor") penalty_factor = pf * n_features / pf.sum() assert_all_finite(penalty_factor) return penalty_factor
def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): X = check_array(X, accept_sparse=('csr', 'csc')) check_non_negative(X, "NMF (input X)") n_samples, n_features = X.shape n_components = self.n_components if n_components is None: n_components = n_features if (not isinstance(n_components, INTEGER_TYPES) or n_components <= 0): raise ValueError("Number of components must be a positive integer;" " got (n_components=%r)" % n_components) if not isinstance(self.max_iter, INTEGER_TYPES) or self.max_iter < 0: raise ValueError("Maximum number of iterations must be a positive " "integer; got (max_iter=%r)" % self.max_iter) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % self.tol) # check W and H, or initialize them if self.init == 'custom' and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") W = np.zeros((n_samples, n_components)) else: W, H = _initialize_nmf(X, n_components, init=self.init, random_state=self.random_state) if update_H: # fit_transform W, H, n_iter = _fit_projected_gradient( X, W, H, self.tol, self.max_iter, self.nls_max_iter, self.alpha, self.l1_ratio) else: # transform Wt, _, n_iter = _nls_subproblem(X.T, H.T, W.T, self.tol, self.nls_max_iter, alpha=self.alpha, l1_ratio=self.l1_ratio) W = Wt.T if n_iter == self.max_iter and self.tol > 0: warnings.warn("Maximum number of iteration %d reached. Increase it" " to improve convergence." % self.max_iter, ConvergenceWarning) return W, H, n_iter
def fit(self, X, y=None): """Learn the idf (n_samples) and beta vectors (n_features) Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ check_array(X, accept_sparse=['csr', 'csc']) if not sp.issparse(X): X = sp.csc_matrix(X) n_samples, n_features = X.shape # if X is an array of zeros, raise ValueError if X.sum() == 0: raise ValueError("X is an array of zeros") # raise value error if there are negative values in X check_non_negative(X, "Bm25Transformer") df = _document_frequency(X) if self.smooth_idf and self.es_style_idf_smooth: # implementing what elasticsearch does df = df.astype(float) n_samples_calc = n_samples - df + 0.5 idf = np.log( 1.0 + n_samples_calc / ( df + 0.5 ) ) else: # original version # identical to what TfidfTransformer implemented # perform idf smoothing if required df += int(self.smooth_idf) n_samples_calc = n_samples + int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(float(n_samples_calc) / df) + 1.0 self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) return self
def _check_params(self, n_features): if not 0 < self.l1_ratio <= 1: raise ValueError("l1_ratio must be in interval ]0;1], but was %f" % self.l1_ratio) if self.tol <= 0: raise ValueError("tolerance must be positive, but was %f" % self.tol) if self.penalty_factor is None: penalty_factor = numpy.ones(n_features, dtype=numpy.float64) else: pf = column_or_1d(self.penalty_factor, warn=True) if pf.shape[0] != n_features: raise ValueError( "penalty_factor must be array of length n_features (%d), " "but got %d" % (n_features, pf.shape[0])) assert_all_finite(pf) check_non_negative(pf, "penalty_factor") penalty_factor = pf * n_features / pf.sum() assert_all_finite(penalty_factor) create_path = self.alphas is None if create_path: if self.n_alphas <= 0: raise ValueError("n_alphas must be a positive integer") alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64) else: alphas = column_or_1d(self.alphas, warn=True) assert_all_finite(alphas) check_non_negative(alphas, "alphas") assert_all_finite(alphas) if self.max_iter <= 0: raise ValueError("max_iter must be a positive integer") return create_path, alphas.astype( numpy.float64), penalty_factor.astype(numpy.float64)
def test_check_non_negative(retype): A = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) X = retype(A) check_non_negative(X, "") X = retype([[0, 0], [0, 0]]) check_non_negative(X, "") A[0, 0] = -1 X = retype(A) with pytest.raises(ValueError, match="Negative "): check_non_negative(X, "")
def _initialize_nmf(X, n_components, init=None, eps=1e-6, random_state=None): """Algorithms for NMF initialization. Computes an initial guess for the non-negative rank k matrix approximation for X: X = WH Parameters ---------- X : array-like, shape (n_samples, n_features) The data matrix to be decomposed. n_components : integer The number of components desired in the approximation. init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' Method used to initialize the procedure. Default: 'nndsvdar' if n_components < n_features, otherwise 'random'. Valid options: - 'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components) - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) initialization (better for sparseness) - 'nndsvda': NNDSVD with zeros filled with the average of X (better when sparsity is not desired) - 'nndsvdar': NNDSVD with zeros filled with small random values (generally faster, less accurate alternative to NNDSVDa for when sparsity is not desired) - 'custom': use custom matrices W and H eps : float Truncate all values less then this in output to zero. random_state : int seed, RandomState instance, or None (default) Random number generator seed control, used in 'nndsvdar' and 'random' modes. Returns ------- W : array-like, shape (n_samples, n_components) Initial guesses for solving X ~= WH H : array-like, shape (n_components, n_features) Initial guesses for solving X ~= WH References ---------- C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for nonnegative matrix factorization - Pattern Recognition, 2008 http://tinyurl.com/nndsvd """ check_non_negative(X, "NMF initialization") n_samples, n_features = X.shape if init is None: if n_components < n_features: init = 'nndsvd' else: init = 'random' # Random initialization if init == 'random': avg = np.sqrt(X.mean() / n_components) rng = check_random_state(random_state) H = avg * rng.randn(n_components, n_features) W = avg * rng.randn(n_samples, n_components) # we do not write np.abs(H, out=H) to stay compatible with # numpy 1.5 and earlier where the 'out' keyword is not # supported as a kwarg on ufuncs np.abs(H, H) np.abs(W, W) return W, H # NNDSVD initialization U, S, V = randomized_svd(X, n_components, random_state=random_state) W, H = np.zeros(U.shape), np.zeros(V.shape) # The leading singular triplet is non-negative # so it can be used as is for initialization. W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) for j in range(1, n_components): x, y = U[:, j], V[j, :] # extract positive and negative parts of column vectors x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) # and their norms x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm # choose update if m_p > m_n: u = x_p / x_p_nrm v = y_p / y_p_nrm sigma = m_p else: u = x_n / x_n_nrm v = y_n / y_n_nrm sigma = m_n lbd = np.sqrt(S[j] * sigma) W[:, j] = lbd * u H[j, :] = lbd * v W[W < eps] = 0 H[H < eps] = 0 if init == "nndsvd": pass elif init == "nndsvda": avg = X.mean() W[W == 0] = avg H[H == 0] = avg elif init == "nndsvdar": rng = check_random_state(random_state) avg = X.mean() W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100) H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100) else: raise ValueError( 'Invalid init parameter: got %r instead of one of %r' % (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar'))) return W, H
def non_negative_factorization(X, W=None, H=None, n_components=None, init='random', update_H=True, solver='cd', tol=1e-4, max_iter=200, alpha=0., l1_ratio=0., regularization=None, random_state=None, verbose=0, shuffle=False, nls_max_iter=2000, sparseness=None, beta=1, eta=0.1): """Compute Non-negative Matrix Factorization (NMF) Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction. The objective function is:: 0.5 * ||X - WH||_Fro^2 + alpha * l1_ratio * ||vec(W)||_1 + alpha * l1_ratio * ||vec(H)||_1 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2 Where:: ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm) ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm) The objective function is minimized with an alternating minimization of W and H. If H is given and update_H=False, it solves for W only. Parameters ---------- X : array-like, shape (n_samples, n_features) Constant matrix. W : array-like, shape (n_samples, n_components) If init='custom', it is used as initial guess for the solution. H : array-like, shape (n_components, n_features) If init='custom', it is used as initial guess for the solution. If update_H=False, it is used as a constant, to solve for W only. n_components : integer Number of components, if n_components is not set all features are kept. init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom' Method used to initialize the procedure. Default: 'nndsvd' if n_components < n_features, otherwise random. Valid options: - 'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components) - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) initialization (better for sparseness) - 'nndsvda': NNDSVD with zeros filled with the average of X (better when sparsity is not desired) - 'nndsvdar': NNDSVD with zeros filled with small random values (generally faster, less accurate alternative to NNDSVDa for when sparsity is not desired) - 'custom': use custom matrices W and H update_H : boolean, default: True Set to True, both W and H will be estimated from initial guesses. Set to False, only W will be estimated. solver : 'pg' | 'cd' Numerical solver to use: 'pg' is a (deprecated) Projected Gradient solver. 'cd' is a Coordinate Descent solver. tol : float, default: 1e-4 Tolerance of the stopping condition. max_iter : integer, default: 200 Maximum number of iterations before timing out. alpha : double, default: 0. Constant that multiplies the regularization terms. l1_ratio : double, default: 0. The regularization mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an elementwise L2 penalty (aka Frobenius Norm). For l1_ratio = 1 it is an elementwise L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. regularization : 'both' | 'components' | 'transformation' | None Select whether the regularization affects the components (H), the transformation (W), both or none of them. random_state : integer seed, RandomState instance, or None (default) Random number generator seed control. verbose : integer, default: 0 The verbosity level. shuffle : boolean, default: False If true, randomize the order of coordinates in the CD solver. nls_max_iter : integer, default: 2000 Number of iterations in NLS subproblem. Used only in the deprecated 'pg' solver. sparseness : 'data' | 'components' | None, default: None Where to enforce sparsity in the model. Used only in the deprecated 'pg' solver. beta : double, default: 1 Degree of sparseness, if sparseness is not None. Larger values mean more sparseness. Used only in the deprecated 'pg' solver. eta : double, default: 0.1 Degree of correctness to maintain, if sparsity is not None. Smaller values mean larger error. Used only in the deprecated 'pg' solver. Returns ------- W : array-like, shape (n_samples, n_components) Solution to the non-negative least squares problem. H : array-like, shape (n_components, n_features) Solution to the non-negative least squares problem. n_iter : int Actual number of iterations. References ---------- C.-J. Lin. Projected gradient methods for non-negative matrix factorization. Neural Computation, 19(2007), 2756-2779. http://www.csie.ntu.edu.tw/~cjlin/nmf/ Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for large scale nonnegative matrix and tensor factorizations." IEICE transactions on fundamentals of electronics, communications and computer sciences 92.3: 708-721, 2009. """ X = check_array(X, accept_sparse=('csr', 'csc')) check_non_negative(X, "NMF (input X)") _check_string_param(sparseness, solver) n_samples, n_features = X.shape if n_components is None: n_components = n_features if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0: raise ValueError("Number of components must be a positive integer;" " got (n_components=%r)" % n_components) if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0: raise ValueError( "Maximum number of iterations must be a positive integer;" " got (max_iter=%r)" % max_iter) if not isinstance(tol, numbers.Number) or tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % tol) # check W and H, or initialize them if init == 'custom' and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") W = np.zeros((n_samples, n_components)) else: W, H = _initialize_nmf(X, n_components, init=init, random_state=random_state) if solver == 'pg': warnings.warn( "'pg' solver will be removed in release 0.19." " Use 'cd' solver instead.", DeprecationWarning) if update_H: # fit_transform W, H, n_iter = _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, l1_ratio, sparseness, beta, eta) else: # transform W, H, n_iter = _update_projected_gradient_w( X, W, H, tol, nls_max_iter, alpha, l1_ratio, sparseness, beta, eta) elif solver == 'cd': W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter, alpha, l1_ratio, regularization, update_H=update_H, verbose=verbose, shuffle=shuffle, random_state=random_state) else: raise ValueError("Invalid solver parameter '%s'." % solver) if n_iter == max_iter: warnings.warn( "Maximum number of iteration %d reached. Increase it to" " improve convergence." % max_iter, ConvergenceWarning) return W, H, n_iter
def topic_supervised_factorization(X, W=None, H=None, n_components=None, labels=None, init=None, update_H=True, tol=1e-4, max_iter=200, random_state=None, verbose=0): """Compute Topic Supervised Non-negative Matrix Factorization (TSNMF) Similar to NMF, this method find two non-negative matrices (W, H) whose product approximates the non-negative matrix X, where W is constrain by a supervision matrix L. This factorization can be used for example for dimensionality reduction, source separation or topic extraction. The objective function is:: ||X - Had(W,L)H||_Fro^2 Where:: ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm) Had(W,L) is the Hadamard (entrywise) product The objective function is minimized with an alternating minimization of W and H. If H is given and update_H=False, it solves for W only. Parameters ---------- X : array-like, shape (n_samples, n_features) Constant matrix. W : array-like, shape (n_samples, n_components) H : array-like, shape (n_components, n_features) If update_H=False, it is used as a constant, to solve for W only. n_components : integer Number of components, if n_components is not set all features are kept. labels : list of list The order of the sublists should match the order of the docs from X, and each sublist is a list of topic id tags init : None | 'random' | 'nndsvd' Method used to initialize the procedure. Default: 'random'. Valid options: - None: 'nndsvd' if n_components < n_features, otherwise 'random'. - 'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components) - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) initialization (better for sparseness) update_H : boolean, default: True Set to True, both W and H will be estimated from initial guesses. Set to False, only W will be estimated. tol : float, default: 1e-4 Tolerance of the stopping condition. max_iter : integer, default: 200 Maximum number of iterations before timing out. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : integer, default: 0 The verbosity level. Returns ------- W : array-like, shape (n_samples, n_components) Solution to the non-negative least squares problem. H : array-like, shape (n_components, n_features) Solution to the non-negative least squares problem. n_iter : int Actual number of iterations. """ X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) # from utils check_non_negative(X, "TSNMF (input X)") # from utils.validation n_samples, n_features = X.shape if n_components is None: n_components = n_features #Validation from NMF sklearn source code if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0: raise ValueError("Number of components must be a positive integer;" " got (n_components=%r)" % n_components) if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0: raise ValueError("Maximum number of iterations must be a positive " "integer; got (max_iter=%r)" % max_iter) if not isinstance(tol, numbers.Number) or tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % tol) # Supervision matrix L = create_constraint_matrix(labels, n_components) if not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") # 'mu' solver should not be initialized by zeros avg = np.sqrt(X.mean() / n_components) W = np.full((n_samples, n_components), avg) else: W_init, H_init = _initialize_tsnmf(X, L, n_components, init=init, random_state=random_state) # override W or H if supplied if W is not None: _check_init(W, (n_samples, n_components), "NMF (input W)") else: W = W_init if H is not None: _check_init(H, (n_components, n_features), "NMF (input H)") else: H = H_init if init == 'nndsvd': W[(L == 1) & (W <= EPSILON)] = W.mean() W, H, n_iter = _fit_multiplicative_update(X, W, H, L, max_iter, tol, update_H, verbose) return W, H, n_iter
def fit(self, X, verbose=False): r"""Compute the basis matrix on the provided data matrix using the UoI\ :sub:`NMF` algorithm. Parameters ---------- X : ndarray, shape (n_samples, n_features) Data matrix to be decomposed. verbose : bool If True, outputs status updates. """ if verbose: self._logger.setLevel(logging.DEBUG) else: self._logger.setLevel(logging.WARNING) check_non_negative(X, 'UoI NMF') n_samples, n_features = X.shape H_samples = { k: np.zeros((self.n_boots, k, n_features)) for k in self.ranks } rep_idx = self._rand.randint(n_samples, size=(self.n_boots, n_samples)) for boot_idx in range(self.n_boots): self._logger.info("Bootstrap %d" % boot_idx) # compute NMF bases for k across bootstrap replicates sample = X[rep_idx[boot_idx]] for k_idx, k in enumerate(self.ranks): # concatenate k by p H_samples[k][boot_idx] = \ self.nmf.set_params(n_components=k).fit(sample).components_ if self.use_dissimilarity: gamma = np.zeros(len(self.ranks)) # iterate over each rank for k_idx, k in enumerate(self.ranks): # extract the bases for each rank H_k = H_samples[k] for boot1, boot2 in combinations(range(self.n_boots), 2): gamma[k_idx] += dissimilarity(H_k[boot1], H_k[boot2]) self.dissimilarity_ = (gamma * 2) /\ (self.n_boots * (self.n_boots - 1)) k_min = self.ranks[np.argmin(gamma)] H_pre_cluster = H_samples[k_min].reshape( (self.n_boots * k_min, n_features)) else: H_pre_cluster = np.zeros( (self.n_boots * np.sum(self.ranks), n_features)) start_idx = 0 for k in self.ranks: H_k = H_samples[k].reshape((self.n_boots * k, n_features)) end_idx = start_idx + self.n_boots * k H_pre_cluster[start_idx:end_idx] = H_k start_idx = end_idx # remove zero bases and normalize across features H_pre_cluster = H_pre_cluster[np.sum(H_pre_cluster, axis=1) != 0.0] H_pre_cluster = normalize(H_pre_cluster, norm='l2', axis=1) # cluster all bases self._logger.info("clustering bases samples") labels = self.cluster.fit_predict(H_pre_cluster) # compute consensus bases from clusters cluster_ids = np.unique(labels[labels != -1]) n_clusters = cluster_ids.size self._logger.info("found %d bases, computing consensus bases" % n_clusters) H_cons = np.zeros((n_clusters, n_features)) for c_id in cluster_ids: H_cons[c_id, :] = self.cons_meth(H_pre_cluster[labels == c_id], axis=0) # re-normalize across features H_cons = normalize(H_cons, norm='l2', axis=1) self.components_ = H_cons self.n_components = self.components_.shape[0] self.bases_samples_ = H_pre_cluster self.bases_samples_labels_ = labels self.bootstraps_ = rep_idx self.reconstruction_err_ = None return self
def _initialize_tsnmf(X, L, n_components, init=None, eps=1e-6, random_state=None): """ Initialization of matrices W and H (X = WH) Returns ------- W : array-like, shape (n_samples, n_components) Initial guesses for solving X ~= WH H : array-like, shape (n_components, n_features) Initial guesses for solving X ~= WH """ check_non_negative(X, "TSNMF initialization") n_samples, n_features = X.shape if (init is not None and init != 'random' and n_components > min(n_samples, n_features)): raise ValueError( "init = '{}' can only be used when " "n_components <= min(n_samples, n_features)".format(init)) if init is None: if n_components <= min(n_samples, n_features): init = 'nndsvd' else: init = 'random' if init == 'random': avg = np.sqrt(X.mean() / n_components) rng = check_random_state(random_state) H = avg * rng.randn(n_components, n_features) W = avg * rng.randn(n_samples, n_components) # we do not write np.abs(H, out=H) to stay compatible with # numpy 1.5 and earlier where the 'out' keyword is not # supported as a kwarg on ufuncs np.abs(H, H) np.abs(W, W) return W, H # NNDSVD initialization U, S, V = randomized_svd(X, n_components, random_state=random_state) W, H = np.zeros(U.shape), np.zeros(V.shape) # The leading singular triplet is non-negative # so it can be used as is for initialization. W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) for j in range(1, n_components): x, y = U[:, j], V[j, :] # extract positive and negative parts of column vectors x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) # and their norms x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm # choose update if m_p > m_n: u = x_p / x_p_nrm v = y_p / y_p_nrm sigma = m_p else: u = x_n / x_n_nrm v = y_n / y_n_nrm sigma = m_n lbd = np.sqrt(S[j] * sigma) W[:, j] = lbd * u H[j, :] = lbd * v W[W < eps] = 0 H[H < eps] = 0 if init == "nndsvd": pass else: raise ValueError( 'Invalid init parameter: got %r instead of one of %r' % (init, ( None, 'random', 'nndsvd', ))) return W, H
def _initialize_mf(M, n_components, init=None, eps=1e-6, random_state=None, non_negative=False): """Algorithms for MF initialization. Computes an initial guess for the non-negative rank k matrix approximation for M: M = AB^T Parameters ---------- M : array-like, shape (n_samples, n_features) The data matrix to be decomposed. n_components : integer The number of components desired in the approximation. init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'svd' Method used to initialize the procedure. Default: 'svd' if n_components < n_features, otherwise 'random'. Valid options: - 'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components) - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) initialization (better for sparseness) - 'nndsvda': NNDSVD with zeros filled with the average of X (better when sparsity is not desired) - 'nndsvdar': NNDSVD with zeros filled with small random values (generally faster, less accurate alternative to NNDSVDa for when sparsity is not desired) non_negative: bool Whether to decompose into non-negative matrices. eps : float If non-negative, truncate all values less then this in output to zero. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``random`` == 'nndsvdar' or 'random'. Returns ------- A : array-like, shape (n_samples, n_components) Initial guesses for solving M ~= AB^T B : array-like, shape (n_features, n_components) Initial guesses for solving M ~= AB^T References ---------- C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for nonnegative matrix factorization - Pattern Recognition, 2008 http://tinyurl.com/nndsvd """ if non_negative: check_non_negative(M, "MF initialization") n_samples, n_features = M.shape if init is None: if n_components < n_features: init = 'nndsvdar' if non_negative else 'svd' else: init = 'random' if init == 'random': avg = np.sqrt(np.abs(M.mean()) / n_components) rng = check_random_state(random_state) A = avg * rng.randn(n_samples, n_components) B = avg * rng.randn(n_components, n_features) if non_negative: np.abs(A, A) np.abs(B, B) elif init == 'svd': if non_negative: raise ValueError( 'SVD initialization incompatible with NMF (use nndsvd instead)' ) if min(n_samples, n_features) < n_components: warnings.warn( 'The number of components is smaller than the rank in svd initialization.' + 'The input will be padded with zeros to compensate for the lack of singular values.' ) # simple SVD based approximation U, S, V = randomized_svd(M, n_components, random_state=random_state) # randomize_svd only returns min(n_components, n_features, n_samples) singular values and vectors # therefore, to retain the desired shape, we need to pad and reshape the inputs if n_components > n_features: U_padded = np.zeros((U.shape[0], n_components)) U_padded[:, :U.shape[1]] = U U = U_padded V_padded = np.zeros((n_components, V.shape[1])) V_padded[:V.shape[0], :] = V V = V_padded S_padded = np.zeros(n_components) S_padded[:S.shape[0]] = S S = S_padded S = np.diag(np.sqrt(S)) A = np.dot(U, S) B = np.dot(S, V) elif init in ['nndsvd', 'nndsvda', 'nndsvdar']: if not non_negative: warnings.warn( '%s results in non-negative constrained factors,' % init + 'so SVD initialization should provide better initial estimate') # NNDSVD initialization U, S, V = randomized_svd(M, n_components, random_state=random_state) A, B = np.zeros(U.shape), np.zeros(V.shape) # The leading singular triplet is non-negative # so it can be used as is for initialization. A[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) B[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) for j in range(1, n_components): x, y = U[:, j], V[j, :] # extract positive and negative parts of column vectors x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) # and their norms x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm # choose update if m_p > m_n: u = x_p / x_p_nrm v = y_p / y_p_nrm sigma = m_p else: u = x_n / x_n_nrm v = y_n / y_n_nrm sigma = m_n lbd = np.sqrt(S[j] * sigma) A[:, j] = lbd * u B[j, :] = lbd * v A[A < eps] = 0 B[B < eps] = 0 if init == "nndsvd": pass elif init == "nndsvda": avg = M.mean() A[A == 0] = avg B[B == 0] = avg elif init == "nndsvdar": rng = check_random_state(random_state) avg = M.mean() A[A == 0] = abs(avg * rng.randn(len(A[A == 0])) / 100) B[B == 0] = abs(avg * rng.randn(len(B[B == 0])) / 100) else: raise ValueError("Invalid init argument") return A, B.T
def _fit(self, X, skip_num_points=0): """Private function to fit the model using X as training data.""" if self.method not in ['barnes_hut', 'exact']: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if hasattr(self, 'square_distances'): if self.square_distances not in [True, 'legacy']: raise ValueError( "'square_distances' must be True or 'legacy'.") if self.metric != "euclidean" and self.square_distances is not True: warnings.warn( ("'square_distances' has been introduced in 0.24" "to help phase out legacy squaring behavior. The " "'legacy' setting will be removed in 0.26, and the " "default setting will be changed to True. In 0.28, " "'square_distances' will be removed altogether," "and distances will be squared by default. Set " "'square_distances'=True to silence this warning."), FutureWarning) if self.method == 'barnes_hut': if sklearn_check_version('0.23'): X = self._validate_data(X, accept_sparse=['csr'], ensure_min_samples=2, dtype=[np.float32, np.float64]) else: X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2, dtype=[np.float32, np.float64]) else: if sklearn_check_version('0.23'): X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self.init, str) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " "used with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") check_non_negative( X, "TSNE.fit(). With metric='precomputed', X " "should contain positive distances.") if self.method == "exact" and issparse(X): raise TypeError( 'TSNE with method="exact" does not accept sparse ' 'precomputed distance matrix. Use method="barnes_hut" ' 'or provide the dense distance matrix.') if self.method == 'barnes_hut' and self.n_components > 3: raise ValueError("'n_components' should be inferior to 4 for the " "barnes_hut algorithm as it relies on " "quad-tree or oct-tree.") random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError( "early_exaggeration must be at least 1, but is {}".format( self.early_exaggeration)) if self.n_iter < 250: raise ValueError("n_iter should be at least 250") n_samples = X.shape[0] neighbors_nn = None if self.method == "exact": # Retrieve the distance matrix, either using the precomputed one or # computing it. if self.metric == "precomputed": distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": # Euclidean is squared here, rather than using **= 2, # because euclidean_distances already calculates # squared distances, and returns np.sqrt(dist) for # squared=False. # Also, Euclidean is slower for n_jobs>1, so don't set here distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric, n_jobs=self.n_jobs) if np.any(distances < 0): raise ValueError("All distances should be positive, the " "metric given is not correct") if self.metric != "euclidean" and \ getattr(self, 'square_distances', True) is True: distances **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") else: # Compute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1)) if self.verbose: print("[t-SNE] Computing {} nearest neighbors...".format( n_neighbors)) # Find the nearest neighbors for every point knn = NearestNeighbors(algorithm='auto', n_jobs=self.n_jobs, n_neighbors=n_neighbors, metric=self.metric) t0 = time() knn.fit(X) duration = time() - t0 if self.verbose: print("[t-SNE] Indexed {} samples in {:.3f}s...".format( n_samples, duration)) t0 = time() distances_nn = knn.kneighbors_graph(mode='distance') duration = time() - t0 if self.verbose: print("[t-SNE] Computed neighbors for {} samples " "in {:.3f}s...".format(n_samples, duration)) # Free the memory used by the ball_tree del knn if getattr(self, 'square_distances', True) is True or \ self.metric == "euclidean": # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the # the method was derived using the euclidean method as in the # input space. Not sure of the implication of using a different # metric. distances_nn.data **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose) if isinstance(self.init, np.ndarray): X_embedded = self.init elif self.init == 'pca': pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) elif self.init == 'random': # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. X_embedded = 1e-4 * random_state.randn( n_samples, self.n_components).astype(np.float32) else: raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from # "Learning a Parametric Embedding by Preserving Local Structure" # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1, 1) return self._tsne(P, degrees_of_freedom, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)
def non_negative_factorization(X, W=None, H=None, n_components=None, init='random', update_H=True, solver='cd', beta_loss='frobenius', tol=1e-4, max_iter=200, alpha=0., l1_ratio=0., regularization=None, random_state=None, verbose=0, shuffle=False, distribution = 'gaussian', N=None, D=None): r"""Compute Non-negative Matrix Factorization (NMF) Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction. The objective function is:: 0.5 * ||X - WH||_Fro^2 + alpha * l1_ratio * ||vec(W)||_1 + alpha * l1_ratio * ||vec(H)||_1 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2 Where:: ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm) ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm) For multiplicative-update ('mu') solver, the Frobenius norm (0.5 * ||X - WH||_Fro^2) can be changed into another beta-divergence loss, by changing the beta_loss parameter. The objective function is minimized with an alternating minimization of W and H. If H is given and update_H=False, it solves for W only. Parameters ---------- X : array-like, shape (n_samples, n_features) Constant matrix. W : array-like, shape (n_samples, n_components) If init='custom', it is used as initial guess for the solution. H : array-like, shape (n_components, n_features) If init='custom', it is used as initial guess for the solution. If update_H=False, it is used as a constant, to solve for W only. n_components : integer Number of components, if n_components is not set all features are kept. init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom' Method used to initialize the procedure. Default: 'random'. Valid options: - 'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components) - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) initialization (better for sparseness) - 'nndsvda': NNDSVD with zeros filled with the average of X (better when sparsity is not desired) - 'nndsvdar': NNDSVD with zeros filled with small random values (generally faster, less accurate alternative to NNDSVDa for when sparsity is not desired) - 'custom': use custom matrices W and H update_H : boolean, default: True Set to True, both W and H will be estimated from initial guesses. Set to False, only W will be estimated. solver : 'cd' | 'mu' Numerical solver to use: 'cd' is a Coordinate Descent solver that uses Fast Hierarchical Alternating Least Squares (Fast HALS). 'mu' is a Multiplicative Update solver. .. versionadded:: 0.17 Coordinate Descent solver. .. versionadded:: 0.19 Multiplicative Update solver. beta_loss : float or string, default 'frobenius' String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}. Beta divergence to be minimized, measuring the distance between X and the dot product WH. Note that values different from 'frobenius' (or 2) and 'kullback-leibler' (or 1) lead to significantly slower fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input matrix X cannot contain zeros. Used only in 'mu' solver. .. versionadded:: 0.19 tol : float, default: 1e-4 Tolerance of the stopping condition. max_iter : integer, default: 200 Maximum number of iterations before timing out. alpha : double, default: 0. Constant that multiplies the regularization terms. l1_ratio : double, default: 0. The regularization mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an elementwise L2 penalty (aka Frobenius Norm). For l1_ratio = 1 it is an elementwise L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. regularization : 'both' | 'components' | 'transformation' | None Select whether the regularization affects the components (H), the transformation (W), both or none of them. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : integer, default: 0 The verbosity level. shuffle : boolean, default: False If true, randomize the order of coordinates in the CD solver. Returns ------- W : array-like, shape (n_samples, n_components) Solution to the non-negative least squares problem. H : array-like, shape (n_components, n_features) Solution to the non-negative least squares problem. n_iter : int Actual number of iterations. Examples -------- >>> import numpy as np >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) >>> from sklearn.decomposition import non_negative_factorization >>> W, H, n_iter = non_negative_factorization(X, n_components=2, ... init='random', random_state=0) References ---------- Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for large scale nonnegative matrix and tensor factorizations." IEICE transactions on fundamentals of electronics, communications and computer sciences 92.3: 708-721, 2009. Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix factorization with the beta-divergence. Neural Computation, 23(9). """ #print('My Non negative Factorization') X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) check_non_negative(X, "NMF (input X)") beta_loss = _check_string_param(solver, regularization, beta_loss, init) if safe_min(X) == 0 and beta_loss <= 0: raise ValueError("When beta_loss <= 0 and X contains zeros, " "the solver may diverge. Please add small values to " "X, or use a positive beta_loss.") n_samples, n_features = X.shape if n_components is None: n_components = n_features if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0: raise ValueError("Number of components must be a positive integer;" " got (n_components=%r)" % n_components) if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0: raise ValueError("Maximum number of iterations must be a positive " "integer; got (max_iter=%r)" % max_iter) if not isinstance(tol, numbers.Number) or tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % tol) # check W and H, or initialize them if init == 'custom' and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") # 'mu' solver should not be initialized by zeros if solver == 'mu': avg = np.sqrt(X.mean() / n_components) W = np.full((n_samples, n_components), avg) else: W = np.zeros((n_samples, n_components)) else: W, H = _initialize_nmf(X, n_components, init=init, random_state=random_state) l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization( alpha, l1_ratio, regularization) W, H, n_iter = update(X, W, H, max_iter, distribution, N , D) if n_iter == max_iter and tol > 0: warnings.warn("Maximum number of iteration %d reached. Increase it to" " improve convergence." % max_iter, ConvergenceWarning) return W, H, n_iter
def non_negative_factorization(X, W, H, n_components=None, update_H=True, beta_loss='frobenius', tol=1e-4, max_iter=200, alpha=0., l1_ratio=0., regularization=None, random_state=None, verbose=0, shuffle=False): X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) check_non_negative(X, "NMF (input X)") beta_loss = _check_string_param(regularization, beta_loss) if X.min() == 0 and beta_loss <= 0: raise ValueError("When beta_loss <= 0 and X contains zeros, " "the solver may diverge. Please add small values to " "X, or use a positive beta_loss.") n_samples, n_features = X.shape if n_components is None: n_components = n_features if not isinstance(n_components, numbers.Integral) or n_components <= 0: raise ValueError("Number of components must be a positive integer;" " got (n_components=%r)" % n_components) if not isinstance(max_iter, numbers.Integral) or max_iter < 0: raise ValueError("Maximum number of iterations must be a positive " "integer; got (max_iter=%r)" % max_iter) if not isinstance(tol, numbers.Number) or tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % tol) if H is not None and W is not None: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif H is not None and W is None: _check_init(H, (n_components, n_features), "NMF (input H)") if H.dtype != X.dtype: raise TypeError("H should have the same dtype as X. Got H.dtype = " "{}.".format(H.dtype)) W = np.zeros((n_samples, n_components), dtype=X.dtype) elif H is None and W is None: avg = np.sqrt(X.mean() / n_components) rng = check_random_state(random_state) H = avg * rng.randn(n_components, n_features).astype(X.dtype, copy=False) W = avg * rng.randn(n_samples, n_components).astype(X.dtype, copy=False) np.abs(H, out=H) np.abs(W, out=W) W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter, update_H=update_H, verbose=verbose, shuffle=shuffle, random_state=random_state) if n_iter == max_iter and tol > 0: warnings.warn( "Maximum number of iteration %d reached. Increase it to" " improve convergence." % max_iter, ConvergenceWarning) return W, H, n_iter
def non_negative_factorization(lam, N, D, X, W=None, H=None, n_components=None, init='warn', update_H=True, solver='mu', beta_loss='KL', tol=1e-4, max_iter=400, alpha=0., l1_ratio=0., regularization=None, random_state=None, verbose=0, shuffle=False): X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) check_non_negative(X, "NMF (input X)") beta_loss = _check_string_param(solver, regularization, beta_loss, init) if safe_min(X) == 0 and beta_loss <= 0: raise ValueError("When beta_loss <= 0 and X contains zeros, " "the solver may diverge. Please add small values to " "X, or use a positive beta_loss.") n_samples, n_features = X.shape if n_components is None: n_components = n_features if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0: raise ValueError("Number of components must be a positive integer;" " got (n_components=%r)" % n_components) if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0: raise ValueError("Maximum number of iterations must be a positive " "integer; got (max_iter=%r)" % max_iter) if not isinstance(tol, numbers.Number) or tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % tol) if init == "warn": if n_components < n_features: warnings.warn( "The default value of init will change from " "random to None in 0.23 to make it consistent " "with decomposition.NMF.", FutureWarning) init = "random" # check W and H, or initialize them if init == 'custom' and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") avg = np.sqrt(X.mean() / n_components) W = np.full((n_samples, n_components), avg) else: W, H = _initialize_nmf(X, n_components, init=init, random_state=random_state) l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization( alpha, l1_ratio, regularization) W, H, n_iter = _fit_multiplicative_update(lam, N, D, X, W, H, beta_loss, max_iter, tol, l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, update_H, verbose) if n_iter == max_iter and tol > 0: warnings.warn( "Maximum number of iteration %d reached. Increase it to" " improve convergence." % max_iter, ConvergenceWarning) return W, H, n_iter
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, force_all_finite=True, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. ['nan_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf and np.nan in array. The possibilities are: - True: Force all values of array to be finite. - False: accept both np.inf and np.nan in array. - 'allow-nan': accept only np.nan values in array. Values cannot be infinite. .. versionadded:: 0.22 **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. See also -------- pairwise_distances_chunked : performs the same calculation as this function, but returns a generator of chunks of the distance matrix, in order to limit memory usage. paired_distances : Computes the distances between corresponding elements of two arrays """ if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed": raise ValueError("Unknown metric %s. Valid metrics are %s, or 'precomputed', " "or a callable" % (metric, _VALID_METRICS)) X = _daal_check_array(X, accept_sparse=['csr', 'csc', 'coo'], force_all_finite=force_all_finite) _patching_status = PatchingConditionsChain( "sklearn.metrics.pairwise_distances") _dal_ready = _patching_status.and_conditions([ (metric == 'cosine' or metric == 'correlation', f"'{metric}' metric is not supported. " "Only 'cosine' and 'correlation' metrics are supported."), (Y is None, "Second feature array is not supported."), (not issparse(X), "X is sparse. Sparse input is not supported."), (X.dtype == np.float64, f"{X.dtype} X data type is not supported. Only np.float64 is supported.") ]) _patching_status.write_log() if _dal_ready: if metric == 'cosine': return _daal4py_cosine_distance_dense(X) if metric == 'correlation': return _daal4py_correlation_distance_dense(X) raise ValueError(f"'{metric}' distance is wrong for daal4py.") if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True, force_all_finite=force_all_finite) whom = ("`pairwise_distances`. Precomputed distance " " need to have non-negative values.") check_non_negative(X, whom=whom) return X if metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)): msg = "Data was converted to boolean for metric %s" % metric warnings.warn(msg, DataConversionWarning) X, Y = check_pairwise_arrays(X, Y, dtype=dtype, force_all_finite=force_all_finite) # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. """ if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): raise ValueError("Unknown metric %s. " "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) whom = ("`pairwise_distances`. Precomputed distance " " need to have non-negative values.") check_non_negative(X, whom=whom) return X elif ((metric == 'cosine') and (Y is None) and (not issparse(X)) and X.dtype == np.float64): return _daal4py_cosine_distance_dense(X) elif ((metric == 'correlation') and (Y is None) and (not issparse(X)) and X.dtype == np.float64): return _daal4py_correlation_distance_dense(X) elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None if (dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool))): msg = "Data was converted to boolean for metric %s" % metric warnings.warn(msg, DataConversionWarning) X, Y = check_pairwise_arrays(X, Y, dtype=dtype) # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
def _fit(self, X, skip_num_points=0): """Private function to fit the model using X as training data.""" X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self.init, str) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " "used with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") check_non_negative( X, "QSNE.fit(). With metric='precomputed', X " "should contain positive distances.") if issparse(X): raise TypeError('QSNE does not accept sparse ' 'precomputed distance matrix.') random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError( "early_exaggeration must be at least 1, but is {}".format( self.early_exaggeration)) if self.n_iter < 250: raise ValueError("n_iter should be at least 250") n_samples = X.shape[0] neighbors_nn = None # Retrieve the distance matrix, either using the precomputed one or # computing it. if self.metric == "precomputed": distances = X else: if self.verbose: print("[q-SNE] Computing pairwise distances...") if self.metric == "euclidean": distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric, n_jobs=self.n_jobs) if np.any(distances < 0): raise ValueError("All distances should be positive, the " "metric given is not correct") # compute the joint probability distribution for the input space P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") if isinstance(self.init, np.ndarray): X_embedded = self.init elif self.init == 'pca': pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) elif self.init == 'random': # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. X_embedded = 1e-4 * random_state.randn( n_samples, self.n_components).astype(np.float32) else: raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") return self._qsne(P, self.q, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)