Пример #1
0
 def _check_non_neg_array(self, X, whom):
     """check X format
     check X format and make sure no negative value in X.
     Parameters
     ----------
     X :  array-like
     """
     X = check_array(X)
     check_non_negative(X, whom)
     return X
Пример #2
0
 def _check_non_neg_array(self, X, whom):
     """check X format
     check X format and make sure no negative value in X.
     Parameters
     ----------
     X :  array-like or sparse matrix
     """
     X = check_array(X, accept_sparse='csr')
     check_non_negative(X, whom)
     return X
    def fit(self, X, y=None, **params):
        X = check_array(X,
                        accept_sparse=('csr', 'csc'),
                        dtype=[np.float64, np.float32])
        try:
            check_non_negative(X, whom='')
        except ValueError:
            raise ValueError("Some non-informative error msg")

        return super().fit(X, y, **params)
Пример #4
0
def test_check_non_negative(retype):
    A = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    X = retype(A)
    check_non_negative(X, "")
    X = retype([[0, 0], [0, 0]])
    check_non_negative(X, "")

    A[0, 0] = -1
    X = retype(A)
    assert_raises_regex(ValueError, "Negative ", check_non_negative, X, "")
Пример #5
0
    def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
        X = check_array(X, accept_sparse=('csr', 'csc'))
        check_non_negative(X, "NMF (input X)")

        n_samples, n_features = X.shape
        n_components = self.n_components
        if n_components is None:
            n_components = n_features

        if (not isinstance(n_components, numbers.Integral)
                or n_components <= 0):
            raise ValueError("Number of components must be a positive integer;"
                             " got (n_components=%r)" % n_components)
        if (not isinstance(self.max_iter, numbers.Integral)
                or self.max_iter < 0):
            raise ValueError("Maximum number of iterations must be a positive "
                             "integer; got (max_iter=%r)" % self.max_iter)
        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
            raise ValueError("Tolerance for stopping criteria must be "
                             "positive; got (tol=%r)" % self.tol)

        # check W and H, or initialize them
        if self.init == 'custom' and update_H:
            _check_init(H, (n_components, n_features), "NMF (input H)")
            _check_init(W, (n_samples, n_components), "NMF (input W)")
        elif not update_H:
            _check_init(H, (n_components, n_features), "NMF (input H)")
            W = np.zeros((n_samples, n_components))
        else:
            W, H = _initialize_nmf(X,
                                   n_components,
                                   init=self.init,
                                   random_state=self.random_state)

        if update_H:  # fit_transform
            W, H, n_iter = _fit_projected_gradient(X, W, H, self.tol,
                                                   self.max_iter,
                                                   self.nls_max_iter,
                                                   self.alpha, self.l1_ratio)
        else:  # transform
            Wt, _, n_iter = _nls_subproblem(X.T,
                                            H.T,
                                            W.T,
                                            self.tol,
                                            self.nls_max_iter,
                                            alpha=self.alpha,
                                            l1_ratio=self.l1_ratio)
            W = Wt.T

        if n_iter == self.max_iter and self.tol > 0:
            warnings.warn(
                "Maximum number of iteration %d reached. Increase it"
                " to improve convergence." % self.max_iter, ConvergenceWarning)

        return W, H, n_iter
def _check_init(A, shape, whom):
    """Check initialization of A.
    Method copied from scikit-learn NMF source code
    """
    A = check_array(A)
    if np.shape(A) != shape:
        raise ValueError('Array with wrong shape passed to %s. Expected %s, '
                         'but got %s ' % (whom, shape, np.shape(A)))
    check_non_negative(A, whom)
    if np.max(A) == 0:
        raise ValueError('Array passed to %s is full of zeros.' % whom)
Пример #7
0
def test_check_non_negative(retype):
    A = np.array([[1, 1, 0, 0],
                  [1, 1, 0, 0],
                  [0, 0, 0, 0],
                  [0, 0, 0, 0]])
    X = retype(A)
    check_non_negative(X, "")
    X = retype([[0, 0], [0, 0]])
    check_non_negative(X, "")

    A[0, 0] = -1
    X = retype(A)
    assert_raises_regex(ValueError, "Negative ", check_non_negative, X, "")
Пример #8
0
    def _check_alphas(self):
        create_path = self.alphas is None
        if create_path:
            if self.n_alphas <= 0:
                raise ValueError("n_alphas must be a positive integer")

            alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64)
        else:
            alphas = column_or_1d(self.alphas, warn=True)
            assert_all_finite(alphas)
            check_non_negative(alphas, "alphas")
            assert_all_finite(alphas)
        return alphas, create_path
Пример #9
0
 def _check_penalty_factor(self, n_features):
     if self.penalty_factor is None:
         penalty_factor = numpy.ones(n_features, dtype=numpy.float64)
     else:
         pf = column_or_1d(self.penalty_factor, warn=True)
         if pf.shape[0] != n_features:
             raise ValueError(
                 "penalty_factor must be array of length n_features (%d), "
                 "but got %d" % (n_features, pf.shape[0]))
         assert_all_finite(pf)
         check_non_negative(pf, "penalty_factor")
         penalty_factor = pf * n_features / pf.sum()
         assert_all_finite(penalty_factor)
     return penalty_factor
Пример #10
0
    def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
        X = check_array(X, accept_sparse=('csr', 'csc'))
        check_non_negative(X, "NMF (input X)")

        n_samples, n_features = X.shape
        n_components = self.n_components
        if n_components is None:
            n_components = n_features

        if (not isinstance(n_components, INTEGER_TYPES) or
                n_components <= 0):
            raise ValueError("Number of components must be a positive integer;"
                             " got (n_components=%r)" % n_components)
        if not isinstance(self.max_iter, INTEGER_TYPES) or self.max_iter < 0:
            raise ValueError("Maximum number of iterations must be a positive "
                             "integer; got (max_iter=%r)" % self.max_iter)
        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
            raise ValueError("Tolerance for stopping criteria must be "
                             "positive; got (tol=%r)" % self.tol)

        # check W and H, or initialize them
        if self.init == 'custom' and update_H:
            _check_init(H, (n_components, n_features), "NMF (input H)")
            _check_init(W, (n_samples, n_components), "NMF (input W)")
        elif not update_H:
            _check_init(H, (n_components, n_features), "NMF (input H)")
            W = np.zeros((n_samples, n_components))
        else:
            W, H = _initialize_nmf(X, n_components, init=self.init,
                                   random_state=self.random_state)

        if update_H:  # fit_transform
            W, H, n_iter = _fit_projected_gradient(
                X, W, H, self.tol, self.max_iter, self.nls_max_iter,
                self.alpha, self.l1_ratio)
        else:  # transform
            Wt, _, n_iter = _nls_subproblem(X.T, H.T, W.T, self.tol,
                                            self.nls_max_iter,
                                            alpha=self.alpha,
                                            l1_ratio=self.l1_ratio)
            W = Wt.T

        if n_iter == self.max_iter and self.tol > 0:
            warnings.warn("Maximum number of iteration %d reached. Increase it"
                          " to improve convergence." % self.max_iter,
                          ConvergenceWarning)

        return W, H, n_iter
Пример #11
0
    def fit(self, X, y=None):
        """Learn the idf (n_samples) and beta vectors (n_features)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """

        check_array(X, accept_sparse=['csr', 'csc'])

        if not sp.issparse(X):
            X = sp.csc_matrix(X)

        n_samples, n_features = X.shape

        # if X is an array of zeros, raise ValueError
        if X.sum() == 0:
            raise ValueError("X is an array of zeros")

        # raise value error if there are negative values in X
        check_non_negative(X, "Bm25Transformer")

        df = _document_frequency(X)

        if self.smooth_idf and self.es_style_idf_smooth:
            # implementing what elasticsearch does
            df = df.astype(float)
            n_samples_calc = n_samples - df + 0.5
            idf = np.log( 1.0 + n_samples_calc / ( df + 0.5 ) )

        else:
            # original version
            # identical to what TfidfTransformer implemented 

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples_calc = n_samples + int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_samples_calc) / df) + 1.0


        self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)

        return self
Пример #12
0
    def _check_params(self, n_features):
        if not 0 < self.l1_ratio <= 1:
            raise ValueError("l1_ratio must be in interval ]0;1], but was %f" %
                             self.l1_ratio)

        if self.tol <= 0:
            raise ValueError("tolerance must be positive, but was %f" %
                             self.tol)

        if self.penalty_factor is None:
            penalty_factor = numpy.ones(n_features, dtype=numpy.float64)
        else:
            pf = column_or_1d(self.penalty_factor, warn=True)
            if pf.shape[0] != n_features:
                raise ValueError(
                    "penalty_factor must be array of length n_features (%d), "
                    "but got %d" % (n_features, pf.shape[0]))
            assert_all_finite(pf)
            check_non_negative(pf, "penalty_factor")
            penalty_factor = pf * n_features / pf.sum()
            assert_all_finite(penalty_factor)

        create_path = self.alphas is None
        if create_path:
            if self.n_alphas <= 0:
                raise ValueError("n_alphas must be a positive integer")

            alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64)
        else:
            alphas = column_or_1d(self.alphas, warn=True)
            assert_all_finite(alphas)
            check_non_negative(alphas, "alphas")
            assert_all_finite(alphas)

        if self.max_iter <= 0:
            raise ValueError("max_iter must be a positive integer")

        return create_path, alphas.astype(
            numpy.float64), penalty_factor.astype(numpy.float64)
Пример #13
0
def test_check_non_negative(retype):
    A = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    X = retype(A)
    check_non_negative(X, "")
    X = retype([[0, 0], [0, 0]])
    check_non_negative(X, "")

    A[0, 0] = -1
    X = retype(A)
    with pytest.raises(ValueError, match="Negative "):
        check_non_negative(X, "")
Пример #14
0
def _initialize_nmf(X, n_components, init=None, eps=1e-6, random_state=None):
    """Algorithms for NMF initialization.

    Computes an initial guess for the non-negative
    rank k matrix approximation for X: X = WH

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        The data matrix to be decomposed.

    n_components : integer
        The number of components desired in the approximation.

    init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar'
        Method used to initialize the procedure.
        Default: 'nndsvdar' if n_components < n_features, otherwise 'random'.
        Valid options:

        - 'random': non-negative random matrices, scaled with:
            sqrt(X.mean() / n_components)

        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
            initialization (better for sparseness)

        - 'nndsvda': NNDSVD with zeros filled with the average of X
            (better when sparsity is not desired)

        - 'nndsvdar': NNDSVD with zeros filled with small random values
            (generally faster, less accurate alternative to NNDSVDa
            for when sparsity is not desired)

        - 'custom': use custom matrices W and H

    eps : float
        Truncate all values less then this in output to zero.

    random_state : int seed, RandomState instance, or None (default)
        Random number generator seed control, used in 'nndsvdar' and
        'random' modes.

    Returns
    -------
    W : array-like, shape (n_samples, n_components)
        Initial guesses for solving X ~= WH

    H : array-like, shape (n_components, n_features)
        Initial guesses for solving X ~= WH

    References
    ----------
    C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for
    nonnegative matrix factorization - Pattern Recognition, 2008
    http://tinyurl.com/nndsvd
    """
    check_non_negative(X, "NMF initialization")
    n_samples, n_features = X.shape

    if init is None:
        if n_components < n_features:
            init = 'nndsvd'
        else:
            init = 'random'

    # Random initialization
    if init == 'random':
        avg = np.sqrt(X.mean() / n_components)
        rng = check_random_state(random_state)
        H = avg * rng.randn(n_components, n_features)
        W = avg * rng.randn(n_samples, n_components)
        # we do not write np.abs(H, out=H) to stay compatible with
        # numpy 1.5 and earlier where the 'out' keyword is not
        # supported as a kwarg on ufuncs
        np.abs(H, H)
        np.abs(W, W)
        return W, H

    # NNDSVD initialization
    U, S, V = randomized_svd(X, n_components, random_state=random_state)
    W, H = np.zeros(U.shape), np.zeros(V.shape)

    # The leading singular triplet is non-negative
    # so it can be used as is for initialization.
    W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
    H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])

    for j in range(1, n_components):
        x, y = U[:, j], V[j, :]

        # extract positive and negative parts of column vectors
        x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
        x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))

        # and their norms
        x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
        x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)

        m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm

        # choose update
        if m_p > m_n:
            u = x_p / x_p_nrm
            v = y_p / y_p_nrm
            sigma = m_p
        else:
            u = x_n / x_n_nrm
            v = y_n / y_n_nrm
            sigma = m_n

        lbd = np.sqrt(S[j] * sigma)
        W[:, j] = lbd * u
        H[j, :] = lbd * v

    W[W < eps] = 0
    H[H < eps] = 0

    if init == "nndsvd":
        pass
    elif init == "nndsvda":
        avg = X.mean()
        W[W == 0] = avg
        H[H == 0] = avg
    elif init == "nndsvdar":
        rng = check_random_state(random_state)
        avg = X.mean()
        W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100)
        H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100)
    else:
        raise ValueError(
            'Invalid init parameter: got %r instead of one of %r' %
            (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar')))

    return W, H
Пример #15
0
def non_negative_factorization(X,
                               W=None,
                               H=None,
                               n_components=None,
                               init='random',
                               update_H=True,
                               solver='cd',
                               tol=1e-4,
                               max_iter=200,
                               alpha=0.,
                               l1_ratio=0.,
                               regularization=None,
                               random_state=None,
                               verbose=0,
                               shuffle=False,
                               nls_max_iter=2000,
                               sparseness=None,
                               beta=1,
                               eta=0.1):
    """Compute Non-negative Matrix Factorization (NMF)

    Find two non-negative matrices (W, H) whose product approximates the non-
    negative matrix X. This factorization can be used for example for
    dimensionality reduction, source separation or topic extraction.

    The objective function is::

        0.5 * ||X - WH||_Fro^2
        + alpha * l1_ratio * ||vec(W)||_1
        + alpha * l1_ratio * ||vec(H)||_1
        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
        + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2

    Where::

        ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
        ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)

    The objective function is minimized with an alternating minimization of W
    and H. If H is given and update_H=False, it solves for W only.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Constant matrix.

    W : array-like, shape (n_samples, n_components)
        If init='custom', it is used as initial guess for the solution.

    H : array-like, shape (n_components, n_features)
        If init='custom', it is used as initial guess for the solution.
        If update_H=False, it is used as a constant, to solve for W only.

    n_components : integer
        Number of components, if n_components is not set all features
        are kept.

    init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
        Method used to initialize the procedure.
        Default: 'nndsvd' if n_components < n_features, otherwise random.
        Valid options:

        - 'random': non-negative random matrices, scaled with:
            sqrt(X.mean() / n_components)

        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
            initialization (better for sparseness)

        - 'nndsvda': NNDSVD with zeros filled with the average of X
            (better when sparsity is not desired)

        - 'nndsvdar': NNDSVD with zeros filled with small random values
            (generally faster, less accurate alternative to NNDSVDa
            for when sparsity is not desired)

        - 'custom': use custom matrices W and H

    update_H : boolean, default: True
        Set to True, both W and H will be estimated from initial guesses.
        Set to False, only W will be estimated.

    solver : 'pg' | 'cd'
        Numerical solver to use:
        'pg' is a (deprecated) Projected Gradient solver.
        'cd' is a Coordinate Descent solver.

    tol : float, default: 1e-4
        Tolerance of the stopping condition.

    max_iter : integer, default: 200
        Maximum number of iterations before timing out.

    alpha : double, default: 0.
        Constant that multiplies the regularization terms.

    l1_ratio : double, default: 0.
        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
        For l1_ratio = 0 the penalty is an elementwise L2 penalty
        (aka Frobenius Norm).
        For l1_ratio = 1 it is an elementwise L1 penalty.
        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.

    regularization : 'both' | 'components' | 'transformation' | None
        Select whether the regularization affects the components (H), the
        transformation (W), both or none of them.

    random_state : integer seed, RandomState instance, or None (default)
        Random number generator seed control.

    verbose : integer, default: 0
        The verbosity level.

    shuffle : boolean, default: False
        If true, randomize the order of coordinates in the CD solver.

    nls_max_iter : integer, default: 2000
        Number of iterations in NLS subproblem.
        Used only in the deprecated 'pg' solver.

    sparseness : 'data' | 'components' | None, default: None
        Where to enforce sparsity in the model.
        Used only in the deprecated 'pg' solver.

    beta : double, default: 1
        Degree of sparseness, if sparseness is not None. Larger values mean
        more sparseness. Used only in the deprecated 'pg' solver.

    eta : double, default: 0.1
        Degree of correctness to maintain, if sparsity is not None. Smaller
        values mean larger error. Used only in the deprecated 'pg' solver.

    Returns
    -------
    W : array-like, shape (n_samples, n_components)
        Solution to the non-negative least squares problem.

    H : array-like, shape (n_components, n_features)
        Solution to the non-negative least squares problem.

    n_iter : int
        Actual number of iterations.

    References
    ----------
    C.-J. Lin. Projected gradient methods for non-negative matrix
    factorization. Neural Computation, 19(2007), 2756-2779.
    http://www.csie.ntu.edu.tw/~cjlin/nmf/

    Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
    large scale nonnegative matrix and tensor factorizations."
    IEICE transactions on fundamentals of electronics, communications and
    computer sciences 92.3: 708-721, 2009.
    """

    X = check_array(X, accept_sparse=('csr', 'csc'))
    check_non_negative(X, "NMF (input X)")
    _check_string_param(sparseness, solver)

    n_samples, n_features = X.shape
    if n_components is None:
        n_components = n_features

    if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0:
        raise ValueError("Number of components must be a positive integer;"
                         " got (n_components=%r)" % n_components)
    if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0:
        raise ValueError(
            "Maximum number of iterations must be a positive integer;"
            " got (max_iter=%r)" % max_iter)
    if not isinstance(tol, numbers.Number) or tol < 0:
        raise ValueError("Tolerance for stopping criteria must be "
                         "positive; got (tol=%r)" % tol)

    # check W and H, or initialize them
    if init == 'custom' and update_H:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        _check_init(W, (n_samples, n_components), "NMF (input W)")
    elif not update_H:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        W = np.zeros((n_samples, n_components))
    else:
        W, H = _initialize_nmf(X,
                               n_components,
                               init=init,
                               random_state=random_state)

    if solver == 'pg':
        warnings.warn(
            "'pg' solver will be removed in release 0.19."
            " Use 'cd' solver instead.", DeprecationWarning)
        if update_H:  # fit_transform
            W, H, n_iter = _fit_projected_gradient(X, W, H, tol, max_iter,
                                                   nls_max_iter, alpha,
                                                   l1_ratio, sparseness, beta,
                                                   eta)
        else:  # transform
            W, H, n_iter = _update_projected_gradient_w(
                X, W, H, tol, nls_max_iter, alpha, l1_ratio, sparseness, beta,
                eta)
    elif solver == 'cd':
        W, H, n_iter = _fit_coordinate_descent(X,
                                               W,
                                               H,
                                               tol,
                                               max_iter,
                                               alpha,
                                               l1_ratio,
                                               regularization,
                                               update_H=update_H,
                                               verbose=verbose,
                                               shuffle=shuffle,
                                               random_state=random_state)
    else:
        raise ValueError("Invalid solver parameter '%s'." % solver)

    if n_iter == max_iter:
        warnings.warn(
            "Maximum number of iteration %d reached. Increase it to"
            " improve convergence." % max_iter, ConvergenceWarning)

    return W, H, n_iter
def topic_supervised_factorization(X,
                                   W=None,
                                   H=None,
                                   n_components=None,
                                   labels=None,
                                   init=None,
                                   update_H=True,
                                   tol=1e-4,
                                   max_iter=200,
                                   random_state=None,
                                   verbose=0):
    """Compute Topic Supervised Non-negative Matrix Factorization (TSNMF)
    Similar to NMF, this method find two non-negative matrices (W, H) whose
    product approximates the non-negative matrix X, where W is constrain by
    a supervision matrix L. This factorization can be used for example for 
    dimensionality reduction, source separation or topic extraction.
    The objective function is::
        ||X - Had(W,L)H||_Fro^2
    Where::
        ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
        Had(W,L) is the Hadamard (entrywise) product
    The objective function is minimized with an alternating minimization of W
    and H. If H is given and update_H=False, it solves for W only.
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Constant matrix.
    W : array-like, shape (n_samples, n_components)
    H : array-like, shape (n_components, n_features)
        If update_H=False, it is used as a constant, to solve for W only.
    n_components : integer
        Number of components, if n_components is not set all features
        are kept.
    labels : list of list
        The order of the sublists should match the order of the docs from X,
        and each sublist is a list of topic id tags
    init : None | 'random' | 'nndsvd'
        Method used to initialize the procedure.
        Default: 'random'.
        Valid options:
        - None: 'nndsvd' if n_components < n_features, otherwise 'random'.
        - 'random': non-negative random matrices, scaled with:
            sqrt(X.mean() / n_components)
        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
            initialization (better for sparseness)
    update_H : boolean, default: True
        Set to True, both W and H will be estimated from initial guesses.
        Set to False, only W will be estimated.
    tol : float, default: 1e-4
        Tolerance of the stopping condition.
    max_iter : integer, default: 200
        Maximum number of iterations before timing out.
    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    verbose : integer, default: 0
        The verbosity level.
    Returns
    -------
    W : array-like, shape (n_samples, n_components)
        Solution to the non-negative least squares problem.
    H : array-like, shape (n_components, n_features)
        Solution to the non-negative least squares problem.
    n_iter : int
        Actual number of iterations.
    """

    X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)  # from utils
    check_non_negative(X, "TSNMF (input X)")  # from utils.validation

    n_samples, n_features = X.shape
    if n_components is None:
        n_components = n_features

    #Validation from NMF sklearn source code
    if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0:
        raise ValueError("Number of components must be a positive integer;"
                         " got (n_components=%r)" % n_components)
    if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0:
        raise ValueError("Maximum number of iterations must be a positive "
                         "integer; got (max_iter=%r)" % max_iter)
    if not isinstance(tol, numbers.Number) or tol < 0:
        raise ValueError("Tolerance for stopping criteria must be "
                         "positive; got (tol=%r)" % tol)

    # Supervision matrix
    L = create_constraint_matrix(labels, n_components)

    if not update_H:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        # 'mu' solver should not be initialized by zeros
        avg = np.sqrt(X.mean() / n_components)
        W = np.full((n_samples, n_components), avg)
    else:
        W_init, H_init = _initialize_tsnmf(X,
                                           L,
                                           n_components,
                                           init=init,
                                           random_state=random_state)

        # override W or H if supplied
        if W is not None:
            _check_init(W, (n_samples, n_components), "NMF (input W)")
        else:
            W = W_init

        if H is not None:
            _check_init(H, (n_components, n_features), "NMF (input H)")
        else:
            H = H_init

    if init == 'nndsvd':
        W[(L == 1) & (W <= EPSILON)] = W.mean()

    W, H, n_iter = _fit_multiplicative_update(X, W, H, L, max_iter, tol,
                                              update_H, verbose)
    return W, H, n_iter
Пример #17
0
    def fit(self, X, verbose=False):
        r"""Compute the basis matrix on the provided data matrix using the
        UoI\ :sub:`NMF` algorithm.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Data matrix to be decomposed.
        verbose : bool
            If True, outputs status updates.
        """
        if verbose:
            self._logger.setLevel(logging.DEBUG)
        else:
            self._logger.setLevel(logging.WARNING)

        check_non_negative(X, 'UoI NMF')
        n_samples, n_features = X.shape

        H_samples = {
            k: np.zeros((self.n_boots, k, n_features))
            for k in self.ranks
        }

        rep_idx = self._rand.randint(n_samples, size=(self.n_boots, n_samples))
        for boot_idx in range(self.n_boots):
            self._logger.info("Bootstrap %d" % boot_idx)
            # compute NMF bases for k across bootstrap replicates
            sample = X[rep_idx[boot_idx]]
            for k_idx, k in enumerate(self.ranks):
                # concatenate k by p
                H_samples[k][boot_idx] = \
                    self.nmf.set_params(n_components=k).fit(sample).components_

        if self.use_dissimilarity:
            gamma = np.zeros(len(self.ranks))
            # iterate over each rank
            for k_idx, k in enumerate(self.ranks):
                # extract the bases for each rank
                H_k = H_samples[k]
                for boot1, boot2 in combinations(range(self.n_boots), 2):
                    gamma[k_idx] += dissimilarity(H_k[boot1], H_k[boot2])
            self.dissimilarity_ = (gamma * 2) /\
                                  (self.n_boots *
                                   (self.n_boots - 1))
            k_min = self.ranks[np.argmin(gamma)]
            H_pre_cluster = H_samples[k_min].reshape(
                (self.n_boots * k_min, n_features))
        else:
            H_pre_cluster = np.zeros(
                (self.n_boots * np.sum(self.ranks), n_features))
            start_idx = 0
            for k in self.ranks:
                H_k = H_samples[k].reshape((self.n_boots * k, n_features))
                end_idx = start_idx + self.n_boots * k
                H_pre_cluster[start_idx:end_idx] = H_k
                start_idx = end_idx

        # remove zero bases and normalize across features
        H_pre_cluster = H_pre_cluster[np.sum(H_pre_cluster, axis=1) != 0.0]
        H_pre_cluster = normalize(H_pre_cluster, norm='l2', axis=1)

        # cluster all bases
        self._logger.info("clustering bases samples")
        labels = self.cluster.fit_predict(H_pre_cluster)

        # compute consensus bases from clusters
        cluster_ids = np.unique(labels[labels != -1])
        n_clusters = cluster_ids.size
        self._logger.info("found %d bases, computing consensus bases" %
                          n_clusters)
        H_cons = np.zeros((n_clusters, n_features))

        for c_id in cluster_ids:
            H_cons[c_id, :] = self.cons_meth(H_pre_cluster[labels == c_id],
                                             axis=0)

        # re-normalize across features
        H_cons = normalize(H_cons, norm='l2', axis=1)

        self.components_ = H_cons
        self.n_components = self.components_.shape[0]
        self.bases_samples_ = H_pre_cluster
        self.bases_samples_labels_ = labels
        self.bootstraps_ = rep_idx
        self.reconstruction_err_ = None
        return self
def _initialize_tsnmf(X,
                      L,
                      n_components,
                      init=None,
                      eps=1e-6,
                      random_state=None):
    """
    Initialization of matrices W and H (X = WH)

    Returns
    -------
    W : array-like, shape (n_samples, n_components)
        Initial guesses for solving X ~= WH
    H : array-like, shape (n_components, n_features)
        Initial guesses for solving X ~= WH
    """

    check_non_negative(X, "TSNMF initialization")
    n_samples, n_features = X.shape
    if (init is not None and init != 'random'
            and n_components > min(n_samples, n_features)):
        raise ValueError(
            "init = '{}' can only be used when "
            "n_components <= min(n_samples, n_features)".format(init))

    if init is None:
        if n_components <= min(n_samples, n_features):
            init = 'nndsvd'
        else:
            init = 'random'

    if init == 'random':
        avg = np.sqrt(X.mean() / n_components)
        rng = check_random_state(random_state)
        H = avg * rng.randn(n_components, n_features)
        W = avg * rng.randn(n_samples, n_components)
        # we do not write np.abs(H, out=H) to stay compatible with
        # numpy 1.5 and earlier where the 'out' keyword is not
        # supported as a kwarg on ufuncs
        np.abs(H, H)
        np.abs(W, W)
        return W, H

    # NNDSVD initialization
    U, S, V = randomized_svd(X, n_components, random_state=random_state)
    W, H = np.zeros(U.shape), np.zeros(V.shape)

    # The leading singular triplet is non-negative
    # so it can be used as is for initialization.
    W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
    H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])

    for j in range(1, n_components):
        x, y = U[:, j], V[j, :]

        # extract positive and negative parts of column vectors
        x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
        x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))

        # and their norms
        x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
        x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)

        m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm

        # choose update
        if m_p > m_n:
            u = x_p / x_p_nrm
            v = y_p / y_p_nrm
            sigma = m_p
        else:
            u = x_n / x_n_nrm
            v = y_n / y_n_nrm
            sigma = m_n

        lbd = np.sqrt(S[j] * sigma)
        W[:, j] = lbd * u
        H[j, :] = lbd * v

    W[W < eps] = 0
    H[H < eps] = 0

    if init == "nndsvd":
        pass
    else:
        raise ValueError(
            'Invalid init parameter: got %r instead of one of %r' % (init, (
                None,
                'random',
                'nndsvd',
            )))

    return W, H
Пример #19
0
def _initialize_mf(M,
                   n_components,
                   init=None,
                   eps=1e-6,
                   random_state=None,
                   non_negative=False):
    """Algorithms for MF initialization.

    Computes an initial guess for the non-negative
    rank k matrix approximation for M: M = AB^T

    Parameters
    ----------
    M : array-like, shape (n_samples, n_features)
        The data matrix to be decomposed.

    n_components : integer
        The number of components desired in the approximation.

    init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'svd'
        Method used to initialize the procedure.
        Default: 'svd' if n_components < n_features, otherwise 'random'.
        Valid options:

        - 'random': non-negative random matrices, scaled with:
            sqrt(X.mean() / n_components)

        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
            initialization (better for sparseness)

        - 'nndsvda': NNDSVD with zeros filled with the average of X
            (better when sparsity is not desired)

        - 'nndsvdar': NNDSVD with zeros filled with small random values
            (generally faster, less accurate alternative to NNDSVDa
            for when sparsity is not desired)

    non_negative: bool
        Whether to decompose into non-negative matrices.

    eps : float
        If non-negative, truncate all values less then this in output to zero.

    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used when ``random`` == 'nndsvdar' or 'random'.

    Returns
    -------
    A : array-like, shape (n_samples, n_components)
        Initial guesses for solving M ~= AB^T

    B : array-like, shape (n_features, n_components)
        Initial guesses for solving M ~= AB^T

    References
    ----------
    C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for
    nonnegative matrix factorization - Pattern Recognition, 2008
    http://tinyurl.com/nndsvd
    """
    if non_negative:
        check_non_negative(M, "MF initialization")

    n_samples, n_features = M.shape

    if init is None:
        if n_components < n_features:
            init = 'nndsvdar' if non_negative else 'svd'
        else:
            init = 'random'

    if init == 'random':
        avg = np.sqrt(np.abs(M.mean()) / n_components)
        rng = check_random_state(random_state)
        A = avg * rng.randn(n_samples, n_components)
        B = avg * rng.randn(n_components, n_features)
        if non_negative:
            np.abs(A, A)
            np.abs(B, B)

    elif init == 'svd':
        if non_negative:
            raise ValueError(
                'SVD initialization incompatible with NMF (use nndsvd instead)'
            )
        if min(n_samples, n_features) < n_components:
            warnings.warn(
                'The number of components is smaller than the rank in svd initialization.'
                +
                'The input will be padded with zeros to compensate for the lack of singular values.'
            )
        # simple SVD based approximation
        U, S, V = randomized_svd(M, n_components, random_state=random_state)
        # randomize_svd only returns min(n_components, n_features, n_samples) singular values and vectors
        # therefore, to retain the desired shape, we need to pad and reshape the inputs
        if n_components > n_features:
            U_padded = np.zeros((U.shape[0], n_components))
            U_padded[:, :U.shape[1]] = U
            U = U_padded
            V_padded = np.zeros((n_components, V.shape[1]))
            V_padded[:V.shape[0], :] = V
            V = V_padded
            S_padded = np.zeros(n_components)
            S_padded[:S.shape[0]] = S
            S = S_padded

        S = np.diag(np.sqrt(S))
        A = np.dot(U, S)
        B = np.dot(S, V)

    elif init in ['nndsvd', 'nndsvda', 'nndsvdar']:
        if not non_negative:
            warnings.warn(
                '%s results in non-negative constrained factors,' % init +
                'so SVD initialization should provide better initial estimate')
        # NNDSVD initialization
        U, S, V = randomized_svd(M, n_components, random_state=random_state)
        A, B = np.zeros(U.shape), np.zeros(V.shape)

        # The leading singular triplet is non-negative
        # so it can be used as is for initialization.
        A[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
        B[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])

        for j in range(1, n_components):
            x, y = U[:, j], V[j, :]

            # extract positive and negative parts of column vectors
            x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
            x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))

            # and their norms
            x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
            x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)

            m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm

            # choose update
            if m_p > m_n:
                u = x_p / x_p_nrm
                v = y_p / y_p_nrm
                sigma = m_p
            else:
                u = x_n / x_n_nrm
                v = y_n / y_n_nrm
                sigma = m_n

            lbd = np.sqrt(S[j] * sigma)
            A[:, j] = lbd * u
            B[j, :] = lbd * v

        A[A < eps] = 0
        B[B < eps] = 0

        if init == "nndsvd":
            pass
        elif init == "nndsvda":
            avg = M.mean()
            A[A == 0] = avg
            B[B == 0] = avg
        elif init == "nndsvdar":
            rng = check_random_state(random_state)
            avg = M.mean()
            A[A == 0] = abs(avg * rng.randn(len(A[A == 0])) / 100)
            B[B == 0] = abs(avg * rng.randn(len(B[B == 0])) / 100)

    else:
        raise ValueError("Invalid init argument")

    return A, B.T
Пример #20
0
    def _fit(self, X, skip_num_points=0):
        """Private function to fit the model using X as training data."""

        if self.method not in ['barnes_hut', 'exact']:
            raise ValueError("'method' must be 'barnes_hut' or 'exact'")
        if self.angle < 0.0 or self.angle > 1.0:
            raise ValueError("'angle' must be between 0.0 - 1.0")

        if hasattr(self, 'square_distances'):
            if self.square_distances not in [True, 'legacy']:
                raise ValueError(
                    "'square_distances' must be True or 'legacy'.")
            if self.metric != "euclidean" and self.square_distances is not True:
                warnings.warn(
                    ("'square_distances' has been introduced in 0.24"
                     "to help phase out legacy squaring behavior. The "
                     "'legacy' setting will be removed in 0.26, and the "
                     "default setting will be changed to True. In 0.28, "
                     "'square_distances' will be removed altogether,"
                     "and distances will be squared by default. Set "
                     "'square_distances'=True to silence this warning."),
                    FutureWarning)

        if self.method == 'barnes_hut':
            if sklearn_check_version('0.23'):
                X = self._validate_data(X,
                                        accept_sparse=['csr'],
                                        ensure_min_samples=2,
                                        dtype=[np.float32, np.float64])
            else:
                X = check_array(X,
                                accept_sparse=['csr'],
                                ensure_min_samples=2,
                                dtype=[np.float32, np.float64])
        else:
            if sklearn_check_version('0.23'):
                X = self._validate_data(X,
                                        accept_sparse=['csr', 'csc', 'coo'],
                                        dtype=[np.float32, np.float64])
            else:
                X = check_array(X,
                                accept_sparse=['csr', 'csc', 'coo'],
                                dtype=[np.float32, np.float64])
        if self.metric == "precomputed":
            if isinstance(self.init, str) and self.init == 'pca':
                raise ValueError("The parameter init=\"pca\" cannot be "
                                 "used with metric=\"precomputed\".")
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square distance matrix")

            check_non_negative(
                X, "TSNE.fit(). With metric='precomputed', X "
                "should contain positive distances.")

            if self.method == "exact" and issparse(X):
                raise TypeError(
                    'TSNE with method="exact" does not accept sparse '
                    'precomputed distance matrix. Use method="barnes_hut" '
                    'or provide the dense distance matrix.')

        if self.method == 'barnes_hut' and self.n_components > 3:
            raise ValueError("'n_components' should be inferior to 4 for the "
                             "barnes_hut algorithm as it relies on "
                             "quad-tree or oct-tree.")
        random_state = check_random_state(self.random_state)

        if self.early_exaggeration < 1.0:
            raise ValueError(
                "early_exaggeration must be at least 1, but is {}".format(
                    self.early_exaggeration))

        if self.n_iter < 250:
            raise ValueError("n_iter should be at least 250")

        n_samples = X.shape[0]

        neighbors_nn = None
        if self.method == "exact":
            # Retrieve the distance matrix, either using the precomputed one or
            # computing it.
            if self.metric == "precomputed":
                distances = X
            else:
                if self.verbose:
                    print("[t-SNE] Computing pairwise distances...")

                if self.metric == "euclidean":
                    # Euclidean is squared here, rather than using **= 2,
                    # because euclidean_distances already calculates
                    # squared distances, and returns np.sqrt(dist) for
                    # squared=False.
                    # Also, Euclidean is slower for n_jobs>1, so don't set here
                    distances = pairwise_distances(X,
                                                   metric=self.metric,
                                                   squared=True)
                else:
                    distances = pairwise_distances(X,
                                                   metric=self.metric,
                                                   n_jobs=self.n_jobs)

            if np.any(distances < 0):
                raise ValueError("All distances should be positive, the "
                                 "metric given is not correct")

            if self.metric != "euclidean" and \
                    getattr(self, 'square_distances', True) is True:
                distances **= 2

            # compute the joint probability distribution for the input space
            P = _joint_probabilities(distances, self.perplexity, self.verbose)
            assert np.all(np.isfinite(P)), "All probabilities should be finite"
            assert np.all(P >= 0), "All probabilities should be non-negative"
            assert np.all(P <= 1), ("All probabilities should be less "
                                    "or then equal to one")

        else:
            # Compute the number of nearest neighbors to find.
            # LvdM uses 3 * perplexity as the number of neighbors.
            # In the event that we have very small # of points
            # set the neighbors to n - 1.
            n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1))

            if self.verbose:
                print("[t-SNE] Computing {} nearest neighbors...".format(
                    n_neighbors))

            # Find the nearest neighbors for every point
            knn = NearestNeighbors(algorithm='auto',
                                   n_jobs=self.n_jobs,
                                   n_neighbors=n_neighbors,
                                   metric=self.metric)
            t0 = time()
            knn.fit(X)
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Indexed {} samples in {:.3f}s...".format(
                    n_samples, duration))

            t0 = time()
            distances_nn = knn.kneighbors_graph(mode='distance')
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Computed neighbors for {} samples "
                      "in {:.3f}s...".format(n_samples, duration))

            # Free the memory used by the ball_tree
            del knn

            if getattr(self, 'square_distances', True) is True or \
                    self.metric == "euclidean":
                # knn return the euclidean distance but we need it squared
                # to be consistent with the 'exact' method. Note that the
                # the method was derived using the euclidean method as in the
                # input space. Not sure of the implication of using a different
                # metric.
                distances_nn.data **= 2

            # compute the joint probability distribution for the input space
            P = _joint_probabilities_nn(distances_nn, self.perplexity,
                                        self.verbose)

        if isinstance(self.init, np.ndarray):
            X_embedded = self.init
        elif self.init == 'pca':
            pca = PCA(n_components=self.n_components,
                      svd_solver='randomized',
                      random_state=random_state)
            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
        elif self.init == 'random':
            # The embedding is initialized with iid samples from Gaussians with
            # standard deviation 1e-4.
            X_embedded = 1e-4 * random_state.randn(
                n_samples, self.n_components).astype(np.float32)
        else:
            raise ValueError("'init' must be 'pca', 'random', or "
                             "a numpy array")

        # Degrees of freedom of the Student's t-distribution. The suggestion
        # degrees_of_freedom = n_components - 1 comes from
        # "Learning a Parametric Embedding by Preserving Local Structure"
        # Laurens van der Maaten, 2009.
        degrees_of_freedom = max(self.n_components - 1, 1)

        return self._tsne(P,
                          degrees_of_freedom,
                          n_samples,
                          X_embedded=X_embedded,
                          neighbors=neighbors_nn,
                          skip_num_points=skip_num_points)
def non_negative_factorization(X, W=None, H=None, n_components=None,
                               init='random', update_H=True, solver='cd',
                               beta_loss='frobenius', tol=1e-4,
                               max_iter=200, alpha=0., l1_ratio=0.,
                               regularization=None, random_state=None,
                               verbose=0, shuffle=False, distribution = 'gaussian', N=None, D=None):
    r"""Compute Non-negative Matrix Factorization (NMF)
    Find two non-negative matrices (W, H) whose product approximates the non-
    negative matrix X. This factorization can be used for example for
    dimensionality reduction, source separation or topic extraction.
    The objective function is::
        0.5 * ||X - WH||_Fro^2
        + alpha * l1_ratio * ||vec(W)||_1
        + alpha * l1_ratio * ||vec(H)||_1
        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
        + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2
    Where::
        ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
        ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)
    For multiplicative-update ('mu') solver, the Frobenius norm
    (0.5 * ||X - WH||_Fro^2) can be changed into another beta-divergence loss,
    by changing the beta_loss parameter.
    The objective function is minimized with an alternating minimization of W
    and H. If H is given and update_H=False, it solves for W only.
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Constant matrix.
    W : array-like, shape (n_samples, n_components)
        If init='custom', it is used as initial guess for the solution.
    H : array-like, shape (n_components, n_features)
        If init='custom', it is used as initial guess for the solution.
        If update_H=False, it is used as a constant, to solve for W only.
    n_components : integer
        Number of components, if n_components is not set all features
        are kept.
    init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
        Method used to initialize the procedure.
        Default: 'random'.
        Valid options:
        - 'random': non-negative random matrices, scaled with:
            sqrt(X.mean() / n_components)
        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
            initialization (better for sparseness)
        - 'nndsvda': NNDSVD with zeros filled with the average of X
            (better when sparsity is not desired)
        - 'nndsvdar': NNDSVD with zeros filled with small random values
            (generally faster, less accurate alternative to NNDSVDa
            for when sparsity is not desired)
        - 'custom': use custom matrices W and H
    update_H : boolean, default: True
        Set to True, both W and H will be estimated from initial guesses.
        Set to False, only W will be estimated.
    solver : 'cd' | 'mu'
        Numerical solver to use:
        'cd' is a Coordinate Descent solver that uses Fast Hierarchical
            Alternating Least Squares (Fast HALS).
        'mu' is a Multiplicative Update solver.
        .. versionadded:: 0.17
           Coordinate Descent solver.
        .. versionadded:: 0.19
           Multiplicative Update solver.
    beta_loss : float or string, default 'frobenius'
        String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.
        Beta divergence to be minimized, measuring the distance between X
        and the dot product WH. Note that values different from 'frobenius'
        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
        fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
        matrix X cannot contain zeros. Used only in 'mu' solver.
        .. versionadded:: 0.19
    tol : float, default: 1e-4
        Tolerance of the stopping condition.
    max_iter : integer, default: 200
        Maximum number of iterations before timing out.
    alpha : double, default: 0.
        Constant that multiplies the regularization terms.
    l1_ratio : double, default: 0.
        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
        For l1_ratio = 0 the penalty is an elementwise L2 penalty
        (aka Frobenius Norm).
        For l1_ratio = 1 it is an elementwise L1 penalty.
        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
    regularization : 'both' | 'components' | 'transformation' | None
        Select whether the regularization affects the components (H), the
        transformation (W), both or none of them.
    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    verbose : integer, default: 0
        The verbosity level.
    shuffle : boolean, default: False
        If true, randomize the order of coordinates in the CD solver.
    Returns
    -------
    W : array-like, shape (n_samples, n_components)
        Solution to the non-negative least squares problem.
    H : array-like, shape (n_components, n_features)
        Solution to the non-negative least squares problem.
    n_iter : int
        Actual number of iterations.
    Examples
    --------
    >>> import numpy as np
    >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
    >>> from sklearn.decomposition import non_negative_factorization
    >>> W, H, n_iter = non_negative_factorization(X, n_components=2,
    ... init='random', random_state=0)
    References
    ----------
    Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
    large scale nonnegative matrix and tensor factorizations."
    IEICE transactions on fundamentals of electronics, communications and
    computer sciences 92.3: 708-721, 2009.
    Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
    factorization with the beta-divergence. Neural Computation, 23(9).
    """

    #print('My Non negative Factorization')

    X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
    check_non_negative(X, "NMF (input X)")
    beta_loss = _check_string_param(solver, regularization, beta_loss, init)

    if safe_min(X) == 0 and beta_loss <= 0:
        raise ValueError("When beta_loss <= 0 and X contains zeros, "
                         "the solver may diverge. Please add small values to "
                         "X, or use a positive beta_loss.")

    n_samples, n_features = X.shape
    if n_components is None:
        n_components = n_features

    if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0:
        raise ValueError("Number of components must be a positive integer;"
                         " got (n_components=%r)" % n_components)
    if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0:
        raise ValueError("Maximum number of iterations must be a positive "
                         "integer; got (max_iter=%r)" % max_iter)
    if not isinstance(tol, numbers.Number) or tol < 0:
        raise ValueError("Tolerance for stopping criteria must be "
                         "positive; got (tol=%r)" % tol)

    # check W and H, or initialize them
    if init == 'custom' and update_H:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        _check_init(W, (n_samples, n_components), "NMF (input W)")
    elif not update_H:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        # 'mu' solver should not be initialized by zeros
        if solver == 'mu':
            avg = np.sqrt(X.mean() / n_components)
            W = np.full((n_samples, n_components), avg)
        else:
            W = np.zeros((n_samples, n_components))
    else:
        W, H = _initialize_nmf(X, n_components, init=init,
                               random_state=random_state)


    l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
        alpha, l1_ratio, regularization)

    W, H, n_iter = update(X, W, H, max_iter, distribution, N , D)


    if n_iter == max_iter and tol > 0:
        warnings.warn("Maximum number of iteration %d reached. Increase it to"
                      " improve convergence." % max_iter, ConvergenceWarning)

    return W, H, n_iter
Пример #22
0
def non_negative_factorization(X,
                               W,
                               H,
                               n_components=None,
                               update_H=True,
                               beta_loss='frobenius',
                               tol=1e-4,
                               max_iter=200,
                               alpha=0.,
                               l1_ratio=0.,
                               regularization=None,
                               random_state=None,
                               verbose=0,
                               shuffle=False):

    X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
    check_non_negative(X, "NMF (input X)")
    beta_loss = _check_string_param(regularization, beta_loss)

    if X.min() == 0 and beta_loss <= 0:
        raise ValueError("When beta_loss <= 0 and X contains zeros, "
                         "the solver may diverge. Please add small values to "
                         "X, or use a positive beta_loss.")

    n_samples, n_features = X.shape
    if n_components is None:
        n_components = n_features

    if not isinstance(n_components, numbers.Integral) or n_components <= 0:
        raise ValueError("Number of components must be a positive integer;"
                         " got (n_components=%r)" % n_components)
    if not isinstance(max_iter, numbers.Integral) or max_iter < 0:
        raise ValueError("Maximum number of iterations must be a positive "
                         "integer; got (max_iter=%r)" % max_iter)
    if not isinstance(tol, numbers.Number) or tol < 0:
        raise ValueError("Tolerance for stopping criteria must be "
                         "positive; got (tol=%r)" % tol)

    if H is not None and W is not None:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        _check_init(W, (n_samples, n_components), "NMF (input W)")
    elif H is not None and W is None:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        if H.dtype != X.dtype:
            raise TypeError("H should have the same dtype as X. Got H.dtype = "
                            "{}.".format(H.dtype))
        W = np.zeros((n_samples, n_components), dtype=X.dtype)
    elif H is None and W is None:
        avg = np.sqrt(X.mean() / n_components)
        rng = check_random_state(random_state)
        H = avg * rng.randn(n_components, n_features).astype(X.dtype,
                                                             copy=False)
        W = avg * rng.randn(n_samples, n_components).astype(X.dtype,
                                                            copy=False)
        np.abs(H, out=H)
        np.abs(W, out=W)

    W, H, n_iter = _fit_coordinate_descent(X,
                                           W,
                                           H,
                                           tol,
                                           max_iter,
                                           update_H=update_H,
                                           verbose=verbose,
                                           shuffle=shuffle,
                                           random_state=random_state)

    if n_iter == max_iter and tol > 0:
        warnings.warn(
            "Maximum number of iteration %d reached. Increase it to"
            " improve convergence." % max_iter, ConvergenceWarning)

    return W, H, n_iter
Пример #23
0
def non_negative_factorization(lam,
                               N,
                               D,
                               X,
                               W=None,
                               H=None,
                               n_components=None,
                               init='warn',
                               update_H=True,
                               solver='mu',
                               beta_loss='KL',
                               tol=1e-4,
                               max_iter=400,
                               alpha=0.,
                               l1_ratio=0.,
                               regularization=None,
                               random_state=None,
                               verbose=0,
                               shuffle=False):

    X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
    check_non_negative(X, "NMF (input X)")
    beta_loss = _check_string_param(solver, regularization, beta_loss, init)

    if safe_min(X) == 0 and beta_loss <= 0:
        raise ValueError("When beta_loss <= 0 and X contains zeros, "
                         "the solver may diverge. Please add small values to "
                         "X, or use a positive beta_loss.")

    n_samples, n_features = X.shape
    if n_components is None:
        n_components = n_features

    if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0:
        raise ValueError("Number of components must be a positive integer;"
                         " got (n_components=%r)" % n_components)
    if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0:
        raise ValueError("Maximum number of iterations must be a positive "
                         "integer; got (max_iter=%r)" % max_iter)
    if not isinstance(tol, numbers.Number) or tol < 0:
        raise ValueError("Tolerance for stopping criteria must be "
                         "positive; got (tol=%r)" % tol)

    if init == "warn":
        if n_components < n_features:
            warnings.warn(
                "The default value of init will change from "
                "random to None in 0.23 to make it consistent "
                "with decomposition.NMF.", FutureWarning)
        init = "random"

    # check W and H, or initialize them
    if init == 'custom' and update_H:
        _check_init(H, (n_components, n_features), "NMF (input H)")
        _check_init(W, (n_samples, n_components), "NMF (input W)")
    elif not update_H:
        _check_init(H, (n_components, n_features), "NMF (input H)")

        avg = np.sqrt(X.mean() / n_components)
        W = np.full((n_samples, n_components), avg)

    else:
        W, H = _initialize_nmf(X,
                               n_components,
                               init=init,
                               random_state=random_state)

    l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
        alpha, l1_ratio, regularization)

    W, H, n_iter = _fit_multiplicative_update(lam, N, D, X, W, H, beta_loss,
                                              max_iter, tol, l1_reg_W,
                                              l1_reg_H, l2_reg_W, l2_reg_H,
                                              update_H, verbose)

    if n_iter == max_iter and tol > 0:
        warnings.warn(
            "Maximum number of iteration %d reached. Increase it to"
            " improve convergence." % max_iter, ConvergenceWarning)

    return W, H, n_iter
Пример #24
0
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
                            force_all_finite=True, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix
      inputs.
      ['nan_euclidean'] but it does not yet support sparse matrices.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if
        metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf and np.nan in array. The
        possibilities are:

        - True: Force all values of array to be finite.
        - False: accept both np.inf and np.nan in array.
        - 'allow-nan': accept only np.nan values in array. Values cannot
          be infinite.

        .. versionadded:: 0.22

    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    See also
    --------
    pairwise_distances_chunked : performs the same calculation as this
        function, but returns a generator of chunks of the distance matrix, in
        order to limit memory usage.
    paired_distances : Computes the distances between corresponding
                       elements of two arrays
    """
    if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed":
        raise ValueError("Unknown metric %s. Valid metrics are %s, or 'precomputed', "
                         "or a callable" % (metric, _VALID_METRICS))

    X = _daal_check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                          force_all_finite=force_all_finite)

    _patching_status = PatchingConditionsChain(
        "sklearn.metrics.pairwise_distances")
    _dal_ready = _patching_status.and_conditions([
        (metric == 'cosine' or metric == 'correlation',
            f"'{metric}' metric is not supported. "
            "Only 'cosine' and 'correlation' metrics are supported."),
        (Y is None, "Second feature array is not supported."),
        (not issparse(X), "X is sparse. Sparse input is not supported."),
        (X.dtype == np.float64,
            f"{X.dtype} X data type is not supported. Only np.float64 is supported.")
    ])
    _patching_status.write_log()
    if _dal_ready:
        if metric == 'cosine':
            return _daal4py_cosine_distance_dense(X)
        if metric == 'correlation':
            return _daal4py_correlation_distance_dense(X)
        raise ValueError(f"'{metric}' distance is wrong for daal4py.")
    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True,
                                     force_all_finite=force_all_finite)
        whom = ("`pairwise_distances`. Precomputed distance "
                " need to have non-negative values.")
        check_non_negative(X, whom=whom)
        return X
    if metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric,
                       force_all_finite=force_all_finite, **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)):
            msg = "Data was converted to boolean for metric %s" % metric
            warnings.warn(msg, DataConversionWarning)

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype,
                                     force_all_finite=force_all_finite)

        # precompute data-derived metric params
        params = _precompute_metric_params(X, Y, metric=metric, **kwds)
        kwds.update(**params)

        if effective_n_jobs(n_jobs) == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Пример #25
0
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix inputs.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    """
    if (metric not in _VALID_METRICS
        and not callable(metric)
        and metric != "precomputed"):
        raise ValueError("Unknown metric %s. "
                         "Valid metrics are %s, or 'precomputed', or a "
                         "callable" % (metric, _VALID_METRICS))

    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True)
        whom = ("`pairwise_distances`. Precomputed distance "
                                " need to have non-negative values.")
        check_non_negative(X, whom=whom)
        return X
    elif ((metric == 'cosine') and (Y is None)
          and (not issparse(X)) and X.dtype == np.float64):
        return _daal4py_cosine_distance_dense(X)
    elif ((metric == 'correlation') and (Y is None) and
          (not issparse(X)) and X.dtype == np.float64):
        return _daal4py_correlation_distance_dense(X)
    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric, **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        if (dtype == bool
            and (X.dtype != bool or (Y is not None and Y.dtype != bool))):
            msg = "Data was converted to boolean for metric %s" % metric
            warnings.warn(msg, DataConversionWarning)

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype)

        # precompute data-derived metric params
        params = _precompute_metric_params(X, Y, metric=metric, **kwds)
        kwds.update(**params)

        if effective_n_jobs(n_jobs) == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Пример #26
0
    def _fit(self, X, skip_num_points=0):
        """Private function to fit the model using X as training data."""

        X = self._validate_data(X,
                                accept_sparse=['csr', 'csc', 'coo'],
                                dtype=[np.float32, np.float64])
        if self.metric == "precomputed":
            if isinstance(self.init, str) and self.init == 'pca':
                raise ValueError("The parameter init=\"pca\" cannot be "
                                 "used with metric=\"precomputed\".")
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square distance matrix")

            check_non_negative(
                X, "QSNE.fit(). With metric='precomputed', X "
                "should contain positive distances.")

            if issparse(X):
                raise TypeError('QSNE does not accept sparse '
                                'precomputed distance matrix.')

        random_state = check_random_state(self.random_state)

        if self.early_exaggeration < 1.0:
            raise ValueError(
                "early_exaggeration must be at least 1, but is {}".format(
                    self.early_exaggeration))

        if self.n_iter < 250:
            raise ValueError("n_iter should be at least 250")

        n_samples = X.shape[0]

        neighbors_nn = None
        # Retrieve the distance matrix, either using the precomputed one or
        # computing it.
        if self.metric == "precomputed":
            distances = X
        else:
            if self.verbose:
                print("[q-SNE] Computing pairwise distances...")

            if self.metric == "euclidean":
                distances = pairwise_distances(X,
                                               metric=self.metric,
                                               squared=True)
            else:
                distances = pairwise_distances(X,
                                               metric=self.metric,
                                               n_jobs=self.n_jobs)

            if np.any(distances < 0):
                raise ValueError("All distances should be positive, the "
                                 "metric given is not correct")

        # compute the joint probability distribution for the input space
        P = _joint_probabilities(distances, self.perplexity, self.verbose)
        assert np.all(np.isfinite(P)), "All probabilities should be finite"
        assert np.all(P >= 0), "All probabilities should be non-negative"
        assert np.all(P <= 1), ("All probabilities should be less "
                                "or then equal to one")

        if isinstance(self.init, np.ndarray):
            X_embedded = self.init
        elif self.init == 'pca':
            pca = PCA(n_components=self.n_components,
                      svd_solver='randomized',
                      random_state=random_state)
            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
        elif self.init == 'random':
            # The embedding is initialized with iid samples from Gaussians with
            # standard deviation 1e-4.
            X_embedded = 1e-4 * random_state.randn(
                n_samples, self.n_components).astype(np.float32)
        else:
            raise ValueError("'init' must be 'pca', 'random', or "
                             "a numpy array")

        return self._qsne(P,
                          self.q,
                          n_samples,
                          X_embedded=X_embedded,
                          neighbors=neighbors_nn,
                          skip_num_points=skip_num_points)