def fit_analyze_nmf(self):
        '''
        fit NMF
        print out stats
        print out summary.html
        print out model.html
        '''

        #idx_df = list(self.df.index)

        t0 = time.time()  # time it
        self.W, self.H, nmf_model = run_nmf(self.X2, self.kw_nmf)
        self.nmf_model = nmf_model
        t1 = time.time()  # time it
        print "finished in  %4.4fmin for %s " % ((t1 - t0) / 60, 'run_nmf')
        print 'w ', self.W.shape, 'h', self.H.shape

        W_t = self.W.T

        print self.W.shape, W_t.shape
        # print type(W_t)
        print len(W_t[0, ]), W_t[0, :10]
        print 'range for W: (%.2f - %.2f); range for H: (%.2f - %.2f)' % (np.min(self.W), np.max(self.W), np.min(self.H), np.max(self.H))

        self.topic_terms = get_top_topics_terms(
            self.vectorizer, self.H, k_top_words=n_top_terms)

        self.print_topic_results_html()
        self.plot_hist_weight_best_topic_per_article()
def test_run_nmf_nokw():
    print '\nfunction: %s ' % inspect.stack()[0][3]
    nmx_max_iter = 6000
    model_name = 'run3_1'
    #func_stemmer = PorterStemmer()
    #func_tokenizer = word_tokenize
    # kw_tfidf = {'max_df': 0.90, 'stop_words': 'english', 'min_df': 10,\
    #            'tokenizer': func_tokenizer, 'ngram_range':(1,3)}
    kw_nmf = {'n_components': 2, 'max_iter': nmx_max_iter}
    X = np.array([random.random() for i in xrange(20)]).reshape((4, 5))
    print X
    W, H, nmf = run_nmf(X)  # , kw_nmf)
    print W
    n.assert_true(len(W), 2)
示例#3
0
def nmf(
    data: Union[MultimodalData, UnimodalData],
    n_components: int = 20,
    features: str = "highly_variable_features",
    space: str = "log",
    init: str = "nndsvdar",
    algo: str = "halsvar",
    mode: str = "batch",
    tol: float = 1e-4,
    use_gpu: bool = False,
    alpha_W: float = 0.0,
    l1_ratio_W: float = 0.0,
    alpha_H: float = 0.0,
    l1_ratio_H: float = 0.0,
    fp_precision: str = "float",
    n_jobs: int = -1,
    random_state: int = 0,
) -> None:
    """Perform Nonnegative Matrix Factorization (NMF) to the data using Frobenius norm. Steps include select features and L2 normalization and NMF and L2 normalization of resulting coordinates.

    The calculation uses `nmf-torch <https://github.com/lilab-bcb/nmf-torch>`_ package.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``50``.
        Number of Principal Components to get.

    features: ``str``, optional, default: ``"highly_variable_features"``.
        Keyword in ``data.var`` to specify features used for nmf.

    max_value: ``float``, optional, default: ``None``.
        The threshold to truncate data symmetrically after scaling. If ``None``, do not truncate.

    space: ``str``, optional, default: ``log``.
        Choose from ``log`` and ``expression``. ``log`` works on log-transformed expression space; ``expression`` works on the original expression space (normalized by total UMIs).

    init: ``str``, optional, default: ``nndsvdar``.
        Method to initialize NMF. Options are 'random', 'nndsvd', 'nndsvda' and 'nndsvdar'.

    algo: ``str``, optional, default: ``halsvar``
        Choose from ``mu`` (Multiplicative Update), ``hals`` (Hierarchical Alternative Least Square), ``halsvar`` (HALS variant, use HALS to mimic ``bpp`` and can get better convergence for sometimes) and ``bpp`` (alternative non-negative least squares with Block Principal Pivoting method).

    mode: ``str``, optional, default: ``batch``
        Learning mode. Choose from ``batch`` and ``online``. Notice that ``online`` only works when ``beta=2.0``. For other beta loss, it switches back to ``batch`` method.

    tol: ``float``, optional, default: ``1e-4``
        The toleration used for convergence check.

    use_gpu: ``bool``, optional, default: ``False``
        If ``True``, use GPU if available. Otherwise, use CPU only.

    alpha_W: ``float``, optional, default: ``0.0``
        A numeric scale factor which multiplies the regularization terms related to W.
        If zero or negative, no regularization regarding W is considered.

    l1_ratio_W: ``float``, optional, default: ``0.0``
        The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on W is (1 - l1_ratio_W).

    alpha_H: ``float``, optional, default: ``0.0``
        A numeric scale factor which multiplies the regularization terms related to H.
        If zero or negative, no regularization regarding H is considered.

    l1_ratio_H: ``float``, optional, default: ``0.0``
        The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on H is (1 - l1_ratio_H).

    fp_precision: ``str``, optional, default: ``float``
        The numeric precision on the results. Choose from ``float`` and ``double``.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``.
        Random seed to be set for reproducing result.

    Returns
    -------
    ``None``.

    Update ``data.obsm``:

        * ``data.obsm["X_nmf"]``: Scaled NMF coordinates of shape ``(n_cells, n_components)``. Each column has a unit variance.

        * ``data.obsm["H"]``: The coordinate factor matrix of shape ``(n_cells, n_components)``.

    Update ``data.uns``:

        * ``data.uns["W"]``: The feature factor matrix of shape ``(n_HVFs, n_components)``.

        * ``data.uns["nmf_err"]``: The NMF loss.

        * ``data.uns["nmf_features"]``: Record the features used to perform NMF analysis.

    Examples
    --------
    >>> pg.nmf(data)
    """
    X = _select_and_scale_features(data, features=features, space=space)

    try:
        from nmf import run_nmf
    except ImportError as e:
        import sys
        logger.error(f"{e}\nNeed NMF-Torch! Try 'pip install nmf-torch'.")
        sys.exit(-1)

    H, W, err = run_nmf(
        X,
        n_components=n_components,
        init=init,
        algo=algo,
        mode=mode,
        tol=tol,
        n_jobs=eff_n_jobs(n_jobs),
        random_state=random_state,
        use_gpu=use_gpu,
        alpha_W=alpha_W,
        l1_ratio_W=l1_ratio_W,
        alpha_H=alpha_H,
        l1_ratio_H=l1_ratio_H,
        fp_precision=fp_precision,
    )

    data.uns["nmf_features"] = features  # record which feature to use
    data.uns["W"] = np.ascontiguousarray(
        W.T, dtype=np.float32
    )  # cannot be varm because numbers of features are not the same
    data.uns["nmf_err"] = err

    data.obsm["H"] = np.ascontiguousarray(H, dtype=np.float32)
    H = data.obsm["H"]
    data.obsm["X_nmf"] = H / np.linalg.norm(H, axis=0)
示例#4
0
def run_test(filename,
             algo,
             mode,
             k,
             n_jobs,
             fp=None,
             init='nndsvdar',
             loss='frobenius',
             tol=1e-4,
             max_iter=200,
             random_state=0,
             alpha=0.0,
             l1_ratio=0.0,
             chunk_size=5000):
    X = np.load(filename)
    print(X.shape)

    if loss == 'kullback-leibler':
        beta = 1
    elif loss == 'frobenius':
        beta = 2
    elif loss == 'itakura-saito':
        beta = 0
    else:
        raise ValueError("Beta loss not supported!")

    #if method == 'sklearn mu':
    #    model = sd.NMF(n_components=k, init=init, beta_loss=loss, tol=tol, max_iter=max_iter, random_state=random_state, solver='mu',
    #                alpha=alpha, l1_ratio=l1_ratio)
    #    with threadpool_limits(limits=n_jobs):
    #        ts_start = time.time()
    #        W1 = model.fit_transform(X)
    #        ts_end = time.time()
    #    H1 = model.components_
    #    err = beta_loss(torch.tensor(X), torch.tensor(W1 @ H1), torch.tensor(W1), torch.tensor(H1),
    #                     l1_reg_H=alpha*l1_ratio, l2_reg_H=alpha*(1-l1_ratio),
    #                     l1_reg_W=alpha*l1_ratio, l2_reg_W=alpha*(1-l1_ratio),
    #                     beta=beta, epsilon=EPSILON)
    #    print(f"H has {np.sum(W1!=0)} non-zero elements, W has {np.sum(H1!=0)} non-zero elements. Iterations: {model.n_iter_}.")
    #elif method == 'sklearn cd':
    #    model = sd.NMF(n_components=k, init=init, beta_loss=loss, tol=tol, max_iter=max_iter, random_state=random_state, solver='cd',
    #                alpha=alpha, l1_ratio=l1_ratio)
    #    with threadpool_limits(limits=n_jobs):
    #        ts_start = time.time()
    #        W1 = model.fit_transform(X)
    #        ts_end = time.time()
    #    H1 = model.components_
    #    err = beta_loss(torch.tensor(X), torch.tensor(W1 @ H1), torch.tensor(W1), torch.tensor(H1),
    #                     l1_reg_H=alpha*l1_ratio, l2_reg_H=alpha*(1-l1_ratio),
    #                     l1_reg_W=alpha*l1_ratio, l2_reg_W=alpha*(1-l1_ratio),
    #                     beta=beta, epsilon=EPSILON)
    #    err_double = beta_loss(torch.tensor(X), torch.tensor(W1 @ H1), torch.tensor(W1), torch.tensor(H1),
    #                     l1_reg_H=alpha*l1_ratio, l2_reg_H=alpha*(1-l1_ratio),
    #                     l1_reg_W=alpha*l1_ratio, l2_reg_W=alpha*(1-l1_ratio),
    #                     beta=beta, epsilon=EPSILON, dtype='double')
    #    print(f"H has {np.sum(W1!=0)} non-zero elements, W has {np.sum(H1!=0)} non-zero elements. Iterations: {model.n_iter_}.")
    ts_start = time.time()
    H, W, err = run_nmf(X,
                        k,
                        init=init,
                        beta_loss=loss,
                        algo=algo,
                        mode=mode,
                        tol=tol,
                        n_jobs=n_jobs,
                        random_state=random_state,
                        alpha_W=alpha,
                        l1_ratio_W=l1_ratio,
                        alpha_H=alpha,
                        l1_ratio_H=l1_ratio,
                        fp_precision='float')
    ts_end = time.time()
    err_confirm = beta_loss(torch.tensor(X),
                            torch.tensor(H @ W),
                            torch.tensor(H),
                            torch.tensor(W),
                            beta=beta,
                            epsilon=EPSILON,
                            l1_reg_H=alpha * l1_ratio,
                            l2_reg_H=alpha * (1 - l1_ratio),
                            l1_reg_W=alpha * l1_ratio,
                            l2_reg_W=alpha * (1 - l1_ratio))
    print(
        f"{algo} {mode} takes {ts_end - ts_start} s, with error {err} ({err_confirm} confirmed)."
    )
    if fp is not None:
        fp.write(f"{algo} {mode},{ts_end - ts_start} s,{err}\n")