def nmf_topics(X, k, **kwargs): """Perform a boostrap sample from a corpus of documents and fit the sample using NMF to give a set of topic vectors, normalized such that the(z,w) entry of the returned array is the probability P(w|z) of word w occuring given the zth topic. Parameters ---------- X: sparse matrix of shape (n_docs, n_words) The bag of words representation of the corpus of documents. k: int The number of topics to generate. kwargs: Further keyword arguments that can be passed on th the ``NMF`` class. Possibilities include: * ``init`` * ``beta_loss`` * ``alpha`` * ``solver`` Returns ------- topics: array of shape (k, n_words) The topics generated from the bootstrap sample. """ A = X.tocsr() if kwargs.get("bootstrap", True): rng = check_random_state(kwargs.get("random_state", None)) bootstrap_sample_indices = rng.randint(0, A.shape[0], size=A.shape[0]) B = A[bootstrap_sample_indices] else: B = A nmf = NMF( n_components=k, init=kwargs.get("init", "nndsvd"), beta_loss=kwargs.get("beta_loss", 1), alpha=kwargs.get("alpha", 0.0), solver=kwargs.get("solver", "mu"), random_state=kwargs.get("random_state", None), ).fit(B) topics = nmf.components_.copy() normalize(topics, axis=1) return topics
def plsa_refit( X, topics, sample_weight, n_iter=50, n_iter_per_test=10, tolerance=0.005, e_step_thresh=1e-32, random_state=None, ): """Routine for refitting values of P(z|d) given a fixed set of topics ( i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics (given, for example, by an ensemble result). Parameters ---------- X: sparse matrix of shape (n_docs, n_words) The data matrix pLSA is attempting to fit to. topics: array of shape (n_topics, n_words) The fixed topics against which to fit the values of P(z|d). sample_weight: array of shape (n_docs,) Input document weights. n_iter: int The maximum number iterations of EM to perform n_iter_per_test: int The number of iterations between tests for relative improvement in log-likelihood. tolerance: float The threshold of relative improvement in log-likelihood required to continue iterations. e_step_thresh: float (optional, default=1e-32) Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls below threshold then write a zero for P(z|w,d). random_state: int, RandomState instance or None, (optional, default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used in in initialization. Returns ------- p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) The resulting model values of P(z|d) and P(w|z) """ A = X.tocoo().astype(np.float32) k = topics.shape[0] rng = check_random_state(random_state) p_z_given_d = rng.rand(A.shape[0], k) normalize(p_z_given_d, axis=1) p_z_given_d = p_z_given_d.astype(np.float32) topics = topics.astype(np.float32) p_z_given_d = plsa_refit_inner( A.row, A.col, A.data, topics, p_z_given_d, sample_weight, n_iter=n_iter, n_iter_per_test=n_iter_per_test, tolerance=tolerance, e_step_thresh=e_step_thresh, ) return p_z_given_d
def plsa_init(X, k, init="random", rng=np.random): """Initialize matrices for pLSA. Specifically, given data X, a number of topics k, and an initialization method, compute matrices for P(z|d) and P(w|z) that can be used to begin an EM optimization of pLSA. Various initialization approaches are available. The most straightforward is "random", which randomly initializes values for P(z|d) and P(w|z) and normalizes to make them probabilities. A second approach, borrowing from sklearn's NMF implementation, is to use a non-negative SVD approach ("nndsvd"). A third option is the use the fast coordinate descent under Frobenius loss version of NMF and then normalize to make probabilities ("nmf"). Finally if the ``init`` parameter is a tuple of ndarrays then these will be used, allowing for custom user defined initializations. Parameters ---------- X: sparse matrix of shape (n_docs, n_words) The data matrix pLSA is attempting to fit to. k: int The number of topics for pLSA to fit with. init: string or tuple (optional, default="random") The intialization method to use. This should be one of: * ``"random"`` * ``"nndsvd"`` * ``"nmf"`` or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). rng: RandomState instance (optional, default=np.random) Seeded randomness generator. Used for random intialization. Returns ------- p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) Initialized arrays suitable to passing to pLSA optimization methods. """ n = X.shape[0] m = X.shape[1] if init == "random": p_w_given_z = rng.rand(k, m) p_z_given_d = rng.rand(n, k) elif init == "nndsvd": # Taken from sklearn NMF implementation U, S, V = randomized_svd(X, k) p_z_given_d, p_w_given_z = np.zeros(U.shape), np.zeros(V.shape) # The leading singular triplet is non-negative # so it can be used as is for initialization. p_z_given_d[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) p_w_given_z[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) for j in range(1, k): x, y = U[:, j], V[j, :] # extract positive and negative parts of column vectors x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) # and their norms x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm # choose update if m_p > m_n: u = x_p / x_p_nrm v = y_p / y_p_nrm sigma = m_p else: u = x_n / x_n_nrm v = y_n / y_n_nrm sigma = m_n lbd = np.sqrt(S[j] * sigma) p_z_given_d[:, j] = lbd * u p_w_given_z[j, :] = lbd * v elif init == "nmf": p_z_given_d, p_w_given_z, _ = non_negative_factorization( X, n_components=k, init="nndsvd", solver="cd", beta_loss=2, tol=1e-2, max_iter=100, ) elif isinstance(init, tuple) or isinstance(init, list): p_z_given_d, p_w_given_z = init else: raise ValueError("Unrecognized init {}".format(init)) normalize(p_w_given_z, axis=1) normalize(p_z_given_d, axis=1) return p_z_given_d, p_w_given_z
def ensemble_fit( X, estimated_n_topics=10, model="plsa", init="random", min_samples=3, min_cluster_size=4, n_starts=16, n_jobs=8, parallelism="dask", topic_combination="hellinger_umap", n_iter=100, n_iter_per_test=10, tolerance=0.001, e_step_thresh=1e-16, lift_factor=1, beta_loss=1, alpha=0.0, solver="mu", random_state=None, ): """Generate a set of stable topics by using an ensemble of topic models and then clustering the results and generating representative topics for each cluster. The generate a set of document vectors based on the selected stable topics. Parameters ---------- X: array or sparse matrix of shape (n_docs, n_words) The bag-of-words matrix for the corpus to train on. estimated_n_topics: int (optional, default=10) The estimated number of topics. Note that the final number of topics produced can differ from this value, and may be more or less than the provided value. Instead this value provides the algorithm with a suggestion of the approximate number of topics to use. model: string (optional, default="plsa") The topic modeling method to use (either "plsa" or "nmf") init: string or tuple (optional, default="random") The intialization method to use. This should be one of: * ``"random"`` * ``"nndsvd"`` * ``"nmf"`` or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). int (optional, default=3) The min_samples parameter to use for HDBSCAN clustering. min_cluster_size: int (optional, default=4) The min_cluster_size parameter to use for HDBSCAN clustering n_starts: int (optional, default=16) The number of bootstrap sampled topic models to run -- the size of the ensemble. n_jobs: int (optional, default=8) The number of parallel jobs to run at a time. parallelism: string (optional, default="dask") The parallelism model to use. Should be one of "dask" or "joblib". topic_combination: string (optional, default="hellinger_umap") The method of comnining ensemble topics into a set of stable topics. Should be one of: * ``"hellinger_umap"`` * ``"hellinger"`` * ``"kl_divergence"`` n_iter: int The maximum number iterations of EM to perform n_iter_per_test: int The number of iterations between tests for relative improvement in log-likelihood. tolerance: float The threshold of relative improvement in log-likelihood required to continue iterations. e_step_thresh: float (optional, default=1e-32) Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls below threshold then write a zero for P(z|w,d). lift_factor: int (optional, default=1) Importance factor to apply to lift -- if high lift value are important to you then larger lift factors will be beneficial. beta_loss: float or string, (optional, default 'kullback-leibler') The beta loss to use if using NMF for topic modeling. alpha: float (optional, default=0.0) The alpha parameter defining regularization if using NMF for topic modeling. solver: string, (optional, default="mu") The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu". random_state int, RandomState instance or None, (optional, default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used in in initialization. Returns ------- doc_vectors, stable_topics: arrays of shape (n_docs, M) and (M, n_words) The vectors giving the probability of topics for each document, and the stable topics produced by the ensemble. """ X = check_array(X, accept_sparse="csr") if issparse(X): X_coo = X.tocoo() else: X_coo = coo_matrix(X) all_topics = ensemble_of_topics( X_coo, estimated_n_topics, model, n_jobs, n_starts, parallelism, init=init, n_iter=n_iter, n_iter_per_test=n_iter_per_test, tolerance=tolerance, e_step_thresh=e_step_thresh, lift_factor=1, beta_loss=beta_loss, alpha=alpha, solver=solver, random_state=random_state, ) if topic_combination in _topic_combiner: cluster_topics = _topic_combiner[topic_combination] else: raise ValueError("topic_combination must be one of {}".format( tuple(_topic_combiner.keys()))) stable_topics = cluster_topics(all_topics, min_samples, min_cluster_size) if lift_factor != 1: stable_topics **= lift_factor normalize(stable_topics, axis=1) if model == "plsa": doc_vectors = plsa_refit( X, stable_topics, e_step_thresh=e_step_thresh, random_state=random_state, ) elif model == "nmf": doc_vectors, _, _ = non_negative_factorization( X, H=stable_topics, n_components=stable_topics.shape[0], update_H=False, beta_loss=beta_loss, alpha=alpha, solver=solver, ) else: raise ValueError('Model must be one of "plsa" or "nmf"') return doc_vectors, stable_topics