def test_lda_negative_input(): # test pass dense matrix with sparse negative input. X = np.full((5, 10), -1.) lda = LatentDirichletAllocation() regex = r"^Negative values in data passed" with pytest.raises(ValueError, match=regex): lda.fit(X)
def test_lda_no_component_error(): # test `perplexity` before `fit` rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) lda = LatentDirichletAllocation() regex = ("This LatentDirichletAllocation instance is not fitted yet. " "Call 'fit' with appropriate arguments before using this " "estimator.") with pytest.raises(NotFittedError, match=regex): lda.perplexity(X)
def test_lda_fit_transform(method): # Test LDA fit_transform & transform # fit_transform and transform result should be the same rng = np.random.RandomState(0) X = rng.randint(10, size=(50, 20)) lda = LatentDirichletAllocation(n_components=5, learning_method=method, random_state=rng) X_fit = lda.fit_transform(X) X_trans = lda.transform(X) assert_array_almost_equal(X_fit, X_trans, 4)
def test_lda_transform(): # Test LDA transform. # Transform result cannot be negative and should be normalized rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_components = 3 lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) X_trans = lda.fit_transform(X) assert (X_trans > 0.0).any() assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
def test_lda_dense_input(): # Test LDA with dense input. rng = np.random.RandomState(0) n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, learning_method='batch', random_state=rng) lda.fit(X.toarray()) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_multi_jobs(method): n_components, X = _build_sparse_mtx() # Test LDA batch training with multi CPU rng = np.random.RandomState(0) lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_method=method, evaluate_every=1, random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_fit_online(): # Test LDA online learning (`fit` method with 'online' learning) rng = np.random.RandomState(0) n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, learning_offset=10., evaluate_every=1, learning_method='online', random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_partial_fit_multi_jobs(): # Test LDA online training with multi CPU rng = np.random.RandomState(0) n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_offset=5., total_samples=30, random_state=rng) for i in range(2): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_partial_fit(): # Test LDA online learning (`partial_fit` method) # (same as test_lda_batch) rng = np.random.RandomState(0) n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, learning_offset=10., total_samples=100, random_state=rng) for i in range(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps
def test_invalid_params(): # test `_check_params` method X = np.ones((5, 10)) invalid_models = ( ('n_components', LatentDirichletAllocation(n_components=0)), ('learning_method', LatentDirichletAllocation(learning_method='unknown')), ('total_samples', LatentDirichletAllocation(total_samples=0)), ('learning_offset', LatentDirichletAllocation(learning_offset=-1)), ) for param, model in invalid_models: regex = r"^Invalid %r parameter" % param with pytest.raises(ValueError, match=regex): model.fit(X)
def test_lda_empty_docs(): """Test LDA on empty document (all-zero rows).""" Z = np.zeros((5, 4)) for X in [Z, csr_matrix(Z)]: lda = LatentDirichletAllocation(max_iter=750).fit(X) assert_almost_equal(lda.components_.sum(axis=0), np.ones(lda.components_.shape[1]))
def test_lda_fit_perplexity(): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', random_state=0, evaluate_every=1) lda.fit(X) # Perplexity computed at end of fit method perplexity1 = lda.bound_ # Result of perplexity method on the train set perplexity2 = lda.perplexity(X) assert_almost_equal(perplexity1, perplexity2)
def test_lda_score(method): # Test LDA score for batch training # score should be higher after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert score_2 >= score_1
def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities): n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=3, learning_method='batch', verbose=verbose, evaluate_every=evaluate_every, random_state=0) out = StringIO() old_out, sys.stdout = sys.stdout, out try: lda.fit(X) finally: sys.stdout = old_out n_lines = out.getvalue().count('\n') n_perplexity = out.getvalue().count('perplexity') assert expected_lines == n_lines assert expected_perplexities == n_perplexity
def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) n_components = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components)) with pytest.raises(ValueError, match=r'Number of samples'): lda._perplexity_precomp_distr(X, invalid_n_samples) # invalid topic number invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1)) with pytest.raises(ValueError, match=r'Number of topics'): lda._perplexity_precomp_distr(X, invalid_n_components)
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
def test_perplexity_input_format(): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) lda.fit(X) perp_1 = lda.perplexity(X) perp_2 = lda.perplexity(X.toarray()) assert_almost_equal(perp_1, perp_2)
def test_lda_transform_mismatch(): # test `n_features` mismatch in partial_fit and transform rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_components = rng.randint(3, 6) lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) lda.partial_fit(X) with pytest.raises(ValueError, match=r"^The provided data has"): lda.partial_fit(X_2)
def test_lda_default_prior_params(): # default prior parameter should be `1 / topics` # and verbose params should not affect result n_components, X = _build_sparse_mtx() prior = 1. / n_components lda_1 = LatentDirichletAllocation(n_components=n_components, doc_topic_prior=prior, topic_word_prior=prior, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) assert_almost_equal(topic_distr_1, topic_distr_2)
def test_lda_partial_fit_dim_mismatch(): # test `n_features` mismatch in `partial_fit` rng = np.random.RandomState(0) n_components = rng.randint(3, 6) n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.partial_fit(X_1) with pytest.raises(ValueError, match=r"^The provided data has"): lda.partial_fit(X_2)
def test_lda_perplexity(method): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit(X) perp_1 = lda_1.perplexity(X, sub_sampling=False) lda_2.fit(X) perp_2 = lda_2.perplexity(X, sub_sampling=False) assert perp_1 >= perp_2 perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) assert perp_1_subsampling >= perp_2_subsampling