def test_gap_encoder(hashing, init, analyzer, add_words, n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] n_components = 10 # Test output shape encoder = GapEncoder( n_components=n_components, hashing=hashing, init=init, analyzer=analyzer, add_words=add_words, random_state=42, rescale_W=True) encoder.fit(X) y = encoder.transform(X) assert y.shape == (n_samples, n_components), str(y.shape) assert len(set(y[0])) == n_components # Test L1-norm of topics W. l1_norm_W = np.abs(encoder.W_).sum(axis=1) np.testing.assert_array_almost_equal( l1_norm_W, np.ones(n_components)) # Test same seed return the same output encoder = GapEncoder( n_components=n_components, hashing=hashing, init=init, analyzer=analyzer, add_words=add_words, random_state=42) encoder.fit(X) y2 = encoder.transform(X) np.testing.assert_array_equal(y, y2) return
def test_analyzer(init1, analyzer1, analyzer2): """" Test if the output is different when the analyzer is 'word' or 'char'. If it is, no error ir raised. """ add_words = False n_samples = 70 X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X = np.array([X_txt, X_txt]).T n_components = 10 # Test first analyzer output: encoder = GapEncoder(n_components=n_components, init='k-means++', analyzer=analyzer1, add_words=add_words, random_state=42, rescale_W=True) encoder.fit(X) y = encoder.transform(X) # Test the other analyzer output: encoder = GapEncoder(n_components=n_components, init='k-means++', analyzer=analyzer2, add_words=add_words, random_state=42) encoder.fit(X) y2 = encoder.transform(X) # Test inequality btw analyzer word and char ouput: np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, y, y2)
def test_overflow_error(): np.seterr(over='raise', divide='raise') r = np.random.RandomState(0) X = r.randint(1e5, 1e6, size=(8000, 1)).astype(str) enc = GapEncoder(n_components=2, batch_size=1, min_iter=1, max_iter=1, random_state=0) enc.fit(X) return
def test_get_feature_names(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] enc = GapEncoder() enc.fit(X) topic_labels = enc.get_feature_names() # Check number of labels assert len(topic_labels) == enc.n_components return
def test_score(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X1 = np.array(X_txt)[:, None] X2 = np.hstack([X1, X1]) enc = GapEncoder(random_state=42) enc.fit(X1) score_X1 = enc.score(X1) enc.fit(X2) score_X2 = enc.score(X2) # Check that two identical columns give the same score assert score_X1 * 2 == score_X2 return
def test_get_feature_names_out(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X = np.array([X_txt, X_txt]).T enc = GapEncoder(random_state=42) enc.fit(X) for topic_labels in [enc.get_feature_names(), enc.get_feature_names_out()]: # Check number of labels assert len(topic_labels) == enc.n_components * X.shape[1] # Test different parameters for col_names topic_labels_2 = enc.get_feature_names_out(col_names='auto') assert topic_labels_2[0] == 'col0: ' + topic_labels[0] topic_labels_3 = enc.get_feature_names_out(col_names=['abc', 'def']) assert topic_labels_3[0] == 'abc: ' + topic_labels[0] return