def test_gap_encoder(hashing, init, analyzer, add_words, n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] n_components = 10 # Test output shape encoder = GapEncoder( n_components=n_components, hashing=hashing, init=init, analyzer=analyzer, add_words=add_words, random_state=42, rescale_W=True) encoder.fit(X) y = encoder.transform(X) assert y.shape == (n_samples, n_components), str(y.shape) assert len(set(y[0])) == n_components # Test L1-norm of topics W. l1_norm_W = np.abs(encoder.W_).sum(axis=1) np.testing.assert_array_almost_equal( l1_norm_W, np.ones(n_components)) # Test same seed return the same output encoder = GapEncoder( n_components=n_components, hashing=hashing, init=init, analyzer=analyzer, add_words=add_words, random_state=42) encoder.fit(X) y2 = encoder.transform(X) np.testing.assert_array_equal(y, y2) return
def test_analyzer(init1, analyzer1, analyzer2): """" Test if the output is different when the analyzer is 'word' or 'char'. If it is, no error ir raised. """ add_words = False n_samples = 70 X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X = np.array([X_txt, X_txt]).T n_components = 10 # Test first analyzer output: encoder = GapEncoder(n_components=n_components, init='k-means++', analyzer=analyzer1, add_words=add_words, random_state=42, rescale_W=True) encoder.fit(X) y = encoder.transform(X) # Test the other analyzer output: encoder = GapEncoder(n_components=n_components, init='k-means++', analyzer=analyzer2, add_words=add_words, random_state=42) encoder.fit(X) y2 = encoder.transform(X) # Test inequality btw analyzer word and char ouput: np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, y, y2)
def test_partial_fit(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] # Gap encoder with fit on one batch enc = GapEncoder(random_state=42, batch_size=n_samples, max_iter=1) X_enc = enc.fit_transform(X) # Gap encoder with partial fit enc = GapEncoder(random_state=42) enc.partial_fit(X) X_enc_partial = enc.transform(X) # Check if the encoded vectors are the same np.testing.assert_almost_equal(X_enc, X_enc_partial) return
# In the example below we select 3 labels to summarize each topic. topic_labels = enc.get_feature_names_out(n_labels=3) for k in range(len(topic_labels)): labels = topic_labels[k] print(f'Topic n°{k}: {labels}') ############################################################################### # As expected, topics capture labels that frequently co-occur. For instance, # the labels *firefighter*, *rescuer*, *rescue* appear together in # *Firefigther/Rescuer III*, or *Fire/Rescue Lieutenant*. # # This enables us to understand the encoding of different samples import matplotlib.pyplot as plt encoded_labels = enc.transform(X_dirty[:20]) plt.figure(figsize=(8, 10)) plt.imshow(encoded_labels) plt.xlabel('Latent topics', size=12) plt.xticks(range(0, 10), labels=topic_labels, rotation=50, ha='right') plt.ylabel('Data entries', size=12) plt.yticks(range(0, 20), labels=X_dirty[:20].to_numpy().flatten()) plt.colorbar().set_label(label='Topic activations', size=12) plt.tight_layout() plt.show() ############################################################################### # As we can see, each dirty category encodes on a small number of topics, # These can thus be reliably used to summarize each topic, which are in # effect latent categories captured from the data.