Exemplo n.º 1
0
def test_get_feature_names_out(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X = np.array([X_txt, X_txt]).T
    enc = GapEncoder(random_state=42)
    enc.fit(X)
    for topic_labels in [enc.get_feature_names(), enc.get_feature_names_out()]:
        # Check number of labels
        assert len(topic_labels) == enc.n_components * X.shape[1]
        # Test different parameters for col_names
        topic_labels_2 = enc.get_feature_names_out(col_names='auto')
        assert topic_labels_2[0] == 'col0: ' + topic_labels[0]
        topic_labels_3 = enc.get_feature_names_out(col_names=['abc', 'def'])
        assert topic_labels_3[0] == 'abc: ' + topic_labels[0]
    return
X_enc = enc.fit_transform(X_dirty)
print(f'Shape of encoded vectors = {X_enc.shape}')

###############################################################################
# Interpreting encoded vectors
# ----------------------------
#
# The GapEncoder can be understood as a continuous encoding on a set of latent
# topics estimated from the data. The latent topics are built by
# capturing combinations of substrings that frequently co-occur, and encoded
# vectors correspond to their activations.
# To interpret these latent topics, we select for each of them a few labels
# from the input data with the highest activations.
# In the example below we select 3 labels to summarize each topic.

topic_labels = enc.get_feature_names_out(n_labels=3)
for k in range(len(topic_labels)):
    labels = topic_labels[k]
    print(f'Topic n°{k}: {labels}')

###############################################################################
# As expected, topics capture labels that frequently co-occur. For instance,
# the labels *firefighter*, *rescuer*, *rescue* appear together in
# *Firefigther/Rescuer III*, or *Fire/Rescue Lieutenant*.
#
# This enables us to understand the encoding of different samples

import matplotlib.pyplot as plt
encoded_labels = enc.transform(X_dirty[:20])
plt.figure(figsize=(8, 10))
plt.imshow(encoded_labels)