def test_get_feature_names(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X = np.array([X_txt, X_txt]).T enc = GapEncoder(random_state=42) enc.fit(X) topic_labels = enc.get_feature_names() # Check number of labels assert len(topic_labels) == enc.n_components * X.shape[1] # Test different parameters for col_names topic_labels_2 = enc.get_feature_names(col_names='auto') assert topic_labels_2[0] == 'col0: ' + topic_labels[0] topic_labels_3 = enc.get_feature_names(col_names=['abc', 'def']) assert topic_labels_3[0] == 'abc: ' + topic_labels[0] return
def test_get_feature_names(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] enc = GapEncoder() enc.fit(X) topic_labels = enc.get_feature_names() # Check number of labels assert len(topic_labels) == enc.n_components return
X_enc = enc.fit_transform(X_dirty) print(f'Shape of encoded vectors = {X_enc.shape}') ################################################################################ # Interpreting encoded vectors # ---------------------------- # # The GapEncoder can be understood as a continuous encoding on a set of latent # topics estimated from the data. The latent topics are built by # capturing combinations of substrings that frequently co-occur, and encoded # vectors correspond to their activations. # To interpret these latent topics, we select for each of them a few labels # from the input data with the highest activations. # In the example below we select 3 labels to summarize each topic. topic_labels = enc.get_feature_names(n_labels=3) for k in range(len(topic_labels)): labels = topic_labels[k] print(f'Topic n°{k}: {labels}') ################################################################################ # As expected, topics capture labels that frequently co-occur. For instance, # the labels *firefighter*, *rescuer*, *rescue* appear together in # *Firefigther/Rescuer III*, or *Fire/Rescue Lieutenant*. # # This enables us to understand the encoding of different samples import matplotlib.pyplot as plt encoded_labels = enc.transform(X_dirty[:20]) plt.figure(figsize=(8, 10)) plt.imshow(encoded_labels)