예제 #1
0
def test_gap_encoder(hashing, init, analyzer, add_words, n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    n_components = 10
    # Test output shape
    encoder = GapEncoder(
        n_components=n_components, hashing=hashing, init=init,
        analyzer=analyzer, add_words=add_words,
        random_state=42, rescale_W=True)
    encoder.fit(X)
    y = encoder.transform(X)
    assert y.shape == (n_samples, n_components), str(y.shape)
    assert len(set(y[0])) == n_components

    # Test L1-norm of topics W.
    l1_norm_W = np.abs(encoder.W_).sum(axis=1)
    np.testing.assert_array_almost_equal(
        l1_norm_W, np.ones(n_components))

    # Test same seed return the same output
    encoder = GapEncoder(
        n_components=n_components, hashing=hashing, init=init,
        analyzer=analyzer, add_words=add_words,
        random_state=42)
    encoder.fit(X)
    y2 = encoder.transform(X)
    np.testing.assert_array_equal(y, y2)
    return
예제 #2
0
def test_analyzer(init1, analyzer1, analyzer2):
    """" Test if the output is different when the analyzer is 'word' or 'char'.
        If it is, no error ir raised. 
    """
    add_words = False
    n_samples = 70
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X = np.array([X_txt, X_txt]).T
    n_components = 10
    # Test first analyzer output:
    encoder = GapEncoder(n_components=n_components,
                         init='k-means++',
                         analyzer=analyzer1,
                         add_words=add_words,
                         random_state=42,
                         rescale_W=True)
    encoder.fit(X)
    y = encoder.transform(X)

    # Test the other analyzer output:
    encoder = GapEncoder(n_components=n_components,
                         init='k-means++',
                         analyzer=analyzer2,
                         add_words=add_words,
                         random_state=42)
    encoder.fit(X)
    y2 = encoder.transform(X)

    # Test inequality btw analyzer word and char ouput:
    np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, y,
                             y2)
예제 #3
0
def test_overflow_error():
    np.seterr(over='raise', divide='raise')
    r = np.random.RandomState(0)
    X = r.randint(1e5, 1e6, size=(8000, 1)).astype(str)
    enc = GapEncoder(n_components=2, batch_size=1, min_iter=1, max_iter=1,
                     random_state=0)
    enc.fit(X)
    return
예제 #4
0
def test_get_feature_names(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    enc = GapEncoder()
    enc.fit(X)
    topic_labels = enc.get_feature_names()
    # Check number of labels
    assert len(topic_labels) == enc.n_components
    return
예제 #5
0
def test_score(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X1 = np.array(X_txt)[:, None]
    X2 = np.hstack([X1, X1])
    enc = GapEncoder(random_state=42)
    enc.fit(X1)
    score_X1 = enc.score(X1)
    enc.fit(X2)
    score_X2 = enc.score(X2)
    # Check that two identical columns give the same score
    assert score_X1 * 2 == score_X2
    return
예제 #6
0
def test_get_feature_names_out(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X = np.array([X_txt, X_txt]).T
    enc = GapEncoder(random_state=42)
    enc.fit(X)
    for topic_labels in [enc.get_feature_names(), enc.get_feature_names_out()]:
        # Check number of labels
        assert len(topic_labels) == enc.n_components * X.shape[1]
        # Test different parameters for col_names
        topic_labels_2 = enc.get_feature_names_out(col_names='auto')
        assert topic_labels_2[0] == 'col0: ' + topic_labels[0]
        topic_labels_3 = enc.get_feature_names_out(col_names=['abc', 'def'])
        assert topic_labels_3[0] == 'abc: ' + topic_labels[0]
    return