Пример #1
0
def test_combined_features():
    train_X, train_y, test_X, test_y = train_test_data()
    token_features = Pipeline([('prep', preprocessing.std_prep()),
                               ('frm',
                                representation.count_vectorizer({'min_df':
                                                                 1}))])
    X = token_features.fit_transform(train_X)
    expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0],
                         [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1]], np.int64)
    assert (X.toarray() == expected).all()

    polarity_features = Pipeline([
        ('prep', preprocessing.lex_prep()),
        ('frm', representation.count_vectorizer({'min_df': 1}))
    ])
    X = polarity_features.fit_transform(train_X)
    expected = np.array([[1, 7], [0, 7]], np.int64)
    assert (X.toarray() == expected).all()

    combined_features = FeatureUnion([('token_features', token_features),
                                      ('polarity_features', polarity_features)
                                      ])
    X = combined_features.fit_transform(train_X, train_y)
    actual = X.toarray()
    expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 7],
                         [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1, 0, 7]], np.int64)
    assert (actual == expected).all()

    tokens_from_lexicon = combined_features.transformer_list[1][1].steps[0][
        1].tokens_from_lexicon
    assert tokens_from_lexicon == 1
Пример #2
0
def cnn(name):
    if name == 'cnn_raw':
        return CNN()
    elif name == 'cnn_prep':
        return CNN(preprocessing.std_prep())
    else:
        raise ValueError("pipeline name is unknown.")
Пример #3
0
def test_preprocessing():
    train_X, train_y, test_X, test_y = train_test_data()
    preprocessor = preprocessing.std_prep()
    X = preprocessor.fit_transform(train_X)
    assert X[0] == "they twats all deserve an ass kicking ."

    preprocessor = preprocessing.lex_prep()
    X = preprocessor.fit_transform(train_X)
    assert X[
        0] == "NEUTRAL HATE NEUTRAL NEUTRAL NEUTRAL NEUTRAL NEUTRAL NEUTRAL"
Пример #4
0
def svm_sigmoid_embed():
    return pipeline(preprocessing.std_prep(),
                    representation.text2embeddings('glove'),
                    svm.SVC(kernel='sigmoid',
                            gamma='scale'))
Пример #5
0
def svm_libsvc_embed():
    return pipeline(preprocessing.std_prep(),
                    representation.text2embeddings('wiki-news'),
                    svm.LinearSVC(max_iter=10000,
                                  dual=False,
                                  C=0.1))
Пример #6
0
def svm_libsvc_tfidf():
    return pipeline(preprocessing.std_prep(),
                    representation.tfidf_vectorizer(),
                    svm.LinearSVC(max_iter=10000,
                                  dual=False,
                                  C=0.1))
Пример #7
0
def svm_libsvc_counts_bigram():
    return pipeline(preprocessing.std_prep(),
                    representation.count_vectorizer({'min_df': 1, 'ngram_range': (2, 2)}),
                    svm.LinearSVC(max_iter=10000,
                                  dual=False,
                                  C=0.1))
Пример #8
0
def naive_bayes_counts_trigram():
    return pipeline(preprocessing.std_prep(),
                    representation.count_vectorizer({'min_df': 1, 'ngram_range': (3, 3)}),
                    MultinomialNB())
Пример #9
0
def naive_bayes_tfidf():
    return pipeline(preprocessing.std_prep(),
                    representation.tfidf_vectorizer(),
                    MultinomialNB())
Пример #10
0
def naive_bayes_counts():
    return pipeline(preprocessing.std_prep(),
                    representation.count_vectorizer({'min_df': 1}),
                    MultinomialNB())
Пример #11
0
def random_forest_tfidf():
    return pipeline(preprocessing.std_prep(),
                    representation.tfidf_vectorizer(),
                    RandomForestClassifier(random_state=42, n_estimators=1000))
Пример #12
0
def random_forest_embed():
    return pipeline(preprocessing.std_prep(),
                    representation.text2embeddings('glove'),
                    RandomForestClassifier(random_state=42, n_estimators=1000))
Пример #13
0
def naive_bayes_counts_lex():
    return CombinedFeaturesPipeline(preprocessing.std_prep(), representation.count_vectorizer({'min_df': 1}),
                                    preprocessing.lex_prep(), representation.count_vectorizer({'min_df': 1}),
                                    MultinomialNB())