Пример #1
0
def test_combined_features():
    train_X, train_y, test_X, test_y = train_test_data()
    token_features = Pipeline([('prep', preprocessing.std_prep()),
                               ('frm',
                                representation.count_vectorizer({'min_df':
                                                                 1}))])
    X = token_features.fit_transform(train_X)
    expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0],
                         [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1]], np.int64)
    assert (X.toarray() == expected).all()

    polarity_features = Pipeline([
        ('prep', preprocessing.lex_prep()),
        ('frm', representation.count_vectorizer({'min_df': 1}))
    ])
    X = polarity_features.fit_transform(train_X)
    expected = np.array([[1, 7], [0, 7]], np.int64)
    assert (X.toarray() == expected).all()

    combined_features = FeatureUnion([('token_features', token_features),
                                      ('polarity_features', polarity_features)
                                      ])
    X = combined_features.fit_transform(train_X, train_y)
    actual = X.toarray()
    expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 7],
                         [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1, 0, 7]], np.int64)
    assert (actual == expected).all()

    tokens_from_lexicon = combined_features.transformer_list[1][1].steps[0][
        1].tokens_from_lexicon
    assert tokens_from_lexicon == 1
Пример #2
0
def test_count_vectorizer():
    train_X, train_y, test_X, test_y = train_test_data()
    cv = representation.count_vectorizer()
    X = cv.fit_transform(train_X)
    assert cv.get_feature_names() == [
        'all', 'an', 'ass', 'deserve', 'hope', 'kicking', 'later', 'talk',
        'they', 'to', 'twats', 'you'
    ]
    result = X.toarray()
    expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0],
                         [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1]], np.int64)
    assert (result == expected).all()
Пример #3
0
def test_representation():
    task = of.Offenseval()
    task.load(offenseval_data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01)
    prep = preprocessing.Preprocessor(tokenize=True,
                                      normalize_tweet=False,
                                      lowercase=False,
                                      lemmatize=False)
    train_X = prep.transform(train_X)

    frmt = representation.count_vectorizer()
    train_X = frmt.fit_transform(train_X, train_y)
    assert not isinstance(train_X[0], str)
Пример #4
0
def svm_libsvc_counts_bigram():
    return pipeline(preprocessing.std_prep(),
                    representation.count_vectorizer({'min_df': 1, 'ngram_range': (2, 2)}),
                    svm.LinearSVC(max_iter=10000,
                                  dual=False,
                                  C=0.1))
Пример #5
0
def svm_libsvc_counts():
    return pipeline(preprocessing.std_prep(),
                    representation.count_vectorizer(),
                    svm.LinearSVC(max_iter=10000,
                                  dual=False, C=0.1))
Пример #6
0
def naive_bayes_counts_lex():
    return pipeline(preprocessing.lex_prep(),
                    representation.count_vectorizer({'min_df': 1}),
                    MultinomialNB())
Пример #7
0
def naive_bayes_counts_trigram():
    return pipeline(preprocessing.std_prep(),
                    representation.count_vectorizer({'min_df': 1, 'ngram_range': (3, 3)}),
                    MultinomialNB())