Python CharNGramAnalyzer示例

编程语言: Python

命名空间/包名称: scikits.learn.feature_extraction.text

hotexamples.com的示例: 5

Python CharNGramAnalyzer - 已找到5个示例。这些是从开源项目中提取的最受好评的scikits.learn.feature_extraction.text.CharNGramAnalyzer现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

CharNGramAnalyzer(4)

analyze(1)

示例#1

显示文件

文件： test_text.py 项目： aayushsaxena15/projects

def test_char_ngram_analyzer():
    cnga = CharNGramAnalyzer(min_n=3, max_n=6)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon"
    expected = [u"j'a", u"'ai", u'ai ', u'i m', u' ma']
    assert_equal(cnga.analyze(text)[:5], expected)
    expected = [u's tres', u' tres ', u'tres b', u'res bo', u'es bon']
    assert_equal(cnga.analyze(text)[-5:], expected)

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = [u'thi', u'his', u'is ', u's i', u' is']
    assert_equal(cnga.analyze(text)[:5], expected)
    expected = [u' yeste', u'yester', u'esterd', u'sterda', u'terday']
    assert_equal(cnga.analyze(text)[-5:], expected)

示例#2

显示文件

def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in all strings
    vect = CountVectorizer(CharNGramAnalyzer(min_n=1, max_n=1), max_df=1.0)
    vect.fit(test_data)
    assert u'a' in vect.vocabulary.keys()
    assert_equals(len(vect.vocabulary.keys()), 5)
    vect.max_df = 0.5
    vect.fit(test_data)
    assert u'a' not in vect.vocabulary.keys()  # 'a' is ignored
    assert_equals(len(vect.vocabulary.keys()), 4)  # the others remain

示例#3

显示文件

文件： preprocess.py 项目： vene/misc-nlp

def get_clf(n=3, binarize=True):
    steps = [('vectorizer',
              CountVectorizer(
                  CharNGramAnalyzer(min_n=1,
                                    max_n=n,
                                    preprocessor=SimplePreprocessor())))]
    if binarize:
        steps.append(('binarizer', Binarizer(copy=False)))
        steps.append(('clf', naive_bayes.BernoulliNB()))
    else:
        steps.append(('clf', naive_bayes.MultinomialNB()))

    return Pipeline(steps)

示例#4

显示文件

def test_char_ngram_analyzer():
    cnga = CharNGramAnalyzer(min_n=3, max_n=6)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon"
    expected = [u"j'a", u"'ai", u'ai ', u'i m', u' ma']
    assert_equal(cnga.analyze(text)[:5], expected)
    expected = [u's tres', u' tres ', u'tres b', u'res bo', u'es bon']
    assert_equal(cnga.analyze(text)[-5:], expected)

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = [u'thi', u'his', u'is ', u's i', u' is']
    assert_equal(cnga.analyze(text)[:5], expected)
    expected = [u' yeste', u'yester', u'esterd', u'sterda', u'terday']
    assert_equal(cnga.analyze(text)[-5:], expected)

示例#5

显示文件

dataset = load_files(languages_data_folder)

# split the dataset in training and test set:
n_samples_total = dataset.filenames.shape[0]

docs_train = [open(f).read() for f in dataset.filenames[:n_samples_total / 2]]
docs_test = [open(f).read() for f in dataset.filenames[n_samples_total / 2:]]

y_train = dataset.target[:n_samples_total / 2]
y_test = dataset.target[n_samples_total / 2:]

# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(
    min_n=1,
    max_n=3,
    preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)