def test_char_ngram_analyzer(): cnga = CharNGramAnalyzer(min_n=3, max_n=6) text = u"J'ai mang\xe9 du kangourou ce midi, c'\xe9tait pas tr\xeas bon" expected = [u"j'a", u"'ai", u'ai ', u'i m', u' ma'] assert_equal(cnga.analyze(text)[:5], expected) expected = [u's tres', u' tres ', u'tres b', u'res bo', u'es bon'] assert_equal(cnga.analyze(text)[-5:], expected) text = "This \n\tis a test, really.\n\n I met Harry yesterday" expected = [u'thi', u'his', u'is ', u's i', u' is'] assert_equal(cnga.analyze(text)[:5], expected) expected = [u' yeste', u'yester', u'esterd', u'sterda', u'terday'] assert_equal(cnga.analyze(text)[-5:], expected)
def test_vectorizer_max_df(): test_data = [u'abc', u'dea'] # the letter a occurs in all strings vect = CountVectorizer(CharNGramAnalyzer(min_n=1, max_n=1), max_df=1.0) vect.fit(test_data) assert u'a' in vect.vocabulary.keys() assert_equals(len(vect.vocabulary.keys()), 5) vect.max_df = 0.5 vect.fit(test_data) assert u'a' not in vect.vocabulary.keys() # 'a' is ignored assert_equals(len(vect.vocabulary.keys()), 4) # the others remain
def get_clf(n=3, binarize=True): steps = [('vectorizer', CountVectorizer( CharNGramAnalyzer(min_n=1, max_n=n, preprocessor=SimplePreprocessor())))] if binarize: steps.append(('binarizer', Binarizer(copy=False))) steps.append(('clf', naive_bayes.BernoulliNB())) else: steps.append(('clf', naive_bayes.MultinomialNB())) return Pipeline(steps)
dataset = load_files(languages_data_folder) # split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] docs_train = [open(f).read() for f in dataset.filenames[:n_samples_total / 2]] docs_test = [open(f).read() for f in dataset.filenames[n_samples_total / 2:]] y_train = dataset.target[:n_samples_total / 2] y_test = dataset.target[n_samples_total / 2:] # Build a an analyzer that split strings into sequence of 1 to 3 characters # after using the previous preprocessor analyzer = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test)