def tf_idf(tag_matrix): #calculate TF-IDF tfidf = TfidfTransformer(None, use_idf=True) tfidf.fit(tag_matrix) tag_matrix = tfidf.transform(tag_matrix) dense_tag_matrix = tag_matrix.todense() return dense_tag_matrix
def test_pickle(): for obj in (CountVectorizer(), SparseCountVectorizer(), TfidfTransformer(), SparseTfidfTransformer(), Vectorizer(), SparseVectorizer()): s = pickle.dumps(obj) assert_equal(type(pickle.loads(s)), obj.__class__)
def test_dense_vectorizer(): wa = WordNGramAnalyzer() train_data = [wa.analyze(d) for d in JUNK_FOOD_DOCS[:-1]] test_data = [wa.analyze(JUNK_FOOD_DOCS[-1])] # test without vocabulary v1 = TermCountVectorizer() counts_train = v1.transform(train_data) assert_equal(counts_train[0, v1.vocabulary["pizza"]], 2) v2 = TermCountVectorizer(vocabulary=v1.vocabulary) # test with a pre-existing vocabulary for v in (v1, v2): counts_test = v.transform(test_data) assert_equal(counts_test[0, v.vocabulary["coke"]], 1) # test tf-idf t1 = TfidfTransformer() tfidf = t1.fit(counts_train).transform(counts_train) assert_equal(len(t1.idf), len(v1.vocabulary)) assert_equal(tfidf.shape, (len(train_data), len(v1.vocabulary))) # test tf-idf with new data tfidf_test = t1.transform(counts_test) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(use_idf=False) tf = t2.fit(counts_train).transform(counts_train) assert_equal(t2.idf, None) assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * len(train_data)) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) tv = TfidfVectorizer() tfidf2 = tv.fit(train_data).transform(train_data) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data) assert_array_almost_equal(tfidf_test, tfidf_test2)
def test_vectorizer(): # results to be compared res = [] # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) res.append(tfidf) res.append(t1.idf_) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) return res
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) # test empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data)
y_train = dataset.target[:n_samples_total / 2] y_test = dataset.target[n_samples_total / 2:] # Build a an analyzer that split strings into sequence of 1 to 3 characters # after using the previous preprocessor analyzer = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix
# split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = (n_samples_total * 3) / 4 docs_train = [open(f).read() for f in dataset.filenames[:split]] docs_test = [open(f).read() for f in dataset.filenames[split:]] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95,), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train)