def test_dense_tf_idf(): hv = HashingVectorizer(dim=1000, probes=3) hv.vectorize(JUNK_FOOD_DOCS) hv.vectorize(NOTJUNK_FOOD_DOCS) # extract the TF-IDF data X = hv.get_tfidf() assert_equal(X.shape, (11, 1000)) # label junk food as -1, the others as +1 y = np.ones(X.shape[0]) y[:6] = -1 # train and test a classifier clf = DenseLinearSVC().fit(X[1:-1], y[1:-1]) assert_equal(clf.predict([X[0]]), [-1]) assert_equal(clf.predict([X[-1]]), [1])
def test_dense_sparse_idf_sanity(): hv = HashingVectorizer(dim=100, probes=3) shv = SparseHashingVectorizer(dim=100, probes=3) hv.vectorize(JUNK_FOOD_DOCS) shv.vectorize(JUNK_FOOD_DOCS) # check that running TF IDF estimates are the same dense_tf_idf = hv.get_tfidf() sparse_tfidf = shv.get_tfidf().todense() assert_array_almost_equal(dense_tf_idf, sparse_tfidf) # check that incremental behaviour stays the same hv.vectorize(NOTJUNK_FOOD_DOCS) shv.vectorize(NOTJUNK_FOOD_DOCS) dense_tf_idf = hv.get_tfidf() sparse_tfidf = shv.get_tfidf().todense() assert_array_almost_equal(dense_tf_idf, sparse_tfidf)
def _load_document_classification(dataset_path, metadata, set_, sparse, **kw): """Loader implementation for the DocumentClassification format""" target = [] target_names = {} filenames = [] vectorizer = kw.get('vectorizer') if vectorizer is None: if sparse: vectorizer = SparseHashingVectorizer() else: vectorizer = HashingVectorizer() # TODO: make it possible to plug a several pass system to filter-out tokens # that occur in more than 30% of the documents for instance. # TODO: use joblib.Parallel or multiprocessing to parallelize the following # (provided this is not IO bound) dataset_path = os.path.join(dataset_path, set_) folders = [ f for f in sorted(os.listdir(dataset_path)) if os.path.isdir(os.path.join(dataset_path, f)) ] for label, folder in enumerate(folders): target_names[label] = folder folder_path = os.path.join(dataset_path, folder) documents = [ os.path.join(folder_path, d) for d in sorted(os.listdir(folder_path)) ] vectorizer.vectorize_files(documents) target.extend(len(documents) * [label]) filenames.extend(documents) return Bunch(data=vectorizer.get_vectors(), target=np.array(target), target_names=target_names, filenames=filenames, DESCR=metadata.get('description'))