def test_dense_tf_idf():
    hv = HashingVectorizer(dim=1000, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    hv.vectorize(NOTJUNK_FOOD_DOCS)

    # extract the TF-IDF data
    X = hv.get_tfidf()
    assert_equal(X.shape, (11, 1000))

    # label junk food as -1, the others as +1
    y = np.ones(X.shape[0])
    y[:6] = -1

    # train and test a classifier
    clf = DenseLinearSVC().fit(X[1:-1], y[1:-1])
    assert_equal(clf.predict([X[0]]), [-1])
    assert_equal(clf.predict([X[-1]]), [1])
예제 #2
0
def test_dense_sparse_idf_sanity():
    hv = HashingVectorizer(dim=100, probes=3)
    shv = SparseHashingVectorizer(dim=100, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    shv.vectorize(JUNK_FOOD_DOCS)
    # check that running TF IDF estimates are the same
    dense_tf_idf = hv.get_tfidf()

    sparse_tfidf = shv.get_tfidf().todense()
    assert_array_almost_equal(dense_tf_idf, sparse_tfidf)

    # check that incremental behaviour stays the same

    hv.vectorize(NOTJUNK_FOOD_DOCS)

    shv.vectorize(NOTJUNK_FOOD_DOCS)
    dense_tf_idf = hv.get_tfidf()
    sparse_tfidf = shv.get_tfidf().todense()
    assert_array_almost_equal(dense_tf_idf, sparse_tfidf)
예제 #3
0
def _load_document_classification(dataset_path, metadata, set_, sparse, **kw):
    """Loader implementation for the DocumentClassification format"""
    target = []
    target_names = {}
    filenames = []
    vectorizer = kw.get('vectorizer')
    if vectorizer is None:
        if sparse:
            vectorizer = SparseHashingVectorizer()
        else:
            vectorizer = HashingVectorizer()

    # TODO: make it possible to plug a several pass system to filter-out tokens
    # that occur in more than 30% of the documents for instance.

    # TODO: use joblib.Parallel or multiprocessing to parallelize the following
    # (provided this is not IO bound)

    dataset_path = os.path.join(dataset_path, set_)
    folders = [
        f for f in sorted(os.listdir(dataset_path))
        if os.path.isdir(os.path.join(dataset_path, f))
    ]
    for label, folder in enumerate(folders):
        target_names[label] = folder
        folder_path = os.path.join(dataset_path, folder)
        documents = [
            os.path.join(folder_path, d)
            for d in sorted(os.listdir(folder_path))
        ]
        vectorizer.vectorize_files(documents)
        target.extend(len(documents) * [label])
        filenames.extend(documents)

    return Bunch(data=vectorizer.get_vectors(),
                 target=np.array(target),
                 target_names=target_names,
                 filenames=filenames,
                 DESCR=metadata.get('description'))