Python HashingVectorizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: scikits.learn.features.text

Класс/Тип: HashingVectorizer

Примеров на hotexamples.com: 5

Python HashingVectorizer - 5 примеров найдено. Это лучшие примеры Python кода для scikits.learn.features.text.HashingVectorizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HashingVectorizer(3)

get_tfidf(2)

vectorize(2)

Пример #1

Показать файл

def test_dense_sparse_idf_sanity():
    hv = HashingVectorizer(dim=100, probes=3)
    shv = SparseHashingVectorizer(dim=100, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    shv.vectorize(JUNK_FOOD_DOCS)
    # check that running TF IDF estimates are the same
    dense_tf_idf = hv.get_tfidf()

    sparse_tfidf = shv.get_tfidf().todense()
    assert_array_almost_equal(dense_tf_idf, sparse_tfidf)

    # check that incremental behaviour stays the same

    hv.vectorize(NOTJUNK_FOOD_DOCS)

    shv.vectorize(NOTJUNK_FOOD_DOCS)
    dense_tf_idf = hv.get_tfidf()
    sparse_tfidf = shv.get_tfidf().todense()
    assert_array_almost_equal(dense_tf_idf, sparse_tfidf)

Пример #2

Показать файл

Файл: 144_test_text_remote.py Проект: kenji-nicholson/smerge

def test_dense_tf_idf():
    hv = HashingVectorizer(dim=1000, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    hv.vectorize(NOTJUNK_FOOD_DOCS)

    # extract the TF-IDF data
    X = hv.get_tfidf()
    assert_equal(X.shape, (11, 1000))

    # label junk food as -1, the others as +1
    y = np.ones(X.shape[0])
    y[:6] = -1

    # train and test a classifier
    clf = DenseLinearSVC().fit(X[1:-1], y[1:-1])
    assert_equal(clf.predict([X[0]]), [-1])
    assert_equal(clf.predict([X[-1]]), [1])

Пример #3

Показать файл

def _load_document_classification(dataset_path, metadata, set_, sparse, **kw):
    """Loader implementation for the DocumentClassification format"""
    target = []
    target_names = {}
    filenames = []
    vectorizer = kw.get('vectorizer')
    if vectorizer is None:
        if sparse:
            vectorizer = SparseHashingVectorizer()
        else:
            vectorizer = HashingVectorizer()

    # TODO: make it possible to plug a several pass system to filter-out tokens
    # that occur in more than 30% of the documents for instance.

    # TODO: use joblib.Parallel or multiprocessing to parallelize the following
    # (provided this is not IO bound)

    dataset_path = os.path.join(dataset_path, set_)
    folders = [
        f for f in sorted(os.listdir(dataset_path))
        if os.path.isdir(os.path.join(dataset_path, f))
    ]
    for label, folder in enumerate(folders):
        target_names[label] = folder
        folder_path = os.path.join(dataset_path, folder)
        documents = [
            os.path.join(folder_path, d)
            for d in sorted(os.listdir(folder_path))
        ]
        vectorizer.vectorize_files(documents)
        target.extend(len(documents) * [label])
        filenames.extend(documents)

    return Bunch(data=vectorizer.get_vectors(),
                 target=np.array(target),
                 target_names=target_names,
                 filenames=filenames,
                 DESCR=metadata.get('description'))

Пример #4

Показать файл

Файл: test_text.py Проект: timwee/scikit-learn

def test_dense_tf_idf():
    hv = HashingVectorizer(dim=1000, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    hv.vectorize(NOTJUNK_FOOD_DOCS)

    # extract the TF-IDF data
    X = hv.get_tfidf()
    assert_equal(X.shape, (11, 1000))

    # label junk food as -1, the others as +1
    y = np.ones(X.shape[0])
    y[:6] = -1

    # train and test a classifier
    clf = LogisticRegression().fit(X[1:-1], y[1:-1])
    assert_equal(clf.predict([X[0]]), [-1])
    assert_equal(clf.predict([X[-1]]), [1])

Пример #5

Показать файл

Файл: test_text.py Проект: timwee/scikit-learn

def test_dense_sparse_idf_sanity():
    hv = HashingVectorizer(dim=100, probes=3)
    shv = SparseHashingVectorizer(dim=100, probes=3)

    hv.vectorize(JUNK_FOOD_DOCS)
    shv.vectorize(JUNK_FOOD_DOCS)

    # check that running TF IDF estimates are the same
    dense_tf_idf = hv.get_tfidf()
    sparse_tfidf = shv.get_tfidf().todense()

    assert_array_almost_equal(dense_tf_idf, sparse_tfidf)

    # check that incremental behaviour stays the same
    hv.vectorize(NOTJUNK_FOOD_DOCS)
    shv.vectorize(NOTJUNK_FOOD_DOCS)

    dense_tf_idf = hv.get_tfidf()
    sparse_tfidf = shv.get_tfidf().todense()

    assert_array_almost_equal(dense_tf_idf, sparse_tfidf)