Python load_documents примеры использования

Язык программирования: Python

Пространство имен/Пакет: load

Метод/Функция: load_documents

Примеров на hotexamples.com: 6

Python load_documents - 6 примеров найдено. Это лучшие примеры Python кода для load.load_documents, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def lda():
    dictionary = load_dictionary()
    documents = load_documents(dictionary=dictionary)
    matrix = document_matrix(documents)
    lda = LDA(n_topics=16, learning_method='batch', random_state=SEED)
    lda.fit(matrix)
    for topic in top_words(lda, dictionary, 5):
        print_row([', '.join(topic)])

Пример #2

Показать файл

def tfidf_threshold(minimum, path):
    dictionary = load_dictionary()
    documents = load_documents('train', dictionary=dictionary)
    word_freq = get_word_freq(documents)
    below_threshold = word_freq < minimum

    for doc in documents:
        doc.vector[below_threshold] = 0

    write_documents(documents, path)

Пример #3

Показать файл

def explore():
    dictionary = load_dictionary()
    documents = load_documents(dictionary=dictionary)
    word_freq = get_word_freq(documents)

    def print_words(indexes):
        for i in indexes:
            print_row([dictionary[i], word_freq[i],
                       '{:.5}\%'.format(100 * word_freq[i] / len(documents))])

    # This is the number of documents that have each word.
    n = 10
    word_freq_sort_index = np.argsort(word_freq)

    print('least common words')
    print_words(word_freq_sort_index[:n])
    print('most common words')
    print_words(word_freq_sort_index[:-n-1:-1])
    print('median common words')
    l = len(word_freq_sort_index) // 2
    print_words(word_freq_sort_index[l-(n//2):l+(n//2)])
    print("median word frequency: {}".format(np.median(word_freq)))

Пример #4

Показать файл

Файл: run.py Проект: scott91e1/python-searchengine

import requests

from download import download_wikipedia_abstracts
from load import load_documents
from search.timing import timing
from search.index import Index


@timing
def index_documents(documents, index):
    for i, document in enumerate(documents):
        index.index_document(document)
        if i % 5000 == 0:
            print(f'Indexed {i} documents', end='\r')
    return index


if __name__ == '__main__':
    # this will only download the xml dump if you don't have a copy already;
    # just delete the file if you want a fresh copy
    if not os.path.exists('data/enwiki.latest-abstract.xml.gz'):
        download_wikipedia_abstracts()

    index = index_documents(load_documents(), Index())
    print(f'Index contains {len(index.documents)} documents')

    index.search('London Beer Flood', search_type='AND')
    index.search('London Beer Flood', search_type='OR')
    index.search('London Beer Flood', search_type='AND', rank=True)
    index.search('London Beer Flood', search_type='OR', rank=True)

Пример #5

Показать файл

import matplotlib.pyplot as plt

from load import load_documents
from k_means import k_means_plus_plus, cluster, random_documents


def distortion(clusters):
    return sum(cluster.distortion() for cluster in clusters)


def distortion_for_seed(seed):
    centroids = k_means_plus_plus(16, documents, seed=seed)
    # centroids = random_documents(16, documents, seed=seed)
    clusters = cluster(16, documents, centroids=centroids)
    return distortion(clusters)


documents = load_documents('train100')
distortions = [distortion_for_seed(i) for i in range(100)]

plt.title('Distortion for Different Random Seeds')
plt.xlabel('Distortion')
plt.hist(distortions, bins=20)
plt.tight_layout()
plt.savefig('writeup/images/different_init.png')
plt.show()

Пример #6

Показать файл

Файл: test_load.py Проект: aaasen/wiki-cluster

from load import load_documents, write_documents

docs = load_documents()
write_documents(docs, 'asd')