Пример #1
0
def lda():
    dictionary = load_dictionary()
    documents = load_documents(dictionary=dictionary)
    matrix = document_matrix(documents)
    lda = LDA(n_topics=16, learning_method='batch', random_state=SEED)
    lda.fit(matrix)
    for topic in top_words(lda, dictionary, 5):
        print_row([', '.join(topic)])
Пример #2
0
def tfidf_threshold(minimum, path):
    dictionary = load_dictionary()
    documents = load_documents('train', dictionary=dictionary)
    word_freq = get_word_freq(documents)
    below_threshold = word_freq < minimum

    for doc in documents:
        doc.vector[below_threshold] = 0

    write_documents(documents, path)
Пример #3
0
def explore():
    dictionary = load_dictionary()
    documents = load_documents(dictionary=dictionary)
    word_freq = get_word_freq(documents)

    def print_words(indexes):
        for i in indexes:
            print_row([dictionary[i], word_freq[i],
                       '{:.5}\%'.format(100 * word_freq[i] / len(documents))])

    # This is the number of documents that have each word.
    n = 10
    word_freq_sort_index = np.argsort(word_freq)

    print('least common words')
    print_words(word_freq_sort_index[:n])
    print('most common words')
    print_words(word_freq_sort_index[:-n-1:-1])
    print('median common words')
    l = len(word_freq_sort_index) // 2
    print_words(word_freq_sort_index[l-(n//2):l+(n//2)])
    print("median word frequency: {}".format(np.median(word_freq)))
Пример #4
0
import requests

from download import download_wikipedia_abstracts
from load import load_documents
from search.timing import timing
from search.index import Index


@timing
def index_documents(documents, index):
    for i, document in enumerate(documents):
        index.index_document(document)
        if i % 5000 == 0:
            print(f'Indexed {i} documents', end='\r')
    return index


if __name__ == '__main__':
    # this will only download the xml dump if you don't have a copy already;
    # just delete the file if you want a fresh copy
    if not os.path.exists('data/enwiki.latest-abstract.xml.gz'):
        download_wikipedia_abstracts()

    index = index_documents(load_documents(), Index())
    print(f'Index contains {len(index.documents)} documents')

    index.search('London Beer Flood', search_type='AND')
    index.search('London Beer Flood', search_type='OR')
    index.search('London Beer Flood', search_type='AND', rank=True)
    index.search('London Beer Flood', search_type='OR', rank=True)
Пример #5
0
import matplotlib.pyplot as plt

from load import load_documents
from k_means import k_means_plus_plus, cluster, random_documents


def distortion(clusters):
    return sum(cluster.distortion() for cluster in clusters)


def distortion_for_seed(seed):
    centroids = k_means_plus_plus(16, documents, seed=seed)
    # centroids = random_documents(16, documents, seed=seed)
    clusters = cluster(16, documents, centroids=centroids)
    return distortion(clusters)


documents = load_documents('train100')
distortions = [distortion_for_seed(i) for i in range(100)]

plt.title('Distortion for Different Random Seeds')
plt.xlabel('Distortion')
plt.hist(distortions, bins=20)
plt.tight_layout()
plt.savefig('writeup/images/different_init.png')
plt.show()
Пример #6
0
from load import load_documents, write_documents

docs = load_documents()
write_documents(docs, 'asd')