def lda(): dictionary = load_dictionary() documents = load_documents(dictionary=dictionary) matrix = document_matrix(documents) lda = LDA(n_topics=16, learning_method='batch', random_state=SEED) lda.fit(matrix) for topic in top_words(lda, dictionary, 5): print_row([', '.join(topic)])
def tfidf_threshold(minimum, path): dictionary = load_dictionary() documents = load_documents('train', dictionary=dictionary) word_freq = get_word_freq(documents) below_threshold = word_freq < minimum for doc in documents: doc.vector[below_threshold] = 0 write_documents(documents, path)
def explore(): dictionary = load_dictionary() documents = load_documents(dictionary=dictionary) word_freq = get_word_freq(documents) def print_words(indexes): for i in indexes: print_row([dictionary[i], word_freq[i], '{:.5}\%'.format(100 * word_freq[i] / len(documents))]) # This is the number of documents that have each word. n = 10 word_freq_sort_index = np.argsort(word_freq) print('least common words') print_words(word_freq_sort_index[:n]) print('most common words') print_words(word_freq_sort_index[:-n-1:-1]) print('median common words') l = len(word_freq_sort_index) // 2 print_words(word_freq_sort_index[l-(n//2):l+(n//2)]) print("median word frequency: {}".format(np.median(word_freq)))
import requests from download import download_wikipedia_abstracts from load import load_documents from search.timing import timing from search.index import Index @timing def index_documents(documents, index): for i, document in enumerate(documents): index.index_document(document) if i % 5000 == 0: print(f'Indexed {i} documents', end='\r') return index if __name__ == '__main__': # this will only download the xml dump if you don't have a copy already; # just delete the file if you want a fresh copy if not os.path.exists('data/enwiki.latest-abstract.xml.gz'): download_wikipedia_abstracts() index = index_documents(load_documents(), Index()) print(f'Index contains {len(index.documents)} documents') index.search('London Beer Flood', search_type='AND') index.search('London Beer Flood', search_type='OR') index.search('London Beer Flood', search_type='AND', rank=True) index.search('London Beer Flood', search_type='OR', rank=True)
import matplotlib.pyplot as plt from load import load_documents from k_means import k_means_plus_plus, cluster, random_documents def distortion(clusters): return sum(cluster.distortion() for cluster in clusters) def distortion_for_seed(seed): centroids = k_means_plus_plus(16, documents, seed=seed) # centroids = random_documents(16, documents, seed=seed) clusters = cluster(16, documents, centroids=centroids) return distortion(clusters) documents = load_documents('train100') distortions = [distortion_for_seed(i) for i in range(100)] plt.title('Distortion for Different Random Seeds') plt.xlabel('Distortion') plt.hist(distortions, bins=20) plt.tight_layout() plt.savefig('writeup/images/different_init.png') plt.show()
from load import load_documents, write_documents docs = load_documents() write_documents(docs, 'asd')