def plot_top_words(): books = mongo_handler.query_books() corpus = collection_reader.extract_corpus(books) n = 50 X, vectorizer = create_bag_of_words(corpus) top_values, top_indices, i = max_n(X.data, X.indices, n) feature_names = vectorizer.get_feature_names() top_terms = [feature_names[top_indices[j]] for j in range(len(top_indices))] # print(i) # print(top_indices) # print(top_values) # print(top_terms) index = np.arange(len(top_values)) bar_width = 0.8 plt.bar(index, top_values, width=bar_width, alpha=0.4, color='b', label='Top words') plt.xlabel('Terms') plt.ylabel('Frequency') plt.title('Most frequent words') plt.xticks(index, top_terms, rotation=90) plt.tight_layout() plt.show()
def custom_dendrogram(label_type='titles', linkage_method='ward'): """ Plots a dendogram uses cosine similarity :param label_type: {'titles', 'ids'} linkage: {'ward', average} :return: None """ # Read data books = collection_reader.read_books_from_mongo() documents = collection_reader.extract_corpus(books) # Labels if label_type == 'titles': labels = [ "(" + book["book_id3"] + ") " + book["title"][:25] + ("..." if len(book["title"]) > 25 else "") for book in books ] else: labels = ["(" + book["book_id3"] + ")" for book in books] # Create term-document representation vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.7, use_idf=True) X = vectorizer.fit_transform(documents) # Cosine similarity matrix dist = 1 - cosine_similarity(X) # Define the linkage_matrix using ward clustering pre-computed distances if linkage_method == 'ward': linkage_matrix = ward(dist) elif linkage_method == 'average': linkage_matrix = average(dist) elif linkage_method == 'complete': linkage_matrix = complete(dist) else: raise Exception("Parameter linkage_method is not recognized!") # Calculate metrics # Plot dendrogram plt.subplots(figsize=(5, 5)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=labels) plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') print(ax["leaves"]) print(ax["ivl"]) # plt.tight_layout() # show plot with tight layout plt.show()
def discover_top_frequent_words(): books = mongo_handler.query_books() corpus = collection_reader.extract_corpus(books) X, vectorizer = create_bag_of_words(corpus) print("Term-document matrix: ") print("Shape {} x {}".format(X.shape[0], X.shape[1])) print("Number of Feature names: {}".format(len(vectorizer.get_feature_names()))) # Obtain the most frequent words per book N = 10 print("Top {} words per document:".format(N)) for j in range(len(corpus)): book_title = (books[j]["title"][:75] + '...') if len(books[j]["title"]) > 75 else books[j]["title"] print("Book {}: {}".format(books[j]['book_id3'], book_title)) top_words = calc_top_words_per_document(X.getrow(j), vectorizer, N) print(top_words) books[j]["top10words"] = top_words mongo_handler.remove_book_2_collection() mongo_handler.insert_books_2(books)
import matplotlib.pyplot as plt from time import time from sklearn.cluster import AgglomerativeClustering from sklearn.metrics.pairwise import cosine_similarity from util import plot_util, preprocessing_util, benchmark, collection_reader if __name__ == "__main__": # Read data books = collection_reader.read_books_from_mongo() documents = collection_reader.extract_corpus(books) print("{} books:".format(len(documents))) print([book["book_id3"] for book in books]) print() # Create term-document representation X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9) for feature_number in range(10,24,4): print("Features: {}".format(feature_number)) # SVD Y = preprocessing_util.apply_svd(X, feature_number) # Cosine similarity matrix dist = 1 - cosine_similarity(Y) ############################################################################### # Do the actual clustering k = 4 ac = AgglomerativeClustering(linkage="average", n_clusters=k, affinity="cosine")