def plot_top_words():
    books = mongo_handler.query_books()
    corpus = collection_reader.extract_corpus(books)
    n = 50

    X, vectorizer = create_bag_of_words(corpus)

    top_values, top_indices, i = max_n(X.data, X.indices, n)

    feature_names = vectorizer.get_feature_names()

    top_terms = [feature_names[top_indices[j]] for j in range(len(top_indices))]

    # print(i)
    # print(top_indices)
    # print(top_values)
    # print(top_terms)

    index = np.arange(len(top_values))
    bar_width = 0.8
    plt.bar(index, top_values, width=bar_width,
                     alpha=0.4,
                     color='b',
                     label='Top words')
    plt.xlabel('Terms')
    plt.ylabel('Frequency')
    plt.title('Most frequent words')
    plt.xticks(index, top_terms, rotation=90)
    plt.tight_layout()
    plt.show()
示例#2
0
def custom_dendrogram(label_type='titles', linkage_method='ward'):
    """
    Plots a dendogram
    uses cosine similarity
    :param
    label_type: {'titles', 'ids'}
    linkage: {'ward', average}
    :return: None
    """

    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)

    # Labels
    if label_type == 'titles':
        labels = [
            "(" + book["book_id3"] + ") " + book["title"][:25] +
            ("..." if len(book["title"]) > 25 else "") for book in books
        ]
    else:
        labels = ["(" + book["book_id3"] + ")" for book in books]

    # Create term-document representation
    vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.7, use_idf=True)
    X = vectorizer.fit_transform(documents)

    # Cosine similarity matrix
    dist = 1 - cosine_similarity(X)

    # Define the linkage_matrix using ward clustering pre-computed distances
    if linkage_method == 'ward':
        linkage_matrix = ward(dist)
    elif linkage_method == 'average':
        linkage_matrix = average(dist)
    elif linkage_method == 'complete':
        linkage_matrix = complete(dist)
    else:
        raise Exception("Parameter linkage_method is not recognized!")

    # Calculate metrics

    # Plot dendrogram
    plt.subplots(figsize=(5, 5))  # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=labels)

    plt.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom='off',  # ticks along the bottom edge are off
        top='off',  # ticks along the top edge are off
        labelbottom='off')

    print(ax["leaves"])
    print(ax["ivl"])

    # plt.tight_layout()  # show plot with tight layout
    plt.show()
def discover_top_frequent_words():
    books = mongo_handler.query_books()
    corpus = collection_reader.extract_corpus(books)

    X, vectorizer = create_bag_of_words(corpus)

    print("Term-document matrix: ")
    print("Shape {} x {}".format(X.shape[0], X.shape[1]))
    print("Number of Feature names: {}".format(len(vectorizer.get_feature_names())))

    # Obtain the most frequent words per book
    N = 10
    print("Top {} words per document:".format(N))

    for j in range(len(corpus)):
        book_title = (books[j]["title"][:75] + '...') if len(books[j]["title"]) > 75 else books[j]["title"]
        print("Book {}: {}".format(books[j]['book_id3'], book_title))
        top_words = calc_top_words_per_document(X.getrow(j), vectorizer, N)
        print(top_words)
        books[j]["top10words"] = top_words

    mongo_handler.remove_book_2_collection()
    mongo_handler.insert_books_2(books)
示例#4
0
import matplotlib.pyplot as plt
from time import time
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from util import plot_util, preprocessing_util, benchmark, collection_reader

if __name__ == "__main__":
    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9)

    for feature_number in range(10,24,4):
        print("Features: {}".format(feature_number))

        # SVD
        Y = preprocessing_util.apply_svd(X, feature_number)

        # Cosine similarity matrix
        dist = 1 - cosine_similarity(Y)

        ###############################################################################
        # Do the actual clustering
        k = 4
        ac = AgglomerativeClustering(linkage="average", n_clusters=k, affinity="cosine")