예제 #1
0
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9)

    for feature_number in range(10,24,4):
        print("Features: {}".format(feature_number))

        # SVD
        Y = preprocessing_util.apply_svd(X, feature_number)

        # Cosine similarity matrix
        dist = 1 - cosine_similarity(Y)

        ###############################################################################
        # Do the actual clustering
        k = 4
        ac = AgglomerativeClustering(linkage="average", n_clusters=k, affinity="cosine")

        print("Clustering sparse data with {}".format(ac))
        t0 = time()
        ac.fit(dist)
        print("done in {}".format(time() - t0))
        print()

        # Create a 3d scatter plot of the corpus
        plot_util.create_3d_plot_for_sparse_matrix(Y, ac.labels_, show=False)

    plt.show()
예제 #2
0
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents,
                                                    min_df=0.1,
                                                    max_df=0.9)

    # SVD
    X = preprocessing_util.apply_svd(X, min(X.shape))

    ###############################################################################
    # Do the actual clustering
    print("Clustering data")
    k = 5

    model = Birch(branching_factor=8,
                  n_clusters=k,
                  threshold=0.5,
                  compute_labels=True)
    model.fit(X)

    # Metrics
    benchmark.clustering_metrics(X, model.predict(X))

    # Create a 3d scatter plot of the corpus
    plot_util.create_3d_plot_for_sparse_matrix(X, model.predict(X))
예제 #3
0
from util import plot_util, preprocessing_util, benchmark, collection_reader

if __name__ == "__main__":
    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents,
                                                    min_df=0.1,
                                                    max_df=0.9)

    # SVD
    X = preprocessing_util.apply_svd(X, min(X.shape))

    ###############################################################################
    # Do the actual clustering
    print("Clustering data")
    k = 4

    method = DBSCAN(eps=0.8, min_samples=1).fit(X)

    # Metrics
    benchmark.clustering_metrics(X, method.labels_)

    # Create a 3d scatter plot of the corpus
    plot_util.create_3d_plot_for_sparse_matrix(X, method.labels_)
    ###############################################################################
    # Do the actual clustering
    k = 5

    import matplotlib.pyplot as plt

    the_metrics = []
    for i in range(1):
        km = KMeans(n_clusters=k, verbose=False)
        #print("Clustering sparse data with {}".format(km))
        #t0 = time()
        km.fit(X)
        #print("done in %0.3fs" % (time() - t0))
        #print()

        # Metrics
        the_metrics.append(benchmark.clustering_metrics(X, km.labels_))

        # Plot:
        # create_2d_plot_for_sparse_matrix(X, km.labels_)
        plot_util.create_3d_plot_for_sparse_matrix(X, km.labels_, block=False)

    print("All the metrics: ")
    for a in the_metrics:
        print(a)

    plt.show()
    print("Done!")