예제 #1
0
from util import plot_util, preprocessing_util, benchmark, collection_reader

if __name__ == "__main__":
    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents,
                                                    min_df=0.1,
                                                    max_df=0.9)

    # SVD
    X = preprocessing_util.apply_svd(X, min(X.shape))

    ###############################################################################
    # Do the actual clustering
    print("Clustering data")
    k = 4

    method = DBSCAN(eps=0.8, min_samples=1).fit(X)

    # Metrics
    benchmark.clustering_metrics(X, method.labels_)

    # Create a 3d scatter plot of the corpus
    plot_util.create_3d_plot_for_sparse_matrix(X, method.labels_)
                                                    min_df=0.1,
                                                    max_df=0.9)

    # SVD
    X = preprocessing_util.apply_svd(X, min(X.shape))

    # Cosine similarity matrix
    dist = 1 - cosine_similarity(X)

    ###############################################################################
    # Do the actual clustering
    k = 5

    # linkage: ward, average, complete
    # affinity: cosine, euclidean, cityblock
    ac = AgglomerativeClustering(linkage="average",
                                 n_clusters=k,
                                 affinity="cosine")

    print("Clustering sparse data with {}".format(ac))
    t0 = time()
    ac.fit(dist)
    print("done in {}".format(time() - t0))
    print()

    # Metrics
    benchmark.clustering_metrics(X, ac.labels_)

    # Create a 3d scatter plot of the corpus
    plot_util.create_3d_plot_for_sparse_matrix(X, ac.labels_)
    ###############################################################################
    # Do the actual clustering
    k = 5

    import matplotlib.pyplot as plt

    the_metrics = []
    for i in range(1):
        km = KMeans(n_clusters=k, verbose=False)
        #print("Clustering sparse data with {}".format(km))
        #t0 = time()
        km.fit(X)
        #print("done in %0.3fs" % (time() - t0))
        #print()

        # Metrics
        the_metrics.append(benchmark.clustering_metrics(X, km.labels_))

        # Plot:
        # create_2d_plot_for_sparse_matrix(X, km.labels_)
        plot_util.create_3d_plot_for_sparse_matrix(X, km.labels_, block=False)

    print("All the metrics: ")
    for a in the_metrics:
        print(a)

    plt.show()
    print("Done!")

예제 #4
0
    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents,
                                                    min_df=0.1,
                                                    max_df=0.9)

    # SVD
    X = preprocessing_util.apply_svd(X, min(X.shape))

    ###############################################################################
    # Do the actual clustering
    print("Clustering data")
    k = 5

    model = Birch(branching_factor=8,
                  n_clusters=k,
                  threshold=0.5,
                  compute_labels=True)
    model.fit(X)

    # Metrics
    benchmark.clustering_metrics(X, model.predict(X))

    # Create a 3d scatter plot of the corpus
    plot_util.create_3d_plot_for_sparse_matrix(X, model.predict(X))
from sklearn import mixture

from util import plot_util, preprocessing_util, benchmark, collection_reader

if __name__ == "__main__":
    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9)

    # SVD
    X = preprocessing_util.apply_svd(X, min(X.shape))

    ###############################################################################
    # Do the actual clustering
    print("Clustering data")
    k = 5

    method = mixture.GaussianMixture(n_components=k, covariance_type='full').fit(X)

    # Metrics
    benchmark.clustering_metrics(X, method.predict(X))

    # Create a 3d scatter plot of the corpus
    plot_util.create_3d_plot_for_sparse_matrix(X, method.predict(X))