from util import plot_util, preprocessing_util, benchmark, collection_reader if __name__ == "__main__": # Read data books = collection_reader.read_books_from_mongo() documents = collection_reader.extract_corpus(books) print("{} books:".format(len(documents))) print([book["book_id3"] for book in books]) print() # Create term-document representation X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9) # SVD X = preprocessing_util.apply_svd(X, min(X.shape)) ############################################################################### # Do the actual clustering print("Clustering data") k = 4 method = DBSCAN(eps=0.8, min_samples=1).fit(X) # Metrics benchmark.clustering_metrics(X, method.labels_) # Create a 3d scatter plot of the corpus plot_util.create_3d_plot_for_sparse_matrix(X, method.labels_)
min_df=0.1, max_df=0.9) # SVD X = preprocessing_util.apply_svd(X, min(X.shape)) # Cosine similarity matrix dist = 1 - cosine_similarity(X) ############################################################################### # Do the actual clustering k = 5 # linkage: ward, average, complete # affinity: cosine, euclidean, cityblock ac = AgglomerativeClustering(linkage="average", n_clusters=k, affinity="cosine") print("Clustering sparse data with {}".format(ac)) t0 = time() ac.fit(dist) print("done in {}".format(time() - t0)) print() # Metrics benchmark.clustering_metrics(X, ac.labels_) # Create a 3d scatter plot of the corpus plot_util.create_3d_plot_for_sparse_matrix(X, ac.labels_)
############################################################################### # Do the actual clustering k = 5 import matplotlib.pyplot as plt the_metrics = [] for i in range(1): km = KMeans(n_clusters=k, verbose=False) #print("Clustering sparse data with {}".format(km)) #t0 = time() km.fit(X) #print("done in %0.3fs" % (time() - t0)) #print() # Metrics the_metrics.append(benchmark.clustering_metrics(X, km.labels_)) # Plot: # create_2d_plot_for_sparse_matrix(X, km.labels_) plot_util.create_3d_plot_for_sparse_matrix(X, km.labels_, block=False) print("All the metrics: ") for a in the_metrics: print(a) plt.show() print("Done!")
# Read data books = collection_reader.read_books_from_mongo() documents = collection_reader.extract_corpus(books) print("{} books:".format(len(documents))) print([book["book_id3"] for book in books]) print() # Create term-document representation X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9) # SVD X = preprocessing_util.apply_svd(X, min(X.shape)) ############################################################################### # Do the actual clustering print("Clustering data") k = 5 model = Birch(branching_factor=8, n_clusters=k, threshold=0.5, compute_labels=True) model.fit(X) # Metrics benchmark.clustering_metrics(X, model.predict(X)) # Create a 3d scatter plot of the corpus plot_util.create_3d_plot_for_sparse_matrix(X, model.predict(X))
from sklearn import mixture from util import plot_util, preprocessing_util, benchmark, collection_reader if __name__ == "__main__": # Read data books = collection_reader.read_books_from_mongo() documents = collection_reader.extract_corpus(books) print("{} books:".format(len(documents))) print([book["book_id3"] for book in books]) print() # Create term-document representation X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9) # SVD X = preprocessing_util.apply_svd(X, min(X.shape)) ############################################################################### # Do the actual clustering print("Clustering data") k = 5 method = mixture.GaussianMixture(n_components=k, covariance_type='full').fit(X) # Metrics benchmark.clustering_metrics(X, method.predict(X)) # Create a 3d scatter plot of the corpus plot_util.create_3d_plot_for_sparse_matrix(X, method.predict(X))