nclusters = sys.argv[2] clusters = cluster_sentences(infvec, nclusters) for cluster in range(nclusters): print("cluster ", cluster, ":") for i, sentence in enumerate(clusters[cluster]): print("\tsentence ", sentence, ": ", doc1[sentence], "\n", "=" * 90) #kmeans from nltk library import numpy as np infvec = np.array(infvec) from nltk.cluster import KMeansClusterer, cosine_distance clusterer = KMeansClusterer(200, cosine_distance, repeats=20, avoid_empty_clusters=True) clusters = clusterer.cluster(infvec, True, trace=True) ans = [[] for _ in range(50)] for i in range(len(infvec)): ans[clusterer.classify_vectorspace(infvec[i])].append(i) # Print the clusters for i in range(50): print('cluster', i) for j in ans[i]: print(doc1[j], '\n', "=" * 90)
#1 football = np.array([[3, 5], [3, 4], [2, 8], [2, 3], [6, 2], [6, 4], [7, 3], [7, 4], [8, 5], [7, 6]], np.int64) init = [[4, 6], [5, 4]] kmeans = KMeans(n_clusters=2).fit(football) print(kmeans.inertia_) clusterer = KMeansClusterer(2, distance=euclidean_distance) clusters = clusterer.cluster(football, True, trace=True) sse = 0 for i in range(0, 10): #print(clusterer.classify_vectorspace(football[i])) dist = euclidean_distance( football[i], clusterer.means()[clusterer.classify_vectorspace(football[i])]) sse = sse + dist print("SSE: ", sse) print('Clustered:', football) print('As:', clusters) print('Means:', clusterer.means()) print() plt.figure(0) plt.title('Manhattan Distance with Centroids of (4, 6), (5, 4)') plt.xlabel('# of wins 2016') plt.ylabel('# of wins 2017') plt.scatter(football[:, 0], football[:, 1], c=clusters, cmap='rainbow') #plt.scatter(football[:,0],football[:,1], c=kmeans.labels_, cmap='rainbow') clusterer = KMeansClusterer(2, distance=cityblock, initial_means=init) clusters = clusterer.cluster(football, True, trace=True)
# Clustering from nltk.cluster import KMeansClusterer import nltk NUM_CLUSTERS=3 clustermodel = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = clustermodel.cluster(X_train, assign_clusters=True) print (assigned_clusters) cluster_numbers = [] count0 = 0 count1 = 0 count2 = 0 for i in X_dev1: cluster_numbers.append(clustermodel.classify_vectorspace(i)) for i in cluster_numbers: if(i== 0): count0 += 1 if(i == 1): count1 += 1 if(i == 2): count2 += 1 print("No change: 0: {}, 1:{}, 2:{}".format(count0, count1, count2)) cluster_numbers = [] count0 = 0 count1 = 0 count2 = 0
repeats=epochs) clusters = model.cluster(vectors, assign_clusters=True) dump(model, '../data/advanced_nltk_kmeans.joblib') # Just cluster data['cluster'] = pd.DataFrame(clusters) data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_kmeans.csv', index=True, quoting=csv.QUOTE_ALL) #nltk GAAClusterer model = GAAClusterer(num_clusters=cluster_number) model.cluster(vectors, assign_clusters=True) clusters = [model.classify_vectorspace(vector.tolist()) for vector in vectors] data['cluster'] = pd.DataFrame(clusters) data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_gaac.csv', index=True, quoting=csv.QUOTE_ALL) #sklearn means model = KMeans(n_clusters=cluster_number, max_iter=epochs, n_jobs=8) model.fit(vectors) dump(model, '../data/advanced_sklearn_kmeans.joblib') data['cluster'] = pd.DataFrame(model.labels_) data[['text', 'cluster']].to_csv('../data/text_clustered_sklearn_kmeans.csv', index=True, quoting=csv.QUOTE_ALL)