예제 #1
0

nclusters = sys.argv[2]
clusters = cluster_sentences(infvec, nclusters)
for cluster in range(nclusters):
    print("cluster ", cluster, ":")
    for i, sentence in enumerate(clusters[cluster]):
        print("\tsentence ", sentence, ": ", doc1[sentence], "\n", "=" * 90)

#kmeans from nltk library
import numpy as np

infvec = np.array(infvec)
from nltk.cluster import KMeansClusterer, cosine_distance

clusterer = KMeansClusterer(200,
                            cosine_distance,
                            repeats=20,
                            avoid_empty_clusters=True)
clusters = clusterer.cluster(infvec, True, trace=True)

ans = [[] for _ in range(50)]
for i in range(len(infvec)):
    ans[clusterer.classify_vectorspace(infvec[i])].append(i)

# Print the clusters
for i in range(50):
    print('cluster', i)
    for j in ans[i]:
        print(doc1[j], '\n', "=" * 90)
예제 #2
0
파일: main.py 프로젝트: afzm4/cs3001hw7
#1
football = np.array([[3, 5], [3, 4], [2, 8], [2, 3], [6, 2], [6, 4], [7, 3],
                     [7, 4], [8, 5], [7, 6]], np.int64)

init = [[4, 6], [5, 4]]

kmeans = KMeans(n_clusters=2).fit(football)
print(kmeans.inertia_)
clusterer = KMeansClusterer(2, distance=euclidean_distance)
clusters = clusterer.cluster(football, True, trace=True)
sse = 0
for i in range(0, 10):
    #print(clusterer.classify_vectorspace(football[i]))
    dist = euclidean_distance(
        football[i],
        clusterer.means()[clusterer.classify_vectorspace(football[i])])
    sse = sse + dist
print("SSE: ", sse)
print('Clustered:', football)
print('As:', clusters)
print('Means:', clusterer.means())
print()
plt.figure(0)
plt.title('Manhattan Distance with Centroids of (4, 6), (5, 4)')
plt.xlabel('# of wins 2016')
plt.ylabel('# of wins 2017')
plt.scatter(football[:, 0], football[:, 1], c=clusters, cmap='rainbow')
#plt.scatter(football[:,0],football[:,1], c=kmeans.labels_, cmap='rainbow')

clusterer = KMeansClusterer(2, distance=cityblock, initial_means=init)
clusters = clusterer.cluster(football, True, trace=True)
예제 #3
0
# Clustering
from nltk.cluster import KMeansClusterer
import nltk
NUM_CLUSTERS=3
clustermodel = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = clustermodel.cluster(X_train, assign_clusters=True)
print (assigned_clusters)

cluster_numbers = []
count0 = 0
count1 = 0
count2 = 0  

for i in X_dev1:
  cluster_numbers.append(clustermodel.classify_vectorspace(i))

for i in cluster_numbers:
  if(i== 0):
    count0 += 1
  if(i == 1):
    count1 += 1
  if(i == 2):
    count2 += 1
print("No change: 0: {}, 1:{}, 2:{}".format(count0, count1, count2))

cluster_numbers = []
count0 = 0
count1 = 0
count2 = 0  
예제 #4
0
                        repeats=epochs)
clusters = model.cluster(vectors, assign_clusters=True)

dump(model, '../data/advanced_nltk_kmeans.joblib')

# Just cluster
data['cluster'] = pd.DataFrame(clusters)
data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_kmeans.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#nltk GAAClusterer
model = GAAClusterer(num_clusters=cluster_number)
model.cluster(vectors, assign_clusters=True)

clusters = [model.classify_vectorspace(vector.tolist()) for vector in vectors]

data['cluster'] = pd.DataFrame(clusters)
data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_gaac.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#sklearn means
model = KMeans(n_clusters=cluster_number, max_iter=epochs, n_jobs=8)
model.fit(vectors)
dump(model, '../data/advanced_sklearn_kmeans.joblib')

data['cluster'] = pd.DataFrame(model.labels_)
data[['text', 'cluster']].to_csv('../data/text_clustered_sklearn_kmeans.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)