def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk.cluster import GAAClusterer # use a set of tokens with 2D indices vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test the GAAC clusterer with 4 clusters clusterer = GAAClusterer(4) clusters = clusterer.cluster(vectors, True) print("Clusterer:", clusterer) print("Clustered:", vectors) print("As:", clusters) print() # show the dendrogram clusterer.dendrogram().show() # classify a new vector vector = numpy.array([3, 3]) print("classify(%s):" % vector, end=" ") print(clusterer.classify(vector)) print()
def Clustering(orig, minclusters, maxclusters): '''returns (distortion score, number of clusters, cluster assignment)''' # perform clustering clusterer = GAAClusterer() clusterer.cluster(orig) vrc = [] # calculate distortions wb = len(orig) centroid = numpy.mean(orig, axis=0) for vector in orig: wb -= cosine_distance(vector, centroid) lowerbound = minclusters if lowerbound < 2: lowerbound = 2 for k in range(lowerbound, maxclusters + 1): clusterer.update_clusters(k) gaac = [] ww = len(orig) for vector in orig: maxcos = None for j in range(k): clust = clusterer._centroids[j] cdist = cosine_distance(vector, clust) if not maxcos or cdist > maxcos[0]: maxcos = (cdist, j) ww -= maxcos[0] gaac.append(maxcos[1]) vrc.append(((wb / (k - 1)) / (ww / (len(orig) - k)), k, gaac)) khat = (float("inf"), vrc[0][1], vrc[0][2]) for k in range(1, len(vrc) - 1): dist = (vrc[k + 1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k - 1][0]) if dist < khat[0]: khat = (dist, vrc[k][1], vrc[k][2]) return khat
def cluster_texts(texts, clustersNumber, distance): # Convierte texto en una coleccion # Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of", len(collection), "terms.") # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos # Get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. # TF mide la frecuencia en los textos. # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts] print("Vectors created.") print(vectors) # initialize the clusterer clusterer = GAAClusterer(clustersNumber) clusters = clusterer.cluster(vectors, True) # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn #clusterer = AgglomerativeClustering(n_clusters=clustersNumber, # linkage="average", affinity=distanceFunction) #clusters = clusterer.fit_predict(vectors) return clusters
def Gaaclusterer_experiment(samples,k_cluster): silhouette = [] devis_boldin = [] for i in range(2,k_cluster): gaaclusterer= GAAClusterer(num_clusters=i) assigned_cluster = gaaclusterer.cluster(samples,True) silhouette.append(metrics.silhouette_score(X=samples, labels=np.array(assigned_cluster))) devis_boldin.append(davies_bouldin_score(samples, assigned_cluster)) plt.plot(np.arange(2,k_cluster), silhouette,c='r', label='silhouette') plt.plot(np.arange(2,k_cluster), devis_boldin,c='g' ,label='devis_bouldin') plt.xlabel('number of cluster') plt.ylabel('Score') plt.title('GAACluster') plt.legend() plt.show() return assigned_cluster
def get_word_clusters(): all_words = set() for tweet in tweets.find(): for word in get_words(tweet['text']): all_words.add(word) all_words = tuple(all_words) cluster = GAAClusterer(5) cluster.cluster( [vectorspaced(tweet['text'], all_words) for tweet in tweets.find()]) classified_examples = [ cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find() ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def cluster_texts(texts, clustersNumber, distance): #Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of", len(collection), "terms.") #get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts] print("Vectors created.") # initialize the clusterer clusterer = GAAClusterer(clustersNumber) clusters = clusterer.cluster(vectors, True) #clusterer = AgglomerativeClustering(n_clusters=clustersNumber, # linkage="average", affinity=distanceFunction) #clusters = clusterer.fit_predict(vectors) return clusters
title_components = [normalize_word(word) for word in title.split()] return numpy.array( [word in title_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': filename = 'example.txt' if len(sys.argv) == 2: filename = sys.argv[1] with open(filename) as title_file: job_titles = [line.strip() for line in title_file.readlines()] words = get_words(job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(5) cluster.cluster([vectorspaced(title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def cluster3(index, k): from nltk.cluster import GAAClusterer clusterer = GAAClusterer(k) clusters = clusterer.cluster(index, True) return clusters
silhouette_score(tfidf, array1, metric='euclidean', sample_size=None, random_state=None) #0.031277350000072916 ## clustering only on the training data km2 = KMeans(n_clusters=num_clusters, random_state=42) km2.fit(X_all) clusters2 = km2.labels_.tolist() array2 = np.array(clusters2) silhouette_score(X_all, array2, metric='euclidean', sample_size=None, random_state=None) #0.037444797109297122 from nltk.cluster import GAAClusterer clusterer = GAAClusterer(4) clusters_agg = clusterer.cluster(X_all.toarray(), True) array3 = np.array(clusters_agg) # EValuating the nltk Agglomerative clustering silhouette_score(X_all, array3, metric='cosine', sample_size=None, random_state=None)
return numpy.array( [word in title_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': filename = 'CSV/pridected_true_text_alldata.csv' if len(sys.argv) == 2: filename = sys.argv[1] with open(filename) as title_file: job_titles = [line.strip() for line in title_file.readlines()] words = get_words(stemmer, job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(30) cluster.cluster( [vectorspaced(stemmer, title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! classified_examples = [ cluster.classify(vectorspaced(stemmer, title)) for title in job_titles ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
#nltk kmeans model = KMeansClusterer(cluster_number, distance=cosine_distance, repeats=epochs) clusters = model.cluster(vectors, assign_clusters=True) dump(model, '../data/advanced_nltk_kmeans.joblib') # Just cluster data['cluster'] = pd.DataFrame(clusters) data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_kmeans.csv', index=True, quoting=csv.QUOTE_ALL) #nltk GAAClusterer model = GAAClusterer(num_clusters=cluster_number) model.cluster(vectors, assign_clusters=True) clusters = [model.classify_vectorspace(vector.tolist()) for vector in vectors] data['cluster'] = pd.DataFrame(clusters) data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_gaac.csv', index=True, quoting=csv.QUOTE_ALL) #sklearn means model = KMeans(n_clusters=cluster_number, max_iter=epochs, n_jobs=8) model.fit(vectors) dump(model, '../data/advanced_sklearn_kmeans.joblib') data['cluster'] = pd.DataFrame(model.labels_)