def grouper(filename): stemmer_func = nltk.stem.snowball.EnglishStemmer().stem @decorators.memoize def normalize_word(word): return stemmer_func(word.lower()) def get_words(titles): words = set() for title in job_titles: for word in title.split(): words.add(normalize_word(word)) return list(words) @decorators.memoize def vectorspaced(title): title_components = [normalize_word(word) for word in title.split()] return numpy.array([ word in title_components for word in words], numpy.short) with open(filename) as title_file: job_titles = [line.decode('utf-8').strip() for line in title_file.readlines()] #name = Data(keyword = job_titles) #db.session.add(name) #db.session.commit() words = get_words(job_titles) if len(words) >= 1500: k = 75 elif len(words) >= 500 and len(words) < 1000: k = 55 elif len(words) >200 and len(words)<500: k =30 else: k = 15 cluster = KMeansClusterer(k,euclidean_distance,avoid_empty_clusters = True ) cluster.cluster([vectorspaced(title) for title in job_titles if title]) classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] global gen_file gen_file =str(uuid.uuid4())+".csv" f = open("/home/ubuntu/downloads/"+gen_file,'wb') try: w = csv.writer(f) w.writerow(('Search Terms','GroupID')) for cluster_id, title in sorted(zip(classified_examples, job_titles)): w.writerow((title.encode('utf-8'),cluster_id)) #print "done" finally: f.close() f1 = open("/home/ubuntu/time/"+gen_file+".txt", 'wb') try: t = (time.time() - start_time) f1.write(str(t)) finally: f.close()
def get_clusters(txt): clusters = {} num_clusters = len(txt) / 4 if num_clusters < 2: num_clusters = 2 if num_clusters > 5: num_clusters = 5 #txt = [''.join([l for l in txt])] #print txt responses = [line.strip() for line in txt] words = get_words(responses) cluster = KMeansClusterer(num_clusters, euclidean_distance, repeats=100, avoid_empty_clusters=True) cluster.cluster( [vectorspaced(response, words) for response in responses if response]) classified_examples = [ cluster.classify(vectorspaced(response, words)) for response in responses ] for cluster_id, title in sorted(zip(classified_examples, responses)): if not cluster_id in clusters: clusters[cluster_id] = [title] else: clusters[cluster_id].append(title) return (clusters)
def demo(): # example from figure 14.9, page 517, Manning and Schutze from nltk.cluster import KMeansClusterer, euclidean_distance vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, trace=True) print('Clustered:', vectors) print('As:', clusters) print('Means:', clusterer.means()) print() vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True) print('Clustered:', vectors) print('As:', clusters) print('Means:', clusterer.means()) print() # classify a new vector vector = numpy.array([3, 3]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) print()
def my_demo_main(file_list_name, tokenizer_num=0): from mmseg import seg_txt from nltk.cluster import KMeansClusterer, euclidean_distance from nltk.cluster import GAAClusterer tokenizer_list = [seg_txt,] file_list = open(file_list_name) tokenizer = tokenizer_list[tokenizer_num] texts = [[term for term in tokenizer(open('pos/' + str(file_name.strip())).read())] for file_name in file_list] data = TF_IDF(texts) vectors = [] file_count = 1 feature_set = set() for text in data.texts: vector = list() for term in set(text): vector.append((data.tf_idf(term, text), term)) vector.sort(key=lambda x:x[0], reverse=True) for term in vector[:int(len(vector)*0.15) + 1]: feature_set.add(term[1]) print feature_set print len(feature_set) for text in data.texts: vector = list() for term in feature_set: if term in text: vector.append(data.tf_idf(term, text)) else: vector.append(0) square_sum = map(lambda x:x*x, vector) square_sum = math.sqrt(sum(square_sum)) vector = map(lambda x:x/square_sum, vector) vectors += [numpy.array(vector)] print file_count file_count += 1 means = find_max_density(vectors, euclidean_distance); print 'means', len(means) f = open('result.txt', 'w') clusterer = KMeansClusterer(len(means), euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, True) print 'km1', clusters f.write('km1: ' + str(clusters) + '\n') clusterer = KMeansClusterer(len(vectors) / 10, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True, True) print 'km2', clusters f.write('km2: ' + str(clusters) + '\n') clusterer = GAAClusterer(len(vectors) / 10) clusters = clusterer.cluster(vectors, True) print 'gaac', clusters f.write('gaac: ' + str(clusters) + '\n') f.close()
def get_word_clusters(tweets): ListTweets = get_all_text(tweets) ListTweets = list(ListTweets) # Project tweet text onto a vector space vs_tweets = list(TweetVectors(tweets)) cluster = KMeansClusterer(10, euclidean_distance, avoid_empty_clusters = True) cluster.cluster(vs_tweets) classified_examples = [ cluster.classify(tweet) for tweet in vs_tweets ] for cluster_id, tweet in sorted(zip(classified_examples, ListTweets)): print cluster_id, tweet
class KMeansTopics(BaseEstimator, TransformerMixin): def __init__(self, k=10): """ :param k: クラスタ数 int """ cosine = nltk.cluster.util.cosine_distance self.model = KMeansClusterer( k, distance=cosine, avoid_empty_clusters=True) def fit(self, sents): return self def transform(self, sents): self.model.cluster(sents)
def main(): tracknames = get_tracknames() #title_file = open("example_jobs.txt", 'r') #job_titles = [line.strip() for line in title_file.readlines()] words = get_words(tracknames) cluster = KMeansClusterer(20, euclidean_distance, avoid_empty_clusters=True) cluster.cluster([vectorspaced(trakname, words) for trakname in tracknames if trakname]) classified_examples = [cluster.classify(vectorspaced(trackname, words)) for trackname in tracknames] for cluster_id, title in sorted(zip(classified_examples, tracknames)): print cluster_id, title
def clusterize(self, noClusters, noNounsToKeep, **kwargs): """ """ root = getUtility(ISiteRoot) catalog = getToolByName(root, 'portal_catalog') nounTermsIndex = catalog._catalog.getIndex('noun_terms') uidTermsIndex = catalog._catalog.getIndex('UID') nounTermsIndexIds = [] allNouns = set() docnouns = [] vectors = [] for key in nounTermsIndex._unindex.keys(): importantNouns = nounTermsIndex._unindex[key][:noNounsToKeep] if importantNouns: nounTermsIndexIds.append(key) docnouns.append(importantNouns) allNouns = allNouns.union(importantNouns) for nouns in docnouns: vector = [(noun in nouns and 1 or 0) for noun in allNouns] vectors.append(numpy.array(vector)) clusterer = KMeansClusterer(noClusters, pearson, **kwargs) clusters = clusterer.cluster(vectors, True) result = {} for i in range(noClusters): result[i] = [] for i in range(len(nounTermsIndexIds)): docid = nounTermsIndexIds[i] uid = uidTermsIndex._unindex[docid] result[clusters[i]] = result[clusters[i]] + [uid] return result
def cluster(self,vectors): if self.clustering_params['method'] == "KMeans_NLTK": kmeans = KMeansClusterer(num_means=20, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True) self.dataset['cluster'] = \ kmeans.cluster(vectors, assign_clusters=True) elif self.clustering_params['method'] == "KMeans": kmeans = KMeans(n_clusters=eval(self.clustering_params['n_clusters'])) kmeans.fit(vectors) clusters = kmeans.cluster_centers_ self.dataset['cluster'] = kmeans.predict(self.vectors) print(self.dataset[['cluster', 'id', 'text']]) self.clustered_filename = f'{self.disaster_name}_{self.clustering_params["method"]}' + \ f'_{self.clustering_params["n_clusters"]}'.replace(" ", "_") current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") self.dataset.to_csv( f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}.csv", index=False) filename = f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}" with open(filename+'.pkl', 'wb') as file: pickle.dump(kmeans, file) file.close() with open(filename+'.vec', 'wb') as file: pickle.dump(self.vectors, file) file.close() return self.dataset, filename+'.pkl'
def test_knn_result(): model = gensim.models.Word2Vec.load( '/Users/holly/Desktop/毕设/Data/(旧)PlatformsComments/p2p.word2vec.model' ) X = model[model.wv.vocab] from nltk.cluster import KMeansClusterer import nltk NUM_CLUSTERS = 5 kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(X, assign_clusters=True) words = list(model.wv.vocab) cluster_dict = {0: [], 1: [], 2: [], 3: [], 4: []} for i, word in enumerate(words): cluster_dict[assigned_clusters[i]].append(word) for j in range(5): with open( os.path.join( '/Users/holly/Desktop/毕设/Data/(旧)PlatformsComments/result', str(j) + '.txt'), 'w') as f: for word in cluster_dict[j]: f.write("%s\n" % word) f.close()
def clustering(self, modelpath, hashtag_cluster_path, num_clusters): #loading word2vec model model = Word2Vec.load(modelpath) X = model.wv.vectors #clustering num_clusters = int(num_clusters) kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(X, assign_clusters=True) #distributing hashtags into their respective clusters words = list(model.wv.vocab) cluster_distribution = {} for i, word in enumerate(words): try: cluster_distribution[str(assigned_clusters[i])].append(word) except: cluster_distribution[str(assigned_clusters[i])] = [] cluster_distribution[str(assigned_clusters[i])].append(word) #save the cluster distribution with open(hashtag_cluster_path, "w") as write_file: json.dump(cluster_distribution, write_file) print('saved hashtag cluster.')
def cluster(self, embedding, NUM_CLUSTERS): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(embedding, assign_clusters=True) return assigned_clusters
class PartionalNltk(): def __init__(self): self.clf = KMeansClusterer(2, cosine_distance, repeats=30, avoid_empty_clusters=True) def cluster(self, data): clusters = self.clf.cluster(data.toarray(), True) return np.array(clusters) def f_score(self, cluster, f_score_dict): for cl in f_score_dict: docs = np.array(f_score_dict[cl]['docs']) nri = np.intersect1d(cluster, docs).shape[0] nr = docs.shape[0] ni = cluster.shape[0] #print nri, nr, ni try: recall = float(nri) / float(nr) precision = float(nri) / float(ni) f_score = (2 * precision * recall) / (precision + recall) f_score_dict[cl]['fscore'] = f_score if ( f_score > f_score_dict[cl]['fscore'] or not f_score_dict[cl]['fscore'] ) else f_score_dict[cl]['fscore'] except ZeroDivisionError, e: #print e pass return f_score_dict
def clustering(data, cluster, n_classes): print('\n------------------GMM\n') assigned_clusters = mixture.GaussianMixture( n_components=n_classes, covariance_type='tied').fit_predict(data) print('Mutual_info_score =', mutual_info_score(cluster - 1, assigned_clusters)) print( 'Adjusted_mutual_info_score =', adjusted_mutual_info_score(cluster - 1, assigned_clusters, average_method='min')) print('Adjusted_rand_scor =', adjusted_rand_score(cluster - 1, assigned_clusters)) print('\nK_MEANS') kclusterer = KMeansClusterer(num_means=n_classes, distance=nltk.cluster.util.cosine_distance) assigned_clusters = kclusterer.cluster(data, assign_clusters=True) print('Mutual_info_score =', mutual_info_score(cluster - 1, assigned_clusters)) print( 'Adjusted_mutual_info_score =', adjusted_mutual_info_score(cluster - 1, assigned_clusters, average_method='min')) print('Adjusted_rand_scor =', adjusted_rand_score(cluster - 1, assigned_clusters))
class KMeansClusters(BaseEstimator, TransformerMixin): """ Cluster text data using k-means. Makes use of nltk k-means clustering. Allows for alternative distance measures """ def __init__(self, k=7): self.k = k self.distance = nltk.cluster.util.cosine_distance self.model = KMeansClusterer(self.k, self.distance, avoid_empty_clusters=True) def fit(self, documents, labels=None): return self def transform(self, documents): """ fits the K-means model to the given documents Parameters ---------- documents : a string containing the normalized text. Returns ------- fitted model """ return np.array(self.model.cluster(documents, assign_clusters=True))
def word2vec_cluster(in_file, out_file): sentences = [] with codecs.open(in_file, 'r',encoding='utf-8', errors='ignore') as in_file: corpus = in_file.readlines() for line in corpus: line = line.strip('\n') if not line: continue line = line.lower() line = line.split(" ") sentences.append(line) print("training model...") model = Word2Vec(sentences, min_count=2) print("get vector data...") X = model[model.wv.vocab] NUM_CLUSTERS=50 kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,avoid_empty_clusters=True, repeats=30) print("assigning cluster..") assigned_clusters = kclusterer.cluster(X, assign_clusters=True) words = list(model.wv.vocab) with open(out_file, 'a') as out_file: for i, word in enumerate(words): out_file.write(word + ":" + str(assigned_clusters[i]) + '\n')
class PartionalNltk(): def __init__(self): self.clf = KMeansClusterer(2, cosine_distance, repeats=30, avoid_empty_clusters=True) def cluster(self, data): clusters = self.clf.cluster(data.toarray(), True) return np.array(clusters) def f_score(self, cluster, f_score_dict): for cl in f_score_dict: docs = np.array(f_score_dict[cl]['docs']) nri = np.intersect1d(cluster, docs).shape[0] nr = docs.shape[0] ni = cluster.shape[0] #print nri, nr, ni try: recall = float(nri) / float(nr) precision = float(nri) / float(ni) f_score = (2 * precision * recall) / (precision + recall) f_score_dict[cl]['fscore'] = f_score if (f_score > f_score_dict[cl]['fscore'] or not f_score_dict[cl]['fscore']) else f_score_dict[cl]['fscore'] except ZeroDivisionError, e: #print e pass return f_score_dict
def clusterize(self,noClusters,noNouranksToKeep,**kwargs): """ """ storage = getUtility(INounPhraseStorage) docids = storage.rankedNouns.keys() docnouns = [] allNouns = set() vectors = [] for key in docids: importantNouns = storage.getNounTerms( key, noNouranksToKeep) docnouns.append(importantNouns) allNouns = allNouns.union(importantNouns) for nouns in docnouns: vector = [(noun in nouns and 1 or 0) for noun in allNouns] vectors.append(numpy.array(vector)) clusterer = KMeansClusterer(noClusters,pearson,**kwargs) clusters = clusterer.cluster(vectors,True) result = {} for i in range(noClusters): result[i] = [] for docid in docids: index = indexOf(docids,docid) result[clusters[index]] = result[clusters[index]] + [docid] return result
def main(): NUM_CLUSTERS = 3 model = Word2Vec.load("test_word2vec_1.model") model_data = model[ model.wv.vocab] #word2vec 모델을 kmeans clustering 하기 위하여 데이터로 변환 # 클러스터링이 잘 되는지 확인------------------------------------------------------------------------- kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(model_data, assign_clusters=True) print(assigned_clusters) words = list(model.wv.vocab) for i, word in enumerate(words): print(word + ":" + str(assigned_clusters[i])) #----------------------------------------------------------------------------------------------------- #model_data->get Vector Data # clusterling ------------------------------------------------------------------------------------------- kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS) kmeans.fit(model_data) #clusterling labels = kmeans.labels_ # 각 데이터의 라벨 값들 centroids = kmeans.cluster_centers_ #각 cluster의 중심점의 좌표 # 결과 확인 ------------------------------------------------------------------------------------------- print("Cluster id labels for inputted data") print(labels) print("Centroids data") print(centroids)
def __kmeans(self, aspect, vectors, id_sentences, k=50): ''' Cluster sentences using the K-Means Algorithm ''' k = min(k, len(vectors)) vectors = [array(v) for v in vectors] means = vectors[:k] clusterer = KMeansClusterer(k, euclidean_distance, initial_means=means, avoid_empty_clusters=True) with utils.Capturing() as output: clusters = clusterer.cluster(vectors, True) for id_cluster in range(k): self.__clusters[aspect][id_cluster] = { 'importance': 0, 'sentences': [], 'representative_words': [], 'max_sentence': None } for index, id_cluster in enumerate(clusters): self.__clusters[aspect][id_cluster]['sentences'].append( id_sentences[index]) for id_cluster in range(k): # Delete empty clusters if len(self.__clusters[aspect][id_cluster]['sentences']) == 0: self.__clusters[aspect].pop(id_cluster) self.__search_representative_words(aspect)
def cluster(abstracts, mode, metric, debug=False, repeats=10): ''' K-means clustering with evaluation metrics, using custom distance function and provided abstracts. ''' labels = [] vectors = [] # create vectors and labels; k will be number of ground-truth labels construct(abstracts, vectors, mode) k = label(abstracts, labels) # cluster clusterer = KMeansClusterer(k, metric, repeats=repeats, normalise=True, avoid_empty_clusters=True) clusters = clusterer.cluster(vectors, assign_clusters=True, trace=debug) means = clusterer.means() print print "EVALUATION:" # compute evaluation metrics dist = sumdistance(vectors, clusters, means) pure = purity(clusters, labels, k) entr = entropy(clusters, labels, k) f, rand = f1(clusters, labels, k) print "Sum of distances: %f" % dist print "Purity: %f" % pure print "Entropy: %f" % entr print "Rand index: %f" % rand print "F1 measure: %f" % f
def evaluate_clustering(self): data = pd.read_csv(self.clustering_data, names=['Word', 'Cat']) full = pd.concat([data, pd.DataFrame({'v' + str(i): [self.model.wv[word][i] for word in data['Word']] for i in range(self.model.vector_size)})], axis=1) ncats = len(set(full['Cat'])) kclusterer = KMeansClusterer(ncats, distance=nltk.cluster.util.euclidean_distance, repeats=25) assigned_clusters = kclusterer.cluster(np.array(full[['v' + str(i) for i in range(self.model.vector_size)]]), assign_clusters=True) data['Euclidean'] = assigned_clusters kclusterer = KMeansClusterer(ncats, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(np.array(full[['v' + str(i) for i in range(self.model.vector_size)]]), assign_clusters=True) data['Cosine'] = assigned_clusters self.clustering_evaluation_dataset = data self.summary['Clustering Purity (Euclidean)'] = self.purity('Euclidean', ncats) self.summary['Clustering Purity (Cosine)'] = self.purity('Cosine', ncats) self.summary['Clustering Entropy (Euclidean)'] = self.entropy('Euclidean') self.summary['Clustering Entropy (Cosine)'] = self.entropy('Cosine') self.summary['Clustering Rand Index (Euclidean)'] = self.rand_index('Euclidean') self.summary['Clustering Rand Index (Cosine)'] = self.rand_index('Cosine')
def clusterize( data, repeats=50 ): clusterer = KMeansClusterer(5, iou_dist_function, repeats=repeats, avoid_empty_clusters=True) clusters = clusterer.cluster(vectors, True) #print(clusters) anchors = clusterer.means() return anchors
def kmeans_test(model, documents): count = len(documents) vectors = [] print("done") kclusterer = KMeansClusterer(20, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(model.docvecs, assign_clusters=True)
def cluster_kmean(train_file, test_file): """Load train and test data into data frames""" f_train = open(train_file, encoding="utf-8") train_data = json.load(f_train) df_train = pd.DataFrame(train_data, columns=['text']) f_train.close() f_test = open(test_file, encoding='utf-8') test_data = json.load(f_test) df_test = pd.DataFrame(test_data, columns=['text', 'labels']) f_test.close() labels = df_test.labels labels = list(set(sum(labels, [])))[:3] """"Initialize TF-IDF vectorizer""" tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5) dtm = tfidf_vect.fit_transform(df_train['text']) num_clusters = 3 """Initialize clutering""" clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20) clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True) centroids = np.array(clusterer.means()) sorted_centroids = centroids.argsort()[:, ::-1] voc_lookup = tfidf_vect.get_feature_names() test_dtm = tfidf_vect.transform(df_test.text) predicted = [clusterer.classify(v) for v in test_dtm.toarray()] df_test['label_test'] = df_test['labels'].apply(lambda x: x[0]) confusion_df = pd.DataFrame(list( zip(df_test["label_test"].values, predicted)), columns=["actual_class", "cluster"]) df_result = pd.crosstab(index=confusion_df.cluster, columns=confusion_df.actual_class) print(df_result) df_clusterLabelsPredicted = list( df_result.apply(lambda x: x.idxmax(), axis=1)) cluster_dict = dict( (i, j) for i, j in enumerate(df_clusterLabelsPredicted)) predicted_target = [cluster_dict[i] for i in predicted] print( metrics.classification_report(df_test["label_test"], predicted_target)) for i in cluster_dict: print("Cluster %d : Topic %s" % (i, cluster_dict[i]))
def KmeansClustering(trainX, numberOfClusters, numberOfRepeats): # init cluster with trainX # example taken from https://www.nltk.org/_modules/nltk/cluster/kmeans.html#demo clusterer = KMeansClusterer(numberOfClusters, cosine_distance, initial_means=None, repeats=numberOfRepeats) assigned_clusters = clusterer.cluster(trainX, assign_clusters=True) return clusterer, assigned_clusters
class KMeansTopics(object): def __init__(self, corpus, k=10): """ corpus is a corpus object, e.g. an HTMLCorpusReader() or an HTMLPickledCorpusReader() object k is the number of clusters """ self.k = k self.model = None self.vocab = list( set(normalize(corpus.words(categories=['news']))) ) def vectorize(self, document): """ Vectorizes a document consisting of a list of part of speech tagged tokens using the segmentation and tokenization methods. One-hot encode the set of documents """ features = set(normalize(document)) return np.array([ token in features for token in self.vocab], np.short) def cluster(self, corpus): """ Fits the K-Means model to the given data. """ cosine = nltk.cluster.util.cosine_distance self.model = KMeansClusterer( self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize( corpus.words(fileid) ) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document))
def cluster_kmean(train_file, test_file): data = pd.read_json(train_file, orient='columns') data.columns = ["text"] tfidf_vect = TfidfVectorizer(min_df=5, stop_words='english') dtm = tfidf_vect.fit_transform(data["text"]) num_clusters = 3 clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=5) clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True) test = pd.read_json(test_file, orient='columns') test.columns = ["text", "label"] #to convert dataframe with multiple targets to the first target x = test["label"] truth = [] for item in x: truth.append(item[0]) test["label"] = truth test_dtm = tfidf_vect.transform(test["text"]) predicted = [clusterer.classify(v) for v in test_dtm.toarray()] confusion_df = pd.DataFrame(list(zip(test["label"].values, predicted)), columns=["label", "cluster"]) crosstab = pd.crosstab(index=confusion_df.cluster, columns=confusion_df.label) print("using cosine: ") print(crosstab) dfmax = crosstab.idxmax(axis=1) print(dfmax) cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]} predicted_target = [cluster_dict[i] for i in predicted] print(metrics.classification_report(test["label"], predicted_target)) # Kmeans with 20 different centroid seeds num_clusters = 3 km = KMeans(n_clusters=num_clusters, n_init=20).fit(dtm) clusters = km.labels_.tolist() predicted2 = km.predict(test_dtm) confusion_df2 = pd.DataFrame(list(zip(test["label"].values, predicted2)), columns=["label", "cluster"]) crosstab2 = pd.crosstab(index=confusion_df2.cluster, columns=confusion_df2.label) print("using Euclidean distance") print(crosstab2) dfmax = crosstab2.idxmax(axis=1) print(dfmax) cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]} predicted_target2 = [cluster_dict[i] for i in predicted2] print(metrics.classification_report(test["label"], predicted_target2)) return None
def get_clusters(self, vectors): vectors = [numpy.array(v) for v in vectors] init_means = [copy(vectors[i]) for i in range(self.num_clusters)] clusterer = KMeansClusterer(self.num_clusters, euclidean_distance, initial_means=init_means, avoid_empty_clusters=True) clusters = clusterer.cluster(vectors, True) return clusters
def clustering_on_wordvecs(word_vectors, num_clusters): # Initalize a k-means object and use it to extract centroids #kmeans_clustering = KMeans(n_clusters=num_clusters, init='k-means++'); #idx = kmeans_clustering.fit_predict(word_vectors); kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(X, assign_clusters=True) return assigned_clusters
def clustering_question(sents, sents_word2vec, NUM_CLUSTERS=15): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True) assigned_clusters = kclusterer.cluster(sents_word2vec, assign_clusters=True) data = pd.DataFrame([], columns=['text', 'cluster', 'centroid']) data.loc[:, 'text'] = sents data.loc[:, 'cluster'] = pd.Series(assigned_clusters, index=data.index) data.loc[:, 'centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x]) return data, assigned_clusters
def kgen(self, num): num = int(num) clusterer = KMeansClusterer(num, distance=cosine_distance, repeats=20) vecs = self.model.wv[self.model.wv.vocab] assignments = clusterer.cluster(vecs, assign_clusters=True) self.vocab_to_cluster_map = dict(zip(self.model.wv.vocab, assignments)) self.clusters = dict() for word, index in self.vocab_to_cluster_map.items(): if index in self.clusters: self.clusters[index] += word else: self.clusters[index] = [word]
def cosine_cluster(num_clusters, matrix): print("Running k-means using cosine distance...\n") matrix = np.asanyarray(matrix) k_means = KMeansClusterer(num_clusters, cosine_distance, avoid_empty_clusters=True) clusters = k_means.cluster(matrix, assign_clusters=True, trace=False) print("Successfully found %d clusters in %d dimensions \n" % (num_clusters, matrix.shape[1])) return clusters
class KMeansTopics(object): def __init__(self, corpus, k=10): """ corpus is a corpus object, e.g. an HTMLCorpusReader() or an HTMLPickledCorpusReader() object k is the number of clusters """ self.k = k self.model = None self.vocab = list(set(normalize(corpus.words(categories=['news'])))) def vectorize(self, document): """ Vectorizes a document consisting of a list of part of speech tagged tokens using the segmentation and tokenization methods. One-hot encode the set of documents """ features = set(normalize(document)) return np.array([token in features for token in self.vocab], np.short) def cluster(self, corpus): """ Fits the K-Means model to the given data. """ cosine = nltk.cluster.util.cosine_distance self.model = KMeansClusterer(self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize(corpus.words(fileid)) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document))
def get_kmeans_predicted_clusters(word_representions, Num_clusters): #from dictionnary type to transposed dataframe Y = pd.DataFrame(data=word_representions).T X = Y.values #Clustering the data using sklearn library kclusterer = KMeansClusterer(Num_clusters, distance=nltk.cluster.util.euclidean_distance, repeats=25, avoid_empty_clusters=False) predicted_clusters = kclusterer.cluster( X, assign_clusters=True, ) return predicted_clusters
def demo_1(): urls = [ "www.ai-one.com", "http://en.wikipedia.org/wiki/Albert_Einstein", "http://en.wikipedia.org/wiki/USA", "http://en.wikipedia.org/wiki/Microsoft" ] keywords = [get_keywords(url) for url in urls] all_words = set(chain(*keywords)) vectors = [vector_from_keywords(kw, all_words) for kw in keywords] clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True)
def k_means_experiment(sample,distance:['euclidean_distance','cosine_distance']='cosine_distance', min_k=2, max_k=50): score=[] silhouette=[] k_assigment_cluster=[] if distance == 'euclidean_distance': for k in range(min_k,max_k): kclusterer = KMeansClusterer(num_means=k, distance=nltk.cluster.util.euclidean_distance,avoid_empty_clusters=True,repeats=1) assigned_clusters = kclusterer.cluster(sample,assign_clusters=True) # kmeans = KMeans(n_clusters=k,random_state=5,n_jobs=-1,n_init=20,max_iter=500).fit(tickets_vec) # labels = np.array(kmeans.labels_) silhouette.append(silhouette_score(X=sample, labels=np.array(assigned_clusters))) score.append(davies_bouldin_score(sample, assigned_clusters)) k_assigment_cluster.append(assigned_clusters) else: for k in range(min_k,max_k): #kmeans = KMeans(n_clusters=k,random_state=5,n_jobs=-1,n_init=20,max_iter=500).fit(tickets_vec) #assigned_clusters = kmeans.labels_ kclusterer = KMeansClusterer(num_means=k, distance=nltk.cluster.util.cosine_distance, avoid_empty_clusters=True,repeats=1) assigned_clusters = kclusterer.cluster(sample,assign_clusters=True) silhouette.append(silhouette_score(X=sample, labels=np.array(assigned_clusters))) score.append(davies_bouldin_score(sample, assigned_clusters)) k_assigment_cluster.append(assigned_clusters) plt.plot(np.arange(min_k,max_k),np.array(score), label='davis bouldin score') plt.plot(np.arange(min_k,max_k),np.array(silhouette),label='silhouette score') plt.xlabel('number of cluster') plt.ylabel('DAVIES BOULDIN SCCORE') plt.title('K-means Cluster Scoring') plt.legend() plt.show() return kclusterer,k_assigment_cluster
def nltk_kmeans(word_vectors, k): from nltk.cluster import KMeansClusterer import nltk #word_vectors.init_sims() norm_vectors = word_vectors.syn0 kmeans = KMeansClusterer(k, nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kmeans.cluster(norm_vectors, assign_clusters=True) clusters = defaultdict(list) for idx in range(0, len(word_vectors.index2word)): clusters[assigned_clusters[idx]].append(word_vectors.index2word[idx]) return (clusters, kmeans)
def train(X, y, train_ratio): from sklearn.cluster import KMeans from sklearn.linear_model import LogisticRegression from sklearn import svm from sklearn.metrics import precision_score, recall_score, f1_score from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split test_ratio = 1-train_ratio X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=36) """ Classification """ # clf = LogisticRegression(C=1000.0, random_state=0).fit(X_train, y_train) # clf = svm.SVC(kernel='linear', C=1e30).fit(X_train, y_train) # y_pred = clf.predict(X_test) # print(y_test) # print(y_pred) # print("accuracy: %.2f" %accuracy_score(y_test, y_pred)) # print("Precision : %.3f" % precision_score(y_test, y_pred)) # print("Recall : %.3f" % recall_score(y_test, y_pred)) # print("F1-micro : %.3f" % f1_score(y_test, y_pred, average='micro')) # print("F1-macro : %.3f" % f1_score(y_test, y_pred, average='macro')) # f1_micro = f1_score(y_test, y_pred, average='micro') # f1_macro = f1_score(y_test, y_pred, average='macro') # print("F1-macro") # print(f1_macro) # print("F1-micro") # print(f1_micro) """ Clustering """ from sklearn.metrics.cluster import normalized_mutual_info_score from nltk.cluster import KMeansClusterer import nltk NUM_CLUSTERS=8 kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=100, normalise=True, avoid_empty_clusters=True) assigned_clusters = kclusterer.cluster(X, assign_clusters=True) nmi = normalized_mutual_info_score(assigned_clusters, y) return nmi
def cluster(self, k=5, repeats=1): ''' Cluster documents into k clusters using the NLTK implementation of K-Means clustering. The frequency of each unique word across an article serves as its feature vector. ''' article_freq_count = {} #frequency of each unique word in a given article for article in self.testing_articles: article_freq_count[article.id] = [] for unique_word in self.keywords: #count frequency of word in article, add to frequency list article_freq_count[article.id].append(article.content.count(unique_word)) #nltk k-means requires numpy array-like objects vectors = [array(article_freq_count[article]) for article in article_freq_count] clusterer = KMeansClusterer(k, cosine_distance, repeats=repeats) clusters = clusterer.cluster(vectors, True, trace=False) groups = [[] for _ in xrange(k)] #vector positions need to be converted back to article IDs, #because IDs are striped during vector construction. vector_ids = {} #maps positions in the vector to article IDs f = article_freq_count.copy() for pos in xrange(len(vectors)): for id in f.keys(): #equivalent to 'if article_freq_count[id] == vectors[pos]', #but numpy equivalence checking is weird t = article_freq_count[id] == vectors[pos] if not False in t: vector_ids[pos] = id f.pop(id) for i in xrange(len(clusters)): groups[clusters[i]].append(vector_ids[i]) return groups
###################################### # Cluster a BOW vector in 4 clusters # # # # Requirements: clusterVectors # # Usage : %loadpy cluster.py # ###################################### import nltk from nltk import cluster from nltk.cluster import cosine_distance from nltk.cluster import KMeansClusterer numClusters = 4 print "KMeans Clustering with %d means and using cosine distance" %numClusters clusterer = KMeansClusterer(numClusters, cosine_distance); clusters = clusterer.cluster(clusterVectors, assign_clusters=True, trace=False); means = clusterer.means();
exit() with open(filename) as title_file: print "Reading Files" job_titles = [unicode(line.strip(), "utf-8") for line in title_file.readlines()] print "Parsing Words" words = get_words(job_titles) print "Creating Cluster Instance" cluster = KMeansClusterer(10, euclidean_distance, 5) # Alternative Clusterer - Less accurate for my use #cluster = GAAClusterer(20) print "Clustering" cluster.cluster([vectorspaced(title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! print "Classifying" classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] print "Saving results" for cluster_id, title in sorted(zip(classified_examples, job_titles)): filename = "results/"+ str(cluster_id) + ".txt" list = codecs.open(filename, "a", "utf-8") list.write(title + "\n")
# ### k-Means Clustering # [Clustering](http://www.nltk.org/api/nltk.cluster.html) groups similar items together. # The K-means clusterer starts with k arbitrarily chosen means (or centroids) then assigns each vector to the cluster with the closest mean. It then recalculates the means of each cluster as the centroid of its vector members. This process repeats until the cluster memberships stabilize. [NLTK docs on this example](https://www.nltk.org/_modules/nltk/cluster/kmeans.html) # This example clusters int vectors, which you can think of as points on a plane. But you could also use clustering to cluster similar documents by vocabulary/topic. # In[80]: import numpy as np from nltk.cluster import KMeansClusterer, euclidean_distance vectors = [np.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, trace=True) print('Clustered:', vectors) print('As:', clusters) print('Means:', clusterer.means()) # **k-Means Clustering, Example-2** # In this example we cluster an array of 6 points into 2 clusters. # The initial centroids are randomly chosen by the clusterer, and it does 10 iterations to regroup the clusters and recalculate centroids. # In[103]: vectors = [np.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
def get_cluster(k=K): cluster = KMeansClusterer(k, euclidean_distance) cluster.cluster([vectorspaced(corpus.words(fileid)) for fileid in corpus.fileids()]) return cluster
return freq_lore.items() num_clusters=20 vec_len=len(common_words) vector_words=common_words[:vec_len] word_freqs=[get_word_freq(text) for [text,a,b] in annotated] tmp_vector=[] for champ_freq in word_freqs: for word in vector_words: appendable=None for (aword, afreq) in champ_freq: if word == aword: appendable=afreq if appendable==None: tmp_vector.append(0) else: tmp_vector.append(appendable) vector_list=[tmp_vector[i:i+vec_len] for i in range(0, len(tmp_vector), vec_len)] word_array=numpy.array(vector_list) clusterer= KMeansClusterer(num_clusters, euclidean_distance, repeats=10) clusters = clusterer.cluster(word_array, True) enum_clusters=list(enumerate(clusters)) enum_clusters.sort(key=lambda x: x[1]) clustered_champs= [(annotated[index][0], clus_num) for (index , clus_num) in enum_clusters] print('clustered_champs',clustered_champs)
0's are inserted otherwise. @param response The survey response to generate a vector for ''' response_components = [normalize_word(word) for word in response.split()] return numpy.array([ word in response_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': num_clusters = DEFAULT_NUM_CLUSTERS if len(sys.argv) == 2: num_clusters = int(sys.argv[1]) with open("reviews.txt") as survey_file: responses = [line.strip() for line in survey_file.readlines()] words = get_words(responses) cluster = KMeansClusterer(num_clusters, euclidean_distance, repeats=100, avoid_empty_clusters=True) cluster.cluster([vectorspaced(response) for response in responses if response]) classified_examples = [ cluster.classify(vectorspaced(response)) for response in responses ] for cluster_id, title in sorted(zip(classified_examples, responses)): print cluster_id, title
def kmeans_cluster(datamatrix, numofclusters=3): clusterer = KMeansClusterer(numofclusters, euclidean_distance) groups = clusterer.cluster(datamatrix, assign_clusters=True, trace=True) means = clusterer.means() return groups, means