def demo(): # example from figure 14.9, page 517, Manning and Schutze from nltk.cluster import KMeansClusterer, euclidean_distance vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, trace=True) print 'Clustered:', vectors print 'As:', clusters print 'Means:', clusterer.means() print vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True) print 'Clustered:', vectors print 'As:', clusters print 'Means:', clusterer.means() print # classify a new vector vector = numpy.array([3, 3]) print 'classify(%s):' % vector, print clusterer.classify(vector) print
def grouper(filename): stemmer_func = nltk.stem.snowball.EnglishStemmer().stem @decorators.memoize def normalize_word(word): return stemmer_func(word.lower()) def get_words(titles): words = set() for title in job_titles: for word in title.split(): words.add(normalize_word(word)) return list(words) @decorators.memoize def vectorspaced(title): title_components = [normalize_word(word) for word in title.split()] return numpy.array([ word in title_components for word in words], numpy.short) with open(filename) as title_file: job_titles = [line.decode('utf-8').strip() for line in title_file.readlines()] #name = Data(keyword = job_titles) #db.session.add(name) #db.session.commit() words = get_words(job_titles) if len(words) >= 1500: k = 75 elif len(words) >= 500 and len(words) < 1000: k = 55 elif len(words) >200 and len(words)<500: k =30 else: k = 15 cluster = KMeansClusterer(k,euclidean_distance,avoid_empty_clusters = True ) cluster.cluster([vectorspaced(title) for title in job_titles if title]) classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] global gen_file gen_file =str(uuid.uuid4())+".csv" f = open("/home/ubuntu/downloads/"+gen_file,'wb') try: w = csv.writer(f) w.writerow(('Search Terms','GroupID')) for cluster_id, title in sorted(zip(classified_examples, job_titles)): w.writerow((title.encode('utf-8'),cluster_id)) #print "done" finally: f.close() f1 = open("/home/ubuntu/time/"+gen_file+".txt", 'wb') try: t = (time.time() - start_time) f1.write(str(t)) finally: f.close()
def get_clusters(txt): clusters = {} num_clusters = len(txt) / 4 if num_clusters < 2: num_clusters = 2 if num_clusters > 5: num_clusters = 5 #txt = [''.join([l for l in txt])] #print txt responses = [line.strip() for line in txt] words = get_words(responses) cluster = KMeansClusterer(num_clusters, euclidean_distance, repeats=100, avoid_empty_clusters=True) cluster.cluster( [vectorspaced(response, words) for response in responses if response]) classified_examples = [ cluster.classify(vectorspaced(response, words)) for response in responses ] for cluster_id, title in sorted(zip(classified_examples, responses)): if not cluster_id in clusters: clusters[cluster_id] = [title] else: clusters[cluster_id].append(title) return (clusters)
def cluster_kmean(train_file, test_file): """Load train and test data into data frames""" f_train = open(train_file, encoding="utf-8") train_data = json.load(f_train) df_train = pd.DataFrame(train_data, columns=['text']) f_train.close() f_test = open(test_file, encoding='utf-8') test_data = json.load(f_test) df_test = pd.DataFrame(test_data, columns=['text', 'labels']) f_test.close() labels = df_test.labels labels = list(set(sum(labels, [])))[:3] """"Initialize TF-IDF vectorizer""" tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5) dtm = tfidf_vect.fit_transform(df_train['text']) num_clusters = 3 """Initialize clutering""" clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20) clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True) centroids = np.array(clusterer.means()) sorted_centroids = centroids.argsort()[:, ::-1] voc_lookup = tfidf_vect.get_feature_names() test_dtm = tfidf_vect.transform(df_test.text) predicted = [clusterer.classify(v) for v in test_dtm.toarray()] df_test['label_test'] = df_test['labels'].apply(lambda x: x[0]) confusion_df = pd.DataFrame(list( zip(df_test["label_test"].values, predicted)), columns=["actual_class", "cluster"]) df_result = pd.crosstab(index=confusion_df.cluster, columns=confusion_df.actual_class) print(df_result) df_clusterLabelsPredicted = list( df_result.apply(lambda x: x.idxmax(), axis=1)) cluster_dict = dict( (i, j) for i, j in enumerate(df_clusterLabelsPredicted)) predicted_target = [cluster_dict[i] for i in predicted] print( metrics.classification_report(df_test["label_test"], predicted_target)) for i in cluster_dict: print("Cluster %d : Topic %s" % (i, cluster_dict[i]))
def cluster_kmean(train_file, test_file): data = pd.read_json(train_file, orient='columns') data.columns = ["text"] tfidf_vect = TfidfVectorizer(min_df=5, stop_words='english') dtm = tfidf_vect.fit_transform(data["text"]) num_clusters = 3 clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=5) clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True) test = pd.read_json(test_file, orient='columns') test.columns = ["text", "label"] #to convert dataframe with multiple targets to the first target x = test["label"] truth = [] for item in x: truth.append(item[0]) test["label"] = truth test_dtm = tfidf_vect.transform(test["text"]) predicted = [clusterer.classify(v) for v in test_dtm.toarray()] confusion_df = pd.DataFrame(list(zip(test["label"].values, predicted)), columns=["label", "cluster"]) crosstab = pd.crosstab(index=confusion_df.cluster, columns=confusion_df.label) print("using cosine: ") print(crosstab) dfmax = crosstab.idxmax(axis=1) print(dfmax) cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]} predicted_target = [cluster_dict[i] for i in predicted] print(metrics.classification_report(test["label"], predicted_target)) # Kmeans with 20 different centroid seeds num_clusters = 3 km = KMeans(n_clusters=num_clusters, n_init=20).fit(dtm) clusters = km.labels_.tolist() predicted2 = km.predict(test_dtm) confusion_df2 = pd.DataFrame(list(zip(test["label"].values, predicted2)), columns=["label", "cluster"]) crosstab2 = pd.crosstab(index=confusion_df2.cluster, columns=confusion_df2.label) print("using Euclidean distance") print(crosstab2) dfmax = crosstab2.idxmax(axis=1) print(dfmax) cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]} predicted_target2 = [cluster_dict[i] for i in predicted2] print(metrics.classification_report(test["label"], predicted_target2)) return None
def get_word_clusters(tweets): ListTweets = get_all_text(tweets) ListTweets = list(ListTweets) # Project tweet text onto a vector space vs_tweets = list(TweetVectors(tweets)) cluster = KMeansClusterer(10, euclidean_distance, avoid_empty_clusters = True) cluster.cluster(vs_tweets) classified_examples = [ cluster.classify(tweet) for tweet in vs_tweets ] for cluster_id, tweet in sorted(zip(classified_examples, ListTweets)): print cluster_id, tweet
def main(): tracknames = get_tracknames() #title_file = open("example_jobs.txt", 'r') #job_titles = [line.strip() for line in title_file.readlines()] words = get_words(tracknames) cluster = KMeansClusterer(20, euclidean_distance, avoid_empty_clusters=True) cluster.cluster([vectorspaced(trakname, words) for trakname in tracknames if trakname]) classified_examples = [cluster.classify(vectorspaced(trackname, words)) for trackname in tracknames] for cluster_id, title in sorted(zip(classified_examples, tracknames)): print cluster_id, title
class KMeansTopics(object): def __init__(self, corpus, k=10): """ corpus is a corpus object, e.g. an HTMLCorpusReader() or an HTMLPickledCorpusReader() object k is the number of clusters """ self.k = k self.model = None self.vocab = list( set(normalize(corpus.words(categories=['news']))) ) def vectorize(self, document): """ Vectorizes a document consisting of a list of part of speech tagged tokens using the segmentation and tokenization methods. One-hot encode the set of documents """ features = set(normalize(document)) return np.array([ token in features for token in self.vocab], np.short) def cluster(self, corpus): """ Fits the K-Means model to the given data. """ cosine = nltk.cluster.util.cosine_distance self.model = KMeansClusterer( self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize( corpus.words(fileid) ) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document))
class KMeansTopics(object): def __init__(self, corpus, k=10): """ corpus is a corpus object, e.g. an HTMLCorpusReader() or an HTMLPickledCorpusReader() object k is the number of clusters """ self.k = k self.model = None self.vocab = list(set(normalize(corpus.words(categories=['news'])))) def vectorize(self, document): """ Vectorizes a document consisting of a list of part of speech tagged tokens using the segmentation and tokenization methods. One-hot encode the set of documents """ features = set(normalize(document)) return np.array([token in features for token in self.vocab], np.short) def cluster(self, corpus): """ Fits the K-Means model to the given data. """ cosine = nltk.cluster.util.cosine_distance self.model = KMeansClusterer(self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize(corpus.words(fileid)) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document))
def cluster_kmean(train_file, test_file): with open(train_file, 'r', encoding='utf-8') as f: jayson_train = json.load(f) with open(test_file, 'r', encoding='utf-8') as f: jayson_test = json.load(f) train = pd.DataFrame(jayson_train) # Initialize the TfidfVectorizer # Set min document frequency to 5 tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5) dtm = tfidf_vect.fit_transform(train[0]) # set number of clusters num_clusters = 3 clusterer_Cos = KMeansClusterer(num_clusters, distance=cosine_distance, repeats=20) clusterer_Euc = KMeansClusterer(num_clusters, distance=euclidean_distance, repeats=20) clusters_cos = clusterer_Cos.cluster(dtm.toarray(), assign_clusters=True) clusters_Euc = clusterer_Euc.cluster(dtm.toarray(), assign_clusters=True) test = pd.DataFrame(jayson_test) # Use the first label in the ground-truth label list of each test document t = [] for i in test[1]: t.append(i[0]) test['label'] = t # Make prediction on test sample test_dtm = tfidf_vect.transform(test[0]) predicted_cos = [clusterer_Cos.classify(v) for v in test_dtm.toarray()] predicted_Euc = [clusterer_Euc.classify(v) for v in test_dtm.toarray()] # Create a dataframe with cluster id and ground truth label confusion_df_cos = pd.DataFrame(list( zip(test['label'].values, predicted_cos)), columns=["label", "cluster"]) confusion_df_Euc = pd.DataFrame(list( zip(test['label'].values, predicted_Euc)), columns=["label", "cluster"]) # Draw the crosstab table crosstab_cos = pd.crosstab(index=confusion_df_cos['cluster'], columns=confusion_df_cos['label']) crosstab_Euc = pd.crosstab(index=confusion_df_Euc['cluster'], columns=confusion_df_Euc['label']) # Draw the majority vote into dictionary majority_vote_cos = crosstab_cos.idxmax(axis=1, skipna=True).to_dict() majority_vote_Euc = crosstab_Euc.idxmax(axis=1, skipna=True).to_dict() # Map true label to cluster id predicted_target_cos = [majority_vote_cos[i] for i in predicted_cos] predicted_target_Euc = [majority_vote_Euc[i] for i in predicted_Euc] # Precision/recall/f-score for each label result_cos = metrics.classification_report(test["label"], predicted_target_cos) result_Euc = metrics.classification_report(test["label"], predicted_target_Euc) # Print out the result print('cosine') print(crosstab_cos) for i in majority_vote_cos: print('Cluster %d: Topic %s' % (i, majority_vote_cos[i])) print(result_cos) print('\nL2') print(crosstab_Euc) for i in majority_vote_Euc: print('Cluster %d: Topic %s' % (i, majority_vote_Euc[i])) print(result_Euc) return None
print cosine from nltk.cluster import KMeansClusterer, euclidean_distance #import nltk.stem #stemmer_func = nltk.stem.snowball.SnowballStemmer("english").stem #stopwords = set(nltk.corpus.stopwords.words('english')) #def normalize_word(word): # return stemmer_func(word.lower()) #def get_words(blogs): # words = set() # for post in sentences: # for word in post.split(): # words.add(normalize_word(word)) #return list(words) def vectorspaced(post): #post_components = [normalize_word(word) for word in post.split()] return numpy.array([ word in words and not word in stop_words for word in words], numpy.short) #words = get_words(sentences) cluster = KMeansClusterer(7, euclidean_distance) cluster.cluster([vectorspaced(post) for post in blog_data if post]) classified_examples = [cluster.classify(vectorspaced(post)) for post in blog_data] for cluster_id, post in sorted(zip(classified_examples, blog_data)): print cluster_id, post
def cluster_kmean(train_file, test_file): with open(train_file) as json_train_file: train_json_data = json.load(json_train_file) train_json_dataframe = pd.DataFrame(train_json_data) train_json_dataframe.columns = ['Text'] #print(train_json_dataframe) with open(test_file) as json_test_file: test_json_data = json.load(json_test_file) test_json_dataframe = pd.DataFrame(test_json_data) test_json_dataframe.columns = ['Text', 'Labels'] test_json_dataframe['First'] = [ x[0] for x in test_json_dataframe.Labels ] unique_variety = test_json_dataframe["First"].unique() # print(unique_variety) # print(test_json_dataframe) # set the min document frequency to 5 # generate tfidf matrix tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5) dtm = tfidf_vect.fit_transform(train_json_dataframe['Text']) #print (dtm.shape) # set number of clusters num_clusters = 3 # initialize clustering model # using cosine distance # clustering will repeat 20 times # each with different initial centroids clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20) # samples are assigned to cluster labels # starting from 0 clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True) #print the cluster labels of the first 5 samples #print(clusters[0:5]) # note transform function is used # not fit_transform test_dtm = tfidf_vect.transform(test_json_dataframe["Text"]) predicted = [clusterer.classify(v) for v in test_dtm.toarray()] #print(predicted[0:10]) # determine cluster labels and calcuate precision and recall # Create a dataframe with cluster id and # ground truth label confusion_df = pd.DataFrame(list( zip(test_json_dataframe['First'].values, predicted)), columns=["label", "cluster"]) confusion_df.head() # generate crosstab between clusters and true labels print(pd.crosstab(index=confusion_df.cluster, columns=confusion_df.label)) # Map cluster id to true labels by "majority vote" cluster_dict = {i: j for i, j in enumerate(unique_variety)} print(cluster_dict) # Map true label to cluster id predicted_target = [cluster_dict[i] for i in predicted] print( metrics.classification_report(test_json_dataframe['First'], predicted_target))
tfidf = transformer.fit_transform(vectorizer.fit_transform(articals)) print(tfidf.toarray) print(tfidf) dtm = tfidf.toarray() #使用TF-IDF矩阵对章节进行聚类 ## 使用夹角余弦距离进行k均值聚类 #越接近1,夹角越接近0,越相似 kmeans = KMeansClusterer( num_means=3, ## 聚类数目 distance=nltk.cluster.util.cosine_distance, ## 夹角余弦距离 ) kmeans.cluster(dtm) ## 聚类得到的类别 labpre = [kmeans.classify(i) for i in dtm] kmeanlab = Red_df[["ChapName", "Chapter"]] kmeanlab["cosd_pre"] = labpre print(kmeanlab) ## 查看每类有多少个分组 count = kmeanlab.groupby("cosd_pre").count() ## 可视化 count = count.reset_index() count.plot(kind="barh", figsize=(6, 5), x="cosd_pre", y="ChapName", legend=False) for xx, yy, s in zip(count.cosd_pre, count.ChapName, count.ChapName):
for word in words], numpy.short) if __name__ == '__main__': filename=r'C:\Users\Shravya.Shanmukh\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Python 3.6\Cars.txt' with open(filename) as title_file: print ("Reading Files") job_titles = [line.strip() for line in title_file.readlines()] print ("Parsing Words") words = get_words(job_titles) print ("Creating Cluster Instance") cluster = KMeansClusterer(4, euclidean_distance, 5) print ("Clustering") cluster.cluster([vectorspaced(title) for title in job_titles if title]) print ("Classifying") classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] print ("Saving results") for cluster_id, title in sorted(zip(classified_examples, job_titles)): filename = r'C:\Users\Shravya.Shanmukh\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Python 3.6\results'+ str(cluster_id) + ".csv" list = codecs.open(filename, "a", "utf-8") list.write(title + "\n")
exit() with open(filename) as title_file: print "Reading Files" job_titles = [unicode(line.strip(), "utf-8") for line in title_file.readlines()] print "Parsing Words" words = get_words(job_titles) print "Creating Cluster Instance" cluster = KMeansClusterer(10, euclidean_distance, 5) # Alternative Clusterer - Less accurate for my use #cluster = GAAClusterer(20) print "Clustering" cluster.cluster([vectorspaced(title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! print "Classifying" classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] print "Saving results" for cluster_id, title in sorted(zip(classified_examples, job_titles)): filename = "results/"+ str(cluster_id) + ".txt" list = codecs.open(filename, "a", "utf-8") list.write(title + "\n")
clustered: pd.DataFrame = vwp[vwp["cluster"] == i] print("Sample:") for index, row in clustered.sample(5, random_state=222).iterrows( ): # print 5 sample paragraphs from cluster content: str = recover_raw_paragraph(vwpr, row["letter"], row["offset"]) print( f"[{index}] letter {row['letter']}, paragraph {row['offset']}: {content}, embedded words: {row['chosen_words']}" ) print(f"Paragraphs most similar to mean vector of cluster {i}:") similars: pd.DataFrame = most_similar_paragraphs(vwp, means[i]) for index, row in similars.head(3).iterrows(): content: str = recover_raw_paragraph(vwpr, row["letter"], row["offset"]) in_cluster: bool = "YES" if int(vwp.at[index, "cluster"]) == i else "NO" print( f"[{index}] letter {row['letter']}, paragraph {row['offset']} (similarity: {row['similarity']}) (in cluster {i}? {in_cluster}):\n{content}" ) print("\n\n") for i in range(10): # tests test = np.random.choice(vwp.index) classification = kmeans.classify(np.array(vwp.at[test, "embedding"])) print(f"TEST index: {test}, classification: {classification}") print(f"CONTENT: {vwpr.at[test, 'text']}") # save dataframe after clustering vwp.to_json(VWP_CLUSTERED, orient="index")
0's are inserted otherwise. @param response The survey response to generate a vector for ''' response_components = [normalize_word(word) for word in response.split()] return numpy.array([ word in response_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': num_clusters = DEFAULT_NUM_CLUSTERS if len(sys.argv) == 2: num_clusters = int(sys.argv[1]) with open("reviews.txt") as survey_file: responses = [line.strip() for line in survey_file.readlines()] words = get_words(responses) cluster = KMeansClusterer(num_clusters, euclidean_distance, repeats=100, avoid_empty_clusters=True) cluster.cluster([vectorspaced(response) for response in responses if response]) classified_examples = [ cluster.classify(vectorspaced(response)) for response in responses ] for cluster_id, title in sorted(zip(classified_examples, responses)): print cluster_id, title
vectors = [np.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test k-means using 2 means, euclidean distance, and 10 trial clustering repetitions with random seeds clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True) centroids = clusterer.means() print('Clustered:', vectors) print('As:', clusters) print('Means:', centroids) # classify a new vector vector = np.array([2,2]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) # **Plot a Chart of the Clusters in Example-2** # Make a Scatter Plot of the two clusters using matplotlib.pyplot. # We plot all the points in cluster-0 blue, and all the points in cluster-1 red. Then we plot the two centroids in orange. # I used list comprehensions to create new lists for all the x0, y0, x1 and y1 values. # In[104]: import matplotlib.pyplot as plt x0 = np.array([x[0] for idx, x in enumerate(vectors) if clusters[idx]==0]) y0 = np.array([x[1] for idx, x in enumerate(vectors) if clusters[idx]==0]) plt.scatter(x0,y0, color='blue')
# Step 5: k-means clustering vectors = [array(f) for f in doc_lda] clusterer = KMeansClusterer(num_topics, euclidean_distance, repeats=100, avoid_empty_clusters=True) clusterer.cluster(vectors, True) apps_per_topic = [] for x in range(num_topics): apps_per_topic.append([]) # classify a new vector apk_names = name_desc_pairs.keys() for i, doc in enumerate(doc_lda): topic_id = clusterer.classify(array(doc)) apps_per_topic[topic_id].append(apk_names[i]) # Step 6: make text for each topic text_for_topics = [] for x in range(num_topics): text_for_topics.append('') apkname_stem_pairs = dict(zip(name_desc_pairs.keys(), processed)) for topic_id, names in enumerate(apps_per_topic): for name in names: # FIXME: there have two options for word cloud 1) pure descriptions 2) using stem processed # text_for_topics[topic_id] = text_for_topics[topic_id] + " " + name_desc_pairs[name] text = " ".join(apkname_stem_pairs[name]) text_for_topics[topic_id] = text_for_topics[topic_id] + text
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, trace=True) print('Clustered:', vectors) print('As:', clusters) print('Means:', clusterer.means()) print() vectors = [ numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]] ] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True) print('Clustered:', vectors) print('As:', clusters) print('Means:', clusterer.means()) print() # classify a new vector vector = numpy.array([3, 3]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) print()