def main(): """ currently using kmeans to cluster the data, need to be able to find k as the data gathered will be dynamic maybe only calculate on batch time also, look into using EM instead of kmeans """ filename = "data" fd = open(filename, "r") questions = [line.strip() for line in fd] fd.close() data = Dataset() features = data.populate(questions) k = 2 cluster_ids, centroids = milk.kmeans(features, k) # using unigrams successfully classifies every question print cluster_ids while True: raw = raw_input("ask a question?") if raw == "x" or raw == "q": break if raw == "": continue f = data.featurevector(raw) # wont update the data used initially to find unigrams query = features + [f] cluster_ids, centroids = milk.kmeans(query, k + 1) print "***** did you mean ******" results = filter(lambda zipped: zipped[0] == cluster_ids[-1], zip(cluster_ids[:-1], questions)) if len(results) == 0: print "** no similar questions have been asked **" for index, result in results[:5]: print result print "*************************"
def RunKMeansMilk(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the centroids # file. Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: data = np.genfromtxt(self.dataset[0], delimiter=',') centroids = np.genfromtxt(self.dataset[1], delimiter=',') else: data = np.genfromtxt(self.dataset, delimiter=',') # Gather parameters. clusters = None if "clusters" in options: clusters = options.pop("clusters") maxIterations = None if "max_iterations" in options: maxIterations = options.pop("max_iterations") if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") # Now do validation of options. if not clusters and len(self.dataset) != 2: Log.Fatal("Required option: Number of clusters or cluster locations.") return -1 elif (not clusters or int(clusters) < 1) and len(self.dataset) != 2: Log.Fatal("Invalid number of clusters requested! Must be greater than" + " or equal to 1.") return -1 m = 1000 if not maxIterations else int(maxIterations) try: # Create the KMeans object and perform K-Means clustering. with totalTimer: if len(self.dataset) == 2: assignments = kmeans(data, int(clusters), max_iter=m, centroids=centroids, return_centroids=False) else: assignments, centroids = kmeans(data, int(clusters), max_iter=m) except Exception as e: Log.Fatal("Exception: " + str(e)) return -1 time = totalTimer.ElapsedTime() return time
def execute(self): import milk while True: k,ri = self.inq.get() if k == 'shutdown': return _,centroids = milk.kmeans(self.features, k=k, R=(k*1024+ri)) self.outq.put(centroids)
def train(self, features, labels, **kwargs): from milk.supervised.gridsearch import gridminimise from milk.supervised import svm c_features = np.concatenate([f for f,_ in features if f.size]) c_features = c_features[::self.sample] learner = milk.defaultlearner() k = (self.k if self.k is not None else len(features)//self.kfrac) _,codebook = milk.kmeans(c_features, k=k, R=123) features = project.f(features, codebook) model = learner.train(features, labels) return codebook_model(codebook, model)
def main(): filename = 'data' fd = open(filename, 'r') questions = [line.strip() for line in fd] fd.close() data = Clustering() features = data.populate(questions) k = 2 cluster_ids, centroids = milk.kmeans(features, k) while True: raw = raw_input('Type in your question?') if raw == 'x' or raw == 'q': break if raw == '': continue f = data.featurevector(raw) query = features + [f] cluster_ids, centroids = milk.kmeans(query, k + 1) results = filter(lambda zipped: zipped[0] == cluster_ids[-1], zip(cluster_ids[:-1], questions)) if len(results) == 0: print '** no similar questions have been asked **' for index, result in results[:5]: print result print '*************************'
def kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs): ''' assignments_centroids = kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs) Perform ``repeats`` calls to ``kmeans`` for each ``k`` in ``ks``, select the best one according to ``method.`` Note that, unlike a raw ``kmeans`` call, this is *always deterministic* even if ``R=None`` (which is interpreted as being equivalent to setting it to a fixed value). Otherwise, the jug paradigm would be broken as different runs would give different results. Parameters ---------- features : array-like 2D array ks : sequence of integers These will be the values of ``k`` to try repeats : integer, optional How many times to attempt each k (default: 1). method : str, optional Which method to use. Must be one of 'AIC' (default) or 'BIC'. R : random number source, optional Even you do not pass a value, the result will be deterministic. This is different from the typical behaviour of ``R``, but, when using jug, reproducibility is often but, when using jug, reproducibility is often a desired feature. kwargs : other options These are passed transparently to ``kmeans`` Returns ------- assignments_centroids : jug.Task jug.Task which is the result of the best (as measured by ``method``) kmeans clustering. ''' from milk import kmeans from milk.utils import get_pyrandom kmeans = TaskGenerator(kmeans) if R is not None: start = get_pyrandom(R).randint(0,1024*1024) else: start = 7 results = [] for ki,k in enumerate(ks): for i in xrange(repeats): results.append(kmeans(features, k, R=(start+7*repeats*ki+i), **kwargs)) return _select_best(features, results, method)[1]
def kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs): ''' assignments_centroids = kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs) Perform ``repeats`` calls to ``kmeans`` for each ``k`` in ``ks``, select the best one according to ``method.`` Note that, unlike a raw ``kmeans`` call, this is *always deterministic* even if ``R=None``. Parameters ---------- features : array-like 2D array ks : sequence of integers These will be the values of ``k`` to try repeats : integer, optional How many times to attempt each k (default: 1). method : str, optional Which method to use. Must be one of 'AIC' (default) or 'BIC'. R : random number source, optional If you do not pass a value, the result will be deterministic kwargs : other options These are passed transparently to ``kmeans`` Returns ------- assignments_centroids : jug.Task jug.Task which is the result of the best (as measured by ``method``) kmeans clustering. ''' from milk import kmeans from milk.utils import get_pyrandom kmeans = TaskGenerator(kmeans) if R is not None: start = get_pyrandom(R).randint(0,1024*1024) else: start = 7 results = [] for ki,k in enumerate(ks): for i in xrange(repeats): results.append(kmeans(features, k, R=(start+7*repeats*ki+i), **kwargs)) return _select_best(features, results, method)
def train_all(self, dataset, mu=None): """ Process kmeans algorithm on the input to localize clusters. """ #TODO-- why does this sometimes return X and sometimes return nothing? X = dataset.get_design_matrix() n, m = X.shape k = self.k if milk is not None: #use the milk implementation of k-means if it's available cluster_ids, mu = milk.kmeans(X,k) else: #our own implementation # taking random inputs as initial clusters if user does not provide # them. if mu is not None: if not len(mu) == k: raise Exception('You gave %i clusters, but k=%i were expected' % (len(mu), k)) else: indices = numpy.random.randint(X.shape[0], size=k) mu = X[indices] try: dists = numpy.zeros((n, k)) except MemoryError: print ("dying trying to allocate dists matrix ", "for %d examples and %d means" % (n, k)) raise old_kills = {} iter = 0 mmd = prev_mmd = float('inf') while True: if self.verbose: print 'kmeans iter ' + str(iter) #print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd) #if numpy.sum(numpy.isnan(mu)) > 0: if numpy.any(numpy.isnan(mu)): print 'nan found' return X #computing distances for i in xrange(k): dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) if iter > 0: prev_mmd = mmd min_dists = dists.min(axis=1) #mean minimum distance: mmd = min_dists.mean() print 'cost: ',mmd if iter > 0 and (iter >= self.max_iter or \ abs(mmd - prev_mmd) < self.convergence_th): #converged break #finding minimum distances min_dist_inds = dists.argmin(axis=1) #computing means i = 0 blacklist = [] new_kills = {} while i < k: b = min_dist_inds == i if not numpy.any(b): killed_on_prev_iter = True #initializes empty cluster to be the mean of the d data #points farthest from their corresponding means if i in old_kills: d = old_kills[i] - 1 if d == 0: d = 50 new_kills[i] = d else: d = 5 mu[i, :] = 0 for j in xrange(d): idx = numpy.argmax(min_dists) min_dists[idx] = 0 #chose point idx mu[i, :] += X[idx, :] blacklist.append(idx) mu[i, :] /= float(d) #cluster i was empty, reset it to d far out data points #recomputing distances for this cluster dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) min_dists = dists.min(axis=1) for idx in blacklist: min_dists[idx] = 0 min_dist_inds = dists.argmin(axis=1) #done i += 1 else: mu[i, :] = numpy.mean(X[b, :], axis=0) if numpy.any(numpy.isnan(mu)): print 'nan found at', i return X i += 1 old_kills = new_kills iter += 1 self.mu = sharedX( mu ) self._params = [ self.mu ] return True
def word_net(weibo,weibo_dict,lable,flag,k_cluster):#词频词网 black = load_black_words() sw = load_scws() n = 0 ts = time.time() f_dict = dict()#频数字典 total = 0#词的总数 weibo_word = [] weibo_text = dict() weibo_mid = [] for i in range(0,len(weibo)): mid = weibo[i] text = weibo_dict[weibo[i]][1] if lable[i] == 0: words = sw.participle(text) row = [] for word in words: if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词 total = total + 1 if f_dict.has_key(str(word[0])): f_dict[str(word[0])] = f_dict[str(word[0])] + 1 else: f_dict[str(word[0])] = 1 row.append(word[0]) weibo_word.append(row) weibo_mid.append(str(mid)) weibo_text[str(mid)] = str(text) n = n + 1 if n%10000 == 0: end = time.time() print '%s weibo takes %s s' %(n,(end-ts)) ts = end #top_k = int(total*0.175) + 1#关键词数量 keyword = TopkHeap(300) ts = time.time() print 'start to calculate information counting' n = 0 for k,v in f_dict.iteritems():#计算单个词的信息量 if v >= 2 and (float(v)/float(total)) <= 0.8:#去掉频数小于3,频率高于80%的词 p = v#0 - math.log(v, 2)#计算信息量 keyword.Push((p,k))#排序 n = n + 1 if n%10000 == 0: end = time.time() print '%s weibo takes %s s' %(n,(end-ts)) ts = end keyword_data = keyword.TopK()#取得前100的高频词作为顶点 ts = time.time() keyword = [] k_value = dict() for i in range(0,len(keyword_data)): keyword.append(keyword_data[i][1]) k_value[str(keyword_data[i][1])] = float(keyword_data[i][0])/float(total) word_net = dict()#词网字典 for i in range(0,len(weibo_word)): row = weibo_word[i] for j in range(0,len(row)): if row[j] in keyword: if j-1 >= 0 and row[j] != row[j-1]: if word_net.has_key(str(row[j]+'_'+row[j-1])): word_net[str(row[j]+'_'+row[j-1])] = word_net[str(row[j]+'_'+row[j-1])] + 1 elif word_net.has_key(str(row[j-1]+'_'+row[j])): word_net[str(row[j-1]+'_'+row[j])] = word_net[str(row[j-1]+'_'+row[j])] + 1 else: word_net[str(row[j-1]+'_'+row[j])] = 1 if j+1 < len(row) and row[j] != row[j+1]: if word_net.has_key(str(row[j]+'_'+row[j+1])): word_net[str(row[j]+'_'+row[j+1])] = word_net[str(row[j]+'_'+row[j+1])] + 1 elif word_net.has_key(str(row[j+1]+'_'+row[j])): word_net[str(row[j+1]+'_'+row[j])] = word_net[str(row[j+1]+'_'+row[j])] + 1 else: word_net[str(row[j]+'_'+row[j+1])] = 1 end = time.time() print 'net use %s s' % (end-ts) weight = TopkHeap(500) for k,v in word_net.iteritems():#计算权重 k1,k2 = k.split('_') if not k_value.has_key(k1): k_value[k1] = 0 if not k_value.has_key(k2): k_value[k2] = 0 if k_value[k1] > k_value[k2]: p = v*k_value[k1] else: p = v*k_value[k2] weight.Push((p,k))#排序 data = weight.TopK() word = [] for i in range(0,len(data)): if data[i][1] not in word: word.append(data[i]) if len(word) == 300:#取前300的词对 break #聚类 feature = [] for w in word: k1,k2 = w[1].split('_') c = [] for i in range(0, len(weibo_word)): n1 = str(weibo_text[str(weibo_mid[i])]).count(str(k1)) n2 = str(weibo_text[str(weibo_mid[i])]).count(str(k2)) n = n1 + n2 c.append(n) feature.append(c) features = np.array(feature) cluster_ids = milk.kmeans(features, k_cluster) return cluster_ids, word
@author: juanibraun """ import milk import sys import csv input = sys.argv[-2] out_name = sys.argv[-1] data = [] i=0 nombres = [] #leo entrada with open(input,'rb') as g: reader = csv.reader(g,delimiter=';') for row in reader: data.append(row) #aplico kMeans y genero salida k = 1000 cluster_ids, centroids = milk.kmeans(data, k) with open(out_name,'wb') as f: writer = csv.writer(f,delimiter=';') writer.writerows(centroids) print cluster_ids print centroids
from __future__ import print_function import numpy as np import mahotas as mh from mahotas.features import surf from pylab import * from os import path f = mh.demos.load('luispedro', as_grey=True) f = f.astype(np.uint8) spoints = surf.surf(f, 4, 6, 2) print("Nr points:", len(spoints)) try: import milk descrs = spoints[:,5:] k = 5 values, _ =milk.kmeans(descrs, k) colors = np.array([(255-52*i,25+52*i,37**i % 101) for i in range(k)]) except: values = np.zeros(100) colors = np.array([(255,0,0)]) f2 = surf.show_surf(f, spoints[:100], values, colors) imshow(f2) show()
def train_all(self, dataset, mu=None): """ Process kmeans algorithm on the input to localize clusters. Parameters ---------- dataset : WRITEME mu : WRITEME Returns ------- rval : bool WRITEME """ #TODO-- why does this sometimes return X and sometimes return nothing? X = dataset.get_design_matrix() n, m = X.shape k = self.k if milk is not None: #use the milk implementation of k-means if it's available cluster_ids, mu = milk.kmeans(X, k) else: #our own implementation # taking random inputs as initial clusters if user does not provide # them. if mu is not None: if not len(mu) == k: raise Exception( 'You gave %i clusters, but k=%i were expected' % (len(mu), k)) else: indices = numpy.random.randint(X.shape[0], size=k) mu = X[indices] try: dists = numpy.zeros((n, k)) except MemoryError: raise TypicalMemoryError("dying trying to allocate dists " "matrix for {0} examples and {1} " "means".format(n, k)) old_kills = {} iter = 0 mmd = prev_mmd = float('inf') while True: if self.verbose: logger.info('kmeans iter {0}'.format(iter)) #print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd) #if numpy.sum(numpy.isnan(mu)) > 0: if numpy.any(numpy.isnan(mu)): logger.info('nan found') return X #computing distances for i in xrange(k): dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) if iter > 0: prev_mmd = mmd min_dists = dists.min(axis=1) #mean minimum distance: mmd = min_dists.mean() logger.info('cost: {0}'.format(mmd)) if iter > 0 and (iter >= self.max_iter or \ abs(mmd - prev_mmd) < self.convergence_th): #converged break #finding minimum distances min_dist_inds = dists.argmin(axis=1) #computing means i = 0 blacklist = [] new_kills = {} while i < k: b = min_dist_inds == i if not numpy.any(b): killed_on_prev_iter = True #initializes empty cluster to be the mean of the d data #points farthest from their corresponding means if i in old_kills: d = old_kills[i] - 1 if d == 0: d = 50 new_kills[i] = d else: d = 5 mu[i, :] = 0 for j in xrange(d): idx = numpy.argmax(min_dists) min_dists[idx] = 0 #chose point idx mu[i, :] += X[idx, :] blacklist.append(idx) mu[i, :] /= float(d) #cluster i was empty, reset it to d far out data points #recomputing distances for this cluster dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1) min_dists = dists.min(axis=1) for idx in blacklist: min_dists[idx] = 0 min_dist_inds = dists.argmin(axis=1) #done i += 1 else: mu[i, :] = numpy.mean(X[b, :], axis=0) if numpy.any(numpy.isnan(mu)): logger.info('nan found at {0}'.format(i)) return X i += 1 old_kills = new_kills iter += 1 self.mu = sharedX(mu) self._params = [self.mu] return True
def word_net(weibo,weibo_dict,lable,flag,k_cluster):#词频词网 black = load_black_words() sw = load_scws() n = 0 ts = time.time() f_dict = dict()#频数字典 total = 0#词的总数 weibo_word = [] weibo_text = dict() weibo_mid = [] for i in range(0,len(weibo)): mid = weibo[i] text = weibo_dict[weibo[i]] if lable[i] == 0: words = sw.participle(text) row = [] for word in words: if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词 total = total + 1 if f_dict.has_key(str(word[0])): f_dict[str(word[0])] = f_dict[str(word[0])] + 1 else: f_dict[str(word[0])] = 1 row.append(word[0]) weibo_word.append(row) weibo_mid.append(str(mid)) weibo_text[str(mid)] = str(text) n = n + 1 if n%10000 == 0: end = time.time() print '%s weibo takes %s s' %(n,(end-ts)) ts = end #top_k = int(total*0.175) + 1#关键词数量 keyword = TopkHeap(300) ts = time.time() print 'start to calculate information counting' n = 0 for k,v in f_dict.iteritems():#计算单个词的信息量 if v >= 2 and (float(v)/float(total)) <= 0.8:#去掉频数小于3,频率高于80%的词 p = v#0 - math.log(v, 2)#计算信息量 keyword.Push((p,k))#排序 n = n + 1 if n%10000 == 0: end = time.time() print '%s weibo takes %s s' %(n,(end-ts)) ts = end keyword_data = keyword.TopK()#取得前100的高频词作为顶点 ts = time.time() keyword = [] k_value = dict() for i in range(0,len(keyword_data)): keyword.append(keyword_data[i][1]) k_value[str(keyword_data[i][1])] = float(keyword_data[i][0])/float(total) word_net = dict()#词网字典 for i in range(0,len(weibo_word)): row = weibo_word[i] for j in range(0,len(row)): if row[j] in keyword: if j-1 >= 0 and row[j] != row[j-1]: if word_net.has_key(str(row[j]+'_'+row[j-1])): word_net[str(row[j]+'_'+row[j-1])] = word_net[str(row[j]+'_'+row[j-1])] + 1 elif word_net.has_key(str(row[j-1]+'_'+row[j])): word_net[str(row[j-1]+'_'+row[j])] = word_net[str(row[j-1]+'_'+row[j])] + 1 else: word_net[str(row[j-1]+'_'+row[j])] = 1 if j+1 < len(row) and row[j] != row[j+1]: if word_net.has_key(str(row[j]+'_'+row[j+1])): word_net[str(row[j]+'_'+row[j+1])] = word_net[str(row[j]+'_'+row[j+1])] + 1 elif word_net.has_key(str(row[j+1]+'_'+row[j])): word_net[str(row[j+1]+'_'+row[j])] = word_net[str(row[j+1]+'_'+row[j])] + 1 else: word_net[str(row[j]+'_'+row[j+1])] = 1 end = time.time() print 'net use %s s' % (end-ts) weight = TopkHeap(500) for k,v in word_net.iteritems():#计算权重 k1,k2 = k.split('_') if not k_value.has_key(k1): k_value[k1] = 0 if not k_value.has_key(k2): k_value[k2] = 0 if k_value[k1] > k_value[k2]: p = v*k_value[k1] else: p = v*k_value[k2] weight.Push((p,k))#排序 data = weight.TopK() word = [] for i in range(0,len(data)): if data[i][1] not in word: word.append(data[i]) if len(word) == 300:#取前300的词对 break #聚类 feature = [] for w in word: k1,k2 = w[1].split('_') c = [] for i in range(0, len(weibo_word)): n1 = str(weibo_text[str(weibo_mid[i])]).count(str(k1)) n2 = str(weibo_text[str(weibo_mid[i])]).count(str(k2)) n = n1 + n2 c.append(n) feature.append(c) features = np.array(feature) cluster_ids = milk.kmeans(features, k_cluster) return cluster_ids, word
from mahotas.features import surf from pylab import * from os import path try: luispedro_image = path.join(path.dirname(path.abspath(__file__)), 'data', 'luispedro.jpg') except NameError: luispedro_image = 'data/luispedro.jpg' f = mahotas.imread(luispedro_image, as_grey=True) f = f.astype(np.uint8) spoints = surf.surf(f, 4, 6, 2) print("Nr points:", len(spoints)) try: import milk descrs = spoints[:, 5:] k = 5 values, _ = milk.kmeans(descrs, k) colors = np.array([(255 - 52 * i, 25 + 52 * i, 37**i % 101) for i in range(k)]) except: values = np.zeros(100) colors = np.array([(255, 0, 0)]) f2 = surf.show_surf(f, spoints[:100], values, colors) imshow(f2) show()
def cut_word(flag,cluster):#标题分类 title = dict() title_count = dict() weibo_word = [] black = load_black_words() sw = load_scws() word_count = [] reader = csv.reader(file('./comment/data%s.csv' % flag, 'rb')) for mid,url,t,c,author,publish,site,board in reader:#按标题归类 if title_count.has_key(str(t)): item = title_count[str(t)] item.append(mid) title_count[str(t)] = item else: item = [] item.append(mid) title_count[str(t)] = item string = t + '_' + c words = sw.participle(string) for word in words: if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black): if word[0] not in weibo_word: weibo_word.append(word[0]) word_count.append(0) title[str(mid)] = [str(t),str(c)] #保留文本大于20条的新闻标题 lable = dict() n = 0 for k,v in title_count.iteritems(): if len(v) >= 20: n = n + 1 lable[str(n)] = v big_data = []#进行分类的微博 big_lable = []#分类的标签 for k,v in lable.iteritems(): for i in v: big_data.append([i,title[str(i)][0],title[str(i)][1]]) big_lable.append((int(k)+cluster-1)) del title[str(i)] #统计每个属性的值 for k,v in title.iteritems(): string = v[0] + '_' + v[1] for i in range(0,len(weibo_word)): if weibo_word[i] in string: word_count[i] = word_count[i] + 1 new_weibo = [] for i in range(0,len(word_count)): if word_count[i] >= 5: new_weibo.append(weibo_word[i]) notin = [] data = dict() for k,v in title.iteritems(): f = 0 row = [] string = v[0] + '_' + v[1] for i in new_weibo: if i in string: n = string.count(i) row.append(n) f = 1 else: row.append(0) if f == 1: data[k] = row else: notin.append(k) #聚类 feature = [] word = [] for k,v in data.iteritems(): word.append([k,title[k][0],title[k][1]]) feature.append((v)) features = np.array(feature) cluster_ids = milk.kmeans(features, cluster) return word, cluster_ids, big_data, big_lable#新闻、聚类标签、分类的微博、分类的标签