def clusterClubs(numClust=5): datList = [] for line in open('places.txt').readlines(): lineArr = line.split('\t') datList.append([float(lineArr[4]), float(lineArr[3])]) datMat = mat(datList) myCentroids, clustAssing = kmeans.biKmeans(datMat, numClust, distMeas=distSLC) fig = plt.figure() rect = [0.1, 0.1, 0.8, 0.8] scatterMarkers = ['s', 'o', '^', '8', 'p', \ 'd', 'v', 'h', '>', '<'] axprops = dict(xticks=[], yticks=[]) ax0 = fig.add_axes(rect, label='ax0', **axprops) imgP = plt.imread('Portland.png') ax0.imshow(imgP) ax1 = fig.add_axes(rect, label='ax1', frameon=False) for i in range(numClust): ptsInCurrCluster = datMat[nonzero(clustAssing[:, 0].A == i)[0], :] markerStyle = scatterMarkers[i % len(scatterMarkers)] ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0], ptsInCurrCluster[:, 1].flatten().A[0], marker=markerStyle, s=90) ax1.scatter(myCentroids[:, 0].flatten().A[0], myCentroids[:, 1].flatten().A[0], marker='+', s=300) plt.show()
def train(tfidf, word_id, k): global vocabulary dataSet = [] for i in range(len(tfidf)): data = [0] * len(vocabulary) for j in range(len(tfidf[i])): data[word_id[i][j]] = tfidf[i][j] dataSet.append(data) # clust, cent = kmeans.kmeans(dataSet, k) clust, cent = kmeans.biKmeans(np.mat(dataSet), k) return clust
def test2(): datMat3 = mat(kmeans.loadData('testSet2.txt')) centList, myNewAssments = kmeans.biKmeans(datMat3, 3) print("centList:", centList, "newAssment:", myNewAssments)
# coding: utf-8 # kmeans/test_bi_kmeans.py import kmeans import numpy as np import matplotlib.pyplot as plt if __name__ == "__main__": dataMat = np.mat(kmeans.loadDataSet('data/testSet2.txt')) centroids, clusterAssment = kmeans.biKmeans(dataMat, 3) clusterCount = centroids.shape[0] m = dataMat.shape[0] # 绘制散点图 patterns = ['o', 'D', '^'] colors = ['b', 'g', 'y'] fig = plt.figure() title = 'bi-kmeans with k=3' ax = fig.add_subplot(111, title=title) for k in range(clusterCount): # 绘制聚类中心 ax.scatter(centroids[k, 0], centroids[k, 1], color='r', marker='+', linewidth=20) for i in range(m): # 绘制属于该聚类中心的样本 ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A == k)[0]] ax.scatter(ptsInCluster[:, 0].flatten().A[0], ptsInCluster[:, 1].flatten().A[0], marker=patterns[k],
data = dat.transpose()/order_pivot.diagonal() #item by item divided by diagonal second time RR = data*d.shape[1] RRdist = empty((len(RR),len(RR)),float) for s in range(0,len(RR)): for t in range(0,len(RR)): RRdist[s,t] = float(RR[s,t]) numpy.savetxt('RR.txt',RRdist,delimiter='\t') print "C" print strftime("%Y-%m-%d %H:%M:%S", gmtime()) for i in range(low,up): cur.execute("delete from orderos_%s"%tname) try: mycentroids, myclusters = kmeans.biKmeans(RRdist,i) print "CD" clusterassn = myclusters[0:,[0]] cluster = squeeze(asarray(clusterassn)) orderitem = squeeze(asarray(o)) m = vstack([cluster,orderitem]) orderos = m.transpose() # orderitem, cluster (1185,2) for r in range(0,len(orderos)): #print "AAA--- insert into orderos_%s"%tname + " values(%s,%s)"%list(orderos[r]) sql="insert into orderos_%s "% tname cur.execute(sql+"values(%s,%s)",list(orderos[r])) cost = costCalculator.exp(tname) if cost<yoyocost: yoyocost=cost
def run(self,optim_para): print "Cluster running with params: " , optim_para tname=optim_para['diagnosis'] s=int(optim_para['interval_from']) j=int(optim_para['interval_to']) interval=int(optim_para['interval_min']) low= int(optim_para['orderset_from']) up= int(optim_para['orderset_to']) cur = self.conn.cursor() ran=24 #mincost_stat = "" #mincost_end = "" with self.conn: timestart = s start = s #end = '' orderosdict=dict() while start <=j-2: mincost=1000000 mincost_start=0 mincost_end=0 costdic=dict() origcostdic=dict() original_cost=-999999 tcost=1000000; for end in range(start+interval,min(start+ran,j+1)): print "start vs end: %i, %i" %(start,end) score=list() print "A", strftime("%Y-%m-%d %H:%M:%S", gmtime()) cur.execute("drop view if exists attribute_pivot_%s"%tname) cur.execute("delete from orderbytime_%s"%tname) cur.execute("call timeinterval_%s(%i,%i)"%(tname,start,end)) cur.execute("call createpivot_%s()"%tname) print "B" , strftime("%Y-%m-%d %H:%M:%S", gmtime()) datamat=[] ordermat=[] cur.execute("select * from attribute_pivot_%s"%tname) d=cur.fetchall() for resd in d: datamat.append(resd) d = mat(datamat) cur.execute("select distinct itemnum from %s where Otime>= %i and Otime < %i order by itemnum"%(tname,start,end)) o=cur.fetchall() for reso in o: ordermat.append(reso) o = mat(ordermat) order_pivot = d*d.transpose() #item by item dat=order_pivot/order_pivot.diagonal() #item by item divided by diagonal first time data = dat.transpose()/order_pivot.diagonal() #item by item divided by diagonal second time RR = data*d.shape[1] RRdist = empty((len(RR),len(RR)),float) for s in range(0,len(RR)): for t in range(0,len(RR)): RRdist[s,t] = float(RR[s,t]) numpy.savetxt('RR.txt',RRdist,delimiter='\t') print "C" ,strftime("%Y-%m-%d %H:%M:%S", gmtime()) for i in range(low,up): cur.execute("delete from orderos_%s"%tname) try: mycentroids, myclusters = kmeans.biKmeans(RRdist,i) clusterassn = myclusters[0:,[0]] cluster = squeeze(asarray(clusterassn)) orderitem = squeeze(asarray(o)) m = vstack([cluster,orderitem]) orderos = m.transpose() # orderitem, cluster (1185,2) for r in range(0,len(orderos)): #print "AAA--- insert into orderos_%s"%tname + " values(%s,%s)"%list(orderos[r]) sql="insert into orderos_%s "% tname cur.execute(sql+"values(%s,%s)",list(orderos[r])) if (optim_para['cost_func'] == "mcc") : cost = costCalculator.exp(tname,self.conn) else: cost = costCalculator_CCC.cccexp(tname,self.conn) if cost<tcost: tcost=cost print "cost=%i at %i" %(cost,i) cur.execute("select * from orderos_%s"%tname) orderostable = cur.fetchall() orderosmat=[] for l in orderostable: orderosmat.append(l) orderosdict[tcost]=orderosmat print "tcost=%i" %(tcost) print strftime("%Y-%m-%d %H:%M:%S", gmtime()) except ValueError: print "Kmeans not successful. Move on." if tcost<mincost: costdic[tcost]=i mincost=tcost mincost_start=start mincost_end=end if (optim_para['cost_func'] == "mcc") : original_cost = costCalculator.orig_exp(tname,self.conn) else: original_cost = costCalculator_CCC.orig_cccexp(tname,self.conn) origcostdic[original_cost]=i print "compare against original cost = %i" %(original_cost) print "mincost change: cost=%i, start=%i, end=%i" %(mincost, start,end) print "####################################################################" start=mincost_end result=dict() if tcost<100000: result['tstart'] = mincost_start result['tend'] = mincost_end lKey = original_cost result['cost_orig'] = lKey print "lowest cost %i compared to original cost %i obtained at %i" % (min(costdic), lKey , costdic.get(min(costdic))) print "lowest cost for start: %i end: %i is: %i" % (mincost_start,mincost_end,mincost) result['cost'] = min(costdic) result['cost_index'] = costdic.get(min(costdic)) result['isSuccess']="success" else: result['tstart'] = timestart result['tend'] = j result['cost']=-99999 result['isSuccess']="fail" result['cost_index']=-99999 result['cost_orig']=-99999 #################Save to Database cur2 = self.conn2.cursor() with self.conn2: cur2.execute("select max(sessionID) from session") maxsessionID=cur2.fetchall()[0][0] sessionID=maxsessionID+1 if result['isSuccess'] == "success": for r in range(0,len(orderosdict[min(costdic)])): cur2.execute("insert into orderos (os,item) values(%s,%s)" ,orderosdict[min(costdic)][r]) cur2.execute("update orderos set sessionID = %s where sessionID is null" %sessionID) print "insert into session values(%s,%s,%s,%s,%s,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")" %(sessionID,mincost_start,mincost_end,result['cost_index'],result['cost'],result['cost_orig'],low,up,interval,tname,optim_para['cost_func'],strftime('%Y-%m-%d %H:%M:%S'),strftime('%Y-%m-%d %H:%M:%S'),result['isSuccess']) cur2.execute("insert into session values(%s,%s,%s,%s,%s,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")" %(sessionID,result['tstart'],result['tend'],result['cost_index'],result['cost'],result['cost_orig'],low,up,interval,tname,optim_para['cost_func'],strftime('%Y-%m-%d %H:%M:%S'),strftime('%Y-%m-%d %H:%M:%S'),result['isSuccess']))