Exemplo n.º 1
0
def clusterClubs(numClust=5):
    datList = []
    for line in open('places.txt').readlines():
        lineArr = line.split('\t')
        datList.append([float(lineArr[4]), float(lineArr[3])])
    datMat = mat(datList)
    myCentroids, clustAssing = kmeans.biKmeans(datMat,
                                               numClust,
                                               distMeas=distSLC)
    fig = plt.figure()
    rect = [0.1, 0.1, 0.8, 0.8]
    scatterMarkers = ['s', 'o', '^', '8', 'p', \
                      'd', 'v', 'h', '>', '<']
    axprops = dict(xticks=[], yticks=[])
    ax0 = fig.add_axes(rect, label='ax0', **axprops)
    imgP = plt.imread('Portland.png')
    ax0.imshow(imgP)
    ax1 = fig.add_axes(rect, label='ax1', frameon=False)
    for i in range(numClust):
        ptsInCurrCluster = datMat[nonzero(clustAssing[:, 0].A == i)[0], :]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0],
                    ptsInCurrCluster[:, 1].flatten().A[0],
                    marker=markerStyle,
                    s=90)
    ax1.scatter(myCentroids[:, 0].flatten().A[0],
                myCentroids[:, 1].flatten().A[0],
                marker='+',
                s=300)
    plt.show()
Exemplo n.º 2
0
def train(tfidf, word_id, k):
    global vocabulary
    dataSet = []
    for i in range(len(tfidf)):
        data = [0] * len(vocabulary)
        for j in range(len(tfidf[i])):
            data[word_id[i][j]] = tfidf[i][j]
        dataSet.append(data)
    # clust, cent = kmeans.kmeans(dataSet, k)
    clust, cent = kmeans.biKmeans(np.mat(dataSet), k)
    return clust
Exemplo n.º 3
0
def test2():
    datMat3 = mat(kmeans.loadData('testSet2.txt'))
    centList, myNewAssments = kmeans.biKmeans(datMat3, 3)
    print("centList:", centList, "newAssment:", myNewAssments)
Exemplo n.º 4
0
# coding: utf-8
# kmeans/test_bi_kmeans.py

import kmeans
import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":
    dataMat = np.mat(kmeans.loadDataSet('data/testSet2.txt'))
    centroids, clusterAssment = kmeans.biKmeans(dataMat, 3)
    clusterCount = centroids.shape[0]
    m = dataMat.shape[0]
    # 绘制散点图
    patterns = ['o', 'D', '^']
    colors = ['b', 'g', 'y']
    fig = plt.figure()
    title = 'bi-kmeans with k=3'
    ax = fig.add_subplot(111, title=title)
    for k in range(clusterCount):
        # 绘制聚类中心
        ax.scatter(centroids[k, 0],
                   centroids[k, 1],
                   color='r',
                   marker='+',
                   linewidth=20)
        for i in range(m):
            # 绘制属于该聚类中心的样本
            ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A == k)[0]]
            ax.scatter(ptsInCluster[:, 0].flatten().A[0],
                       ptsInCluster[:, 1].flatten().A[0],
                       marker=patterns[k],
Exemplo n.º 5
0
 data = dat.transpose()/order_pivot.diagonal() #item by item divided by diagonal second time
 RR = data*d.shape[1]
 RRdist = empty((len(RR),len(RR)),float)
 
 for s in range(0,len(RR)):
     for t in range(0,len(RR)):
         RRdist[s,t] = float(RR[s,t])
 numpy.savetxt('RR.txt',RRdist,delimiter='\t')
 print "C"
 print strftime("%Y-%m-%d %H:%M:%S", gmtime())
     
 for i in range(low,up):
     cur.execute("delete from orderos_%s"%tname)
     
     try:
         mycentroids, myclusters = kmeans.biKmeans(RRdist,i)
         print "CD"
         clusterassn = myclusters[0:,[0]]
         cluster = squeeze(asarray(clusterassn))
         orderitem = squeeze(asarray(o))
         m = vstack([cluster,orderitem])
         orderos = m.transpose() # orderitem, cluster (1185,2)
                 
         for r in range(0,len(orderos)):
             #print "AAA--- insert into orderos_%s"%tname + " values(%s,%s)"%list(orderos[r])
             sql="insert into orderos_%s "% tname
             cur.execute(sql+"values(%s,%s)",list(orderos[r]))
         cost = costCalculator.exp(tname)
         if cost<yoyocost:
             yoyocost=cost
             
Exemplo n.º 6
0
  def run(self,optim_para):
    print "Cluster running with params: " , optim_para
  
    tname=optim_para['diagnosis']
    s=int(optim_para['interval_from'])
    j=int(optim_para['interval_to'])
    interval=int(optim_para['interval_min'])
    low= int(optim_para['orderset_from'])
    up= int(optim_para['orderset_to'])
    cur = self.conn.cursor()
    ran=24
    #mincost_stat = ""
    #mincost_end = ""

    with self.conn:
      timestart = s
      start = s
      #end = ''
      orderosdict=dict()
      
      while start <=j-2:
      
        mincost=1000000
        mincost_start=0
        mincost_end=0
        costdic=dict()
        origcostdic=dict()
        original_cost=-999999
        tcost=1000000;

        for end in range(start+interval,min(start+ran,j+1)):
          print "start vs end: %i, %i" %(start,end)
          score=list()
          print "A", strftime("%Y-%m-%d %H:%M:%S", gmtime())
          cur.execute("drop view if exists attribute_pivot_%s"%tname)
          cur.execute("delete from orderbytime_%s"%tname)
          cur.execute("call timeinterval_%s(%i,%i)"%(tname,start,end))
          cur.execute("call createpivot_%s()"%tname)
          print "B" , strftime("%Y-%m-%d %H:%M:%S", gmtime())
          datamat=[]
          ordermat=[]
          cur.execute("select * from attribute_pivot_%s"%tname)
          d=cur.fetchall()
          for resd in d:
              datamat.append(resd)
          d = mat(datamat)
          cur.execute("select distinct itemnum from %s where Otime>= %i and Otime < %i order by itemnum"%(tname,start,end))
          o=cur.fetchall()
          for reso in o:
              ordermat.append(reso)
          o = mat(ordermat)
          order_pivot = d*d.transpose() #item by item
          dat=order_pivot/order_pivot.diagonal() #item by item divided by diagonal first time
          data = dat.transpose()/order_pivot.diagonal() #item by item divided by diagonal second time
          RR = data*d.shape[1]
          RRdist = empty((len(RR),len(RR)),float)
          
          for s in range(0,len(RR)):
              for t in range(0,len(RR)):
                  RRdist[s,t] = float(RR[s,t])
          numpy.savetxt('RR.txt',RRdist,delimiter='\t')
          print "C" ,strftime("%Y-%m-%d %H:%M:%S", gmtime())
              
          for i in range(low,up):
              cur.execute("delete from orderos_%s"%tname)
              
              try:
                  mycentroids, myclusters = kmeans.biKmeans(RRdist,i)
                  clusterassn = myclusters[0:,[0]]
                  cluster = squeeze(asarray(clusterassn))
                  orderitem = squeeze(asarray(o))
                  m = vstack([cluster,orderitem])
                  orderos = m.transpose() # orderitem, cluster (1185,2)
                          
                  for r in range(0,len(orderos)):
                      #print "AAA--- insert into orderos_%s"%tname + " values(%s,%s)"%list(orderos[r])
                      sql="insert into orderos_%s "% tname
                      cur.execute(sql+"values(%s,%s)",list(orderos[r]))
                  if (optim_para['cost_func'] == "mcc") :
                    cost = costCalculator.exp(tname,self.conn)
                  else:
                    cost = costCalculator_CCC.cccexp(tname,self.conn)
                  if cost<tcost:
                      tcost=cost
                      
                      print "cost=%i at %i" %(cost,i)
                      cur.execute("select * from orderos_%s"%tname)
                      orderostable = cur.fetchall()
                      orderosmat=[]
                      for l in orderostable:
                          orderosmat.append(l)
                      orderosdict[tcost]=orderosmat
                      
                  print "tcost=%i" %(tcost)
                  print strftime("%Y-%m-%d %H:%M:%S", gmtime())
              except ValueError:
                  print "Kmeans not successful. Move on."
           
          
          if tcost<mincost:
              costdic[tcost]=i
              mincost=tcost
              mincost_start=start
              mincost_end=end
              if (optim_para['cost_func'] == "mcc") :
                original_cost = costCalculator.orig_exp(tname,self.conn)
              else:
                original_cost = costCalculator_CCC.orig_cccexp(tname,self.conn)
              origcostdic[original_cost]=i
              print "compare against original cost = %i" %(original_cost)
              print "mincost change: cost=%i, start=%i, end=%i" %(mincost, start,end)

          print "####################################################################"
        start=mincost_end

        result=dict()


        if tcost<100000:
            result['tstart'] = mincost_start
            result['tend'] = mincost_end
            lKey = original_cost
            result['cost_orig'] = lKey
            print "lowest cost %i compared to original cost %i obtained at %i" % (min(costdic), lKey , costdic.get(min(costdic)))
            print "lowest cost for start: %i end: %i is: %i" % (mincost_start,mincost_end,mincost)
            result['cost'] = min(costdic)
            result['cost_index'] = costdic.get(min(costdic))
            result['isSuccess']="success"

        else:
            result['tstart'] = timestart
            result['tend'] = j
            result['cost']=-99999
            result['isSuccess']="fail"
            result['cost_index']=-99999
            result['cost_orig']=-99999

    #################Save to Database

        cur2 = self.conn2.cursor()
        with self.conn2:
            cur2.execute("select max(sessionID) from session")
            maxsessionID=cur2.fetchall()[0][0]
            sessionID=maxsessionID+1
            if result['isSuccess'] == "success":
                for r in range(0,len(orderosdict[min(costdic)])):
                    cur2.execute("insert into orderos (os,item) values(%s,%s)" ,orderosdict[min(costdic)][r])
                cur2.execute("update orderos set sessionID = %s where sessionID is null" %sessionID)

            print "insert into session values(%s,%s,%s,%s,%s,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")" %(sessionID,mincost_start,mincost_end,result['cost_index'],result['cost'],result['cost_orig'],low,up,interval,tname,optim_para['cost_func'],strftime('%Y-%m-%d %H:%M:%S'),strftime('%Y-%m-%d %H:%M:%S'),result['isSuccess'])
            cur2.execute("insert into session values(%s,%s,%s,%s,%s,%s,%s,%s,%s,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")" %(sessionID,result['tstart'],result['tend'],result['cost_index'],result['cost'],result['cost_orig'],low,up,interval,tname,optim_para['cost_func'],strftime('%Y-%m-%d %H:%M:%S'),strftime('%Y-%m-%d %H:%M:%S'),result['isSuccess']))