def test1(): datMat = mat(kMeans.loadDataSet('testSet.txt')) print(min(datMat[:, 0])) print(kMeans.randCent(datMat, 2)) print(kMeans.distEclud(datMat[0], datMat[1])) #myCentroids, clustAssing = kMeans.kMeans(datMat,3) myCentroids, clustAssing = kMeans.biKmeans(datMat, 4) print(myCentroids) kMeans.plot1(datMat, myCentroids)
def main(): dataDirectory = 'D:/PycharmProjects/ZhiHuKanShan/zhihukanshan/data' list_DataMat_wordVectorSet = kMeans.loadDataSet(dataDirectory + '/rem_word_embedding.txt') # matDataMat_Label = mat(listDataMat_label) # matDataMat_wordVectorSet = mat(list_DataMat_wordVectorSet) matDataMat_wordVectorSet = pd.DataFrame(list_DataMat_wordVectorSet) # store the matDataMat_Label into word_label.pkl # output_1 = open('word_wordVectorSet.pkl', 'wb') # Pickle dictionary using protocol 0. # pickle.dump(matDataMat_wordVectorSet, output_1) # output_1.close matDataMat_wordVectorSet.to_hdf('rem_word_embedding.h5', 'df')
import matplotlib.pyplot as plt def showCluster(dataSet, k, centroids, clusterAssment): m, dim = shape(dataSet) if dim != 2: print("Sorry! i can not draw because the dimension of data is not 2!") return 1 mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] if k > len(mark): print("Sorry! Your k is too large!") return 1 # draw all samples for i in range(m): markIndex = int(clusterAssment[i, 0]) # 为样本指定颜色 plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex]) mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] # draw the centroids for i in range(k): plt.plot(centroids[i, 0], centroids[i, 1], mark[i], marker='+', color='red', markersize=18) # 用marker来指定质心样式,用color和markersize来指定颜色和大小 plt.show() datMat=mat(kMeans.loadDataSet('../data/kMeans_testSet.txt')) clusterCenters,clusterAssment = kMeans.kMeans(datMat,4) showCluster(datMat,4,clusterCenters,clusterAssment)
import kMeans from numpy import * # dat_set = mat(kMeans.loadDataSet('ds_hash.txt')) dat_dropship = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_DROPSHIP.txt')) dat_other_0 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_0.txt')) dat_other_1 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_1.txt')) dat_other_2 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_2.txt')) dat_other_3 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_3.txt')) dat_other_4 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_4.txt')) dat_other_5 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_5.txt')) dat_other_6 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_6.txt')) print "origin date" print len(dat_dropship) print len(dat_other_0) for i in dat_dropship: dat_other_0.append(i) dat_other_1.append(i) dat_other_2.append(i) dat_other_3.append(i) dat_other_4.append(i) dat_other_5.append(i) dat_other_6.append(i) print len(dat_other_0) print len(dat_other_1) print len(dat_other_2) print len(dat_other_3) print len(dat_other_4)
''' Created on 2016. 2. 9. @author: TaijinKim ''' import kMeans from numpy import * dataMat = mat(kMeans.loadDataSet('../data/testSet.txt')) # print(min(dataMat[:, 0])) # print(min(dataMat[:, 1])) # print(max(dataMat[:, 1])) # print(max(dataMat[:, 0])) # print(kMeans.randCent(dataMat, 2)) # # print(kMeans.distEclud(dataMat[0], dataMat[1])) # myCentroids, clustAssing = kMeans.kMeans(dataMat, 4)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- '10.3' __author__ = 'lxp' import kMeans import numpy as np datMat3 = np.mat(kMeans.loadDataSet('testSet2.txt')) centList, myNewAssment = kMeans.biKMeans(datMat3, 3) print(centList) #print (myNewAssment)
import kMeans import numpy as np import matplotlib.pyplot as plt dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) #print(kMeans.randCent(dataMat,2)) myCentroids, clustAssing = kMeans.kMeans(dataMat,4) #print(myCentroids,clustAssing) datalist = dataMat.tolist() #print([x[0] for x in datalist]) '''plt.figure() plt.scatter([x[0] for x in datalist],[x[1] for x in datalist]) plt.scatter([x[0] for x in myCentroids.tolist()],[x[1] for x in myCentroids.tolist()]) plt.title('kmeans') plt.show()''' dataMat = kMeans.loadDataSet("testSet2.txt") centList, clusteAssment = kMeans.biKmeans(dataMat, 3) print(centList)
import kMeans import ProbIN from numpy import * import subprocess import numpy as np datMat = mat(kMeans.loadDataSet('motionData_Training.txt')) kMeans.biKmeans(datMat,12) # datMat2 = mat(kMeans.loadDataSet('GPS_1Hz_training.txt')) # kMeans.biKmeans(datMat2,7)
import kMeans import os import sys from numpy import * project_path = os.path.abspath(os.path.dirname(__file__)) text_path = os.path.join(project_path, "../chapter10/testSet.txt") datMat = mat(kMeans.loadDataSet(text_path)) myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
import kMeans from numpy import * import time # dat_set = mat(kMeans.loadDataSet('ds_hash.txt')) dat_dropship = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_DROPSHIP.txt')) dat_other_0 = (kMeans.loadDataSet('DEALED_D_CUSTORMER_SHIPMENT_ITEMS_OTHERS_0.txt')) print "origin date" print len(dat_dropship) print len(dat_other_0) for i in dat_dropship: dat_other_0.append(i) print len(dat_other_0) dat_other_0 = mat(dat_other_0) # dat_dropship_0 = mat(dat_dropship) def biKmeans_func(data_set, k, cent_file="", clus_file=""): print "kMeans : " + cent_file cent, clus = kMeans.biKmeans(data_set, k) # print cent # print clus kmean_res_cent_file = open(cent_file, 'w') for item in cent.A: item_str = ""
#!/usr/bin/python2.7 # _*_ coding: utf-8 _*_ """ @Author: MarkLiu """ import numpy as np import kMeans import matplotlib.pyplot as plt dataArr = kMeans.loadDataSet('datasets/testSet2.txt') dataMat = np.matrix(dataArr) k = 3 centroids, clusterAssment = kMeans.biKmeans(dataMat, k) # centroids, clusterAssment = kMeans.kMeans(dataMat, k) # 计算原始数据加上中心数据,将数据分离 m = np.shape(dataMat)[0] # 分离出不同簇的x,y坐标 xPoint_0 = [] yPoint_0 = [] xPoint_1 = [] yPoint_1 = [] xPoint_2 = [] yPoint_2 = [] xPoint_3 = [] yPoint_3 = [] for i in range(m): if int(clusterAssment[i, 0]) == 0:
import kMeans import numpy as np import matplotlib.pyplot as plt dataMat = np.mat(kMeans.loadDataSet('kMeans/testSet2.txt')) myCentroid, clustAssing = kMeans.kMeans(dataMat, 4) #plt.plot(dataMat[:,0],dataMat[:,1], 'ro') #plt.plot(myCentroid[:,0], myCentroid[:,1], 'gs') dataMat2 = np.mat(kMeans.loadDataSet('kMeans/testSet2.txt')) centList, myNewAssments = kMeans.biKMeans(dataMat2,3) centList = np.mat(centList) plt.plot(dataMat2[:,0],dataMat2[:,1], 'ro') plt.plot(centList[:,0], centList[:,1], 'gs') plt.show()
# -*- coding:utf-8 -*- import kMeans from numpy import * datMat = mat(kMeans.loadDataSet("testSet.txt")) ''' myCentroids,clusterAssing = kMeans.kMeans(datMat, 4) print("myCentroids is %s " % myCentroids) print("clusterAssing is %s " % clusterAssing) ''' #kMeans test example two dataMat2 = mat(kMeans.loadDataSet('testSet2.txt')) centList,myNewAssment = kMeans.biKmeans(dataMat2, 3) print(centList) #geoResult = kMeans.geoGrab('1 VA Center', 'Augusta,ME')
def test1(): dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) print kMeans.randCent(dataMat, 2) print kMeans.distEclud(dataMat[0], dataMat[1])
def test4(): dataMat = np.mat(kMeans.loadDataSet('testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(dataMat, 3) kMeans.plotScatter(dataMat, centList, myNewAssments) print centList
def test3(): dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) kMeans.biKmeans(dataMat, 4)
#将当前簇 i 进行二分kMeans处理 centroidMat, splitClustAss = kMeans.kMeans(ptsInCurrCluster, 2, distMeas) #将二分 kMeans 结果中的平方和的距离进行求和 sseSplit = sum(splitClustAss[:,1]) #将未参与二分 kMeans 分配结果中的平方和的距离进行求和 sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) print("sseSplit, and notSplit: ",sseSplit,sseNotSplit) #计算拆分后与未拆分时的误差和,误差和越小,划分的结果就越好。 if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit #找出最好的簇分配结果??? bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #当使用kMeans()函数并指定簇数为2时,会得到两个编号为0和1的结果簇。需要将这些簇编号修改为划分簇及新加簇的编号。 bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit # 更新为最佳质心 print('the bestCentToSplit is: ',bestCentToSplit) print('the len of bestClustAss is: ', len(bestClustAss)) #更新质心列表??? centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0] #更新原质心 list 中的第 i 个质心为使用二分 kMeans 后 bestNewCents 的第一个质心 centList.append(bestNewCents[1,:].tolist()[0]) # 添加 bestNewCents 的第二个质心 # 重新分配最好簇下的数据(质心)以及SSE clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss return mat(centList), clusterAssment if __name__ == '__main__': #测试二分K-Means聚类算法 myDat = kMeans.loadDataSet(r'C:\Users\v_wangdehong\PycharmProjects\MachineLearning_V\9.K-Means\data\testSet2.txt') myMat = mat(myDat) centList,myNewAssments = biKmeans(myMat,3) print(centList)
"M11": [0.839471456333, 0.385856421, 0.983790752333], "M12": [0.957817317, 0.3012502055, 0.7800295435], } dispLabelDic = { "D1": [0.47949965, 0], "D2": [0.24391939, 0.65005331], "D3": [0.65160991, 1], "D4": [0.66235972, 0.59802129], "D5": [0.61647991, 0.53295326], "D6": [0.64551821, 0.67475389], "D7": [0, 0.3630064], } motionDataMat = mat(kMeans.loadDataSet("motionData_Training.txt")) dispDataMat = mat(kMeans.loadDataSet("GPS_1Hz_training.txt")) print motionDataMat[0] # print ProbIN.classifyMotionLabel(motionDataMat[0]) == 'M4' print dispDataMat[0] print ProbIN.classifyDispLabel(dispDataMat[0]) f = open("MD_pair_1Hz_for_Training.txt", "w") for i in range(len(dispDataMat)): print >> f, ProbIN.classifyMotionLabel(motionDataMat[i]), "\t", ProbIN.classifyDispLabel(dispDataMat[i])
import kMeans from numpy import * from imp import reload # dataMat = mat(kMeans.loadDataSet('./testSet.txt')) # kMeans.randCent(dataMat, 2) # kMeans.distEclud(dataMat[0], dataMat[1]) # myCentroids, clustAssing = kMeans.kMeans(dataMat, 4) datMat3 = mat(kMeans.loadDataSet('./testSet2.txt')) centList, myNewAssments = kMeans.biKmeans(datMat3, 3)
# coding:utf-8 import kMeans from numpy import * datMat=mat(kMeans.loadDataSet('testSet.txt')) print datMat[1:5,:] myCentroids,clustAssing=kMeans.kMeans(datMat,4) print myCentroids print ' ' print clustAssing datMat3=mat(kMeans.loadDataSet('testSet2.txt')) centList,myNewAssments=kMeans.biKmeans(datMat3,3) print centList,myNewAssments
"""用matplotlib展示划分过程及结果""" for t in range(len(centList)): #遍历当前的每一个簇 ptsInCurrCluster = dataSet[nonzero( clusterAssment[:, 0].A == t)[0], :] #过滤出属于这一簇的数据 x = flatten(ptsInCurrCluster[:, 0].tolist()) y = flatten(ptsInCurrCluster[:, 1].tolist()) plt.scatter(x, y) #将点展示出来 xx = [] yy = [] for cent in centList: #标注质心 xx.append(cent[0, 0]) yy.append(cent[0, 1]) plt.scatter(xx, yy, marker='*') plt.show() return centList, clusterAssment if __name__ == '__main__': test = loadDataSet('./testSet2.txt') x = [] y = [] for t in test: x.append(t[0]) y.append(t[1]) plt.scatter(x, y) plt.show() #得到原始数据的散点图 dataSet = mat(test) #print dataSet,dataSet[0,:] biKmeans(dataSet, 3)
def showFigure(dataMat, k, clusterAssment): tag = ['go', 'or', 'yo', 'ko', 'bo', 'mo'] for i in range(k): datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]] c = mat(i * ones((len(datalist), 1))) pylab.plot(datalist[:, 0], c, tag[i]) pylab.show() row = 0 for i in range(k): datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]] for j in range(len(datalist)): sheet1.write(row, 0, datalist[j, 0]) #sheet1.write(row, 1, datalist[j,1]) sheet1.write(row, 1, tag[i]) row += 1 if __name__ == '__main__': outputfilename = 'D:\\code\\KM\\res.xls' outputfile = xlwt.Workbook() sheet1 = outputfile.add_sheet('sheet1', cell_overwrite_ok=True) k = 6 dataMat = mat(kMeans.loadDataSet('D:\\code\\KM\\site.txt')) myCentroids, clusterAssment = kMeans.kMeans(dataMat, k) showFigure(dataMat, k, clusterAssment) outputfile.save(outputfilename)
from sklearn.cluster import KMeans from numpy import * import kMeans X = kMeans.loadDataSet('testSet.txt') kmeans = KMeans(n_clusters=4, random_state=0).fit(X) print("sklearn实现质心列表为:", kmeans.cluster_centers_) centroids, _ = kMeans.kMeans(mat(X), 4) print("python实现质心列表为:", centroids)
#!/usr/bin/env python #-*- coding: UTF-8 -*- import kMeans from numpy import * dataMat=mat(kMeans.loadDataSet('testSet.txt')) kMeansRandCenter=kMeans.randCent(dataMat,2) # 两个中心 print(kMeansRandCenter) centroids,clusterAssment=kMeans.kMeans(dataMat,5) import matplotlib.pyplot as plt fig=plt.figure(1) plt.plot(centroids[:,0],centroids[:,1],'ro') plt.plot(dataMat[:,0],dataMat[:,1],'bo') plt.axis([-8,8,-8,8]) # plt.show() kMeans.binaryKeans(dataMat,3) dataMat3=mat(kMeans.loadDataSet('testSet2.txt')) centList,Assments=kMeans.binaryKeans(dataMat3,3) print("centList:",centList) print("Assments:",Assments) fig=plt.figure(2) plt.plot(dataMat3[:,0],dataMat3[:,1],'bo') plt.plot(centList[:,0],centList[:,1],'ro') plt.axis([-10,10,-10,10]) # plt.show()
import xlwt def showFigure(dataMat, k, clusterAssment): tag = ['go', 'or', 'yo', 'ko', 'bo', 'mo'] for i in range(k): datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]] pylab.plot(datalist[:, 0], datalist[:, 1], tag[i]) pylab.show() row = 0 for i in range(k): datalist = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]] for j in range(len(datalist)): sheet1.write(row, 0, datalist[j, 0]) sheet1.write(row, 1, datalist[j, 1]) sheet1.write(row, 2, tag[i]) row += 1 if __name__ == '__main__': outputfilename = 'D:\\code\\team\\res.xls' outputfile = xlwt.Workbook() sheet1 = outputfile.add_sheet('sheet1', cell_overwrite_ok=True) k = 6 dataMat = mat(kMeans.loadDataSet('D:\\code\\team\\data.txt')) myCentroids, clusterAssment = kMeans.kMeans(dataMat, k) showFigure(dataMat, k, clusterAssment) outputfile.save(outputfilename)
import kMeans from numpy import* import matplotlib import matplotlib.pyplot as plt k = 4 datmat = array(kMeans.loadDataSet('testSet.txt')) centerList, clusterAssment = kMeans.biKmeans(datmat,k) print 'The cendroids is:',centerList fig = plt.figure() fig.add_subplot(111) colorList = ['b','c','g','k','r','y'] makerList = ['.','^','*','o','+'] for i in range(k): ax = plt.scatter(datmat[nonzero(clusterAssment[:,0].A == i)[0],0],datmat[nonzero(clusterAssment[:,0].A == i)[0],1], c = colorList[i],marker=makerList[i]) ax = plt.scatter(array(centerList[:,0]),array(centerList[:,1]),c = colorList[4],marker=makerList[3]) plt.title('Graph of k_Means ',) plt.xlabel('x') plt.ylabel('y') plt.show()
#!/usr/bin/env python # -*- coding: utf-8 -*- # author:yiluzhang import kMeans import numpy as np if __name__ == '__main__': data_set = np.mat(kMeans.loadDataSet('testSet.txt')) cent, clus = kMeans.kMeans(data_set, 4) print(cent) #print(clus)
#!/usr/bin/env python __coding__ = "utf-8" __author__ = "Ng WaiMing" from kMeans import kMeans from kMeans import loadDataSet from kMeans import randCent from kMeans import distEclud from kMeans import biKmeans from numpy import * if __name__ == '__main__': dataMat = mat(loadDataSet('testSet.txt')) print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n') print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n') print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n') print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n') print(randCent(dataMat, 2), '\n') print(distEclud(dataMat[0], dataMat[1])) centroids, clusterAssment = kMeans(dataMat, 4) print('centroids:\n', centroids, '\n') print('clusterAssment:\n', clusterAssment, '\n') dataMat3 = mat(loadDataSet('testSet2.txt')) centList, myNewAssments = biKmeans(dataMat3, 3) print('centList: \n', centList, '\n') # fileName = '../../../../data/k-means/places.txt' # imgName = '../../../../data/k-means/Portland.png' # kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)
import kMeans from numpy import * # 导入txt数据 datMat = mat(kMeans.loadDataSet('data2.txt')) # datMat矩阵的第2-4列分别对应半长轴、偏心率和轨道倾角 datMat[0, 2:5] # 计算距离 delta_v = kMeans.distdeltaV(datMat[0, 2:5], datMat[1, 2:5]) # 随机生成k个质心 centroids = kMeans.randCent(datMat[:, 2:5], 4) # k-均值聚类 myCentroids, clustAssing = kMeans.kMeans(datMat[:, 2:5], 5, kMeans.distdeltaV) # 二分 k-均值聚类 centList, myNewAssments = kMeans.biKmeans(datMat[:, 2:5], 5, kMeans.distdeltaV) # 画图 kMeans.showCluster_SRQ(datMat[:, 2:5], myNewAssments)
# -*- coding: UTF-8 -*- # kMeans算法测试 # 运行环境: python3 from numpy import * import kMeans print("loading data...") dataSet = mat(kMeans.loadDataSet('testSetForKMeans.txt')) k = 4 centroids, clusterAssment = kMeans.kMeans(dataSet, k) print("show the result...") kMeans.showCluster(dataSet, k, centroids, clusterAssment)
def plotResult(): datMat=mat(kMeans.loadDataSet('testSet.txt')) myCentroids, clustAssing = kMeans.kMeans(datMat,4)
import kMeans from numpy import * dataMat = mat(kMeans.loadDataSet('testSet.txt')) # print min(dataMat[:,0]) # # print(kMeans.randCent(dataMat,2)) # # print(kMeans.distEclud(dataMat[0],dataMat[1])) myCentroids, clustAssing = kMeans.kMeans(dataMat, 4) print myCentroids
def test2(): dataMat = np.mat(kMeans.loadDataSet('testSet.txt')) myCentroids, clusterAssing = kMeans.kMeans(dataMat, 4) print(clusterAssing)