def kmedoidsWithScores(filenameData, filenameSilhMean, nameDBS, nameCHS, kClusters, measure): path = pathlib.Path(str(root) + '\\' + filenameData) if path.is_file(): data = read_sample(path) clusters, predicted = kmedoidsRun(data, kClusters, measure) meanSilhouetteScore = meanSilh(data, clusters) witTXT(meanSilhouetteScore, filenameSilhMean, filepath=root, note=filenameData + " k: " + str(kClusters)) dbsScore = dbs(data, predicted) witTXT(dbsScore, nameDBS, filepath=root, note=filenameData + " k: " + str(kClusters)) chsScore = chs(data, predicted) witTXT(chsScore, nameCHS, filepath=root, note=filenameData + " k: " + str(kClusters))
def kmedoidsWithScores(filenameData, filenameSilhMean, filenameDBS, filenameCHS, kClusters): data = read_sample(str(root) + '\\' + filenameData) #kClusters = canoc(data, kmin, kmax) initial_medoids = randomCenters(len(data), kClusters) kmedoids_instance = kmedoids(data, initial_medoids, metric=metricResearch) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() predicted = kmedoids_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) witTXT(meanSilhouetteScore, filenameSilhMean, filepath=root, note='k: ' + str(kClusters)) dbsScore = dbs(data, predicted) witTXT(dbsScore, filenameDBS, filepath=root, note='k: ' + str(kClusters)) chsScore = chs(data, predicted) witTXT(chsScore, filenameCHS, filepath=root, note='k: ' + str(kClusters))
def kFun(D, X): m, n = np.shape(D) K = 0 for i in range(len(X)): K += math.pow(2, i) * X[len(X) - 1 - i] K = int(K) + 1 initSet = set() curK = K if (K == 1): return 2 while(curK>0): # 随机选取k个样本 randomInt = random.randint(0, m-1) if randomInt not in initSet: curK -= 1 initSet.add(randomInt) U = D[list(initSet), :] # 均值向量,即质心 C = np.zeros(m) # 计算样本到各均值向量的距离 for i in range(m): p = 0; minDistance = distance(D[i], U[0]); for j in range(1, K): if distance(D[i], U[j]) < minDistance: p = j minDistance = distance(D[i], U[j]) C[i] = p a = dbs(D, C) a = a if a > 0 else 2 return a
def kmediansWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root) + '\\' + nameData) initial_medians = kppi(data, k_clusters).initialize() kmedians_instance = kmedians(data, initial_medians) kmedians_instance.process() clusters = kmedians_instance.get_clusters() # final_medians = kmedians_instance.get_medians() predicted = kmedians_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) elbow_instance = elbow(data, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount( ) # most probable amount of clusters wce = elbow_instance.get_wce()
def averFitness(func, X, K, number, maxIter): s = [] for i in range(number): # U, C, iter, cluster, dbsLists = func(X, K, maxIter) U, C, iter = func(X, K, maxIter) # U, C, iter, cluster, dbsLists = func(X, K, maxIter) s.append(dbs(X, C)) return max(s), min(s), sum(s) / number
def kmeans(data,k, maxIter): def _distance(p1,p2): """ Return Eclud distance between two points. p1 = np.array([0,0]), p2 = np.array([1,1]) => 1.414 """ tmp = np.sum((p1-p2)**2) return np.sqrt(tmp) def _rand_center(data,k): """Generate k center within the range of data set.""" n = data.shape[1] # features centroids = np.zeros((k,n)) # init with (0,0).... for i in range(n): dmin, dmax = np.min(data[:,i]), np.max(data[:,i]) centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k) return centroids def _converged(centroids1, centroids2): # if centroids not changed, we say 'converged' set1 = set([tuple(c) for c in centroids1]) set2 = set([tuple(c) for c in centroids2]) return (set1 == set2) dbsList = [float('inf')] n = data.shape[0] # number of entries centroids = _rand_center(data,k) label = np.zeros(n,dtype=np.int) # track the nearest centroid assement = np.zeros(n) # for the assement of our model converged = False curIter = 0 while not converged: curIter += 1 old_centroids = np.copy(centroids) for i in range(n): # determine the nearest centroid and track it with label min_dist, min_index = np.inf, -1 for j in range(k): dist = _distance(data[i],centroids[j]) if dist < min_dist: min_dist, min_index = dist, j label[i] = j assement[i] = _distance(data[i],centroids[label[i]])**2 # update centroid dbsList.append(dbs(data, label)) new_centroids = [] for m in range(k): if len(data[label==m]) == 0: k -= 1 else: centroids[m] = np.mean(data[label==m],axis=0) new_centroids.append(centroids[m]) centroids = new_centroids converged = _converged(old_centroids,centroids) dbsList = dbsList + [dbsList[len(dbsList) - 1] for i in range(100 - len(dbsList))] print('dbsList', dbsList) return centroids, label, dbsList
def kcluster(rows, k, maxIter): m, n = np.shape(rows) # # 确定每个点的最大值和最小值,给随机数定个范围 # ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) # for i in range(len(rows[0]))] m, dim = np.shape(rows) GbestScore, GbestPositon, Curve = BOAK(pop, k, rows) U = GbestPositon[0] # 随机建立k个中心点 # clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] # for i in range(len(rows[0]))] for j in range(k)] clusters = np.zeros([k, dim]) for i in range(k): clusters[i] = U[i * dim:(i + 1) * dim] print('clusters', clusters) lastmatches = None # 设定循环100次,看你的数据大小,次数自定义 dbsList = [float('inf')] C = np.zeros(m) for t in range(100): bestmatches = [[] for i in range(k)] # 在每一行中寻找距离最近的中心点 for j in range(len(rows)): row = rows[j] bestmatch = 0 for i in range(k): d = distance(clusters[i], row) if d < distance(clusters[bestmatch], row): bestmatch = i C[j] = bestmatch bestmatches[bestmatch].append(j) # 如果结果与上一次的相同,则整个过程结束 if bestmatches == lastmatches: break lastmatches = bestmatches dbsList.append(dbs(rows, C)) # 将中心点移到其所有成员的平均位置处 for i in range(k): avgs = [0.0] * len(rows[0]) if len(bestmatches[i]) > 0: for rowid in bestmatches[i]: for m in range(len(rows[rowid])): avgs[m] += rows[rowid][m] for j in range(len(avgs)): avgs[j] /= len(bestmatches[i]) clusters[i] = avgs dbsList = dbsList + [ dbsList[len(dbsList) - 1] for i in range(100 - len(dbsList)) ] print('dbsList', dbsList) return bestmatches, C, dbsList
def run_trial(X, labels, k): errors = '"' # Run our dbscan start = time() """ if metric == 'seuclidean': db = KMeans(eps,minPts,metric=metric,metric_params={'V':V}) else: db = kmean(,minPts,metric=metric) """ db = KMeans(k, n_jobs=12) pred_labels = db.fit_predict(X) elapsed = time() - start try: ari_score = ari(pred_labels, labels) except Exception as e: errors += str(e) + '; ' ari_score = np.nan try: nmi_score = nmi(pred_labels, labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nmi_score = np.nan try: ss_score = ss(X, pred_labels) except Exception as e: errors += str(e) + '; ' ss_score = np.nan try: vrc_score = vrc(X, pred_labels) except Exception as e: errors += str(e) + '; ' vrc_score = np.nan try: dbs_score = dbs(X, pred_labels) except Exception as e: errors += str(e) + '; ' dbs_score = np.nan errors += '"' return [ k, elapsed, ari_score, nmi_score, ss_score, vrc_score, dbs_score, errors ]
def kFun(D, X, K): m, dim = np.shape(D) result = 0 U = np.zeros([K, dim]) for i in range(K): U[i] = X[i * dim:(i + 1) * dim] C = np.zeros(m) # 计算样本到各均值向量的距离 for i in range(m): p = 0 minDistance = distance(D[i], U[0]) for j in range(1, K): if distance(D[i], U[j]) < minDistance: p = j minDistance = distance(D[i], U[j]) C[i] = p # result += minDistance # return result if len(set(C)) == 1: return float('inf') return dbs(D, C)
except Exception as e: print(e) ss_seu = str(np.nan) try: ss_cor = str(ss(X, labels, metric='correlation')) except Exception as e: print(e) ss_cor = str(np.nan) try: ss_cos = str(ss(X, labels, metric='cosine')) except Exception as e: print(e) ss_cos = str(np.nan) try: vrc_score = str(vrc(X, labels)) except Exception as e: print(e) vrc_score = str(np.nan) try: dbs_score = str(dbs(X, labels)) except Exception as e: print(e) dbs_score = str(np.nan) print(','.join( [sys.argv[1], ss_euc, ss_seu, ss_cor, ss_cos, vrc_score, dbs_score]))
# 判断质心是否发生变化,如果发生变化则继续迭代,否则结束 for i in range(K): newU[i] /= cnt[i] for j in range(n): if U[i, j] != newU[i, j]: changed = 1 U[i, j] = newU[i, j] if changed == 0: return U, C, maxIter - curIter return U, C, maxIter - curIter U, C, iter = Kmeans(data, 3, 4) f1 = plt.figure(1) plt.title('watermelon_4') plt.xlabel('density') plt.ylabel('ratio') plt.scatter(data[:, 0], data[:, 1], marker='o', color='g', s=50) plt.scatter(U[:, 0], U[:, 1], marker='o', color='r', s=100) m, n = np.shape(data) for i in range(m): plt.plot([data[i, 0], U[int(C[i]), 0]], [data[i, 1], U[int(C[i]), 1]], "c--", linewidth=0.3) plt.show() from sklearn.metrics import davies_bouldin_score as dbs print(dbs(data, C)) # %%
def kmeans(data, K, maxIter): m, dim = np.shape(data) k = K GbestScore, GbestPositon, Curve = BOAK(pop, k, data) # GbestPositon = [[4.3, 2, 3.10221011, 0.1,4.3, 2,1,0.1,4.3,2,1,0.1]] U = GbestPositon[0] def _distance(p1, p2): """ Return Eclud distance between two points. p1 = np.array([0,0]), p2 = np.array([1,1]) => 1.414 """ return np.sqrt(np.sum(np.square(np.array(p1) - np.array(p2)))) def _rand_center(data, k): """Generate k center within the range of data set.""" n = data.shape[1] # features centroids = np.zeros((k, n)) # init with (0,0).... for i in range(n): dmin, dmax = np.min(data[:, i]), np.max(data[:, i]) centroids[:, i] = dmin + (dmax - dmin) * np.random.rand(k) return centroids def _converged(centroids1, centroids2): # if centroids not changed, we say 'converged' set1 = set([tuple(c) for c in centroids1]) set2 = set([tuple(c) for c in centroids2]) return (set1 == set2) dbsList = [float('inf')] n = data.shape[0] # number of entries centroids = np.zeros([k, dim]) for i in range(k): centroids[i] = U[i * dim:(i + 1) * dim] label = np.zeros(n, dtype=np.int) # track the nearest centroid assement = np.zeros(n) # for the assement of our model converged = False old_centroids = np.copy(centroids) for i in range(n): # determine the nearest centroid and track it with label min_dist, min_index = np.inf, -1 for j in range(k): dist = _distance(data[i], centroids[j]) if dist < min_dist: min_dist, min_index = dist, j label[i] = j assement[i] = _distance(data[i], centroids[label[i]])**2 # update centroid dbsList.append(dbs(data, label)) new_centroids = [] for m in range(k): if len(data[label == m]) == 0: k -= 1 else: centroids[m] = np.mean(data[label == m], axis=0) new_centroids.append(centroids[m]) centroids = new_centroids converged = _converged(old_centroids, centroids) # while not converged: # old_centroids = np.copy(centroids) # for i in range(n): # # determine the nearest centroid and track it with label # min_dist, min_index = np.inf, -1 # for j in range(k): # dist = _distance(data[i],centroids[j]) # if dist < min_dist: # min_dist, min_index = dist, j # label[i] = j # assement[i] = _distance(data[i],centroids[label[i]])**2 # # update centroid # dbsList.append(dbs(data, label)) # new_centroids = [] # for m in range(k): # if len(data[label==m]) == 0: # k -= 1 # else: # centroids[m] = np.mean(data[label==m],axis=0) # new_centroids.append(centroids[m]) # centroids = new_centroids # converged = _converged(old_centroids,centroids) # dbsList = dbsList + [dbsList[len(dbsList) - 1] for i in range(100 - len(dbsList))] print('dbsList', dbsList) return centroids, label, dbsList
def run_trial(X, labels, eps, minPts, metric, V): errors = '"' # Run our dbscan start = time() if metric == 'seuclidean': db = DBSCAN(eps, minPts, metric=metric, metric_params={'V': V}, n_jobs=6) else: db = DBSCAN(eps, minPts, metric=metric, n_jobs=6) pred_labels = db.fit_predict(X) elapsed = time() - start perc_noise = np.sum(pred_labels == -1) / len(pred_labels) n_clust = pred_labels.max() # Remove noisy points clean_idx = np.where(pred_labels != -1) nn_preds = pred_labels[clean_idx] nn_labels = labels[clean_idx] nn_X = X[clean_idx] try: ari_score = ari(pred_labels, labels) except Exception as e: errors += str(e) + '; ' ari_score = np.nan try: nmi_score = nmi(pred_labels, labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nmi_score = np.nan try: if metric == 'seuclidean': ss_score = ss(X, pred_labels, metric=metric, V=V) else: ss_score = ss(X, pred_labels, metric=metric) except Exception as e: errors += str(e) + '; ' ss_score = np.nan try: vrc_score = vrc(X, pred_labels) except Exception as e: errors += str(e) + '; ' vrc_score = np.nan try: dbs_score = dbs(X, pred_labels) except Exception as e: errors += str(e) + '; ' dbs_score = np.nan try: nn_ari_score = ari(nn_preds, nn_labels) except Exception as e: errors += str(e) + '; ' nn_ari_score = np.nan try: nn_nmi_score = nmi(nn_preds, nn_labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nn_nmi_score = np.nan try: if metric == 'seuclidean': nn_ss_score = ss(nn_X, nn_preds, metric=metric, V=V) else: nn_ss_score = ss(nn_X, nn_preds, metric=metric) except Exception as e: errors += str(e) + '; ' nn_ss_score = np.nan try: nn_vrc_score = vrc(nn_X, nn_preds) except Exception as e: errors += str(e) + '; ' nn_vrc_score = np.nan try: nn_dbs_score = dbs(nn_X, nn_preds) except Exception as e: errors += str(e) + '; ' nn_dbs_score = np.nan errors += '"' return [ metric, eps, minPts, n_clust, perc_noise, elapsed, ari_score, nn_ari_score, nmi_score, nn_nmi_score, ss_score, nn_ss_score, vrc_score, nn_vrc_score, dbs_score, nn_dbs_score, errors ]
def kmedoidsWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root)+'\\'+filenameData) kClusters = canoc(data, kmin, kmax) initial_medoids = rci(data, kClusters).initialize() kmedoids_instance = kmedoids(data, initial_medoids) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() predicted = kmedoids_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) # elbow_instance = elbow(data, kmin, kmax) # elbow_instance.process() # amount_clusters = elbow_instance.get_amount() # most probable amount of clusters # wce = elbow_instance.get_wce() kmedoidsWithScore(filenameData, filenameSilhouetteMean, filenameDBS, filenameCHS, k, metric, k_min, k_max)
def Kmeans(D,K,maxIter): m, n = np.shape(D) if K >= m: return D def _rand_center(data,k): n = data.shape[1] # features centroids = np.zeros((k,n)) # init with (0,0).... for i in range(n): dmin, dmax = np.min(data[:,i]), np.max(data[:,i]) centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k) return centroids U = _rand_center(D, K) # initSet = set() # curK = K # while(curK>0): # 随机选取k个样本 # randomInt = random.randint(0, m-1) # if randomInt not in initSet: # curK -= 1 # initSet.add(randomInt) C = np.zeros(m) curIter = maxIter # 最大的迭代次数 dbsList = [float('inf')] while curIter > 0: curIter -= 1 # 计算样本到各均值向量的距离 for i in range(m): p = 0 minDistance = distance(D[i], U[0]) for j in range(1, K): if distance(D[i], U[j]) < minDistance: p = j minDistance = distance(D[i], U[j]) C[i] = p newU = np.zeros((K, n)) cnt = np.zeros(K) for i in range(m): newU[int(C[i])] = newU[int(C[i])] + D[i] cnt[int(C[i])] += 1 dbsList.append(dbs(D, C)) changed = 0 print('newU', newU) print('cnt', cnt) # 判断质心是否发生变化,如果发生变化则继续迭代,否则结束 for i in range(K): newU[i] /= cnt[i] for j in range(n): if U[i, j] != newU[i, j]: changed = 1 U[i, j] = newU[i, j] if changed == 0: cluster = [[D[i] for i, j in enumerate(C) if (j == k)] for k in range(K)] # indexCluster = [[i + 1 for i, j in enumerate(C) if (j == k)] for k in range(K)] lastList = [dbsList[len(dbsList) - 1] for i in range(curIter)] dbsList = dbsList + lastList return U, C, maxIter-curIter cluster = [[D[i] for i, j in enumerate(C) if (j == k)] for k in range(K)] # indexCluster = [[i + 1 for i, j in enumerate(C) if (j == k)] for k in range(K)] return U, C, maxIter-curIter