def kmeans(samples, k, cutoff): """ kmeans函数 """ # 随机选k个样本点作为初始聚类中心 init_samples = random.sample(samples, k) # 创建k个聚类,聚类的中心分别为随机初始的样本点 clusters = [Cluster([sample]) for sample in init_samples] # 迭代循环直到聚类划分稳定 n_loop = 0 while True: # 初始化一组空列表用于存储每个聚类内的样本点 lists = [[] for _ in clusters] # 开始迭代 n_loop += 1 # 遍历样本集中的每个样本 for sample in samples: # 计算样本点sample和第一个聚类中心的距离 smallest_distance = get_distance(sample, clusters[0].centroid) # 初始化属于聚类 0 cluster_index = 0 # 计算和其他聚类中心的距离 for i in range(k - 1): # 计算样本点sample和聚类中心的距离 distance = get_distance(sample, clusters[i + 1].centroid) # 如果存在更小的距离,更新距离 if distance < smallest_distance: smallest_distance = distance cluster_index = i + 1 # 找到最近的聚类中心,更新所属聚类 lists[cluster_index].append(sample) # 初始化最大移动距离 biggest_shift = 0.0 # 计算本次迭代中,聚类中心移动的距离 for i in range(k): shift = clusters[i].update(lists[i]) # 记录最大移动距离 biggest_shift = max(biggest_shift, shift) # 如果聚类中心移动的距离小于收敛阈值,即:聚类稳定 if biggest_shift < cutoff: print("第{}次迭代后,聚类稳定。".format(n_loop)) break # 返回聚类结果 return clusters
def kmeans(samples, k, cutoff): """ the function of kmeans """ # first ramdon choose k samples init_samples = random.sample(samples, k) # construct k clustering, and the initial random sample as centroid clusters = [Cluster([sample]) for sample in init_samples] # iterate untill reach steady state n_loop = 0 while True: lists = [[] for _ in clusters] # starts n_loop += 1 for sample in samples: smallest_distance = get_distance(sample, clusters[0].centroid) cluster_index = 0 for i in range(k - 1): distance = get_distance(sample, clusters[i + 1].centroid) if distance < smallest_distance: smallest_distance = distance cluster_index = i + 1 # find the centroid and update the cluster lists[cluster_index].append(sample) # initial the shift max distance biggest_shift = 0.0 for i in range(k): shift = clusters[i].update(lists[i]) # record the biggest shift distance biggest_shift = max(biggest_shift, shift) # if the shift distance is smaller than cutoff, the clustering stable if biggest_shift < cutoff: print("{} iterate, stable.".format(n_loop)) break return clusters