def kmeans_euc(k, news): matrix = tfidf(news) centroid = start(k, matrix) ii = 0 while ii < 10: cluster = kernel_euc(centroid, matrix, k) print rss(cluster, centroid) centroid = new(cluster) ii += 1 if ii == 10: out = token__.labelize(news) output(cluster, matrix, out)
def kernel(centroid, matrix, news): cluster = [] for i in range(len(centroid)): cluster.append([centroid[i]]) for i in matrix: distance = [] for j in centroid: distance.append(euclidean(matrix[i], j)) if sum(distance) == 0: l = random.randint(0, len(centroid) - 1) cluster[l].append(matrix[i]) cluster[l] = list(set(cluster[l])) centroid = new(cluster) else: for m in range(len(distance)): if distance[m] == min(distance) and matrix[i] not in cluster[m]: cluster[m].append(matrix[i]) centroid = new(cluster) print rss(cluster, centroid) out = token__.labelize(news) print out output(cluster, matrix, out)