def streaming_kmeans(points, k=10, num_iters=10, num_ballkmeans_runs=2, trim_factor=0.9, test_probability=0.1, correct_weight=False): ''' clustering data points using streaming kmeans method. Args: points(DistArray): data points to be clustered. k(int): the final number of clusters. num_iters(int): the number of iterations to run in each ball kmeans run. num_ballkmeans_runs(int): the number of ball kmeans to run. trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points. test_probability(float): the percentage of points to be chosen as test set. correct_weights(bool): whether to correct the weights of the centroids. ''' centroids = expr.tile_operation(points, _streaming_mapper, kw={'k': k}).evaluate() new_centroids = [] for tile_result in centroids.values(): for centroids_list in tile_result: new_centroids.extend(centroids_list) centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs, trim_factor, test_probability, correct_weight) centers = np.zeros((k, points.shape[1])) for i in range(k): centers[i] = centriods[i].get_center() return expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0],))
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1): ''' A simple implementation of canopy clustering method. Args: points(Expr or DistArray): the input data points matrix. t1(float): distance threshold between center point and the points within a canopy. t2(float): distance threshold between center point and the points within a canopy. cf(int): the minimum canopy size. ''' new_points = expr.tile_operation(points, _canopy_mapper, kw={'t1': t1, 't2': t2, 'cf': cf}).evaluate() centers = find_centers(new_points.values(), t1, t2, cf) labels = expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0],)) return labels
def streaming_kmeans(points, k=10, num_iters=10, num_ballkmeans_runs=2, trim_factor=0.9, test_probability=0.1, correct_weight=False): ''' clustering data points using streaming kmeans method. Args: points(DistArray): data points to be clustered. k(int): the final number of clusters. num_iters(int): the number of iterations to run in each ball kmeans run. num_ballkmeans_runs(int): the number of ball kmeans to run. trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points. test_probability(float): the percentage of points to be chosen as test set. correct_weights(bool): whether to correct the weights of the centroids. ''' centroids = expr.tile_operation(points, _streaming_mapper, kw={ 'k': k }).evaluate() new_centroids = [] for tile_result in centroids.values(): for centroids_list in tile_result: new_centroids.extend(centroids_list) centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs, trim_factor, test_probability, correct_weight) centers = np.zeros((k, points.shape[1])) for i in range(k): centers[i] = centriods[i].get_center() return expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0], ))
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1): ''' A simple implementation of canopy clustering method. Args: points(Expr or DistArray): the input data points matrix. t1(float): distance threshold between center point and the points within a canopy. t2(float): distance threshold between center point and the points within a canopy. cf(int): the minimum canopy size. ''' new_points = expr.tile_operation(points, _canopy_mapper, kw={ 't1': t1, 't2': t2, 'cf': cf }).force() centers = find_centers(new_points.values(), t1, t2, cf) labels = expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0], )) return labels