def test_bin_seeds(): """ Test the bin seeding technique which can be used in the mean shift algorithm """ # Data is just 6 points in the plane X = np.array([[1., 1.], [1.5, 1.5], [1.8, 1.2], [2., 1.], [2.1, 1.1], [0., 0.]]) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be # found ground_truth = set([(1., 1.), (2., 1.), (0., 0.)]) test_bins = get_bin_seeds(X, 1, 1) test_result = set([tuple(p) for p in test_bins]) assert_true(len(ground_truth.symmetric_difference(test_result)) == 0) # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be # found ground_truth = set([(1., 1.), (2., 1.)]) test_bins = get_bin_seeds(X, 1, 2) test_result = set([tuple(p) for p in test_bins]) assert_true(len(ground_truth.symmetric_difference(test_result)) == 0) # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found test_bins = get_bin_seeds(X, 0.01, 1) test_result = set([tuple(p) for p in test_bins]) assert_true(len(test_result) == 6)
def test_bin_seeds(): # Test the bin seeding technique which can be used in the mean shift # algorithm # Data is just 6 points in the plane X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2], [2., 1.], [2.1, 1.1], [0., 0.]]) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be # found ground_truth = set([(1., 1.), (2., 1.), (0., 0.)]) test_bins = get_bin_seeds(X, 1, 1) test_result = set([tuple(p) for p in test_bins]) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be # found ground_truth = set([(1., 1.), (2., 1.)]) test_bins = get_bin_seeds(X, 1, 2) test_result = set([tuple(p) for p in test_bins]) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found # we bail and use the whole data here. with warnings.catch_warnings(record=True): test_bins = get_bin_seeds(X, 0.01, 1) assert_array_almost_equal(test_bins, X) # tight clusters around [0, 0] and [1, 1], only get two bins X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]], cluster_std=0.1, random_state=0) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]])
def test_bin_seeds(): # Test the bin seeding technique which can be used in the mean shift # algorithm # Data is just 6 points in the plane X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2], [2., 1.], [2.1, 1.1], [0., 0.]]) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be # found ground_truth = {(1., 1.), (2., 1.), (0., 0.)} test_bins = get_bin_seeds(X, 1, 1) test_result = set(tuple(p) for p in test_bins) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be # found ground_truth = {(1., 1.), (2., 1.)} test_bins = get_bin_seeds(X, 1, 2) test_result = set(tuple(p) for p in test_bins) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found # we bail and use the whole data here. with warnings.catch_warnings(record=True): test_bins = get_bin_seeds(X, 0.01, 1) assert_array_almost_equal(test_bins, X) # tight clusters around [0, 0] and [1, 1], only get two bins X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]], cluster_std=0.1, random_state=0) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]])
def test_mean_shift_zero_bandwidth(): # Check that mean shift works when the estimated bandwidth is 0. X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1) # estimate_bandwidth with default args returns 0 on this dataset bandwidth = estimate_bandwidth(X) assert bandwidth == 0 # get_bin_seeds with a 0 bin_size should return the dataset itself assert get_bin_seeds(X, bin_size=bandwidth) is X # MeanShift with binning and a 0 estimated bandwidth should be equivalent # to no binning. ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X) ms_nobinning = MeanShift(bin_seeding=False).fit(X) expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2]) assert v_measure_score(ms_binning.labels_, expected_labels) == 1 assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1 assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)
import numpy as np from collections import defaultdict def bin_points(X, bin_size, min_bin_freq): bin_sizes = defaultdict(int) for point in X: binned_point = np.cast[np.int32](point / bin_size) bin_sizes[tuple(binned_point)] += 1 bin_seeds = np.array([point for point, freq in bin_sizes.iteritems() if freq >= min_bin_freq], dtype=np.float32) bin_seeds = bin_seeds * bin_size return bin_seeds from numpy import genfromtxt from sklearn.cluster import get_bin_seeds from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import MeanShift, estimate_bandwidth X = genfromtxt('results.csv', delimiter=',') seeds = get_bin_seeds(X, 1, 1) print seeds print mean_shift(X, 0.01, seeds, gaussian_kernel_update)
def cluster(points: List[Point]): pts = np.array([[point.lon, point.lat] for point in points]) clustering = MeanShift(bandwidth=2, seeds=get_bin_seeds(pts, 0.011)).fit(pts) return zip(clustering.labels_, points)