def joint_information(x, y, n_neighbors=3, random_noise=0.3): n_samples = x.size if random_noise: x = with_added_white_noise(x, random_noise) y = with_added_white_noise(y, random_noise) x = x.reshape((-1, 1)) y = y.reshape((-1, 1)) xy = np.hstack((x, y)) # Here we rely on NearestNeighbors to select the fastest algorithm. nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors) nn.fit(xy) radius = nn.kneighbors()[0] radius = np.nextafter(radius[:, -1], 0) # Algorithm is selected explicitly to allow passing an array as radius # later (not all algorithms support this). nn.set_params(algorithm='kd_tree') nn.fit(x) ind = nn.radius_neighbors(radius=radius, return_distance=False) nx = np.array([i.size for i in ind]) nn.fit(y) ind = nn.radius_neighbors(radius=radius, return_distance=False) ny = np.array([i.size for i in ind]) mi = (digamma(n_samples) + digamma(n_neighbors) - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1))) return max(0, mi)
def kmeanspp(X, k, seed): # That we need to do this is a bug in _init_centroids x_squared_norms = row_norms(X, squared=True) # Use k-means++ to initialise the centroids centroids = _init_centroids(X, k, 'k-means++', random_state=seed, x_squared_norms=x_squared_norms) # OK, we should just short-circuit and get these from k-means++... # quick and dirty solution nns = NearestNeighbors() nns.fit(X) centroid_candidatess = nns.radius_neighbors(X=centroids, radius=0, return_distance=False) # Account for "degenerated" solutions: serveral voxels at distance 0, each becoming a centroid centroids = set() for centroid_candidates in centroid_candidatess: centroid_candidates = set(centroid_candidates) - centroids if len(set(centroid_candidates) - centroids) == 0: raise Exception('Cannot get an unambiguous set of centers;' 'theoretically this cannot happen, so check for bugs') centroids.add(centroid_candidates.pop()) return np.array(sorted(centroids))