def test_k_and_radius_neighbors_duplicates(): # Test behavior of kneighbors when duplicates are present in query for algorithm in ALGORITHMS: nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) nn.fit([[0], [1]]) # Do not do anything special to duplicates. kng = nn.kneighbors_graph([[0], [1]], mode='distance') assert_array_equal(kng.A, np.array([[0., 0.], [0., 0.]])) assert_array_equal(kng.data, [0., 0.]) assert_array_equal(kng.indices, [0, 1]) dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) check_object_arrays(dist, [[0, 1], [1, 0]]) check_object_arrays(ind, [[0, 1], [0, 1]]) rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5) assert_array_equal(rng.A, np.ones((2, 2))) rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode='distance') assert_array_equal(rng.A, [[0, 1], [1, 0]]) assert_array_equal(rng.indices, [0, 1, 0, 1]) assert_array_equal(rng.data, [0, 1, 1, 0]) # Mask the first duplicates when n_duplicates > n_neighbors. X = np.ones((3, 1)) nn = neighbors.NearestNeighbors(n_neighbors=1) nn.fit(X) dist, ind = nn.kneighbors() assert_array_equal(dist, np.zeros((3, 1))) assert_array_equal(ind, [[1], [0], [1]]) # Test that zeros are explicitly marked in kneighbors_graph. kng = nn.kneighbors_graph(mode='distance') assert_array_equal(kng.A, np.zeros((3, 3))) assert_array_equal(kng.data, np.zeros(3)) assert_array_equal(kng.indices, [1., 0., 1.]) assert_array_equal( nn.kneighbors_graph().A, np.array([[0., 1., 0.], [1., 0., 0.], [0., 1., 0.]]))
def test_callable_metric(): def custom_metric(x1, x2): return np.sqrt(np.sum(x1**2 + x2**2)) X = np.random.RandomState(42).rand(20, 2) nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto', metric=custom_metric) nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute', metric=custom_metric) nbrs1.fit(X) nbrs2.fit(X) dist1, ind1 = nbrs1.kneighbors(X) dist2, ind2 = nbrs2.kneighbors(X) assert_array_almost_equal(dist1, dist2)
def __init__(self, dataset, soft_encoding=True): self.dataset = dataset self.kernels = bins_centers self.L_normalize = 100 self.kernel_normalize = np.max(np.abs(self.kernels)) self.num_bins = len(self.kernels) self.neighborhood = knn.NearestNeighbors(n_neighbors=5).fit( self.kernels / self.kernel_normalize) self.soft_encoding = soft_encoding self.device = torch.device('cuda:0')
def nearest_neighbors(src, dst): model = neighbors.NearestNeighbors(n_neighbors=1) # feed dst points to kd-tree model.fit(dst) # search nearest points of src in dst distances, indices = model.kneighbors(src) return distances, indices
def apply_mask_and_get_affinity(seeds, niimg, radius, allow_overlap, n_jobs=1, mask_img=None): import time start = time.time() seeds = list(seeds) affine = niimg.affine # Compute world coordinates of all in-mask voxels. mask_img = check_niimg_3d(mask_img) mask_img = image.resample_img(mask_img, target_affine=affine, target_shape=niimg.shape[:3], interpolation='nearest') mask, _ = masking._load_mask_img(mask_img) mask_coords = list(zip(*np.where(mask != 0))) X = masking._apply_mask_fmri(niimg, mask_img) # For each seed, get coordinates of nearest voxel nearests = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(seed_nearest)(seed_chunk, affine, mask_coords) for thread_id, seed_chunk in enumerate(np.array_split(seeds, n_jobs))) nearests = [i for j in nearests for i in j] mask_coords = np.asarray(list(zip(*mask_coords))) mask_coords = coord_transform(mask_coords[0], mask_coords[1], mask_coords[2], affine) mask_coords = np.asarray(mask_coords).T clf = neighbors.NearestNeighbors(radius=radius) A = clf.fit(mask_coords).radius_neighbors_graph(seeds) A = A.tolil() for i, nearest in enumerate(nearests): if nearest is None: continue A[i, nearest] = True # Include the voxel containing the seed itself if not masked mask_coords = mask_coords.astype(int).tolist() for i, seed in enumerate(seeds): try: A[i, mask_coords.index(seed)] = True except ValueError: # seed is not in the mask pass if not allow_overlap: if np.any(A.sum(axis=0) >= 2): raise ValueError('Overlap detected between spheres') return X, A
def get_transition_matrix(self, k=10, ann=False): """ implementation of transition matrix. :param k: number of each node's neighbors. """ # kNN if ann == False: nbrs = neighbors.NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=-1).fit(self.data) distances, indices = nbrs.kneighbors(self.data) else: # need to imporve lshf = neighbors.LSHForest(n_neighbors=3 * k).fit(self.data) distances, indices = lshf.kneighbors(self.data, n_neighbors=k) N = self.shape[0] sqdistances = np.square(distances) sigmas = distances[:, -1] self.sigmas = sigmas # kernel matrix sigs_mul = np.multiply.outer(sigmas, sigmas) kernel_matrix = np.zeros((N, k)) for i in range(N): kernel_matrix[i, :] = np.exp( -np.divide(sqdistances[i, :], 2 * (sigs_mul[i, indices[i, :]]))) weights = kernel_matrix indptr = range(0, (N + 1) * k, k) weight_matrix = sparse.csr_matrix( (weights.flatten(), indices.flatten(), indptr), shape=(N, N)).toarray() # symmetric for i, col in enumerate(indices): for j in col: if i not in set(indices[j]): weight_matrix[j, i] = weight_matrix[i, j] weight_sum = np.power(weight_matrix.sum(axis=0)[:, None], -1 / 2).flatten() weight_sum = np.diag(weight_sum) weight_matrix = weight_sum @ weight_matrix @ weight_sum M = np.divide(weight_matrix, weight_matrix.sum(axis=1)[:, None]) mevals, mevecs = sp.linalg.eigh(M) self.M = M self.mevals = mevals self.mevecs = mevecs self.indices = indices return M
def test_valid_brute_metric_for_auto_algorithm(): X = rng.rand(12, 12) Xcsr = csr_matrix(X) # check that there is a metric that is valid for brute # but not ball_tree (so we actually test something) assert_in("cosine", VALID_METRICS['brute']) assert_false("cosine" in VALID_METRICS['ball_tree']) # Metric which don't required any additional parameter require_params = ['mahalanobis', 'wminkowski', 'seuclidean'] for metric in VALID_METRICS['brute']: if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric).fit(X) nn.kneighbors(X) elif metric == 'precomputed': X_precomputed = rng.random_sample((10, 4)) Y_precomputed = rng.random_sample((3, 4)) DXX = metrics.pairwise_distances(X_precomputed, metric='euclidean') DYX = metrics.pairwise_distances(Y_precomputed, X_precomputed, metric='euclidean') nb_p = neighbors.NearestNeighbors(n_neighbors=3) nb_p.fit(DXX) nb_p.kneighbors(DYX) for metric in VALID_METRICS_SPARSE['brute']: if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric).fit(Xcsr) nn.kneighbors(Xcsr) # Metric with parameter VI = np.dot(X, X.T) list_metrics = [('seuclidean', dict(V=rng.rand(12))), ('wminkowski', dict(w=rng.rand(12))), ('mahalanobis', dict(VI=VI))] for metric, params in list_metrics: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric, metric_params=params).fit(X) nn.kneighbors(X)
def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.kneighbors_graph(X, 3, metric=metric, mode='connectivity', include_self=True).toarray() nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.radius_neighbors_graph( X, radius, metric=metric, mode='connectivity', include_self=True).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, radius, metric='euclidean')
def median_kneighbour_distance(X, k=5): """Calculate the median distance between a set of random datapoints and their kth nearest neighbours. This is a heuristic for setting the kernel length scale.""" N_all = X.shape[0] N_subset = min(N_all, 2000) sample_idx_train = np.random.permutation(N_all)[:N_subset] nn = neighbors.NearestNeighbors(k) nn.fit(X[sample_idx_train, :]) d, idx = nn.kneighbors(X[sample_idx_train, :]) return np.median(d[:, -1])
def __init__(self,NN,sigma,km_filepath='',cc=-1): if(check_value(cc,-1)): self.cc = np.load(km_filepath) else: self.cc = cc self.K = self.cc.shape[0] self.NN = int(NN) self.sigma = sigma self.nbrs = nn.NearestNeighbors(n_neighbors=NN, algorithm='ball_tree').fit(self.cc) self.alreadyUsed = False
def finalize(self): if self.nn is None: if not self.observations: raise Exception('no observations') if not self.observations[0][0]: # 0-length vectors are not allowed xs = [[0] for _ in self.observations] else: xs = [x for (x, _) in self.observations] self.nn = neighbors.NearestNeighbors() self.nn.fit(xs)
def __init__(self, split): self.split = split self.folder = 'data/{}'.format(split) self.files = [f for f in os.listdir(self.folder) if f.lower().endswith('.jpeg')] # Load the array of quantized ab value q_ab = np.load("data/pts_in_hull.npy") self.nb_q = q_ab.shape[0] # Fit a NN to q_ab self.nn_finder = nn.NearestNeighbors(n_neighbors=nb_neighbors, algorithm='ball_tree').fit(q_ab)
def buildTree(self): arr = np.array(self.values) if self.se2: arr = np.reshape(arr, [-1, 6]) else: arr = np.reshape(arr, [-1, 7]) self.nn = sk.NearestNeighbors(n_jobs=-1, algorithm='brute', leaf_size=100, metric=self.metric) self.nn.fit(arr)
def knn_diversity_stats(training_set, generated_imgs, k=3): """ Find the k=3 nearest neighnors of an image in the training set and returns the average distance :param array training_set: the training set of images according to which we will find the nearest neighbours :param array generated_imgs: the images whose nearest neighbours we wish to find """ knn = neighbors.NearestNeighbors(n_neighbors=k) knn.fit(training_set, y=np.zeros(shape=(len(training_set), ))) dists, idxs = knn.kneighbors(generated_imgs) return np.average(dists)
def __k_neighbor(k_n, algm, feature): """ k neighbor will compute k-Nearest Neighbors sklearn algorithm :param k_n: int, integer number for k nearest neighbors groups; '2' set as default :param algm: str, string name for k-NN's algorithm choice; 'auto' set as default :param feature: np.array, np.array object with column features :return: list, python list with numpy array object for each neighbor """ n_neighbor = neighbors.NearestNeighbors(n_neighbors=k_n, algorithm=algm) model_fit = n_neighbor.fit(feature) return model_fit.kneighbors(feature)
def quantize(inputs, to_points, to_points_one_hot_encoded): neighbors = nn.NearestNeighbors(n_neighbors=10, algorithm='auto').fit(to_points) dists, indices = neighbors.kneighbors(inputs) end_points = np.zeros((inputs.shape[0], to_points.shape[0])) sigma = 5.0 wts = np.exp(-dists**2 / (2 * sigma**2)) wts = wts / np.sum(wts, axis=1)[:, np.newaxis] end_points[np.arange(0, inputs.shape[0], dtype='int')[:, np.newaxis], indices] = wts return end_points
def knnThing(matrix, labels): animals = [] print "\033[1mYou are going to pick 5 animals that are similar to your chosen animal. Pick different animals each time and do not pick your chosen animal itself.\033[0m\n" inp = -1 while len(animals) < 5: while inp < 0 or inp >= 50 or inp in animals: input = raw_input( "\nGive the index of one of the following animals:\n" + printAnimalOptions(labels, animals)) + "\n" try: inp = int(input) if inp < 0 or inp >= 50: print "Please give a value between 0 and 49." elif inp in animals: print labels[inp] + " has already been selected!" except: print "Please enter only an integer." print "Ok, " + labels[inp] + " saved." animals += [inp] inp = -1 counts = dict() animals.sort(reverse=True) animalRows = [] for animal in animals: animalRows += [[matrix[animal]]] matrix = np.delete(matrix, animals, axis=0) labels = np.delete(labels, animals, axis=0) friendos = neighbors.NearestNeighbors(n_neighbors=5) friendos.fit(matrix) for animalRow in animalRows: fiveClosest = friendos.kneighbors(animalRow, return_distance=False)[0] for closeAnimal in fiveClosest: if closeAnimal not in counts: counts[closeAnimal] = 1 else: counts[closeAnimal] += 1 maxInd = max(counts, key=counts.get) print "We predict your animal was " + labels[maxInd] + "." return
def split_with_wasserstein(texts, test_set_size, no_of_trials, min_df, leaf_size): """Finds test sets by maximizing Wasserstein distances among the given texts. This is separating the given texts into training/dev and test sets based on an approximate Wasserstein method. First all texts are indexed in a nearest neighbors structure. Then a new test centroid is sampled randomly, from which the nearest neighbors in Wasserstein space are extracted. Those constitute the new test set. Similarity is computed based on document-term counts. Args: texts: Texts to split into training/dev and test sets. test_set_size: Number of elements the new test set should contain. no_of_trials: Number of test sets requested. min_df: Mainly for speed-up and memory efficiency. All tokens must occur at least this many times to be considered in the Wasserstein computation. leaf_size: Leaf size parameter of the nearest neighbor search. Set high values for slower, but less memory-heavy computation. Returns: Returns a List of test set indices, one for each trial. The indices correspond to the items in `texts` that should be part of the test set. """ vectorizer = feature_extraction.text.CountVectorizer( dtype=np.int8, min_df=min_df) logging.info('Creating count vectors.') text_counts = vectorizer.fit_transform(texts) text_counts = text_counts.todense() logging.info('Count vector shape %s.', text_counts.shape) logging.info('Creating tree structure.') nn_tree = neighbors.NearestNeighbors( n_neighbors=test_set_size, algorithm='ball_tree', leaf_size=leaf_size, metric=stats.wasserstein_distance) nn_tree.fit(text_counts) logging.info('Sampling test sets.') test_set_indices = [] for trial in range(no_of_trials): logging.info('Trial set: %d.', trial) # Sample random test centroid. sampled_poind = np.random.randint( text_counts.max().max() + 1, size=(1, text_counts.shape[1])) nearest_neighbors = nn_tree.kneighbors(sampled_poind, return_distance=False) # We queried for only one datapoint. nearest_neighbors = nearest_neighbors[0] logging.info(nearest_neighbors[:10]) test_set_indices.append(nearest_neighbors) return test_set_indices
def dist(self, X, Y=None): if Y is X or Y is None: d = neighbors.kneighbors_graph(X, self.k, mode='distance', metric=self.metric) else: n = neighbors.NearestNeighbors(metric=self.metric) n.fit(Y) d = n.kneighbors_graph(X, self.k, mode='distance') return d.toarray() # since we cant deal with sparse so far
def fit_history(self, udf_metric, n_cases): """ Args: udf_metric: user-defined metric object n_cases: int """ self.neigh = neighbors.NearestNeighbors(n_neighbors=n_cases, metric=udf_metric) self.neigh.fit(self.normalized_history) self.indices = self.neigh.kneighbors(self.normalized_current, return_distance=False)[0]
def laplacian_matrix(data, k): """ :param data: containing data points, :param k: the number of neighbors considered (this distance metric is cosine, and the weights are measured by cosine) :return: """ # import matlab.engine as ME # import matlab # engine = ME.start_matlab() # calling from matllab # options = dict() # options.update({'k' : 10}) # options.update({'NeighborMode' : 'KNN'}) # # options.update({'Metric' : 'Cosine'}) # options.update({'WeightMode' : 'Cosine'}) # options.update({'Metric': 'Euclidean'}) # options.update({'WeightMode': 'HeatKernel'}) # options.update({'t' : 1.0}) # sim = np.array(engine.lapgraph(matlab.double(data.tolist()), options)) # S = [np.sum(row) for row in sim] # # for i in range(len(sim)): # sim[i] = [sim[i][j] / (S[i] * S[j]) ** 0.5 for j in range(len(sim))] # # L = np.identity(len(sim)) - sim # return L nn = neighbors.NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine') nn.fit(data) dist, nn = nn.kneighbors(return_distance=True) sim = np.zeros((len(data), len(data))) for ins_index in range(len(sim)): dist_row = dist[ins_index] nn_row = nn[ins_index] for dist_value, ind_index in zip(dist_row, nn_row): sim[ins_index][ind_index] = 1.0 - dist_value sim[ind_index][ins_index] = 1.0 - dist_value for i in range(len(sim)): sim[i][i] = 1.0 S = [np.sum(row) for row in sim] for i in range(len(sim)): sim[i] = [sim[i][j] / (S[i] * S[j])**0.5 for j in range(len(sim))] L = np.identity(len(sim)) - sim return L
def predict_NearestNeighbors(train_data, train_labels, test_data, nb_neighbors=5): print("Starting to compute Nearest Neighbors") computeStart = time.time() convert_to_minutes = False # Initialization neighbors_kdtree = neighbors.NearestNeighbors(n_neighbors=nb_neighbors, algorithm='kd_tree') # Training print("Nearest Neighbors : starting training") neighbors_kdtree.fit(train_data, train_labels) # Predictions on test_data print("Nearest Neighbors : starting predictions") distances, indices = neighbors_kdtree.kneighbors(test_data) predicted_labels = np.ndarray(shape=(len(test_data))) for i in range(len(test_data)): if nb_neighbors == 1: predicted_labels[i] = indices[i] else: mean_distances = {} for nb in range(nb_neighbors): actual_label = train_labels[indices[i][nb]] actual_distance = distances[i][nb] if actual_label not in mean_distances: mean_distances[actual_label] = [] mean_distances[actual_label].append(actual_distance) md_keys = list(mean_distances.keys()) min_label = md_keys[0] min_distance = np.mean(mean_distances[min_label]) for label in md_keys: if label != min_label: actual_distance = np.mean(mean_distances[label]) if actual_distance < min_distance: min_label = label min_distance = actual_distance predicted_labels[i] = min_label computeDuration = time.time() - computeStart if computeDuration > 60: computeDuration = computeDuration / 60 convert_to_minutes = True unit = " min" if convert_to_minutes else " s" print("Nearest Neighbors finished, computing duration = " + str(computeDuration) + unit) return predicted_labels
def analyze(self, data): X = data["value"] Y = data["geofips"].astype(object) knn = neighbors.KNeighborsClassifier() neighbor = neighbors.NearestNeighbors(n_neighbors=7, algorithm="brute") fit = knn.fit(X.to_frame(), Y.to_frame().values.ravel()) p = neighbor.fit(X.to_frame()) pred = p.kneighbors(X.to_frame()) predicted = pred[1] cols = ["i", "1", "2", "3", "4", "5", "6"] df = pd.DataFrame(predicted, columns=cols) df_melt = pd.melt(df, id_vars=["i"]) return df_melt, data
def test_radius_neighbors_boundary_handling(): """Test whether points lying on boundary are handled consistently""" X = np.array([[1.5], [3.0], [3.01]]) radius = 3.0 for algorithm in ALGORITHMS: nbrs = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm).fit(X) results = nbrs.radius_neighbors([0.0], return_distance=False) assert_equal(results.shape, (1, )) assert_equal(results.dtype, object) assert_array_equal(results[0], [0, 1])
def build(self, data, k): self.check_metric(self.metric) self.index = neighbors.NearestNeighbors( algorithm="ball_tree", metric=self.metric, metric_params=self.metric_params, n_jobs=self.n_jobs, ) self.index.fit(data) # Return the nearest neighbors in the training set distances, indices = self.index.kneighbors(n_neighbors=k) return indices, distances
def test_knn_distance(self): mapper = KeplerMapper() data = np.random.rand(100, 5) lens = mapper.project(data, projection="knn_distance_4", scaler=None) nn = neighbors.NearestNeighbors(n_neighbors=4) nn.fit(data) lens_confirm = np.sum( nn.kneighbors(data, n_neighbors=4, return_distance=True)[0], axis=1 ).reshape((-1, 1)) assert lens.shape == (100, 1) np.testing.assert_array_equal(lens, lens_confirm)
def get_transition_matrix2(self, k=10): """ implemantation of transition matrix of DPT. :param k: number of each node's neighbors. """ # kNN N = self.shape[0] nbrs = neighbors.NearestNeighbors(n_neighbors=k, metric='euclidean').fit(self.data) distances, indices = nbrs.kneighbors(self.data) sqdistances = np.square(distances) sigmas = distances[:, -1] / 2 # kernel matrix sigs_sum = np.add.outer(sigmas**2, sigmas**2) sig_mul = np.multiply.outer(sigmas, sigmas) kernel_matrix = np.zeros((N, k)) for i in range(N): para = np.sqrt( np.divide(2 * sig_mul[i, indices[i, :]], sigs_sum[i, indices[i, :]])) kern = np.exp(-np.divide(sqdistances[i, :], (sigs_sum[i, indices[i, :]]))) # not *2 kernel_matrix[i, :] = np.multiply(para, kern) weights = kernel_matrix indptr = range(0, (N + 1) * k, k) weight_matrix = sparse.csr_matrix( (weights.flatten(), indices.flatten(), indptr), shape=(N, N)).toarray() # symmetric for i, row in enumerate(indices): for j in row: if i not in set(indices[j]): weight_matrix[j, i] = weight_matrix[i, j] # normalization weight_sum = np.power(weight_matrix.sum(axis=0)[:, None], -1 / 2).flatten() weight_sum = np.diag(weight_sum) M = weight_sum @ weight_matrix @ weight_sum mevals, mevecs = sp.linalg.eigh(M) self.M = M self.mevals = mevals self.mevecs = mevecs self.indices = indices return M
def _apply_mask_and_get_affinity(seeds, niimg, radius, allow_overlap, mask_img=None): seeds = list(seeds) affine = niimg.get_affine() # Compute world coordinates of all in-mask voxels. if mask_img is not None: mask_img = check_niimg_3d(mask_img) mask_img = image.resample_img(mask_img, target_affine=affine, target_shape=niimg.shape[:3], interpolation='nearest') mask, _ = masking._load_mask_img(mask_img) mask_coords = list(np.where(mask != 0)) X = masking._apply_mask_fmri(niimg, mask_img) else: mask_coords = list(zip(*np.ndindex(niimg.shape[:3]))) X = niimg.get_data().reshape([-1, niimg.shape[3]]).T mask_coords = np.asarray(mask_coords) mask_coords = coord_transform(mask_coords[0], mask_coords[1], mask_coords[2], affine) mask_coords = np.asarray(mask_coords).T if (radius is not None and LooseVersion(sklearn.__version__) < LooseVersion('0.16')): # Fix for scikit learn versions below 0.16. See # https://github.com/scikit-learn/scikit-learn/issues/4072 radius += 1e-6 clf = neighbors.NearestNeighbors(radius=radius) A = clf.fit(mask_coords).radius_neighbors_graph(seeds) A = A.tolil() # Include selfs mask_coords = mask_coords.astype(int).tolist() for i, seed in enumerate(seeds): try: A[i, mask_coords.index(seed)] = True except ValueError: # seed is not in the mask pass if not allow_overlap: if np.any(A.sum(axis=0) >= 2): raise ValueError('Overlap detected between spheres') return X, A
def get_positive_neighbors_counts(X, y, k=None, radius=None, positive_label=1): assert bool(k) ^ bool(radius) if k: assert type(k) is int nn = neighbors.NearestNeighbors(n_neighbors=k + 1, algorithm="auto").fit(X) else: assert type(radius) is float nn = neighbors.NearestNeighbors(radius=radius, algorithm="auto").fit(X) pos_X_indices = np.where(y == positive_label)[0] positive_X = X[pos_X_indices] if k: neigh = nn.kneighbors(positive_X, return_distance=False) neigh = neigh[:, 1:] # remove self from neighbors neigh_targets = np.vectorize(lambda x: y[x])(neigh) neigh_shape = np.shape(neigh_targets) neigh_counts = np.full((neigh_shape[0],), np.float(neigh_shape[1]), dtype=np.float) pos_neigh_counts = np.sum(neigh_targets == positive_label, axis=1) else: neigh = nn.radius_neighbors(positive_X, return_distance=False) neigh = map(lambda tup: np.delete(tup[1], np.where(tup[1] == pos_X_indices[tup[0]])), enumerate(neigh)) # remove self from neighbors neigh_targets = map(lambda n: np.vectorize(lambda x: y[x])(n) if n.size else np.array([]), neigh) neigh_counts = np.array(map(lambda n: np.float(np.shape(n)[0]), neigh_targets), dtype=np.float) pos_neigh_counts = np.array(map(lambda n: np.sum(n == positive_label), neigh_targets)) pos_neigh_proportions = pos_neigh_counts / neigh_counts pos_neigh_proportions = np.nan_to_num(pos_neigh_proportions) # empty neighborhoods to 0 # # remove empty neighborhoods # not_nan_indices = ~np.isnan(pos_neigh_proportions) # avg_pos_neigh_count = np.average(pos_neigh_counts[not_nan_indices]) # avg_pos_neigh_prop = np.average(pos_neigh_proportions[not_nan_indices]) avg_pos_neigh_count = np.average(pos_neigh_counts) avg_pos_neigh_prop = np.average(pos_neigh_proportions) return itertools.izip(pos_X_indices, pos_neigh_counts, pos_neigh_proportions), avg_pos_neigh_count, avg_pos_neigh_prop
def simplify_co_occurrence(co, nn=3): # quintile breaks (crude, still need to account for upper/diagonals) q = ps.Quantiles(co).yb q = np.reshape(q, co.shape) # nearest neighbors graph knn = neighbors.NearestNeighbors(n_neighbors=nn) neigh = knn.fit(co) knn_mat = neigh.kneighbors_graph(co).toarray() out = np.multiply(q, knn_mat) return out