def get_nearest(src_points, candidates, k_neighbors=1): """Find nearest neighbors for all source points from a set of candidate points""" # Create tree from the candidate points tree = BallTree(candidates, leaf_size=15, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() # Get closest indices and distances (i.e. array at index 0) # note: for the second closest points, you would take index 1, etc. # closest = indices[0] # closest_dist = distances[0] closest = indices closest_dist = distances # Return indices and distances return (closest, closest_dist)
def get_nearest(src_points, candidates, k_neighbors=1): # get_nearest and nearest_neighbor functions sourced from the following site: # https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html """Find nearest neighbors for all source points from a set of candidate points""" print('balltree get nearest function - hsl') # Create tree from the candidate points tree = BallTree(candidates, leaf_size=15, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() # Get closest indices and distances (i.e. array at index 0) # note: for the second closest points, you would take index 1, etc. closest = indices[0] closest_dist = distances[0] # Return indices and distances return (closest, closest_dist)
def train(self): R = (self.m**2 + self.n**2)**0.5 for it in range(self.max_iter): for sample in self.data: win_m, win_n, max_sim = self.find_winner(sample) neighbor = self.find_neighbor(win_m, win_n, R) for w in neighbor: mw = w[0] nw = w[1] rw = w[2] self.weights[mw, nw] += self.learning_rate( it, rw) * (sample - self.weights[mw, nw]) R *= 1 - (it + 1) / self.max_iter data_tree = BallTree(self.data) for mi in range(self.m): for ni in range(self.n): dist, idx = data_tree.query([self.weights[mi, ni]], k=10) vote = [self.labels[i] for i in idx.reshape(-1)] self.output[mi, ni] = int( sorted(dict(Counter(vote)).items(), key=lambda d: d[1], reverse=True)[0][0])
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **kwargs): # The Cython routines used require contiguous arrays if not X.flags['C_CONTIGUOUS']: X = np.array(X, dtype=np.double, order='C') dim = X.shape[0] k = min(dim - 1, k) tree = BallTree(X, metric=metric, **kwargs) dist_metric = DistanceMetric.get_metric(metric, **kwargs) core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C') min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) return single_linkage_tree
def estimate_bayes_factor(trace, r=0.05, return_list=False): """Estimate the bayes factor using the local density of points""" # Convert traces to a numpy array, ignore the intervals trace_arr = np.array([trace[i] for i in trace.varnames if "_interval__" not in i]) trace_t = trace_arr.T N_iter, D = trace_t.shape # compute volume of a D-dimensional sphere of radius r Vr = np.pi ** (0.5 * D) / gamma(0.5 * D + 1) * (r ** D) # use neighbor count within r as a density estimator bt = BallTree(trace_t) count = bt.query_radius(trace_t, r=r, count_only=True) BF = trace.model_logp + np.log(N_iter) + np.log(Vr) - np.log(count) if return_list: return BF else: p25, p50, p75 = np.percentile(BF, [25, 50, 75]) return p50, 0.7413 * (p75 - p25)
def make_nearest_surf(center, radius, rotation, contour_pts, psize=20, qsize=8, vis=False, seg=None): points = np.array([[ radius[0] * math.cos(u) * math.cos(v), radius[1] * math.cos(v) * math.sin(u), radius[2] * math.sin(v) ] for u in np.linspace(0, 2 * math.pi, num=psize) for v in np.linspace( -math.pi / 2 + 0.01, math.pi / 2 - 0.01, num=psize)]) for i in range(len(points)): points[i] = np.dot(points[i], rotation) points += center tree = BallTree(contour_pts) _, ind = tree.query(points, k=1) ind = np.reshape(ind, (ind.shape[0])) points = contour_pts[ind, :].astype(np.float64) noise = 0.001 points += np.random.rand(points.shape[0], points.shape[1]) * noise if vis: img_mask = get_image_mask_points(seg, points) color_img = draw_segmentation(seg, img_mask, mark_val=255) show_ct_image(color_img) return approximate_surface(points.tolist(), psize, psize, 3, 3, ctrlpts_size_u=qsize, ctrlpts_size_v=qsize)
def calc_nearest_site(): # Now we are going to use sklearn's KDTree to find the nearest neighbor of # each center for the nearest port. points_of_int = np.radians( df_centers.loc[:, ['average_lat', 'average_lon']].values) candidates = np.radians(ports_wpi.loc[:, ['lat', 'lon']].values) tree = BallTree(candidates, leaf_size=30, metric='haversine') ports_wpi = get_sites(engine) nearest_list = [] for i in range(len((points_of_int))): dist, ind = tree.query(points_of_int[i, :].reshape(1, -1), k=1) nearest_dict = { clust_id_value: df_centers.iloc[i].loc[clust_id_value], 'nearest_site_id': ports_wpi.iloc[ind[0][0]].loc['port_id'], 'nearest_port_dist': dist[0][0] * 6371.0088 } nearest_list.append(nearest_dict) df_nearest = pd.DataFrame(nearest_list) df_centers = pd.merge(df_centers, df_nearest, how='left', on=clust_id_value)
def test_index(): xs = rand(1000, 100, random_state=42).toarray() try: indexer = SQLiteIndexer(index_path=INDEX_PATH) index = PrioritizedDynamicContinuousIndex(indexer, composite_indices=2, simple_indices=50) index.fit(xs) x = xs[0:1] k = 10 nn_baseline = BallTree(xs) baseline_dist, baseline_idx = nn_baseline.query(x, k=k) dist, idx = index.query(x, k=k) # np.testing.assert_equal(baseline_idx[0], idx) finally: if os.path.exists(INDEX_PATH): os.remove(INDEX_PATH)
def nne(dim_red, true_labels): """ Calculates the nearest neighbor accuracy (basically leave-one-out cross validation with a 1NN classifier). Args: dim_red (array): dimensions (k, cells) true_labels (array): 1d array of integers Returns: Nearest neighbor accuracy - fraction of points for which the 1NN 1NN classifier returns the correct value. """ # use sklearn's BallTree bt = BallTree(dim_red.T) correct = 0 for i, l in enumerate(true_labels): dist, ind = bt.query([dim_red[:, i]], k=2) closest_cell = ind[0, 1] if true_labels[closest_cell] == l: correct += 1 return float(correct) / len(true_labels)
def get_score_for_ideal_points(c, ideal_points, IDEAL_RADIUS, IDEAL_HEIGHT): #rename cameras rename_cameras(c) #get normalized points of cameras currently aligned points = get_normalized_points(c, IDEAL_RADIUS) #get translation and rotation vector #get model, scene and after non rigid points model, scene, after_tps = cca.non_rigid_registration(points, ideal_points) #save_points_like_obj(model, "D:/model{}.obj".format(counter)) #save_points_like_obj(scene, "D:/scene{}.obj".format(counter)) #save_points_like_obj(after_tps, "D:/after_tps{}.obj".format(counter)) distances_array = [] ballTree = BallTree(after_tps) #for dooblicator v1 46 min distance between cameras is height/2 if len(c.cameras) >= 41 and len(c.cameras) <= 51: radius = 2 * (IDEAL_HEIGHT / 2) / 3 else: radius = 2 * IDEAL_HEIGHT / 3 not_functional = [] i = 0 for point in ideal_points: ind = ballTree.query_radius(point, radius) if len(ind[0]) == 1: distances_array.append(np.linalg.norm(point - after_tps[ind[0][0]])) else: i += 1 distances_array.append(1000) print("SCORE: ", np.mean(distances_array)) return np.mean(distances_array)
def image_retrieval(): topK = 10 avg_acc = 0 x_train_noisy, x_test_noisy, y_train, y_test, x_train, x_test = preprocess( ) autoencoder = load_model('../working/autoencoder.h5') print(autoencoder.summary()) encoder = Model(autoencoder.input, autoencoder.get_layer('encoding_layer').output) coded_train = encoder.predict(x_train_noisy) coded_train = coded_train.reshape( coded_train.shape[0], coded_train.shape[1] * coded_train.shape[2] * coded_train.shape[3]) coded_train = preprocessing.normalize(coded_train, norm='l2') tree = BallTree(coded_train, leaf_size=200) #extracting features from test set coded_test = encoder.predict(x_test_noisy) coded_test = coded_test.reshape( coded_test.shape[0], coded_test.shape[1] * coded_test.shape[2] * coded_test.shape[3]) coded_test = preprocessing.normalize(coded_test, norm='l2') for i in range(coded_test.shape[0]): query_code = coded_test[i] query_label = y_test[i] dists, ids = tree.query([query_code], k=topK) labels = np.array([y_train[id] for id in ids[0]]) acc = (labels == query_label).astype(int).sum() / topK avg_acc += acc if i % 1000 == 0: print('{} / {}: {}'.format(i, coded_test.shape[0], acc)) avg_acc /= coded_test.shape[0] print("The average top K accuracy is: {}".format(avg_acc))
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) distances = abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] distances_nn = np.array([distances[i, neighbors_nn[i]] for i in range(n_samples)]) assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\ abs(distances[0, neighbors_nn[0]] - distances_nn[0]) P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0) P = squareform(P) P_bh = P_bh.toarray() assert_array_almost_equal(P_bh, P, decimal=5) assert_almost_equal(kl_exact, kl_bh, decimal=3)
def put_zalando_stuff_in_db(): """Puts the downloaded Zalando stuff into the database etc""" feat_extr = FeatureExtr() pic_list = [] X = [] for file in os.listdir(TRAIN_IMG_PATH): if os.path.isdir(TRAIN_IMG_PATH + file): with open(TRAIN_IMG_PATH + file + "/data.txt") as data_file: prod_data = json.load(data_file) prd = Product(name=prod_data['name'], brand=prod_data['brand']["name"], external_id=prod_data['id'], price=prod_data["units"][0]["price"]["value"], display_img_path="") prd.save() for img_file in prod_data['media']['images']: img_name = img_file['mediumUrl'].split('/')[-1] img_path = TRAIN_IMG_PATH + file + "/" + img_name img_orig = (imread(img_path)[:, :, :3]).astype(np.float32) img_resize = imresize(img_orig, (227, 227)) img_feat = feat_extr.get_features([img_resize])[0] norm_img_feat = img_feat / np.linalg.norm(img_feat) prd.picture_set.create(img_type=img_file['type'], img_path=img_path, feature_array=norm_img_feat) pic_list.append((prd.external_id, norm_img_feat, img_path)) X.append(norm_img_feat) kdt = BallTree(X, leaf_size=30, metric='euclidean') pickle.dump(pic_list, open(NEAREST_NEIGH_PATH + "pic_list.p", "wb")) pickle.dump(kdt, open(NEAREST_NEIGH_PATH + "tree.p", "wb"))
def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs): if leaf_size < 3: leaf_size = 3 if core_dist_n_jobs < 1: core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) if X.dtype != np.float64: X = X.astype(np.float64) tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, n_jobs=core_dist_n_jobs, **kwargs) min_spanning_tree = alg.spanning_tree() # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[ np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None
def __init__(self, pointings, raCol='_ra', decCol='_dec', indexCol='obsHistID', leafSize=50): """ Create a tree of pointings Parameters ---------- pointings : `pd.dataFrame` of pointings with unique index values as the index column raCol : string column name for a column holding ra values in radians decCol : string column name for a column holding dec values in radians .. note : raCol and decCol are assumed to hold ra and dec in units of radians """ self.pointings = pointings if self.validatePointings(pointings, raCol, decCol): self.raCol = raCol self.decCol = decCol else: raise ValueError('pointings, and the provided values of raCol, decCol {0}, {1} are incompatible'.format(raCol, decCol)) # tree queries # Keep mapping from integer indices to obsHistID pointings.loc[:, 'intindex'] = np.arange(len(pointings)).astype(np.int) self.indMapping = pointings['intindex'].reset_index().set_index('intindex') # Build Tree self.tree = BallTree(pointings[[decCol, raCol]].values, leaf_size=leafSize, metric='haversine')
def nn_search(self, tree_features, query_features, metric='haversine', convert_radians=False): ''' Build a BallTree for nearest neighbor search based on haversine distance. Parameters ---------- tree_features: array_like Input features to create the search tree. Features are in lat, lon format, in radians query_features: array_like Points to which calculate the nearest neighbor within the tree. latlon coordinates expected in radians for distance calculation metric: str Distance metric for neighorhood search. Default haversine for latlon coordinates. convert_radians: bool Flag in case features are not in radians and need to be converted Returns ------- distances: array_like Array with the corresponding distance in km (haversine distance * earth radius) ''' if convert_radians: pass tree = BallTree(tree_features, metric=metric) return tree.query(query_features)[0] * 6371000 / 1000
def distance_to_port(lon, lat, ports): ''' Take longitude and latitude and return the distance (km) to the closest port, as well as the country of that port, using the World Port Index database. This uses a ball tree search approach in radians, accounting for the curvature of the Earth by calculating the Haversine metric for each pair of points. Note that Haversine distance metric expects coordinate pairs in (lat, long) order, in radians. Arguments: lon, lat: Arrays of longitude-latitude pairs of ship locations, in degrees ports: shape file of ports Returns: Pandas dataframe with columns 'shore_country' and 'distance_to_port' ''' ports_flip = np.flip(ports, axis=1) coords = pd.concat([np.radians(lat), np.radians(lon)], axis=1) tree = BallTree(np.radians(ports_flip), metric='haversine') dist, ind = tree.query(coords, k=1) df_distance_to_port = pd.Series( dist.flatten() * 6371, # radius of earth (km) name='distance_to_port') return df_distance_to_port
def __init__( self, k, train_X, train_Y, n_components=5, weights=[1, 1], # must be above 1 threshold=0.9): self.train_X = np.asarray(train_X) # Use BallTree to optimize neighbour searches # Is O(m log n) instead of O(m n) self.tree = BallTree(self.train_X) self.train_Y = np.asarray(train_Y).astype(int) self.k = k self.weights = weights print("scoring training data") self.train_scores = self.outlier_score(self.train_X) print("thresholding training scores") self.threshold, self.p = gamma_threshold(self.train_scores, threshold)
def fit(self, X): centroids_dict = defaultdict(int) seeds = self.init_seeds(X) ball_tree = BallTree(X) for weighted_mean in seeds: for i in range(self.max_iterations): prev_weighted_mean = weighted_mean points_within = X[ball_tree.query_radius([prev_weighted_mean], self.bandwidth)[0]] weighted_mean = self.update_kernel_fn(prev_weighted_mean, points_within, self.bandwidth) if (np.linalg.norm(weighted_mean - prev_weighted_mean) < self.tol * self.bandwidth): break centroids_dict[tuple(weighted_mean)] = len(points_within) self.centroids_ = self._remove_overlapping_windows(centroids_dict) self.labels_ = np.array([self._closest_centroid(x) for x in X]) return self
def associate(rad_1, rad_2, k_nn=1): """ Given two grids rad_1 and rad_2, this associates each point in rad_2 to the k-nearest neighbours in rad_1. Pairs of the form [latitude, longitude] """ # Room to improvement: # - Run the Ball tree on the smallest net # - Use something more efficient than a Ball Tree, like a binary search. # Build Ball Tree Ball = BallTree(rad_1, metric='haversine') # Searching Data distances, indices = Ball.query(rad_2, k=k_nn, breadth_first=True, return_distance=True) assert rad_2.shape[0] == indices.shape[0] return distances, indices
def _get_unassigned_balltree(self): """ Use BallTree to find nearest clusters """ k = self.clusters.k if k == 1: return super(FastDPMeans, self).get_unassigned() tree = BallTree(self.centers, leaf_size=k + 1) neigh, _ = tree.query_radius(self.data, self.cutoff, sort_results=True, return_distance=True) n_neigh = np.array(list(map(len, neigh))) assigned = np.nonzero(n_neigh > 0)[0] unassigned = np.nonzero(n_neigh == 0)[0] self.clusters.labels[assigned] = [neigh[i][0] for i in assigned] return unassigned
def particle_position(self, position=None, leaf_size=None, metric=None, position_min=None, position_max=None): if leaf_size is not None: self.leaf_size = leaf_size if metric is not None: self.metric = metric if position is None: print('Input particle positions using particle_position().') return None else: self.position = position if isinstance(self.position, (pd.core.frame.DataFrame)): X = np.vstack((self.position.x, self.position.y, self.position.z)).T else: X = self.position position_min = X.min() if position_min is None else position_min position_max = X.max() if position_max is None else position_max X = (X-position_min)/(position_max-position_min) print('Building tree...') # self.tree = KDTree(X, leaf_size=self.leaf_size, metric=self.metric) self.tree = BallTree(X, leaf_size=self.leaf_size, metric=self.metric) print('Tree built with the positions.')
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] k = min(dim - 1, k) tree = BallTree(X, metric=metric) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=k)[0][:, -1] min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) return single_linkage_tree
def _remove_overlapping_windows(self, centroids_dict): ''' Removes overlapping windows :param centroids_dict: Dictionary with windows positions and a list of points that each window has. :return: Filtered windows. ''' centroids_by_intensity = sorted(centroids_dict.items(), key=lambda tup: tup[1], reverse=True) centroids = np.array([ centroid for centroid, size in centroids_by_intensity if size >= self.min_bin_size ]) unique = np.ones(len(centroids), dtype=bool) nbrs = BallTree(centroids) for centroid_ind, centroid in enumerate(centroids): if (unique[centroid_ind]): indexes = nbrs.query_radius([centroid], self.bandwidth)[0] unique[indexes] = False unique[centroid_ind] = True return centroids[unique]
def find_partial_connected_components(data,cutoff=15.0): import networkx as nx import numpy as np from sklearn.neighbors import BallTree tree = BallTree(data, leaf_size=40) edges = tree.query_radius(data, cutoff) edge_list=[list(zip(np.repeat(idx, len(dest_list)), \ dest_list)) for idx, dest_list in enumerate(edges)] edge_list_flat = np.array([list(item) \ for sublist in edge_list for item in sublist]) res = edge_list_flat res_tree = edge_list_flat[edge_list_flat[:,0]<edge_list_flat[:,1], :] graph =nx.from_edgelist(res_tree) # partial connected components connected_components = nx.connected_components(graph) for x in connected_components: yield [x]
def _rsl_boruvka_balltree(X, k=5, alpha=1.0, metric='euclidean', leaf_size=40, **kwargs): dim = X.shape[0] min_samples = min(dim - 1, k) tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, alpha=alpha, leaf_size=leaf_size, **kwargs) min_spanning_tree = alg.spanning_tree() single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) return single_linkage_tree
def get_nearest(src_points, candidates, k_neighbors=2): """ converts lat-long coords to great-circle distance and returns the two closests """ # Create tree from the candidate points tree = BallTree(candidates, leaf_size=20, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() #Get closest indices and distances (i.e. array at index 0) #note: for the second closest points, you would take index 1, etc. closest = indices[0:2] closest_dist = distances[0:2] return (closest, closest_dist)
def find_hits_for_targets( *, targets: List[Tuple[float, ...]], predictions: List[Tuple[float, ...]], radius: float, ) -> List[Tuple[int, ...]]: """ Generates a list of the predicted points that are within a radius r of the targets. The indicies are returned in sorted order, from closest to farthest point. Parameters ---------- targets A list of target points predictions A list of predicted points radius The maximum distance that two points can be apart for them to be considered a hit Returns ------- A list which has the same length as the targets list. Each element within this list contains another list that contains the indicies of the predictions that are considered hits. """ predictions_tree = BallTree(predictions) hits, _ = predictions_tree.query_radius( X=targets, r=radius, return_distance=True, sort_results=True, ) return hits
def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs): if leaf_size < 3: leaf_size = 3 if core_dist_n_jobs < 1: raise ValueError( 'Parallel core distance computation requires 1 or more jobs!') tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, n_jobs=core_dist_n_jobs, **kwargs) min_spanning_tree = alg.spanning_tree() # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[ np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None
def index_nn_haversine(centroids, coordinates, threshold=THRESHOLD): """Compute the neareast centroid for each coordinate using a Ball tree with haversine distance. Parameters: centroids (2d array): First column contains latitude, second column contains longitude. Each row is a geographic point coordinates (2d array): First column contains latitude, second column contains longitude. Each row is a geographic point threshold (float): distance threshold in km over which no neighbor will be found. Those are assigned with a -1 index Returns: array with so many rows as coordinates containing the centroids indexes """ # Construct tree from centroids tree = BallTree(np.radians(centroids), metric='haversine') # Select unique exposures coordinates _, idx, inv = np.unique(coordinates, axis=0, return_index=True, return_inverse=True) # query the k closest points of the n_points using dual tree dist, assigned = tree.query(np.radians(coordinates[idx]), k=1, return_distance=True, dualtree=True, breadth_first=False) # Raise a warning if the minimum distance is greater than the # threshold and set an unvalid index -1 num_warn = np.sum(dist * EARTH_RADIUS_KM > threshold) if num_warn: LOGGER.warning('Distance to closest centroid is greater than %s' 'km for %s coordinates.', threshold, num_warn) assigned[dist * EARTH_RADIUS_KM > threshold] = -1 # Copy result to all exposures and return value return np.squeeze(assigned[inv])