def neighbor_cluster(AMDs_train, AMDs_test, energy_train, energy_test, type): energy_test_pred = [] N = 15 if type == "chebyshev": tree = BallTree(AMDs_train, metric='chebyshev') elif type == "euclidean": tree = BallTree(AMDs_train, metric='euclidean') elif type == "minkowski": tree = BallTree(AMDs_train, metric='minkowski') elif type == "manhattan": tree = BallTree(AMDs_train, metric='manhattan') else: return dist, inds = tree.query(AMDs_test, k=N) for ind in inds: sum = 0 for i in ind: sum += energy_train[i] ave = sum / N energy_test_pred.append(ave) fig, ax = plt.subplots() print("R^2 score of KNN is: ", r2_score(energy_test, energy_test_pred)) print("RMSE of KNN is: ", math.sqrt(mean_squared_error(energy_test, energy_test_pred))) ax.scatter(energy_test, energy_test_pred) ax.plot([np.min(energy_test), np.max(energy_test)], [np.min(energy_test), np.max(energy_test)], 'k--', lw=4) ax.set_xlabel('Given') ax.set_ylabel('Predicted') plt.savefig('./image/knn_' + type + '.jpg')
def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) bt = BallTree(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def NCM(o, Z_T): Y = np.c_[o, Z_T] # print(Y.T[:1]) tree = BallTree(Y.T, leaf_size=3) dist, ind = tree.query(Y.T[:1], k=self.k + 1) # print(ind) # indices of k closest neighbors # print(dist) # distances to k closest neighbors # print(dist.sum()) return dist.sum()
def find_target_neighbors(X, labels, K, n_classes): N, D = X.shape targets_ind = np.zeros((N, K), dtype=int) for i in range(n_classes): jj, = np.where(labels == i) # Samples of the class i Xu = X[jj] kdt = BallTree(Xu, leaf_size=50, metric='euclidean') targets = kdt.query(Xu, k=K + 1, return_distance=False) targets_ind[jj] = jj[targets[:, 1:]] return targets_ind
def get_closest_locations(data, query_lon, query_lat, query_cat=None, query_subcat=None, num_locs=10): bt_lons = [] bt_lats = [] bt_indices = [] for n, entry in enumerate(data): valid = True if query_cat is not None and not (query_cat.lower().strip( ) in entry["mapping"]["top_category"].lower().strip()): valid = False if query_subcat is not None and not (query_subcat.lower().strip( ) in entry["mapping"]["sub_category"].lower().strip()): valid = False if not valid: break lon = float(entry["mapping"]["longitude"]) lat = float(entry["mapping"]["latitude"]) bt_lons.append(lon) bt_lats.append(lat) bt_indices.append(n) bt_lons = np.array(bt_lons) bt_lats = np.array(bt_lats) bt_indices = np.array(bt_indices) num_locs = min(num_locs, len(bt_indices)) if num_locs == 0: return [] records = pd.DataFrame(data={ 'lon': bt_lons, 'lat': bt_lats, 'index': bt_indices }) bt = BallTree(np.deg2rad(records[['lat', 'lon']].values), metric='haversine') distances, indices = bt.query(np.deg2rad(np.c_[query_lat, query_lon]), num_locs) data_indices = bt_indices[indices[0]].tolist() return data_indices
def test_ball_tree_kde(kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) bt = BallTree(X, leaf_size=10) dens_true = compute_kernel_slow(Y, X, kernel, h) dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
def test_ball_tree_query_metrics(metric): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) Y = rng.random_sample((10, 10)).round(0) elif metric in DISCRETE_METRICS: X = (4 * rng.random_sample((40, 10))).round(0) Y = (4 * rng.random_sample((10, 10))).round(0) k = 5 bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
def test_gaussian_kde(n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde from scipy.stats import gaussian_kde rng = check_random_state(0) x_in = rng.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: bt = BallTree(x_in[:, None]) gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
def test_bad_pyfunc_metric(): def wrong_returned_value(x, y): return "1" def one_arg_func(x): return 1.0 # pragma: no cover X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): BallTree(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): BallTree(X, metric=one_arg_func)
def find_impostors(pred, labels, n_classes, no_potential_impo): N = len(pred) active = np.zeros((N, no_potential_impo), dtype=int) for i in range(n_classes): ii, = np.where(labels == i) pi = pred[ii] jj, = np.where(labels != i) pj = pred[jj] # Find the nearest neighbors using a BallTree kdt = BallTree(pj, leaf_size=50, metric='euclidean') hardest_examples = kdt.query(pi, k=no_potential_impo, return_distance=False) active[ii] = jj[hardest_examples] return active
def test_ball_tree_query_radius(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = bt.query_radius([query_pt], r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = bt.query_radius([query_pt], r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind])**2).sum(1)) assert_array_almost_equal(d, dist)
def knn_error_score(L, x_train, y_train, x_test, y_test, k, tree_size=15): """ Measures the training and testing errors of a kNN classifier implemented using BallTree. :param L: linear transformation :param x_train: training vectors (each column is an instance) :param y_train: training labels (row vector!!) :param x_test: test vectors :param y_test: test labels :param k: number of nearest neighbors :return: training and testing error in k-NN problem. """ assert y_train.ndim == 1, y_test.ndim == 1 assert x_train.shape[0] == len(y_train) assert x_test.shape[0] == len(y_test) assert isinstance(k, (int, np.int32, np.int64)) and k > 0 if len(L) != 0: # L is the initial linear projection, for example PCa or LDA x_train = x_train @ L.T x_test = x_test @ L.T tree = BallTree(x_train, leaf_size=tree_size, metric='euclidean') MM = np.append(y_train, y_test).min() NTr, NTe = x_train.shape[0], x_test.shape[0] # Use the tree to compute the distance between the testing and training points # iTe: indices of the testing elements in the training set dists, iTe = tree.query(x_test, k=k, return_distance=True) # Labels of the testing elements in the training set lTe2 = LSKnn2(y_train[iTe], k, MM) # Compute the error for each k test_error = np.sum(lTe2 != np.repeat(y_test, k, axis=0), axis=1) / NTe # Use the tree to compute the distance between the training points dists, iTr = tree.query(x_train, k=k + 1, return_distance=True) iTr = iTr[:, 1:] lTr2 = LSKnn2(y_train[iTr], k, MM) training_error = np.sum(lTr2 != np.repeat(y_train, k, axis=0), axis=1) / NTr return float(training_error), float(test_error)
def test_ball_tree_two_point(n_samples=100, n_features=3): rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) r = np.linspace(0, 1, 10) bt = BallTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] def check_two_point(r, dualtree): counts = bt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_array_almost_equal(counts, counts_true) for dualtree in (True, False): check_two_point(r, dualtree)
def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): BallTree(X)
def fit(self, X, y): if Version(sklearn_version) >= Version("1.0"): self._check_feature_names(X, reset=True) if self.metric_params is not None and 'p' in self.metric_params: if self.p is not None: warnings.warn( "Parameter p is found in metric_params. " "The corresponding parameter from __init__ " "is ignored.", SyntaxWarning, stacklevel=2) self.effective_metric_params_ = self.metric_params.copy() effective_p = self.metric_params["p"] else: self.effective_metric_params_ = {} effective_p = self.p if self.metric in ["minkowski"]: if effective_p < 1: raise ValueError( "p must be greater or equal to one for minkowski metric") self.effective_metric_params_["p"] = effective_p self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == "minkowski": p = self.effective_metric_params_.pop("p", 2) if p < 1: raise ValueError( "p must be greater or equal to one for minkowski metric") if p == 1: self.effective_metric_ = "manhattan" elif p == 2: self.effective_metric_ = "euclidean" elif p == np.inf: self.effective_metric_ = "chebyshev" else: self.effective_metric_params_["p"] = p if self.metric == "manhattan": self.p = 1 if not isinstance(X, (KDTree, BallTree, sklearn_NeighborsBase)): self._fit_X = _check_array(X, dtype=[np.float64, np.float32], accept_sparse=True) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) if self.algorithm == "auto": # A tree approach is better for small number of neighbors or small # number of features, with KDTree generally faster when available is_n_neighbors_valid_for_brute = self.n_neighbors is not None and \ self.n_neighbors >= self._fit_X.shape[0] // 2 if self._fit_X.shape[1] > 15 or is_n_neighbors_valid_for_brute: self._fit_method = "brute" else: if self.effective_metric_ in VALID_METRICS["kd_tree"]: self._fit_method = "kd_tree" elif callable(self.effective_metric_) or \ self.effective_metric_ in \ VALID_METRICS["ball_tree"]: self._fit_method = "ball_tree" else: self._fit_method = "brute" else: self._fit_method = self.algorithm if hasattr(self, '_onedal_estimator'): delattr(self, '_onedal_estimator') # To cover test case when we pass patched # estimator as an input for other estimator if isinstance(X, sklearn_NeighborsBase): self._fit_X = X._fit_X self._tree = X._tree self._fit_method = X._fit_method self.n_samples_fit_ = X.n_samples_fit_ self.n_features_in_ = X.n_features_in_ if hasattr(X, '_onedal_estimator'): if self._fit_method == "ball_tree": X._tree = BallTree( X._fit_X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_, ) elif self._fit_method == "kd_tree": X._tree = KDTree( X._fit_X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_, ) elif self._fit_method == "brute": X._tree = None else: raise ValueError("algorithm = '%s' not recognized" % self.algorithm) elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X self._fit_method = 'ball_tree' self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X self._fit_method = 'kd_tree' self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] dispatch( self, 'neighbors.KNeighborsClassifier.fit', { 'onedal': self.__class__._onedal_fit, 'sklearn': sklearn_KNeighborsClassifier.fit, }, X, y) return self