Пример #1
0
def neighbor_cluster(AMDs_train, AMDs_test, energy_train, energy_test, type):
    energy_test_pred = []
    N = 15
    if type == "chebyshev":
        tree = BallTree(AMDs_train, metric='chebyshev')
    elif type == "euclidean":
        tree = BallTree(AMDs_train, metric='euclidean')
    elif type == "minkowski":
        tree = BallTree(AMDs_train, metric='minkowski')
    elif type == "manhattan":
        tree = BallTree(AMDs_train, metric='manhattan')
    else:
        return

    dist, inds = tree.query(AMDs_test, k=N)
    for ind in inds:
        sum = 0
        for i in ind:
            sum += energy_train[i]
        ave = sum / N
        energy_test_pred.append(ave)

    fig, ax = plt.subplots()
    print("R^2 score of KNN is: ", r2_score(energy_test, energy_test_pred))
    print("RMSE of KNN is: ",
          math.sqrt(mean_squared_error(energy_test, energy_test_pred)))
    ax.scatter(energy_test, energy_test_pred)
    ax.plot([np.min(energy_test), np.max(energy_test)],
            [np.min(energy_test), np.max(energy_test)],
            'k--',
            lw=4)
    ax.set_xlabel('Given')
    ax.set_ylabel('Predicted')
    plt.savefig('./image/knn_' + type + '.jpg')
Пример #2
0
def test_query_haversine():
    rng = check_random_state(0)
    X = 2 * np.pi * rng.random_sample((40, 2))
    bt = BallTree(X, leaf_size=1, metric="haversine")
    dist1, ind1 = bt.query(X, k=5)
    dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine")

    assert_array_almost_equal(dist1, dist2)
    assert_array_almost_equal(ind1, ind2)
Пример #3
0
 def NCM(o, Z_T):
     Y = np.c_[o, Z_T]
     # print(Y.T[:1])
     tree = BallTree(Y.T, leaf_size=3)
     dist, ind = tree.query(Y.T[:1], k=self.k + 1)
     # print(ind)  # indices of k closest neighbors
     # print(dist)  # distances to k closest neighbors
     # print(dist.sum())
     return dist.sum()
Пример #4
0
def find_target_neighbors(X, labels, K, n_classes):
    N, D = X.shape
    targets_ind = np.zeros((N, K), dtype=int)
    for i in range(n_classes):
        jj, = np.where(labels == i)
        # Samples of the class i
        Xu = X[jj]
        kdt = BallTree(Xu, leaf_size=50, metric='euclidean')
        targets = kdt.query(Xu, k=K + 1, return_distance=False)
        targets_ind[jj] = jj[targets[:, 1:]]

    return targets_ind
Пример #5
0
def get_closest_locations(data,
                          query_lon,
                          query_lat,
                          query_cat=None,
                          query_subcat=None,
                          num_locs=10):
    bt_lons = []
    bt_lats = []
    bt_indices = []

    for n, entry in enumerate(data):
        valid = True
        if query_cat is not None and not (query_cat.lower().strip(
        ) in entry["mapping"]["top_category"].lower().strip()):
            valid = False
        if query_subcat is not None and not (query_subcat.lower().strip(
        ) in entry["mapping"]["sub_category"].lower().strip()):
            valid = False

        if not valid:
            break

        lon = float(entry["mapping"]["longitude"])
        lat = float(entry["mapping"]["latitude"])
        bt_lons.append(lon)
        bt_lats.append(lat)
        bt_indices.append(n)

    bt_lons = np.array(bt_lons)
    bt_lats = np.array(bt_lats)
    bt_indices = np.array(bt_indices)

    num_locs = min(num_locs, len(bt_indices))
    if num_locs == 0:
        return []

    records = pd.DataFrame(data={
        'lon': bt_lons,
        'lat': bt_lats,
        'index': bt_indices
    })

    bt = BallTree(np.deg2rad(records[['lat', 'lon']].values),
                  metric='haversine')
    distances, indices = bt.query(np.deg2rad(np.c_[query_lat, query_lon]),
                                  num_locs)

    data_indices = bt_indices[indices[0]].tolist()

    return data_indices
Пример #6
0
def test_ball_tree_kde(kernel, h, rtol, atol, breadth_first, n_samples=100,
                       n_features=3):
    rng = np.random.RandomState(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    bt = BallTree(X, leaf_size=10)

    dens_true = compute_kernel_slow(Y, X, kernel, h)

    dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol,
                             kernel=kernel,
                             breadth_first=breadth_first)
    assert_allclose(dens, dens_true,
                    atol=atol, rtol=max(rtol, 1e-7))
Пример #7
0
def test_ball_tree_query_metrics(metric):
    rng = check_random_state(0)
    if metric in BOOLEAN_METRICS:
        X = rng.random_sample((40, 10)).round(0)
        Y = rng.random_sample((10, 10)).round(0)
    elif metric in DISCRETE_METRICS:
        X = (4 * rng.random_sample((40, 10))).round(0)
        Y = (4 * rng.random_sample((10, 10))).round(0)

    k = 5

    bt = BallTree(X, leaf_size=1, metric=metric)
    dist1, ind1 = bt.query(Y, k)
    dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
    assert_array_almost_equal(dist1, dist2)
Пример #8
0
def test_gaussian_kde(n_samples=1000):
    # Compare gaussian KDE results to scipy.stats.gaussian_kde
    from scipy.stats import gaussian_kde
    rng = check_random_state(0)
    x_in = rng.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        bt = BallTree(x_in[:, None])
        gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))

        dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
Пример #9
0
def test_bad_pyfunc_metric():
    def wrong_returned_value(x, y):
        return "1"

    def one_arg_func(x):
        return 1.0  # pragma: no cover

    X = np.ones((5, 2))
    msg = "Custom distance function must accept two vectors and return a float."
    with pytest.raises(TypeError, match=msg):
        BallTree(X, metric=wrong_returned_value)

    msg = "takes 1 positional argument but 2 were given"
    with pytest.raises(TypeError, match=msg):
        BallTree(X, metric=one_arg_func)
Пример #10
0
def find_impostors(pred, labels, n_classes, no_potential_impo):
    N = len(pred)
    active = np.zeros((N, no_potential_impo), dtype=int)
    for i in range(n_classes):
        ii, = np.where(labels == i)
        pi = pred[ii]
        jj, = np.where(labels != i)
        pj = pred[jj]
        # Find the nearest neighbors using a BallTree
        kdt = BallTree(pj, leaf_size=50, metric='euclidean')
        hardest_examples = kdt.query(pi,
                                     k=no_potential_impo,
                                     return_distance=False)
        active[ii] = jj[hardest_examples]

    return active
Пример #11
0
def test_ball_tree_query_radius(n_samples=100, n_features=10):
    rng = check_random_state(0)
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = bt.query_radius([query_pt], r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_array_almost_equal(i, ind)
Пример #12
0
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10):
    rng = check_random_state(0)
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = bt.query_radius([query_pt], r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind])**2).sum(1))

        assert_array_almost_equal(d, dist)
Пример #13
0
def knn_error_score(L, x_train, y_train, x_test, y_test, k, tree_size=15):
    """
    Measures the training and testing errors of a kNN classifier implemented using BallTree.
    :param L: linear transformation
    :param x_train: training vectors (each column is an instance)
    :param y_train: training labels  (row vector!!)
    :param x_test: test vectors
    :param y_test: test labels
    :param k: number of nearest neighbors
    :return: training and testing error in k-NN problem.
    """
    assert y_train.ndim == 1, y_test.ndim == 1
    assert x_train.shape[0] == len(y_train)
    assert x_test.shape[0] == len(y_test)
    assert isinstance(k, (int, np.int32, np.int64)) and k > 0

    if len(L) != 0:
        # L is the initial linear projection, for example PCa or LDA
        x_train = x_train @ L.T
        x_test = x_test @ L.T

    tree = BallTree(x_train, leaf_size=tree_size, metric='euclidean')

    MM = np.append(y_train, y_test).min()
    NTr, NTe = x_train.shape[0], x_test.shape[0]

    # Use the tree to compute the distance between the testing and training points
    # iTe: indices of the testing elements in the training set
    dists, iTe = tree.query(x_test, k=k, return_distance=True)

    # Labels of the testing elements in the training set
    lTe2 = LSKnn2(y_train[iTe], k, MM)
    # Compute the error for each k
    test_error = np.sum(lTe2 != np.repeat(y_test, k, axis=0), axis=1) / NTe

    # Use the tree to compute the distance between the training points
    dists, iTr = tree.query(x_train, k=k + 1, return_distance=True)
    iTr = iTr[:, 1:]
    lTr2 = LSKnn2(y_train[iTr], k, MM)
    training_error = np.sum(lTr2 != np.repeat(y_train, k, axis=0), axis=1) / NTr

    return float(training_error), float(test_error)
Пример #14
0
def test_ball_tree_two_point(n_samples=100, n_features=3):
    rng = check_random_state(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    r = np.linspace(0, 1, 10)
    bt = BallTree(X, leaf_size=10)

    D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
    counts_true = [(D <= ri).sum() for ri in r]

    def check_two_point(r, dualtree):
        counts = bt.two_point_correlation(Y, r=r, dualtree=dualtree)
        assert_array_almost_equal(counts, counts_true)

    for dualtree in (True, False):
        check_two_point(r, dualtree)
Пример #15
0
def test_array_object_type():
    """Check that we do not accept object dtype array."""
    X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
    with pytest.raises(ValueError,
                       match="setting an array element with a sequence"):
        BallTree(X)
Пример #16
0
    def fit(self, X, y):
        if Version(sklearn_version) >= Version("1.0"):
            self._check_feature_names(X, reset=True)
        if self.metric_params is not None and 'p' in self.metric_params:
            if self.p is not None:
                warnings.warn(
                    "Parameter p is found in metric_params. "
                    "The corresponding parameter from __init__ "
                    "is ignored.",
                    SyntaxWarning,
                    stacklevel=2)
            self.effective_metric_params_ = self.metric_params.copy()
            effective_p = self.metric_params["p"]
        else:
            self.effective_metric_params_ = {}
            effective_p = self.p

        if self.metric in ["minkowski"]:
            if effective_p < 1:
                raise ValueError(
                    "p must be greater or equal to one for minkowski metric")
            self.effective_metric_params_["p"] = effective_p

        self.effective_metric_ = self.metric
        # For minkowski distance, use more efficient methods where available
        if self.metric == "minkowski":
            p = self.effective_metric_params_.pop("p", 2)
            if p < 1:
                raise ValueError(
                    "p must be greater or equal to one for minkowski metric")
            if p == 1:
                self.effective_metric_ = "manhattan"
            elif p == 2:
                self.effective_metric_ = "euclidean"
            elif p == np.inf:
                self.effective_metric_ = "chebyshev"
            else:
                self.effective_metric_params_["p"] = p

        if self.metric == "manhattan":
            self.p = 1

        if not isinstance(X, (KDTree, BallTree, sklearn_NeighborsBase)):
            self._fit_X = _check_array(X,
                                       dtype=[np.float64, np.float32],
                                       accept_sparse=True)
            self.n_samples_fit_ = _num_samples(self._fit_X)
            self.n_features_in_ = _num_features(self._fit_X)

            if self.algorithm == "auto":
                # A tree approach is better for small number of neighbors or small
                # number of features, with KDTree generally faster when available
                is_n_neighbors_valid_for_brute = self.n_neighbors is not None and \
                    self.n_neighbors >= self._fit_X.shape[0] // 2
                if self._fit_X.shape[1] > 15 or is_n_neighbors_valid_for_brute:
                    self._fit_method = "brute"
                else:
                    if self.effective_metric_ in VALID_METRICS["kd_tree"]:
                        self._fit_method = "kd_tree"
                    elif callable(self.effective_metric_) or \
                        self.effective_metric_ in \
                            VALID_METRICS["ball_tree"]:
                        self._fit_method = "ball_tree"
                    else:
                        self._fit_method = "brute"
            else:
                self._fit_method = self.algorithm

        if hasattr(self, '_onedal_estimator'):
            delattr(self, '_onedal_estimator')
        # To cover test case when we pass patched
        # estimator as an input for other estimator
        if isinstance(X, sklearn_NeighborsBase):
            self._fit_X = X._fit_X
            self._tree = X._tree
            self._fit_method = X._fit_method
            self.n_samples_fit_ = X.n_samples_fit_
            self.n_features_in_ = X.n_features_in_
            if hasattr(X, '_onedal_estimator'):
                if self._fit_method == "ball_tree":
                    X._tree = BallTree(
                        X._fit_X,
                        self.leaf_size,
                        metric=self.effective_metric_,
                        **self.effective_metric_params_,
                    )
                elif self._fit_method == "kd_tree":
                    X._tree = KDTree(
                        X._fit_X,
                        self.leaf_size,
                        metric=self.effective_metric_,
                        **self.effective_metric_params_,
                    )
                elif self._fit_method == "brute":
                    X._tree = None
                else:
                    raise ValueError("algorithm = '%s' not recognized" %
                                     self.algorithm)

        elif isinstance(X, BallTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'ball_tree'
            self.n_samples_fit_ = X.data.shape[0]
            self.n_features_in_ = X.data.shape[1]

        elif isinstance(X, KDTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'kd_tree'
            self.n_samples_fit_ = X.data.shape[0]
            self.n_features_in_ = X.data.shape[1]

        dispatch(
            self, 'neighbors.KNeighborsClassifier.fit', {
                'onedal': self.__class__._onedal_fit,
                'sklearn': sklearn_KNeighborsClassifier.fit,
            }, X, y)
        return self