Пример #1
0
def test_warning_flag(n_samples=100, n_features=3, k=3):
    """test that discarding identical distances triggers warning flag"""
    X = rng.random_sample(size=(n_samples, n_features))
    q = rng.random_sample(size=n_features)
    bt = neighbors.BallTree(X[:-1], leaf_size=5)
    dist, ind = bt.query(q, k=k)

    # make the last point identical to the furthest neighbor
    # querying this should set warning_flag to True
    X[-1:] = X[ind[0, k - 1]]

    bt = neighbors.BallTree(X, leaf_size=5)
    dist, ind = bt.query(q, k=k)

    assert bt.warning_flag

    # make the last point identical to the closest neighbor
    # though the distance is identical, there is no ambiguity, so there
    # should be no warning.  If k==1, this should not be done
    if k > 1:
        X[-1:] = X[ind[0, 0]]

        bt = neighbors.BallTree(X, leaf_size=5)
        dist, ind = bt.query(q, k=k)

        assert not bt.warning_flag
Пример #2
0
 def fits_coordinate_to_accident_nodes_within_its_radius(self,police_data):
     output = []
     title = ['x', 'y', 'year', 'month', 'day', 'humra']
     output.append(title)
     for fileAcc in police_data:
         X = pd.read_csv('acc/' + fileAcc + '.csv')
         C = pd.read_csv('acc/' + fileAcc + '.csv')
         C.drop(C.loc[:, 'pk_teuna_fikt':'STATUS_IGUN'].columns, axis=1, inplace=True)
         C['X'].replace('', np.nan, inplace=True)
         C.dropna(subset=['X'], inplace=True)
         C = C.values
         X.drop(X.loc[:, 'sug_tik':'YEHIDA'].columns, axis=1, inplace=True)
         X.drop(X.loc[:, 'SHAA':'RAMZOR'].columns, axis=1, inplace=True)
         X.drop(X.loc[:, 'SUG_TEUNA':'STATUS_IGUN'].columns, axis=1, inplace=True)
         X['X'].replace('', np.nan, inplace=True)
         X.dropna(subset=['X'], inplace=True)
         X = X.values
         Y = pd.read_csv("junctions.csv")
         Y = Y.values
         ball_tree = neighbors.BallTree(C, leaf_size=2)
         for i in range(len(Y)):
             ind = ball_tree.query_radius([Y[i]], r=1000)
             for j in (ind[0]):
                 x = [Y[i][0], Y[i][1], int(X[j][1]), int(X[j][2]), int(X[j][3]), int(4-X[j][4])]
                 output.append(x)
     with open("AccRadius.csv", "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerows(output)
Пример #3
0
    def _kNN__ball_tree(self, d):
        kwargs = OrderedDict()
        kwargs['leaf_size'] = 40
        kwargs['metric'] = 'minkowski'

        for k, v in d['neighbors']['args'].items():
            kwargs[k] = v

        for k, v in kwargs.items():
            d['neighbors']['args'][k] = v

        self.kNN_tree = neighbors.BallTree(self.data.df[self.manifold_names],
                                           **kwargs)

        self.kNN_names = ['NN_{}'.format(i) for i in range(self.kNN)]

        self.data.df = pd.concat([
            self.data.df,
            pd.DataFrame(data=self.kNN_tree.query(
                X=self.data.df[self.manifold_names],
                k=self.kNN,
                return_distance=True)[0],
                         columns=self.kNN_names)
        ],
                                 axis=1)
def dist(i, arr):

    #use sklearn to find distance to each object is to each other
    tree = neighbors.BallTree(arr, leaf_size=2)
    dist, ind = tree.query(arr[[i]], k=2)

    return dist
Пример #5
0
  def add_feature_to_coordinates(self,feature,type):
      global Final
      m = 0
      for file in feature:
          X = pd.read_csv('features/'+file+'.csv')
          X = X.values
          Y = pd.read_csv("junctions.csv")
          Y = Y.values
          New = []
          New.append(['x','y',file])
          if type == 'count':
            ball_tree = neighbors.BallTree(X, leaf_size=2)
          if type == 'sum' or type == 'avg':
            Z = pd.read_csv('features/' + file + '.csv')
            Z.drop(Z.columns[[2]], axis=1,inplace=True)
            ball_tree = neighbors.BallTree(Z, leaf_size=2)
          x = [Y[0][0], Y[0][1], 0]
          for i in range(len(Y)):
              if type == 'count' or type == 'avg':
                count = ball_tree.query_radius([Y[i]], r=self.radius * 1000, count_only=True)
                x = [Y[i][0], Y[i][1], int(count)]
              if type == 'sum' or type == 'avg':
                  sum = 0
                  ind = ball_tree.query_radius([Y[i]], r=self.radius * 1000)
                  for j in range(len(ind[0])):
                      sum = sum + X[j][2]
                  if type == 'sum':
                    x = [Y[i][0], Y[i][1], sum]
                  if type == 'avg':
                    if int(count) == 0 :
                        x = [Y[i][0], Y[i][1], 0]
                    else:
                        x = [Y[i][0], Y[i][1], sum/int(count)]

              New.append(x)
          # New = self.normalize(New)
          with open("out" + file + '.csv' , "w", newline="") as f:
              writer = csv.writer(f)
              writer.writerows(New)
          b = pd.read_csv("out" + file + '.csv')
          Final=Final.merge(b, on=('x','y'))
          x, y = Final[file].min(), Final[file].max()
          Final[file] = round((Final[file] - x) / (y - x), 3)
          os.remove("out" + file + '.csv')
          m = m + 1
def find_knn(curr_matrix, ref_matrix, knn):
    """
    for each row in curr_matrix, find k nearest neighbors in ref_matrix,
    return an array of shape=[curr_matrix.shape[0] * knn, ], which stores
    the index of nearest neighbors in ref_matrix
    """
    balltree = sk_neighbors.BallTree(ref_matrix, leaf_size=knn)
    nn_idx = balltree.query(curr_matrix, k=knn, return_distance=False)
    return nn_idx.ravel().astype(int)
Пример #7
0
    def ball_tree_for_one_image(self):

        expected = self.__test_label_list[0]
        start = time.clock()
        tree = sn.BallTree(self.__train_image_list)
        print time.clock() - start
        start = time.clock()
        predicted = tree.query(self.__test_image_list[0].reshape(
            1, len(self.__test_image_list[0])))
        print time.clock() - start
Пример #8
0
def test_ball_tree_pickle():
    import pickle
    X = rng.random_sample(size=(10, 3))
    bt1 = neighbors.BallTree(X, leaf_size=1)
    ind1, dist1 = bt1.query(X)
    for protocol in (0, 1, 2):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)
        ind2, dist2 = bt2.query(X)
        assert np.all(ind1 == ind2)
        assert_array_almost_equal(dist1, dist2)
Пример #9
0
def test_ball_tree_p_distance():
    X = rng.random_sample(size=(100, 5))

    for p in (1, 2, 3, 4, np.inf):
        bt = neighbors.BallTree(X, leaf_size=10, p=p)
        kdt = cKDTree(X, leafsize=10)

        dist_bt, ind_bt = bt.query(X, k=5)
        dist_kd, ind_kd = kdt.query(X, k=5, p=p)

        assert_array_almost_equal(dist_bt, dist_kd)
Пример #10
0
def classify_nearest_neighbor_ball_tree(k):
    labels = load_labels()

    song_samples = []
    indexed_genres = []

    for genre, song_genres_ids in labels.groupby('category'):
        print('Indexing genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2)):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            for val in song.values:
                song_samples.append(val)
                indexed_genres.append(genre)

    ball_tree = nb.BallTree(np.vstack(song_samples))

    total_count = 0
    match_count = 0
    for genre, song_genres_ids in labels.groupby('category'):
        print('Expected genre: {}'.format(genre))
        num_values = len(song_genres_ids.values)
        for i in range(int(num_values / 2), num_values):
            val = song_genres_ids.values[i]
            song_id = val[0]
            song = pd.read_csv('song_data/training/{}'.format(song_id),
                               header=None)
            genre_freqs = {}

            split_song = np.array_split(song, 5,
                                        axis=0)  # Split song into sections
            for s in split_song:
                avg_song_val = np.mean(s)  # Take average of each section
                genre_indices = ball_tree.query([avg_song_val],
                                                k,
                                                return_distance=False)
                for index in genre_indices[0]:
                    genre = indexed_genres[index]
                    genre_freqs[genre] = genre_freqs.get(genre, 0) + 1

            actual_genre = max(genre_freqs, key=genre_freqs.get)
            print('Predicted genre: {}'.format(actual_genre))
            total_count += 1
            if genre == actual_genre:
                match_count += 1

    print('Matched {} out of {} songs: {}%'.format(
        match_count, total_count, (match_count / total_count) * 100))
Пример #11
0
def find_clusters(geos, tol):
    hav_tol = tol / 6371.0
    used = [False] * len(geos)
    ball_tree = nn.BallTree(np.radians(geos), metric="haversine")
    centers = list()
    for i in xrange(len(geos)):
        if not used[i]:
            loc = geos[i]
            st = np.array([i, loc[0], loc[1]])
            centers.append(st)
            nearest = ball_tree.query_radius([np.radians(loc)], hav_tol)[0]
            for i in nearest:
                used[i] = True
    return np.array(centers)
Пример #12
0
def test_ball_tree_query_radius(n_samples=100, n_features=10):
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = neighbors.BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = bt.query_radius(query_pt, r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert np.all(i == ind)
Пример #13
0
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10):
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = neighbors.BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind])**2).sum(1))

        assert_array_almost_equal(d, dist)
Пример #14
0
def test_unsupervised_inputs():
    """test the types of valid input into NearestNeighbors"""
    X = rng.random_sample((10, 3))

    nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1)
    nbrs_fid.fit(X)

    dist1, ind1 = nbrs_fid.kneighbors(X)

    nbrs = neighbors.NearestNeighbors(n_neighbors=1)

    for input in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
        nbrs.fit(input)
        dist2, ind2 = nbrs.kneighbors(X)

        assert_array_almost_equal(dist1, dist2)
        assert_array_almost_equal(ind1, ind2)
Пример #15
0
def LaplacianEigenmaps(data, numNeigh=5, heatKernel=False, heatSigma=1.0):
	''' W is weight/distance/kernel, D is diagonal, L is the Laplacian '''

	if 1:
		W = distance.squareform(distance.pdist(data))
		inds = np.argsort(W, 1)
		W[inds > numNeigh] = 0
		W = inds <= numNeigh

	if 0:
		W = np.zeros([len(data), len(data)])
		Ball = neighbors.BallTree(data)
		dists, nn = Ball.query(data, numNeigh)

		k=numNeigh
		for di in range(len(dists)):
			for ki in range(k):
				# W[di, nn[di,ki]] = dists[di,ki]
				W[di, nn[di,ki]] = 1.0#dists[di,ki]


	if not heatKernel:
		# Binary representation
		W = np.maximum(W, W.T)
	else:
		# Heat kernel based on distances. Ranges between 0-1
		W = W**2
		W /= np.max(np.max(W))
		W = np.maximum(W, W.T)
		W[W!=0] = np.exp(-W[W!=0] / (2*heatSigma**2))

	diag_ = np.diag(np.sum(W,1))

	#Calc Laplacian
	L = diag_-W
	# vals, vecs = np.linalg.eigh(L)
	vals, vecs = sparse.linalg.eigsh(np.asarray(L, dtype=np.float), which='SM') # "LM"

	# Only keep positive eigenvals
	posInds = np.nonzero(vals>0.01)[0]
	posVecs = vecs[:,posInds]

	# posVecs = vecs


	return posVecs
Пример #16
0
def compute_spec_LLE(n_neighbors=10, out_dim=3):
    # Compute the LLE projection
    LLE = manifold.LocallyLinearEmbedding(n_neighbors, out_dim,
                                          method='modified',
                                          eigen_solver='dense')
    Y_LLE = LLE.fit_transform(spec)
    print " - finished LLE projection"

    # remove outliers for the plot
    BT = neighbors.BallTree(Y_LLE)
    dist, ind = BT.query(Y_LLE, n_neighbors)
    dist_to_n = dist[:, -1]
    dist_to_n -= dist_to_n.mean()
    std = np.std(dist_to_n)
    flag = (dist_to_n > 0.25 * std)
    print " - removing %i outliers for plot" % flag.sum()

    return Y_LLE[~flag], color[~flag]
Пример #17
0
    def __init__(self, table, filename=None, rakey="RA", deckey="DEC"):
        # Prep data
        self.table = table
        x = np.array(np.deg2rad([self.table[deckey],
                                 self.table[rakey]])).transpose()
        x = x.reshape(-1, 2)

        # Test if cache file exists. If not, make noe
        if filename is not None and os.path.exists(filename):
            self.rdtree = pickle.load(open(filename, "rb"))
        else:
            self.rdtree = skn.BallTree(
                x,
                metric="haversine",
            )

        # Cache file doesn't exist. Write it
        if filename is not None and not os.path.exists(filename):
            pickle.dump(self.rdtree, open(filename, "wb"))
Пример #18
0
 def __init__(self, k_value, points, labels):
     '''
     This function will initialize the attributes to be used for
     finding the outlier, like hyperparameters of the algorithm.
     '''
     #Setting the parameter governing the kth distance from a point
     self.k_value = k_value
     #The dataset point to perform the outlier detection
     self.points = points
     #Creating the kDTree from the points
     # print("Creating the KD Tree")
     # self.kdTree=spatial.cKDTree(points,
     #                                     leafsize=300000,
     #                                     compact_nodes=True,
     #                                     balanced_tree=True)
     #Creating the ball tree
     print("Creating the Ball tree")
     self.ball_tree = neighbors.BallTree(points, leaf_size=4)
     #Actual labels of the points to check the accuracy of method
     self.labels = labels
Пример #19
0
def knn_graph(X, k, method='brute_force', leaf_size=30, metric='euclidean'):
    n, p = X.shape
    if method == 'kd_tree':
        if _HAS_SKLEARN:
            kdtree = _sknbr.KDTree(X, leaf_size=leaf_size, metric=metric)
            distances, neighbors = kdtree.query(X,
                                                k=k,
                                                return_distance=True,
                                                sort_results=True)
            radii = distances[:, -1]
        else:
            raise ImportError("The scikit-learn library could not be loaded." +
                              " It is required for the 'kd-tree' method.")

    if method == 'ball_tree':
        if _HAS_SKLEARN:
            btree = _sknbr.BallTree(X, leaf_size=leaf_size, metric=metric)
            distances, neighbors = btree.query(X,
                                               k=k,
                                               return_distance=True,
                                               sort_results=True)
            radii = distances[:, -1]
        else:
            raise ImportError("The scikit-learn library could not be loaded." +
                              " It is required for the 'ball-tree' method.")

    else:  # assume brute-force
        if not _HAS_SCIPY:
            raise ImportError("The 'scipy' module could not be loaded. " +
                              "It is required for the 'brute_force' method " +
                              "for building a knn similarity graph.")

        d = _spd.pdist(X, metric=metric)
        D = _spd.squareform(d)
        rank = np.argsort(D, axis=1)
        neighbors = rank[:, 0:k]
        k_nbr = neighbors[:, -1]
        radii = D[np.arange(n), k_nbr]

    return neighbors, radii
def batch_effect_score(dimred_matrix, batch_ids, knn=100, subsample=0.1):
    """
    For each cell, search KNN and calculate the proportion of cells from
    the same batch. Then compute the ratio of the proportion to the
    percentage (number of cells) of this batch. The batch_effect_score is
    defined as the average of the ratio. Closer to 1 means no batch effect.
    """
    num_bcs = dimred_matrix.shape[0]
    assert num_bcs == len(batch_ids)

    # batch percentage
    counter = Counter(batch_ids)
    batch_to_percentage = {
        batch: count * 1.0 / sum(counter.values())
        for batch, count in counter.iteritems()
    }

    # BallTree for KNN
    balltree = sk_neighbors.BallTree(dimred_matrix, leaf_size=knn)

    np.random.seed(0)
    select_bc_idx = np.array(
        [i for i in range(num_bcs) if np.random.uniform() < subsample])
    knn_idx = balltree.query(dimred_matrix[select_bc_idx],
                             k=knn + 1,
                             return_distance=False)

    same_batch_ratio = []
    for bc, neighbors in izip(select_bc_idx, knn_idx):
        batch_id = batch_ids[bc]
        same_batch = len(
            [i for i in neighbors[1:] if batch_ids[i] == batch_id])
        same_batch_ratio.append(
            (same_batch * 1.0 / knn) / batch_to_percentage[batch_id])

    return np.mean(same_batch_ratio)
def nearest_indices_2D(mod_lon, mod_lat, new_lon, new_lat,
                       mask = None):
    '''
    Obtains the 2 dimensional indices of the nearest model points to specified
    lists of longitudes and latitudes. Makes use of sklearn.neighbours
    and its BallTree haversine method. Ensure there are no NaNs in 
    input longitude/latitude arrays (or mask them using "mask"")

    Example Useage
    ----------
    # Get indices of model points closest to altimetry points
    ind_x, ind_y = nemo.nearest_indices(altimetry.dataset.longitude,
                                        altimetry.dataset.latitude)
    # Nearest neighbour interpolation of model dataset to these points
    interpolated = nemo.dataset.isel(x_dim = ind_x, y_dim = ind_y)

    Parameters
    ----------
    mod_lon (2D array): Model longitude (degrees) array (2-dimensional)
    mod_lat (2D array): Model latitude (degrees) array (2-dimensions)
    new_lon (1D array): Array of longitudes (degrees) to compare with model
    new_lat (1D array): Array of latitudes (degrees) to compare with model
    mask (2D array): Mask array. Where True (or 1), elements of array will
                     not be included. For example, use to mask out land in
                     case it ends up as the nearest point.

    Returns
    -------
    Array of x indices, Array of y indices
    '''
    # Cast lat/lon to numpy arrays in case xarray things
    new_lon = np.array(new_lon)
    new_lat = np.array(new_lat)
    mod_lon = np.array(mod_lon)
    mod_lat = np.array(mod_lat)
    original_shape = mod_lon.shape

    # If a mask is supplied, remove indices from arrays.
    if mask is None:
        mod_lon = mod_lon.flatten()
        mod_lat = mod_lat.flatten()
    else:
        mod_lon = remove_indices_by_mask(mod_lon, mask)
        mod_lat = remove_indices_by_mask(mod_lat, mask)
        # If we are masking, we want to preserve the original indices so that
        # we can get them back at the end (since masked points are removed).
        cc, rr = np.meshgrid( np.arange(0,original_shape[1]), 
                             np.arange(0,original_shape[0]))
        cc = remove_indices_by_mask(cc, mask)
        rr = remove_indices_by_mask(rr, mask)
    

    # Put lons and lats into 2D location arrays for BallTree: [lat, lon]
    mod_loc = np.vstack((mod_lat, mod_lon)).transpose()
    new_loc = np.vstack((new_lat, new_lon)).transpose()

    # Convert lat/lon to radians for BallTree
    mod_loc = np.radians(mod_loc)
    new_loc = np.radians(new_loc)

    # Do nearest neighbour interpolation using BallTree (gets indices)
    tree = nb.BallTree(mod_loc, leaf_size=5, metric='haversine')
    _, ind_1d = tree.query(new_loc, k=1)
    
    if mask is None:
        # Get 2D indices from 1D index output from BallTree
        ind_y, ind_x = np.unravel_index(ind_1d, original_shape)
    else:
        ind_y = rr[ind_1d]
        ind_x = cc[ind_1d]
        
    ind_x = xr.DataArray(ind_x.squeeze())
    ind_y = xr.DataArray(ind_y.squeeze())
        
    return ind_x, ind_y
Пример #22
0
def knn_graph(X, k, method='brute_force', leaf_size=30):
    """
    Compute the symmetric k-nearest neighbor graph for a set of points. Assume
    Euclidean distance metric.

    Parameters
    ----------
    X : numpy array | list [numpy arrays]
        Data points, with each row as an observation.

    k : int
        The number of points to consider as neighbors of any given observation.

    method : {'brute-force', 'kd-tree', 'ball-tree'}, optional
        Computing method.

        - 'brute-force': computes the (Euclidean) distance between all O(n^2)
          pairs of rows in 'X', then for every point finds the k-nearest. It is
          limited to tens of thousands of observations (depending on available
          RAM).

        - 'kd-tree': partitions the data into axis-aligned rectangles to avoid
          computing all O(n^2) pairwise distances. Much faster than
          'brute-force', but only works for data in fewer than about 20
          dimensions. Requires the scikit-learn library.

        - 'ball-tree': partitions the data into balls and uses the metric
          property of euclidean distance to avoid computing all O(n^2)
          distances. Typically much faster than 'brute-force', and works with
          up to a few hundred dimensions. Requires the scikit-learn library.

    leaf_size : int, optional
        For the 'kd-tree' and 'ball-tree' methods, the number of observations
        in the leaf nodes. Leaves are not split further, so distance
        computations within leaf nodes are done by brute force. 'leaf_size' is
        ignored for the 'brute-force' method.

    Returns
    -------
    neighbors : numpy array
        Each row contains the nearest neighbors of the corresponding row in
        'X', indicated by row indices.

    radii : list[float]
        For each row of 'X' the distance to its k'th nearest neighbor
        (including itself).

    See Also
    --------
    epsilon_graph

    Examples
    --------
    >>> X = numpy.random.rand(100, 2)
    >>> knn, radii = debacl.utils.knn_graph(X, k=8, method='kd-tree')
    """

    n, p = X.shape

    if method == 'kd_tree':
        if _HAS_SKLEARN:
            kdtree = _sknbr.KDTree(X, leaf_size=leaf_size, metric='euclidean')
            distances, neighbors = kdtree.query(X,
                                                k=k,
                                                return_distance=True,
                                                sort_results=True)
            radii = distances[:, -1]
        else:
            raise ImportError("The scikit-learn library could not be loaded." +
                              " It is required for the 'kd-tree' method.")

    if method == 'ball_tree':
        if _HAS_SKLEARN:
            btree = _sknbr.BallTree(X, leaf_size=leaf_size, metric='euclidean')
            distances, neighbors = btree.query(X,
                                               k=k,
                                               return_distance=True,
                                               sort_results=True)
            radii = distances[:, -1]
        else:
            raise ImportError("The scikit-learn library could not be loaded." +
                              " It is required for the 'ball-tree' method.")

    else:  # assume brute-force
        if not _HAS_SCIPY:
            raise ImportError("The 'scipy' module could not be loaded. " +
                              "It is required for the 'brute_force' method " +
                              "for building a knn similarity graph.")

        d = _spd.pdist(X, metric='euclidean')
        D = _spd.squareform(d)
        rank = _np.argsort(D, axis=1)
        neighbors = rank[:, 0:k]

        k_nbr = neighbors[:, -1]
        radii = D[_np.arange(n), k_nbr]

    return neighbors, radii
Пример #23
0
def build_neighbor_index(x, leaf_size):
    return sk_neighbors.BallTree(x, leaf_size=leaf_size)
Пример #24
0
    exit(0)
    """

    allDicVecs = np.array(dicVec2Word.keys())

    #spellPairs = getSpellPairs('../data/misspeltCorrectPairs.txt')
    spellPairs = getSpellPairs('../data/misspeltCorrectPairsEditDist4.txt')

    #metricArr = ['euclidean', 'braycurtis', 'russellrao', 'cityblock', 'manhattan', 'infinity', 'jaccard', 'seuclidean', 'sokalsneath', 'kulsinski', 'minkowski', 'mahalanobis', 'p', 'l2', 'hamming', 'l1', 'wminkowski', 'pyfunc']
    metricArr = ['euclidean']
    for metr in metricArr:
        errorFile = open('../data/errorPairs.txt', 'w')
        noErrorFile = open('../data/noErrorPairs.txt', 'w')

        print "metric: ", metr
        tree = sn.BallTree(allDicVecs, metric=metr)
        counter = 0

        oldtime = time.time()

        for incorr, corr in spellPairs.items():
            incorrVec = getVec(charVecs, incorr)
            neigborsInds = tree.query([incorrVec],
                                      k=numNeighbors,
                                      return_distance=False)
            #print counter
            found = False
            for ind in neigborsInds[0]:
                if corr in dicVec2Word[tuple(allDicVecs[ind])]:
                    counter += 1
                    found = True
Пример #25
0
import geopy.distance
import numpy as np
from sklearn import neighbors

# load geojson data for manhattan
nycmap = json.load(open("nycpluto_manhattan.geojson"))

# load library data from csv file, convert coordinates to radians, and create coordinate pairs
libs = pd.read_csv('manhattanlibraries.csv', usecols=['facname', 'latitude', 'longitude'])
libs['latitude'] = libs['latitude'].apply(func=math.radians)
libs['longitude'] = libs['longitude'].apply(func=math.radians)
libs['coord'] = list(zip(libs['latitude'], libs['longitude']))

# load library data into BallTree
libcoords = np.asarray(list(libs['coord']))
tree = neighbors.BallTree(libcoords, metric="haversine")

# load lot data from csv file, convert coordinates to radians, and create coordinate pairs
df = pd.read_csv('pluto_small.csv')
df = df.dropna(subset=['assesstot', 'bldgarea', 'lotarea', 'latitude', 'longitude'])
df['latitude'] = df['latitude'].apply(func=math.radians)
df['longitude'] = df['longitude'].apply(func=math.radians)
df['coord'] = list(zip(df['latitude'], df['longitude']))

# query the BallTree and save results back in df
lotcoords = np.asarray(list(df['coord']))
dist, _ = tree.query(X=lotcoords, k=1)
df['dist'] = dist
df['dist'] = df['dist'].apply(lambda x: x*3960)

# use Plotly express function to create a choropleth map
Пример #26
0
#get wordvectors for all words as numpy array
print("Total number of wordvectors=", len(nlp.vocab.vectors))
print("Getting wordvectors...")
wordvecs = numpy.array([nlp.vocab.get_vector(word) for word in words])
print("Retrieved=", len(wordvecs), "wordvectors.")

#ensure the list of words corresponds to the list of wordvectors
assert len(words) == len(wordvecs)
spot_check = random.choice(range(0, len(words)))
assert numpy.array_equal(nlp(words[spot_check]).vector, wordvecs[spot_check])
print("Spot check passed.")

#pickle the entire vocab
#pickle.HIGHEST_PROTOCOL depends on Python version
with open('vocab.pkl', 'wb') as f:
    pickle.dump(words, f, protocol=4)
print("Dumped vocab words to pickle file vocab.pkl")

#place all wordvectors in balltree, and pickle entire tree
tree = nbs.BallTree(wordvecs)
with open('balltree.pkl', 'wb') as f:
    pickle.dump(tree, f, protocol=4)
print("Dumped wordvector BallTree to pickle file balltree.pkl")

#create word:vector dict and pickle it
dict = dict(zip(words, wordvecs))
with open('dict.pkl', 'wb') as f:
    pickle.dump(dict, f, protocol=4)
print("Dumped word2vec dictionary in dict.pkl")
Пример #27
0
import os

ipAddress = "192.168.1.103"  # Define IP address according to IP address of external device
'''
# Load all the models we need: a face detector, a shape predictor for face landmarks,
# and the face recognition model (using distances between face landmarks)
'''
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
facerec = dlib.face_recognition_model_v1(
    "dlib_face_recognition_resnet_model_v1.dat")

# Load trained face descriptors, and put them in a tree to find the closest one faster
trainingVects = np.load("trainingVects.npy")
trainingLabels = np.load("trainingLabels.npy")
tree = neighbors.BallTree(trainingVects, leaf_size=2)

loop = True  # True until one presses "q"
searchedPerson = 0  # Index of the person to be searched
ratio = 1.2  # Ratio used to resize received image
'''
# Function to display result on pc
'''


def display_result(frame, res):
    if len(res) > 0:
        for i in range((len(res) // 5)):
            l, t, r, b = int(res[i * 5 + 1] / ratio), int(
                res[i * 5 + 2] / ratio), int(res[i * 5 + 3] / ratio), int(
                    res[i * 5 + 4] / ratio)
Пример #28
0
d1rad = np.array([data1DEC * np.pi / 180., data1RA * np.pi / 180.]).transpose()
r1rad = np.array([rand1DEC * np.pi / 180., rand1RA * np.pi / 180.]).transpose()
print("Loaded data")

nside_base = 256  # Let's see if I can get the stupid thing to work for nside=256, i.e. a whole bunch of histograms
# Maybe represent them as sparse matrices or something to speed stuff up?
# Actually, maybe it's easier to just write a whole bunch of lists of things.

d1pix = hp.ang2pix(nside_base, data1RA, data1DEC, nest=False, lonlat=True)
r1pix = hp.ang2pix(nside_base, rand1RA, rand1DEC, nest=False, lonlat=True)
print("Computed healpixels")

if not ns.loadtree:
    t0 = time.time()
    d1tree = neighbors.BallTree(d1rad, metric='haversine')
    pickle.dump(d1tree, open('%s-d1tree.p' % (truncate(ns.phot_name)), 'wb'))
    print(time.time() - t0)  # 5x as long as flatsky case

    r1tree = neighbors.BallTree(r1rad, metric='haversine')
    pickle.dump(r1tree,
                open('%s-r1tree.p' % (truncate(ns.phot_name_randoms)), 'wb'))
else:
    d1tree = pickle.load(open('%s-d1tree.p' % (truncate(ns.phot_name)), 'rb'))
    print("Loaded data tree")
    r1tree = pickle.load(
        open('%s-r1tree.p' % (truncate(ns.phot_name_randoms)), 'rb'))
    print("Loaded random tree")

#zs = np.arange(zmin,zmax+deltaz,deltaz)
zs = np.linspace(zmin, zmax + deltaz,
def cluster_finder(parcels_series,
                   final_eps,
                   threshold=5,
                   verbose=False,
                   point_groups=None):
    points = np.concatenate([
        parcel.np_road_points for parcel in parcels_series
        if parcel.road_points
    ])

    tree = neighbors.BallTree(points)
    neigh = tree.query_radius(points, final_eps, return_distance=True)
    G = gt.Graph()
    G.add_vertex(points.shape[0])

    if verbose: print('created ball tree, neighbors, init graph')

    ## create initial cluster map and prune grouped points from neighbors to create idx_dists
    cluster_map = np.array([None] * points.shape[0])
    idx_dists = [
        np.vstack([x, y.astype(int)]) for x, y in zip(neigh[0], neigh[1])
    ]
    base_idx = 0
    for parcel in parcels_series:
        cluster = Cluster(np.inf,
                          None,
                          parcels=[parcel],
                          sub_clusters=None,
                          threshold=5)
        group = np.arange(base_idx, base_idx + parcel.np_road_points.shape[0])
        for idx in group:
            if idx != base_idx:
                G.add_edge(base_idx, idx)
                G.add_edge(idx, base_idx)
            cluster_map[idx] = cluster
            mask = ~np.isin(idx_dists[idx][0], group)
            idx_dists[idx] = idx_dists[idx][:, mask]

        base_idx += group.shape[0]

    if verbose:
        print(
            'created cluster_map array thing, initialized graph and pruned neighbors distances from point groups'
        )

    ## create dendogram
    if verbose: print(len(set(cluster_map)), len(cluster_map))
    comp, hist = gt.topology.label_components(G)
    for eps in range(1, final_eps + 1):
        try:
            start = time.time()
            new_edge_set = set()
            for i, x in enumerate(idx_dists):
                mask = (x[1] <= eps) & (x[1] > (eps - 1))
                idxs = list(x[0, mask])

                for idx in idxs:
                    if comp.a[i] != comp.a[idx]:
                        G.add_edge(i, idx)
                        new_edge_set.add(idx)
            if verbose: print(time.time() - start)

            if len(new_edge_set) > 0:
                comp, hist = gt.topology.label_components(G)
                for component in set(comp.a[list(new_edge_set)]):
                    scs = []
                    for sc in set(cluster_map[comp.a == component]):
                        sc.death = eps  # TODO: should this be eps or eps - 1
                        scs.append(sc)

                    cluster_map[comp.a == component] = Cluster(
                        np.inf,
                        None,
                        parcels=None,
                        sub_clusters=scs,
                        threshold=threshold)

            if verbose: print(time.time() - start)
            curr_num_clusters = len(set(cluster_map))
            if curr_num_clusters == 1:
                break
            if verbose:
                print('%s clusters at eps=%s in %ss' %
                      (curr_num_clusters, eps, time.time() - start))
                print('---')
        except KeyboardInterrupt:
            print(
                'interupted during graph part @ eps=%s. you should probably ignore clusters born at this eps'
                % eps)
            break

    clusters = set(cluster_map)
    if len(clusters) == 1:
        return clusters.pop()
    return clusters
import numpy as np
import pandas as pd
import csv


def Union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list


from sklearn import neighbors
X = pd.read_csv("traffic_signals.csv")
X = X.as_matrix()
print(type(X))
Y = pd.read_csv("traffix_signals2.csv")
Y = Y.as_matrix()
New = []
nnn = []
ball_tree = neighbors.BallTree(X, leaf_size=2)
for i in range(len(Y)):
    if i not in New:
        nnn = Union(nnn, [i])
        ind = ball_tree.query_radius([Y[i]], r=50)
        a = ind[0].tolist()
        New = Union(New, a)
X_New = []
for i in range(len(nnn)):
    X_New.append(X[nnn[i]])
np.savetxt('2darray.csv', X_New, delimiter=',', fmt='%d')