def test_warning_flag(n_samples=100, n_features=3, k=3): """test that discarding identical distances triggers warning flag""" X = rng.random_sample(size=(n_samples, n_features)) q = rng.random_sample(size=n_features) bt = neighbors.BallTree(X[:-1], leaf_size=5) dist, ind = bt.query(q, k=k) # make the last point identical to the furthest neighbor # querying this should set warning_flag to True X[-1:] = X[ind[0, k - 1]] bt = neighbors.BallTree(X, leaf_size=5) dist, ind = bt.query(q, k=k) assert bt.warning_flag # make the last point identical to the closest neighbor # though the distance is identical, there is no ambiguity, so there # should be no warning. If k==1, this should not be done if k > 1: X[-1:] = X[ind[0, 0]] bt = neighbors.BallTree(X, leaf_size=5) dist, ind = bt.query(q, k=k) assert not bt.warning_flag
def fits_coordinate_to_accident_nodes_within_its_radius(self,police_data): output = [] title = ['x', 'y', 'year', 'month', 'day', 'humra'] output.append(title) for fileAcc in police_data: X = pd.read_csv('acc/' + fileAcc + '.csv') C = pd.read_csv('acc/' + fileAcc + '.csv') C.drop(C.loc[:, 'pk_teuna_fikt':'STATUS_IGUN'].columns, axis=1, inplace=True) C['X'].replace('', np.nan, inplace=True) C.dropna(subset=['X'], inplace=True) C = C.values X.drop(X.loc[:, 'sug_tik':'YEHIDA'].columns, axis=1, inplace=True) X.drop(X.loc[:, 'SHAA':'RAMZOR'].columns, axis=1, inplace=True) X.drop(X.loc[:, 'SUG_TEUNA':'STATUS_IGUN'].columns, axis=1, inplace=True) X['X'].replace('', np.nan, inplace=True) X.dropna(subset=['X'], inplace=True) X = X.values Y = pd.read_csv("junctions.csv") Y = Y.values ball_tree = neighbors.BallTree(C, leaf_size=2) for i in range(len(Y)): ind = ball_tree.query_radius([Y[i]], r=1000) for j in (ind[0]): x = [Y[i][0], Y[i][1], int(X[j][1]), int(X[j][2]), int(X[j][3]), int(4-X[j][4])] output.append(x) with open("AccRadius.csv", "w", newline="") as f: writer = csv.writer(f) writer.writerows(output)
def _kNN__ball_tree(self, d): kwargs = OrderedDict() kwargs['leaf_size'] = 40 kwargs['metric'] = 'minkowski' for k, v in d['neighbors']['args'].items(): kwargs[k] = v for k, v in kwargs.items(): d['neighbors']['args'][k] = v self.kNN_tree = neighbors.BallTree(self.data.df[self.manifold_names], **kwargs) self.kNN_names = ['NN_{}'.format(i) for i in range(self.kNN)] self.data.df = pd.concat([ self.data.df, pd.DataFrame(data=self.kNN_tree.query( X=self.data.df[self.manifold_names], k=self.kNN, return_distance=True)[0], columns=self.kNN_names) ], axis=1)
def dist(i, arr): #use sklearn to find distance to each object is to each other tree = neighbors.BallTree(arr, leaf_size=2) dist, ind = tree.query(arr[[i]], k=2) return dist
def add_feature_to_coordinates(self,feature,type): global Final m = 0 for file in feature: X = pd.read_csv('features/'+file+'.csv') X = X.values Y = pd.read_csv("junctions.csv") Y = Y.values New = [] New.append(['x','y',file]) if type == 'count': ball_tree = neighbors.BallTree(X, leaf_size=2) if type == 'sum' or type == 'avg': Z = pd.read_csv('features/' + file + '.csv') Z.drop(Z.columns[[2]], axis=1,inplace=True) ball_tree = neighbors.BallTree(Z, leaf_size=2) x = [Y[0][0], Y[0][1], 0] for i in range(len(Y)): if type == 'count' or type == 'avg': count = ball_tree.query_radius([Y[i]], r=self.radius * 1000, count_only=True) x = [Y[i][0], Y[i][1], int(count)] if type == 'sum' or type == 'avg': sum = 0 ind = ball_tree.query_radius([Y[i]], r=self.radius * 1000) for j in range(len(ind[0])): sum = sum + X[j][2] if type == 'sum': x = [Y[i][0], Y[i][1], sum] if type == 'avg': if int(count) == 0 : x = [Y[i][0], Y[i][1], 0] else: x = [Y[i][0], Y[i][1], sum/int(count)] New.append(x) # New = self.normalize(New) with open("out" + file + '.csv' , "w", newline="") as f: writer = csv.writer(f) writer.writerows(New) b = pd.read_csv("out" + file + '.csv') Final=Final.merge(b, on=('x','y')) x, y = Final[file].min(), Final[file].max() Final[file] = round((Final[file] - x) / (y - x), 3) os.remove("out" + file + '.csv') m = m + 1
def find_knn(curr_matrix, ref_matrix, knn): """ for each row in curr_matrix, find k nearest neighbors in ref_matrix, return an array of shape=[curr_matrix.shape[0] * knn, ], which stores the index of nearest neighbors in ref_matrix """ balltree = sk_neighbors.BallTree(ref_matrix, leaf_size=knn) nn_idx = balltree.query(curr_matrix, k=knn, return_distance=False) return nn_idx.ravel().astype(int)
def ball_tree_for_one_image(self): expected = self.__test_label_list[0] start = time.clock() tree = sn.BallTree(self.__train_image_list) print time.clock() - start start = time.clock() predicted = tree.query(self.__test_image_list[0].reshape( 1, len(self.__test_image_list[0]))) print time.clock() - start
def test_ball_tree_pickle(): import pickle X = rng.random_sample(size=(10, 3)) bt1 = neighbors.BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) for protocol in (0, 1, 2): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert np.all(ind1 == ind2) assert_array_almost_equal(dist1, dist2)
def test_ball_tree_p_distance(): X = rng.random_sample(size=(100, 5)) for p in (1, 2, 3, 4, np.inf): bt = neighbors.BallTree(X, leaf_size=10, p=p) kdt = cKDTree(X, leafsize=10) dist_bt, ind_bt = bt.query(X, k=5) dist_kd, ind_kd = kdt.query(X, k=5, p=p) assert_array_almost_equal(dist_bt, dist_kd)
def classify_nearest_neighbor_ball_tree(k): labels = load_labels() song_samples = [] indexed_genres = [] for genre, song_genres_ids in labels.groupby('category'): print('Indexing genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2)): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) for val in song.values: song_samples.append(val) indexed_genres.append(genre) ball_tree = nb.BallTree(np.vstack(song_samples)) total_count = 0 match_count = 0 for genre, song_genres_ids in labels.groupby('category'): print('Expected genre: {}'.format(genre)) num_values = len(song_genres_ids.values) for i in range(int(num_values / 2), num_values): val = song_genres_ids.values[i] song_id = val[0] song = pd.read_csv('song_data/training/{}'.format(song_id), header=None) genre_freqs = {} split_song = np.array_split(song, 5, axis=0) # Split song into sections for s in split_song: avg_song_val = np.mean(s) # Take average of each section genre_indices = ball_tree.query([avg_song_val], k, return_distance=False) for index in genre_indices[0]: genre = indexed_genres[index] genre_freqs[genre] = genre_freqs.get(genre, 0) + 1 actual_genre = max(genre_freqs, key=genre_freqs.get) print('Predicted genre: {}'.format(actual_genre)) total_count += 1 if genre == actual_genre: match_count += 1 print('Matched {} out of {} songs: {}%'.format( match_count, total_count, (match_count / total_count) * 100))
def find_clusters(geos, tol): hav_tol = tol / 6371.0 used = [False] * len(geos) ball_tree = nn.BallTree(np.radians(geos), metric="haversine") centers = list() for i in xrange(len(geos)): if not used[i]: loc = geos[i] st = np.array([i, loc[0], loc[1]]) centers.append(st) nearest = ball_tree.query_radius([np.radians(loc)], hav_tol)[0] for i in nearest: used[i] = True return np.array(centers)
def test_ball_tree_query_radius(n_samples=100, n_features=10): X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = neighbors.BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = bt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert np.all(i == ind)
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10): X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = neighbors.BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind])**2).sum(1)) assert_array_almost_equal(d, dist)
def test_unsupervised_inputs(): """test the types of valid input into NearestNeighbors""" X = rng.random_sample((10, 3)) nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1) nbrs_fid.fit(X) dist1, ind1 = nbrs_fid.kneighbors(X) nbrs = neighbors.NearestNeighbors(n_neighbors=1) for input in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): nbrs.fit(input) dist2, ind2 = nbrs.kneighbors(X) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def LaplacianEigenmaps(data, numNeigh=5, heatKernel=False, heatSigma=1.0): ''' W is weight/distance/kernel, D is diagonal, L is the Laplacian ''' if 1: W = distance.squareform(distance.pdist(data)) inds = np.argsort(W, 1) W[inds > numNeigh] = 0 W = inds <= numNeigh if 0: W = np.zeros([len(data), len(data)]) Ball = neighbors.BallTree(data) dists, nn = Ball.query(data, numNeigh) k=numNeigh for di in range(len(dists)): for ki in range(k): # W[di, nn[di,ki]] = dists[di,ki] W[di, nn[di,ki]] = 1.0#dists[di,ki] if not heatKernel: # Binary representation W = np.maximum(W, W.T) else: # Heat kernel based on distances. Ranges between 0-1 W = W**2 W /= np.max(np.max(W)) W = np.maximum(W, W.T) W[W!=0] = np.exp(-W[W!=0] / (2*heatSigma**2)) diag_ = np.diag(np.sum(W,1)) #Calc Laplacian L = diag_-W # vals, vecs = np.linalg.eigh(L) vals, vecs = sparse.linalg.eigsh(np.asarray(L, dtype=np.float), which='SM') # "LM" # Only keep positive eigenvals posInds = np.nonzero(vals>0.01)[0] posVecs = vecs[:,posInds] # posVecs = vecs return posVecs
def compute_spec_LLE(n_neighbors=10, out_dim=3): # Compute the LLE projection LLE = manifold.LocallyLinearEmbedding(n_neighbors, out_dim, method='modified', eigen_solver='dense') Y_LLE = LLE.fit_transform(spec) print " - finished LLE projection" # remove outliers for the plot BT = neighbors.BallTree(Y_LLE) dist, ind = BT.query(Y_LLE, n_neighbors) dist_to_n = dist[:, -1] dist_to_n -= dist_to_n.mean() std = np.std(dist_to_n) flag = (dist_to_n > 0.25 * std) print " - removing %i outliers for plot" % flag.sum() return Y_LLE[~flag], color[~flag]
def __init__(self, table, filename=None, rakey="RA", deckey="DEC"): # Prep data self.table = table x = np.array(np.deg2rad([self.table[deckey], self.table[rakey]])).transpose() x = x.reshape(-1, 2) # Test if cache file exists. If not, make noe if filename is not None and os.path.exists(filename): self.rdtree = pickle.load(open(filename, "rb")) else: self.rdtree = skn.BallTree( x, metric="haversine", ) # Cache file doesn't exist. Write it if filename is not None and not os.path.exists(filename): pickle.dump(self.rdtree, open(filename, "wb"))
def __init__(self, k_value, points, labels): ''' This function will initialize the attributes to be used for finding the outlier, like hyperparameters of the algorithm. ''' #Setting the parameter governing the kth distance from a point self.k_value = k_value #The dataset point to perform the outlier detection self.points = points #Creating the kDTree from the points # print("Creating the KD Tree") # self.kdTree=spatial.cKDTree(points, # leafsize=300000, # compact_nodes=True, # balanced_tree=True) #Creating the ball tree print("Creating the Ball tree") self.ball_tree = neighbors.BallTree(points, leaf_size=4) #Actual labels of the points to check the accuracy of method self.labels = labels
def knn_graph(X, k, method='brute_force', leaf_size=30, metric='euclidean'): n, p = X.shape if method == 'kd_tree': if _HAS_SKLEARN: kdtree = _sknbr.KDTree(X, leaf_size=leaf_size, metric=metric) distances, neighbors = kdtree.query(X, k=k, return_distance=True, sort_results=True) radii = distances[:, -1] else: raise ImportError("The scikit-learn library could not be loaded." + " It is required for the 'kd-tree' method.") if method == 'ball_tree': if _HAS_SKLEARN: btree = _sknbr.BallTree(X, leaf_size=leaf_size, metric=metric) distances, neighbors = btree.query(X, k=k, return_distance=True, sort_results=True) radii = distances[:, -1] else: raise ImportError("The scikit-learn library could not be loaded." + " It is required for the 'ball-tree' method.") else: # assume brute-force if not _HAS_SCIPY: raise ImportError("The 'scipy' module could not be loaded. " + "It is required for the 'brute_force' method " + "for building a knn similarity graph.") d = _spd.pdist(X, metric=metric) D = _spd.squareform(d) rank = np.argsort(D, axis=1) neighbors = rank[:, 0:k] k_nbr = neighbors[:, -1] radii = D[np.arange(n), k_nbr] return neighbors, radii
def batch_effect_score(dimred_matrix, batch_ids, knn=100, subsample=0.1): """ For each cell, search KNN and calculate the proportion of cells from the same batch. Then compute the ratio of the proportion to the percentage (number of cells) of this batch. The batch_effect_score is defined as the average of the ratio. Closer to 1 means no batch effect. """ num_bcs = dimred_matrix.shape[0] assert num_bcs == len(batch_ids) # batch percentage counter = Counter(batch_ids) batch_to_percentage = { batch: count * 1.0 / sum(counter.values()) for batch, count in counter.iteritems() } # BallTree for KNN balltree = sk_neighbors.BallTree(dimred_matrix, leaf_size=knn) np.random.seed(0) select_bc_idx = np.array( [i for i in range(num_bcs) if np.random.uniform() < subsample]) knn_idx = balltree.query(dimred_matrix[select_bc_idx], k=knn + 1, return_distance=False) same_batch_ratio = [] for bc, neighbors in izip(select_bc_idx, knn_idx): batch_id = batch_ids[bc] same_batch = len( [i for i in neighbors[1:] if batch_ids[i] == batch_id]) same_batch_ratio.append( (same_batch * 1.0 / knn) / batch_to_percentage[batch_id]) return np.mean(same_batch_ratio)
def nearest_indices_2D(mod_lon, mod_lat, new_lon, new_lat, mask = None): ''' Obtains the 2 dimensional indices of the nearest model points to specified lists of longitudes and latitudes. Makes use of sklearn.neighbours and its BallTree haversine method. Ensure there are no NaNs in input longitude/latitude arrays (or mask them using "mask"") Example Useage ---------- # Get indices of model points closest to altimetry points ind_x, ind_y = nemo.nearest_indices(altimetry.dataset.longitude, altimetry.dataset.latitude) # Nearest neighbour interpolation of model dataset to these points interpolated = nemo.dataset.isel(x_dim = ind_x, y_dim = ind_y) Parameters ---------- mod_lon (2D array): Model longitude (degrees) array (2-dimensional) mod_lat (2D array): Model latitude (degrees) array (2-dimensions) new_lon (1D array): Array of longitudes (degrees) to compare with model new_lat (1D array): Array of latitudes (degrees) to compare with model mask (2D array): Mask array. Where True (or 1), elements of array will not be included. For example, use to mask out land in case it ends up as the nearest point. Returns ------- Array of x indices, Array of y indices ''' # Cast lat/lon to numpy arrays in case xarray things new_lon = np.array(new_lon) new_lat = np.array(new_lat) mod_lon = np.array(mod_lon) mod_lat = np.array(mod_lat) original_shape = mod_lon.shape # If a mask is supplied, remove indices from arrays. if mask is None: mod_lon = mod_lon.flatten() mod_lat = mod_lat.flatten() else: mod_lon = remove_indices_by_mask(mod_lon, mask) mod_lat = remove_indices_by_mask(mod_lat, mask) # If we are masking, we want to preserve the original indices so that # we can get them back at the end (since masked points are removed). cc, rr = np.meshgrid( np.arange(0,original_shape[1]), np.arange(0,original_shape[0])) cc = remove_indices_by_mask(cc, mask) rr = remove_indices_by_mask(rr, mask) # Put lons and lats into 2D location arrays for BallTree: [lat, lon] mod_loc = np.vstack((mod_lat, mod_lon)).transpose() new_loc = np.vstack((new_lat, new_lon)).transpose() # Convert lat/lon to radians for BallTree mod_loc = np.radians(mod_loc) new_loc = np.radians(new_loc) # Do nearest neighbour interpolation using BallTree (gets indices) tree = nb.BallTree(mod_loc, leaf_size=5, metric='haversine') _, ind_1d = tree.query(new_loc, k=1) if mask is None: # Get 2D indices from 1D index output from BallTree ind_y, ind_x = np.unravel_index(ind_1d, original_shape) else: ind_y = rr[ind_1d] ind_x = cc[ind_1d] ind_x = xr.DataArray(ind_x.squeeze()) ind_y = xr.DataArray(ind_y.squeeze()) return ind_x, ind_y
def knn_graph(X, k, method='brute_force', leaf_size=30): """ Compute the symmetric k-nearest neighbor graph for a set of points. Assume Euclidean distance metric. Parameters ---------- X : numpy array | list [numpy arrays] Data points, with each row as an observation. k : int The number of points to consider as neighbors of any given observation. method : {'brute-force', 'kd-tree', 'ball-tree'}, optional Computing method. - 'brute-force': computes the (Euclidean) distance between all O(n^2) pairs of rows in 'X', then for every point finds the k-nearest. It is limited to tens of thousands of observations (depending on available RAM). - 'kd-tree': partitions the data into axis-aligned rectangles to avoid computing all O(n^2) pairwise distances. Much faster than 'brute-force', but only works for data in fewer than about 20 dimensions. Requires the scikit-learn library. - 'ball-tree': partitions the data into balls and uses the metric property of euclidean distance to avoid computing all O(n^2) distances. Typically much faster than 'brute-force', and works with up to a few hundred dimensions. Requires the scikit-learn library. leaf_size : int, optional For the 'kd-tree' and 'ball-tree' methods, the number of observations in the leaf nodes. Leaves are not split further, so distance computations within leaf nodes are done by brute force. 'leaf_size' is ignored for the 'brute-force' method. Returns ------- neighbors : numpy array Each row contains the nearest neighbors of the corresponding row in 'X', indicated by row indices. radii : list[float] For each row of 'X' the distance to its k'th nearest neighbor (including itself). See Also -------- epsilon_graph Examples -------- >>> X = numpy.random.rand(100, 2) >>> knn, radii = debacl.utils.knn_graph(X, k=8, method='kd-tree') """ n, p = X.shape if method == 'kd_tree': if _HAS_SKLEARN: kdtree = _sknbr.KDTree(X, leaf_size=leaf_size, metric='euclidean') distances, neighbors = kdtree.query(X, k=k, return_distance=True, sort_results=True) radii = distances[:, -1] else: raise ImportError("The scikit-learn library could not be loaded." + " It is required for the 'kd-tree' method.") if method == 'ball_tree': if _HAS_SKLEARN: btree = _sknbr.BallTree(X, leaf_size=leaf_size, metric='euclidean') distances, neighbors = btree.query(X, k=k, return_distance=True, sort_results=True) radii = distances[:, -1] else: raise ImportError("The scikit-learn library could not be loaded." + " It is required for the 'ball-tree' method.") else: # assume brute-force if not _HAS_SCIPY: raise ImportError("The 'scipy' module could not be loaded. " + "It is required for the 'brute_force' method " + "for building a knn similarity graph.") d = _spd.pdist(X, metric='euclidean') D = _spd.squareform(d) rank = _np.argsort(D, axis=1) neighbors = rank[:, 0:k] k_nbr = neighbors[:, -1] radii = D[_np.arange(n), k_nbr] return neighbors, radii
def build_neighbor_index(x, leaf_size): return sk_neighbors.BallTree(x, leaf_size=leaf_size)
exit(0) """ allDicVecs = np.array(dicVec2Word.keys()) #spellPairs = getSpellPairs('../data/misspeltCorrectPairs.txt') spellPairs = getSpellPairs('../data/misspeltCorrectPairsEditDist4.txt') #metricArr = ['euclidean', 'braycurtis', 'russellrao', 'cityblock', 'manhattan', 'infinity', 'jaccard', 'seuclidean', 'sokalsneath', 'kulsinski', 'minkowski', 'mahalanobis', 'p', 'l2', 'hamming', 'l1', 'wminkowski', 'pyfunc'] metricArr = ['euclidean'] for metr in metricArr: errorFile = open('../data/errorPairs.txt', 'w') noErrorFile = open('../data/noErrorPairs.txt', 'w') print "metric: ", metr tree = sn.BallTree(allDicVecs, metric=metr) counter = 0 oldtime = time.time() for incorr, corr in spellPairs.items(): incorrVec = getVec(charVecs, incorr) neigborsInds = tree.query([incorrVec], k=numNeighbors, return_distance=False) #print counter found = False for ind in neigborsInds[0]: if corr in dicVec2Word[tuple(allDicVecs[ind])]: counter += 1 found = True
import geopy.distance import numpy as np from sklearn import neighbors # load geojson data for manhattan nycmap = json.load(open("nycpluto_manhattan.geojson")) # load library data from csv file, convert coordinates to radians, and create coordinate pairs libs = pd.read_csv('manhattanlibraries.csv', usecols=['facname', 'latitude', 'longitude']) libs['latitude'] = libs['latitude'].apply(func=math.radians) libs['longitude'] = libs['longitude'].apply(func=math.radians) libs['coord'] = list(zip(libs['latitude'], libs['longitude'])) # load library data into BallTree libcoords = np.asarray(list(libs['coord'])) tree = neighbors.BallTree(libcoords, metric="haversine") # load lot data from csv file, convert coordinates to radians, and create coordinate pairs df = pd.read_csv('pluto_small.csv') df = df.dropna(subset=['assesstot', 'bldgarea', 'lotarea', 'latitude', 'longitude']) df['latitude'] = df['latitude'].apply(func=math.radians) df['longitude'] = df['longitude'].apply(func=math.radians) df['coord'] = list(zip(df['latitude'], df['longitude'])) # query the BallTree and save results back in df lotcoords = np.asarray(list(df['coord'])) dist, _ = tree.query(X=lotcoords, k=1) df['dist'] = dist df['dist'] = df['dist'].apply(lambda x: x*3960) # use Plotly express function to create a choropleth map
#get wordvectors for all words as numpy array print("Total number of wordvectors=", len(nlp.vocab.vectors)) print("Getting wordvectors...") wordvecs = numpy.array([nlp.vocab.get_vector(word) for word in words]) print("Retrieved=", len(wordvecs), "wordvectors.") #ensure the list of words corresponds to the list of wordvectors assert len(words) == len(wordvecs) spot_check = random.choice(range(0, len(words))) assert numpy.array_equal(nlp(words[spot_check]).vector, wordvecs[spot_check]) print("Spot check passed.") #pickle the entire vocab #pickle.HIGHEST_PROTOCOL depends on Python version with open('vocab.pkl', 'wb') as f: pickle.dump(words, f, protocol=4) print("Dumped vocab words to pickle file vocab.pkl") #place all wordvectors in balltree, and pickle entire tree tree = nbs.BallTree(wordvecs) with open('balltree.pkl', 'wb') as f: pickle.dump(tree, f, protocol=4) print("Dumped wordvector BallTree to pickle file balltree.pkl") #create word:vector dict and pickle it dict = dict(zip(words, wordvecs)) with open('dict.pkl', 'wb') as f: pickle.dump(dict, f, protocol=4) print("Dumped word2vec dictionary in dict.pkl")
import os ipAddress = "192.168.1.103" # Define IP address according to IP address of external device ''' # Load all the models we need: a face detector, a shape predictor for face landmarks, # and the face recognition model (using distances between face landmarks) ''' detector = dlib.get_frontal_face_detector() predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat") facerec = dlib.face_recognition_model_v1( "dlib_face_recognition_resnet_model_v1.dat") # Load trained face descriptors, and put them in a tree to find the closest one faster trainingVects = np.load("trainingVects.npy") trainingLabels = np.load("trainingLabels.npy") tree = neighbors.BallTree(trainingVects, leaf_size=2) loop = True # True until one presses "q" searchedPerson = 0 # Index of the person to be searched ratio = 1.2 # Ratio used to resize received image ''' # Function to display result on pc ''' def display_result(frame, res): if len(res) > 0: for i in range((len(res) // 5)): l, t, r, b = int(res[i * 5 + 1] / ratio), int( res[i * 5 + 2] / ratio), int(res[i * 5 + 3] / ratio), int( res[i * 5 + 4] / ratio)
d1rad = np.array([data1DEC * np.pi / 180., data1RA * np.pi / 180.]).transpose() r1rad = np.array([rand1DEC * np.pi / 180., rand1RA * np.pi / 180.]).transpose() print("Loaded data") nside_base = 256 # Let's see if I can get the stupid thing to work for nside=256, i.e. a whole bunch of histograms # Maybe represent them as sparse matrices or something to speed stuff up? # Actually, maybe it's easier to just write a whole bunch of lists of things. d1pix = hp.ang2pix(nside_base, data1RA, data1DEC, nest=False, lonlat=True) r1pix = hp.ang2pix(nside_base, rand1RA, rand1DEC, nest=False, lonlat=True) print("Computed healpixels") if not ns.loadtree: t0 = time.time() d1tree = neighbors.BallTree(d1rad, metric='haversine') pickle.dump(d1tree, open('%s-d1tree.p' % (truncate(ns.phot_name)), 'wb')) print(time.time() - t0) # 5x as long as flatsky case r1tree = neighbors.BallTree(r1rad, metric='haversine') pickle.dump(r1tree, open('%s-r1tree.p' % (truncate(ns.phot_name_randoms)), 'wb')) else: d1tree = pickle.load(open('%s-d1tree.p' % (truncate(ns.phot_name)), 'rb')) print("Loaded data tree") r1tree = pickle.load( open('%s-r1tree.p' % (truncate(ns.phot_name_randoms)), 'rb')) print("Loaded random tree") #zs = np.arange(zmin,zmax+deltaz,deltaz) zs = np.linspace(zmin, zmax + deltaz,
def cluster_finder(parcels_series, final_eps, threshold=5, verbose=False, point_groups=None): points = np.concatenate([ parcel.np_road_points for parcel in parcels_series if parcel.road_points ]) tree = neighbors.BallTree(points) neigh = tree.query_radius(points, final_eps, return_distance=True) G = gt.Graph() G.add_vertex(points.shape[0]) if verbose: print('created ball tree, neighbors, init graph') ## create initial cluster map and prune grouped points from neighbors to create idx_dists cluster_map = np.array([None] * points.shape[0]) idx_dists = [ np.vstack([x, y.astype(int)]) for x, y in zip(neigh[0], neigh[1]) ] base_idx = 0 for parcel in parcels_series: cluster = Cluster(np.inf, None, parcels=[parcel], sub_clusters=None, threshold=5) group = np.arange(base_idx, base_idx + parcel.np_road_points.shape[0]) for idx in group: if idx != base_idx: G.add_edge(base_idx, idx) G.add_edge(idx, base_idx) cluster_map[idx] = cluster mask = ~np.isin(idx_dists[idx][0], group) idx_dists[idx] = idx_dists[idx][:, mask] base_idx += group.shape[0] if verbose: print( 'created cluster_map array thing, initialized graph and pruned neighbors distances from point groups' ) ## create dendogram if verbose: print(len(set(cluster_map)), len(cluster_map)) comp, hist = gt.topology.label_components(G) for eps in range(1, final_eps + 1): try: start = time.time() new_edge_set = set() for i, x in enumerate(idx_dists): mask = (x[1] <= eps) & (x[1] > (eps - 1)) idxs = list(x[0, mask]) for idx in idxs: if comp.a[i] != comp.a[idx]: G.add_edge(i, idx) new_edge_set.add(idx) if verbose: print(time.time() - start) if len(new_edge_set) > 0: comp, hist = gt.topology.label_components(G) for component in set(comp.a[list(new_edge_set)]): scs = [] for sc in set(cluster_map[comp.a == component]): sc.death = eps # TODO: should this be eps or eps - 1 scs.append(sc) cluster_map[comp.a == component] = Cluster( np.inf, None, parcels=None, sub_clusters=scs, threshold=threshold) if verbose: print(time.time() - start) curr_num_clusters = len(set(cluster_map)) if curr_num_clusters == 1: break if verbose: print('%s clusters at eps=%s in %ss' % (curr_num_clusters, eps, time.time() - start)) print('---') except KeyboardInterrupt: print( 'interupted during graph part @ eps=%s. you should probably ignore clusters born at this eps' % eps) break clusters = set(cluster_map) if len(clusters) == 1: return clusters.pop() return clusters
import numpy as np import pandas as pd import csv def Union(lst1, lst2): final_list = list(set(lst1) | set(lst2)) return final_list from sklearn import neighbors X = pd.read_csv("traffic_signals.csv") X = X.as_matrix() print(type(X)) Y = pd.read_csv("traffix_signals2.csv") Y = Y.as_matrix() New = [] nnn = [] ball_tree = neighbors.BallTree(X, leaf_size=2) for i in range(len(Y)): if i not in New: nnn = Union(nnn, [i]) ind = ball_tree.query_radius([Y[i]], r=50) a = ind[0].tolist() New = Union(New, a) X_New = [] for i in range(len(nnn)): X_New.append(X[nnn[i]]) np.savetxt('2darray.csv', X_New, delimiter=',', fmt='%d')