def prepare_velocity_grid_data( X_emb, xy_grid_nums, density=None, smooth=None, n_neighbors=None, ): n_obs, n_dim = X_emb.shape density = 1 if density is None else density smooth = 0.5 if smooth is None else smooth grs, scale = [], 0 for dim_i in range(n_dim): m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i]) m = m - 0.01 * np.abs(M - m) M = M + 0.01 * np.abs(M - m) gr = np.linspace(m, M, xy_grid_nums[dim_i] * density) scale += gr[1] - gr[0] grs.append(gr) scale = scale / n_dim * smooth meshes_tuple = np.meshgrid(*grs) X_grid = np.vstack([i.flat for i in meshes_tuple]).T # estimate grid velocities if n_neighbors is None: n_neighbors = np.max([10, int(n_obs / 50)]) if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2: from pynndescent import NNDescent nn = NNDescent(X_emb, metric='euclidean', n_neighbors=n_neighbors, n_jobs=-1, random_state=19491001) neighs, dists = nn.query(X_grid, k=n_neighbors) else: alg = "ball_tree" if X_emb.shape[1] > 10 else 'kd_tree' nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1, algorithm=alg) nn.fit(X_emb) dists, neighs = nn.kneighbors(X_grid) weight = norm.pdf(x=dists, scale=scale) p_mass = weight.sum(1) return X_grid, p_mass, neighs, weight
class KNNSearch: def __init__(self, features, kwargs): self.org_features = features if kwargs["normalize"]: self.features = preprocessing.normalize(features, norm='l2') else: self.features = features self.kwargs = kwargs self.predictor = None def fit(self): if self.kwargs['algorithm'] == 'datasketch': self.__datasketch_fit() elif self.kwargs['algorithm'] == 'annoy': self.__annoy_fit() elif self.kwargs['algorithm'] == 'exact': self.__exhaustive_fit() elif self.kwargs['algorithm'] == 'falconn': self.__falconn_fit() elif self.kwargs['algorithm'] == 'descent': self.__descent_fit() elif self.kwargs['algorithm'] == 'random': self.__random_fit() else: raise Exception("Algorithm=[{}] not yet implemented".format( self.kwargs['algorithm'])) def predict(self, input, k): if self.kwargs['algorithm'] == 'datasketch': return self.__datasketch_predict(input, k) elif self.kwargs['algorithm'] == 'annoy': return self.__annoy_predict(input, k) elif self.kwargs['algorithm'] == 'exact': return self.__exhaustive_predict(input, k) elif self.kwargs['algorithm'] == 'falconn': return self.__falconn_predict(input, k) elif self.kwargs['algorithm'] == 'descent': return self.__descent_predict(input, k) elif self.kwargs['algorithm'] == 'random': return self.__random_predict(input, k) else: raise Exception("Algorithm=[{}] not yet implemented".format( self.kwargs['algorithm'])) def __datasketch_fit(self): if self.kwargs['create']: # Create a list of MinHash objects min_hash_obj_list = [] forest = MinHashLSHForest(num_perm=self.kwargs['num_perm']) for i in range(len(self.features)): min_hash_obj_list.append( MinHash(num_perm=self.kwargs['num_perm'])) for d in self.features[i]: min_hash_obj_list[i].update(d) forest.add(i, min_hash_obj_list[i]) # IMPORTANT: must call index() otherwise the keys won't be searchable forest.index() with open(self.kwargs['file_path'], "wb") as f: pickle.dump(forest, f) pickle.dump(min_hash_obj_list, f) self.predictor = [forest, min_hash_obj_list] else: with open(self.kwargs['file_path'], "rb") as f: forest = pickle.load(f) min_hash_obj_list = pickle.load(f) self.predictor = [forest, min_hash_obj_list] def __datasketch_predict(self, input, k): forest, min_hash_obj_list = self.predictor if type(input) == int: return forest.query(min_hash_obj_list[input], k) else: min_hash_obj = MinHash(num_perm=self.kwargs['num_perm']) for d in input: min_hash_obj.update(d) return forest.query(min_hash_obj, k) def __annoy_fit(self): if self.kwargs['create']: indexer = AnnoyIndex(self.features.shape[1], self.kwargs['metric']) for i, f in enumerate(self.features): indexer.add_item(i, f) indexer.build(self.kwargs['num_trees']) indexer.save(self.kwargs['file_path']) self.predictor = indexer else: forest = AnnoyIndex(self.features.shape[1], self.kwargs['metric']) forest.load(self.kwargs['file_path']) self.predictor = forest def __annoy_predict(self, input, k): annoy_forest = self.predictor if type(input) == int: return annoy_forest.get_nns_by_item(input, k, search_k=-1, include_distances=False) else: return annoy_forest.get_nns_by_vector(input, k, search_k=-1, include_distances=False) def __exhaustive_fit(self): self.predictor = NearestNeighbors(algorithm='ball_tree') self.predictor.fit(self.features) def __exhaustive_predict(self, input, k): if type(input) == int: return self.predictor.kneighbors(self.features[input].reshape( 1, -1), n_neighbors=k, return_distance=False)[0] else: return self.predictor.kneighbors(input.reshape(1, -1), n_neighbors=k, return_distance=False)[0] def __falconn_fit(self): """ Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data. """ import falconn dimension = self.features.shape[1] nb_tables = self.kwargs['nb_tables'] number_bits = self.kwargs['number_bits'] # LSH parameters params_cp = falconn.LSHConstructionParameters() params_cp.dimension = dimension params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = nb_tables params_cp.num_rotations = 2 # for dense set it to 1; for sparse data set it to 2 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build number_bits-bit hashes so that each table has # 2^number_bits bins; a rule of thumb is to have the number # of bins be the same order of magnitude as the number of data points falconn.compute_number_of_hash_functions(number_bits, params_cp) self._falconn_table = falconn.LSHIndex(params_cp) self._falconn_query_object = None self._FALCONN_NB_TABLES = nb_tables # Center the dataset and the queries: this improves the performance of LSH quite a bit. self.center = np.mean(self.features, axis=0) self.features -= self.center # add features to falconn table self._falconn_table.setup(self.features) def __falconn_predict(self, input, k): # Normalize input if you care about the cosine similarity if type(input) == int: input = self.features[input] else: if self.kwargs['normalize']: input /= np.linalg.norm(input) # Center the input and the queries: this improves the performance of LSH quite a bit. input -= self.center # Late falconn query_object construction # Since I suppose there might be an error # if table.setup() will be called after if self._falconn_query_object is None: self._falconn_query_object = self._falconn_table.construct_query_object( ) self._falconn_query_object.set_num_probes(self._FALCONN_NB_TABLES) query_res = self._falconn_query_object.find_k_nearest_neighbors( input, k) return query_res def __descent_fit(self): self.predictor = NNDescent(data=self.features, metric=self.kwargs['metric']) def __descent_predict(self, input, k): input = np.expand_dims( input, axis=0) # input should be an array of search points index = self.predictor return index.query(input, k)[0][ 0] # returns indices of NN, distances of the NN from the input def __random_fit(self): pass def __random_predict(self, input, k): rand_index_list = [] for i in range(k): rand_index_list.append(random.randint(0, len(self.features) - 1)) return rand_index_list
def build_knn_index(self, data, min_n_neighbors=20, rho=0.5): """ Build a KNN index for the given data set. There will two KNN indices of the SNN distance is used. :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :param min_n_neighbors: minimum number of nearest neighbors to use for the `NN-descent` method. :param rho: `rho` parameter used by the `NN-descent` method. :return: A list with one or two KNN indices. """ # Add one extra neighbor because querying on the points that are part of the KNN index will result in # the neighbor set containing the queried point. This can be removed from the query result if self.shared_nearest_neighbors: k = max(1 + self.n_neighbors_snn, min_n_neighbors) else: k = max(1 + self.n_neighbors, min_n_neighbors) # KNN index based on the primary distance metric if self.approx_nearest_neighbors: params = { 'metric': self.metric, 'metric_kwds': self.metric_kwargs, 'n_neighbors': k, 'rho': rho, 'random_state': self.seed_rng, 'n_jobs': self.n_jobs } index_knn_primary = NNDescent(data, **params) else: # Exact KNN graph index_knn_primary = NearestNeighbors( n_neighbors=k, algorithm='brute', metric=self.metric, metric_params=self.metric_kwargs, n_jobs=self.n_jobs ) index_knn_primary.fit(data) if self.shared_nearest_neighbors: # Construct a second KNN index that uses the shared nearest neighbor distance data_neighbors, _ = remove_self_neighbors( *self.query_wrapper_(data, index_knn_primary, self.n_neighbors_snn + 1) ) if self.approx_nearest_neighbors: params = { 'metric': distance_SNN, 'n_neighbors': max(1 + self.n_neighbors, min_n_neighbors), 'rho': rho, 'random_state': self.seed_rng, 'n_jobs': self.n_jobs } index_knn_secondary = NNDescent(data_neighbors, **params) else: index_knn_secondary = NearestNeighbors( n_neighbors=(1 + self.n_neighbors), algorithm='brute', metric=distance_SNN, n_jobs=self.n_jobs ) index_knn_secondary.fit(data_neighbors) index_knn = [index_knn_primary, index_knn_secondary] else: index_knn = [index_knn_primary] return index_knn
class NearestNeighbors: """Greedy algorithm to balance a K-nearest neighbour graph It has an API similar to scikit-learn Parameters ---------- k : int (default=50) the number of neighbours in the final graph sight_k : int (default=100) the number of neighbours in the initialization graph It correspondent to the farthest neighbour that a sample is allowed to connect to when no closest neighbours are allowed. If sight_k is reached then the matrix is filled with the sample itself maxl : int (default=200) max degree of connectivity allowed. Avoids the presence of hubs in the graph, it is the maximum number of neighbours that are allowed to contact a node before the node is blocked mode : str (default="connectivity") decide wich kind of utput "distance" or "connectivity" n_jobs : int (default=4) parallelization of the standard KNN search preformed at initialization """ def __init__(self, k: int = 50, sight_k: int = 100, maxl: int = 200, mode: str = "distance", metric: str = "euclidean", minkowski_p: int = 20, n_jobs: int = -1) -> None: # input parameters self.k = k self.sight_k = sight_k self.maxl = maxl self.mode = mode self.metric = metric self.minkowski_p = minkowski_p self.n_jobs = n_jobs # NN graphs self.data = None self._nn = None # raw KNN self.bknn = None # balanced KNN self.dist = None # balanced KNN distances self.dsi = None # balanced KNN neighbor index self.l = None # balanced KNN degree of connectivity self.mknn = None # mutual KNN based on bknn self.rnn = None # radius NN based on mknn @property def n_samples(self) -> int: return self.data.shape[0] def fit(self, data: np.ndarray, sight_k: int = None) -> Any: """Fits the model data: np.ndarray (samples, features) np sight_k: int the farthest point that a node is allowed to connect to when its closest neighbours are not allowed """ self.data = data if sight_k is not None: self.sight_k = sight_k logging.debug( f"First search the {self.sight_k} nearest neighbours for {self.n_samples}" ) np.random.seed(13) if self.metric == "correlation": self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, algorithm="brute") self._nn.fit(self.data) elif self.metric == "js": self._nn = NNDescent(data=self.data, metric=jensen_shannon_distance) else: self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, leaf_size=30) self._nn.fit(self.data) # call this to calculate bknn self.kneighbors_graph(mode='distance') return self def kneighbors(self, X: np.ndarray = None, maxl: int = None, mode: str = "distance" ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: if self._nn is None: raise ValueError('must fit() before generating kneighbors graphs') """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_query, n_features), The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. maxl: int max degree of connectivity allowed mode : "distance" or "connectivity" Decides the kind of output Returns ------- dist_new : np.ndarray (samples, k+1) distances to the NN dsi_new : np.ndarray (samples, k+1) indexes of the NN, first column is the sample itself l: np.ndarray (samples) l[i] is the number of connections from other samples to the sample i NOTE: First column (0) correspond to the sample itself, the nearest neighbour is at the second column (1) """ if X is not None: self.data = X if maxl is not None: self.maxl = maxl if mode == "distance": if self.metric == "js": self.dsi, self.dist = self._nn.query(self.data, k=self.sight_k + 1) else: self.dist, self.dsi = self._nn.kneighbors(self.data, return_distance=True) else: if self.metric == "js": self.dsi, _ = self._nn.query(self.data, k=self.sight_k + 1) else: self.dsi = self._nn.kneighbors(self.data, return_distance=False) self.dist = np.ones_like(self.dsi, dtype='float64') self.dist[:, 0] = 0 logging.debug( f"Using the initialization network to find a {self.k}-NN " f"graph with maximum connectivity of {self.maxl}") self.dist, self.dsi, self.l = knn_balance(self.dsi, self.dist, maxl=self.maxl, k=self.k) return self.dist, self.dsi, self.l def kneighbors_graph(self, X: np.ndarray = None, maxl: int = None, mode: str = "distance") -> sparse.csr_matrix: """Retrun the K-neighbors graph as a sparse csr matrix Parameters ---------- X : array-like, shape (n_query, n_features), The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. maxl: int max degree of connectivity allowed mode : "distance" or "connectivity" Decides the kind of output Returns ------- neighbor_graph : scipy.sparse.csr_matrix The values are either distances or connectivity dependig of the mode parameter NOTE: The diagonal will be zero even though the value 0 is actually stored """ dist_new, dsi_new, _ = self.kneighbors(X=X, maxl=maxl, mode=mode) logging.debug("Returning sparse matrix") self.bknn = sparse.csr_matrix( (np.ravel(dist_new), np.ravel(dsi_new), np.arange(0, dist_new.shape[0] * dist_new.shape[1] + 1, dist_new.shape[1])), (self.n_samples, self.n_samples)) self.bknn.eliminate_zeros() return self.bknn def mnn_graph(self): """get mutual nearest neighbor graph from bknn""" if self.mknn is None: if self.bknn is None: raise ValueError( 'must fit() before generating kneighbors graphs') # element-wise minimum between bknn and bknn.T, so non-mutual value will be 0 self.mknn = self.bknn.minimum(self.bknn.transpose()) return self.mknn def rnn_graph(self): """get rnn from mknn, return a sparse binary matrix""" # Convert distances to similarities if self.mknn is None: self.mnn_graph() mknn_sim = self.mknn.copy() bknn_sim = self.bknn.copy() max_d = self.bknn.data.max() bknn_sim.data = (max_d - bknn_sim.data) / max_d mknn_sim.data = (max_d - mknn_sim.data) / max_d mknn_sim = mknn_sim.tocoo() mknn_sim.setdiag(0) # Compute the effective resolution d = 1 - bknn_sim.data radius = np.percentile(d, 90) logging.info(f" 90th percentile radius: {radius:.02}") inside = mknn_sim.data > 1 - radius self.rnn = sparse.coo_matrix( (mknn_sim.data[inside], (mknn_sim.row[inside], mknn_sim.col[inside])), shape=mknn_sim.shape) return self.rnn