def _construct_mnn(t1_cells, t2_cells, data_df, n_neighbors,device,n_jobs=-2): # FUnction to construct mutually nearest neighbors bewteen two points if device == "gpu": from cuml import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean') elif device == "cpu": from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean', n_jobs=n_jobs) print(f't+1 neighbors of t...') nbrs.fit(data_df.loc[t1_cells, :].values) t1_nbrs = nbrs.kneighbors_graph( data_df.loc[t2_cells, :].values, mode='distance') print(f't neighbors of t+1...') nbrs.fit(data_df.loc[t2_cells, :].values) t2_nbrs = nbrs.kneighbors_graph( data_df.loc[t1_cells, :].values, mode='distance') # Mututally nearest neighbors mnn = t2_nbrs.multiply(t1_nbrs.T) mnn = mnn.sqrt() return mnn
def find_similar_image(): KNN = 50 if len(test) == 3: KNN = 2 model = NearestNeighbors(n_neighbors=KNN) model.fit(image_embeddings) preds = [] CHUNK = 1024 * 4 print('Finding similar images...') CTS = len(image_embeddings) // CHUNK if len(image_embeddings) % CHUNK != 0: CTS += 1 for j in range(CTS): a = j * CHUNK b = (j + 1) * CHUNK b = min(b, len(image_embeddings)) print('chunk', a, 'to', b) distances, indices = model.kneighbors(image_embeddings[a:b, ]) for k in range(b - a): IDX = np.where(distances[k,] < 6.0)[0] IDS = indices[k, IDX] o = test.iloc[IDS].posting_id.values preds.append(o) del model, distances, indices, image_embeddings, embeds _ = gc.collect() test['preds2'] = preds test.head()
def get_image_predictions(df, embeddings, threshold=0.0): if len(df) > 3: KNN = 50 else: KNN = 3 model = NearestNeighbors(n_neighbors=KNN, metric='cosine') model.fit(embeddings) distances, indices = model.kneighbors(embeddings) predictions = [] for k in tqdm(range(embeddings.shape[0])): idx = np.where(distances[k, ] < threshold)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values if len(posting_ids) >= 2: idx_s = np.where(distances[k, ] < threshold - 0.08888)[0] ids_s = indices[k, idx_s] posting_ids_b = df['posting_id'].iloc[ids_s].values if len(posting_ids_b) >= 2: predictions.append(posting_ids_b) else: predictions.append(posting_ids) else: idx = np.where(distances[k, ] < 0.51313)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids[:2]) del model, distances, indices gc.collect() return predictions
def find_matching_image_with_rapids(): model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None) train_gen = DataGenerator(train, batch_size=128) image_embeddings = model.predict(train_gen, verbose=1) print('image embeddings shape is', image_embeddings.shape) # After fitting KNN, we will display some example rows of train and their 8 closest other images in train (based EffNetB0 image embeddings). KNN = 50 model = NearestNeighbors(n_neighbors=KNN) model.fit(image_embeddings) distances, indices = model.kneighbors(image_embeddings) for k in range(180, 190): plt.figure(figsize=(20, 3)) plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-') plt.title('Image Distance From Train Row %i to Other Train Rows' % k, size=16) plt.ylabel('Distance to Train Row %i' % k, size=14) plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14) plt.show() cluster = train.loc[cupy.asnumpy(indices[k, :8])] displayDF(cluster, random=False, ROWS=2, COLS=4)
def get_preds(embs_path, threshold): image_embeddings = np.load(embs_path)['embeddings'] KNN = 50 if len(df) == 3: KNN = 2 model = NearestNeighbors(n_neighbors=KNN) model.fit(image_embeddings) image_embeddings = cupy.array(image_embeddings) preds = [] CHUNK = 1024 * 4 # print('Finding similar images...') CTS = len(image_embeddings) // CHUNK if len(image_embeddings) % CHUNK != 0: CTS += 1 for j in range(CTS): a = j * CHUNK b = (j + 1) * CHUNK b = min(b, len(image_embeddings)) cts = cupy.matmul(image_embeddings, image_embeddings[a:b].T).T for k in range(b - a): IDX = cupy.where(cts[k, ] > threshold)[0] IDX = cupy.asnumpy(IDX) o = cpu_df.iloc[IDX].posting_id.values preds.append(o) return preds
def compute_neighbors_rapids(X: np.ndarray, n_neighbors: int, metric: _Metric = 'euclidean'): """Compute nearest neighbors using RAPIDS cuml. Parameters ---------- X: array of shape (n_samples, n_features) The data to compute nearest neighbors for. n_neighbors The number of neighbors to use. metric The metric to use to compute distances in high dimensional space. This string must match a valid predefined metric in RAPIDS cuml. Returns ------- **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors) """ from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) X_contiguous = np.ascontiguousarray(X, dtype=np.float32) nn.fit(X_contiguous) knn_dist, knn_indices = nn.kneighbors(X_contiguous) return knn_indices, knn_dist
def get_image_neighbors(df, embeddings, threshold=args.threshold): n_neighbors = args.n_neighbors_max if len(df) > 3 else args.n_neighbors_min model_nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors) model_nearest_neighbors.fit(embeddings) distances, indices = model_nearest_neighbors.kneighbors(embeddings) predictions = [] for k in range(embeddings.shape[0]): idx = np.where(distances[k] < threshold)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids) del model_nearest_neighbors, distances, indices gc.collect() return predictions
def _find_distance_threshold( self, features, posting_ids: np.ndarray, thresholds: List[float], ) -> Tuple[float, float, List[List[str]]]: features = F.normalize(torch.from_numpy(features)).numpy() with TimeUtil.timer("nearest neighbor search"): model = NearestNeighbors(n_neighbors=len(self.valid_df), n_jobs=32) model.fit(features) distances, indices = model.kneighbors(features) FileUtil.save_npy( distances, self.config.dir_config.output_dir / f"distances_{self.fold}_{self.current_epoch:02d}.npy", ) FileUtil.save_npy( indices, self.config.dir_config.output_dir / f"indices_{self.fold}_{self.current_epoch:02d}.npy", ) best_score = 0 best_threshold = -1 best_y_pred: List[List[str]] = [] for threshold in thresholds: y_pred = [] for i in range(len(distances)): IDX = np.where(distances[i] < threshold)[0] if len(IDX) < self.config.inference_config.min_indices: IDX = list(range(self.config.inference_config.min_indices)) idxs = indices[i, IDX] y_pred.append(posting_ids[idxs]) scores = MetricUtil.f1_scores(self.valid_df["target"].tolist(), y_pred) precisions, recalls = MetricUtil.precision_recall( self.valid_df["target"].tolist(), y_pred ) self.valid_df["score"] = scores self.valid_df["precision"] = precisions self.valid_df["recall"] = recalls selected_score = self.valid_df["score"].mean() _p_mean = self.valid_df["precision"].mean() _r_mean = self.valid_df["recall"].mean() print( f"----------- valid f1: {selected_score} precision: {_p_mean} recall: {_r_mean} threshold: {threshold} ------------" ) if selected_score > best_score: best_score = selected_score best_threshold = threshold best_y_pred = y_pred return best_score, best_threshold, best_y_pred
def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors, precomputed_nearest_neighbors): n_clusters = 30 random_state = 42 metric = 'euclidean' X, _ = make_blobs(n_samples=n_rows, centers=n_clusters, n_features=n_features, random_state=random_state) if precomputed_nearest_neighbors: nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) nn.fit(X) knn_dists, knn_indices = nn.kneighbors(X, n_neighbors, return_distance=True) cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state, metric, knn_indices=knn_indices, knn_dists=knn_dists) knn_indices = knn_indices.get() knn_dists = knn_dists.get() ref_fss_graph = ref_fuzzy_simplicial_set( X, n_neighbors, random_state, metric, knn_indices=knn_indices, knn_dists=knn_dists)[0].tocoo() else: cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state, metric) X = X.get() ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state, metric)[0].tocoo() cu_fss_graph = cu_fss_graph.todense() ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense() assert correctness_sparse(ref_fss_graph, cu_fss_graph, atol=0.1, rtol=0.2, threshold=0.95)
def get_image_neighbors(df, embeddings, KNN=50): model = NearestNeighbors(n_neighbors=KNN) model.fit(embeddings) distances, indices = model.kneighbors(embeddings) threshold = 4.5 predictions = [] for k in tqdm(range(embeddings.shape[0])): idx = np.where(distances[k, ] < threshold)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids) del model, distances, indices gc.collect() return df, predictions
def reduce_dimensionality(self, embeddings): """Reduce dimensionality of embeddings using UMAP and train a UMAP model Args: embeddings (cupy.ndarray): The extracted embeddings using the sentence transformer module. Returns: umap_embeddings: The reduced embeddings """ m_cos = NearestNeighbors(n_neighbors=15, metric="cosine") m_cos.fit(embeddings) knn_graph_cos = m_cos.kneighbors_graph(embeddings, mode="distance") u1 = UMAP(n_neighbors=15, n_components=5, min_dist=0.0) umap_embeddings = u1.fit_transform(embeddings, knn_graph=knn_graph_cos) return umap_embeddings
def get_image_neighbors(df, embeddings, KNN=50): model = NearestNeighbors(n_neighbors=KNN) # 创建knn模型 model.fit(embeddings) # 训练features distances, indices = model.kneighbors(embeddings) # 获得图片之间的距离(相似度) predictions = [] for k in tqdm(range(embeddings.shape[0])): # 每张图片都拿出来两两比对 idx = np.where( distances[k, ] < CFG.img_thres)[0] # 设置一个thres(阈值),来确定匹配的严格程度 # 对于没有匹配到的其他图片的图片,我们放宽阈值再匹配一次 if len(idx) == 1: idx = np.where(distances[k, ] < (CFG.img_thres + CFG.addition))[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values # 输出匹配的图片 predictions.append(posting_ids) del model, distances, indices gc.collect() return predictions
def compute_neighbors_rapids(X: np.ndarray, n_neighbors: int): """Compute nearest neighbors using RAPIDS cuml. Parameters ---------- X: array of shape (n_samples, n_features) The data to compute nearest neighbors for. n_neighbors The number of neighbors to use. Returns ------- **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors) """ from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=n_neighbors) X_contiguous = np.ascontiguousarray(X, dtype=np.float32) nn.fit(X_contiguous) knn_distsq, knn_indices = nn.kneighbors(X_contiguous) return knn_indices, np.sqrt( knn_distsq) # cuml uses sqeuclidean metric so take sqrt
def setNodeData(self, nodes_data: xa.DataArray, **kwargs): print( f"{self.__class__.__name__}[{hex(id(self))}].setNodeData: input shape = {nodes_data.shape}" ) if self.reset or (self.nodes is None): if (nodes_data.size > 0): t0 = time.time() self.nodes = cudf.DataFrame({ icol: nodes_data[:, icol] for icol in range(nodes_data.shape[1]) }) self.nnd = NearestNeighbors(n_neighbors=self.nneighbors) self.nnd.fit(self.nodes) self.D, self.I = self.nnd.kneighbors(self.nodes, return_distance=True) dt = (time.time() - t0) print( f"Computed NN Graph with {self.nnd.n_neighbors} neighbors and {nodes_data.shape[0]} verts in {dt} sec ({dt/60} min)" ) print( f" ---> Indices shape = {self.I.shape}, Distances shape = {self.D.shape} " ) else: print("No data available for this block")
def compute_neighbors_sklearn(X: np.ndarray, n_neighbors: int): """Compute nearest neighbors using sklearn Parameters ---------- X: array of shape (n_samples, n_features) The data to compute nearest neighbors for. n_neighbors The number of neighbors to use. Returns ------- **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors) """ from sklearn.neighbors import NearestNeighbors import time t0 = time.time() nn = NearestNeighbors(n_neighbors=n_neighbors) X_contiguous = np.ascontiguousarray(X, dtype=np.float32) nn.fit(X_contiguous) knn_dist, knn_indices = nn.kneighbors(X_contiguous) print("Here", time.time() - t0) return knn_indices, knn_dist
def find_similar_titles_with_rapids_knn(): """ First we will extract text embeddings using RAPIDS cuML's TfidfVectorizer. This will turn every title into a one-hot-encoding of the words present. We will then compare one-hot-encodings with RAPIDS cuML KNN to find title's that are similar. :return: """ # LOAD TRAIN UNTO THE GPU WITH CUDF train_gf = cudf.read_csv('../input/shopee-product-matching/train.csv') print('train shape is', train_gf.shape) train_gf.head() # Extract Text Embeddings with RAPIDS TfidfVectorizer¶ # TfidfVectorizer returns a cupy sparse matrix. # Afterward we convert to a cupy dense matrix and feed that into RAPIDS cuML KNN. model = TfidfVectorizer(stop_words='english', binary=True) text_embeddings = model.fit_transform(train_gf.title).toarray() print('text embeddings shape is', text_embeddings.shape) # After fitting KNN, we will display some example rows of train and their 10 closest other titles in train (based on word count one-hot-encoding). KNN = 50 model = NearestNeighbors(n_neighbors=KNN) model.fit(text_embeddings) distances, indices = model.kneighbors(text_embeddings) for k in range(5): plt.figure(figsize=(20, 3)) plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-') plt.title('Text Distance From Train Row %i to Other Train Rows' % k, size=16) plt.ylabel('Distance to Train Row %i' % k, size=14) plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14) plt.show() print(train_gf.loc[cupy.asnumpy(indices[k, :10]), ['title', 'label_group']])
def computeGrid(img_collection, X_2d, out_res, out_dim): #grid = np.zeros(()) out = np.ones((out_dim * out_res, out_dim * out_res, 4), dtype=np.uint8) nn = NearestNeighbors(n_neighbors=1) d = {} # TODO: assumes unique #for i, xy in enumerate(X_2d): # d[tuple(xy)] = i def build(remaining): xs = sorted(remaining, key=lambda x: x[0]) ys = sorted(remaining, key=lambda x: x[1]) X = np.zeros((out_dim * out_dim, 2), np.float32) for x in range(out_dim): for y in range(out_dim): X[y * out_dim + x] = np.array([ xs[int(x * len(xs) / out_dim)][0], ys[int(y * len(ys) / out_dim)][1] ]) #X = np.array(X) #print(X.shape) nn.fit(remaining) X_cudf = cudf.DataFrame(X) distances, indices = nn.kneighbors(X_cudf) return indices #remaining = X_2d indices = build(X_2d) seen = {} #for i, x in enumerate(xs): #for j, y in enumerate(ys): for x in range(out_dim): for y in range(out_dim): done = False e = 0 while not done: #X = (xs[int(x * len(xs) / out_dim)], ys[int(y * len(ys) / out_dim)]) #X_cudf = cudf.DataFrame(X) #distances, indices = nn.kneighbors(X_cudf) p = nearest = indices[(y * out_dim + x + e) % len(indices)] if p in seen: e += 1 #print("rebuild") #remaining = [X_2d[x] for i, x in enumerate(X_2d) if not i in seen] #indices = build(remaining) else: seen[p] = True done = True #print("nearest", nearest) #p = d[tuple(nearest)] #grid[i][j] = img_collection[p] pos = (x, y) print("xyp", x, y, p) img = img_collection[p] h_range = x * out_res w_range = y * out_res #print("range", h_range, h_range + out_res, w_range, w_range + out_res) out[h_range:h_range + out_res, w_range:w_range + out_res, :] = readImage(img, out_res) #break #break #im = image.array_to_img(out) im = Image.fromarray(out) im.save(out_dir + out_name, quality=100)
def spreadXY(X_2d, threshold, speed): 'spreads items until distance is greater than threshold' import cudf from cuml.neighbors import NearestNeighbors def kernel(x, y, outx, outy, threshold2): for i, (x2, y2) in enumerate(zip(x, y)): d = math.sqrt(x2 * x2 + y2 * y2) if 0 < d <= threshold2: outx[i] = x2 / d outy[i] = y2 / d else: outx[i] = 0 outy[i] = 0 print('spreadXY') length = len(X_2d) X = cudf.DataFrame() X['x'] = X_2d[0:length, 0] X['y'] = X_2d[0:length, 1] k = 8 scale = 10000 threshold *= scale speed *= scale X = X.mul(scale) #X = np.copy(X_2d[:length]) for i in range(20): nn = NearestNeighbors(n_neighbors=k) nn.fit(X) distances, indices = nn.kneighbors(X) #print(distances.shape) joins = [] s = X.sum() print("iteration", i, "sum dist", s) newX = X for j in range(k): join = indices.drop([x for x in range(k) if x != j ]) #.rename(mapper={j: 'x'}, columns=[j]) join = join.merge(X, how='left', left_on=[j], right_index=True) join = join.drop(j) v = join.sub(X) v = v.apply_rows(kernel, incols=['x', 'y'], outcols=dict(outx=np.float32, outy=np.float32), kwargs=dict(threshold2=threshold)) v = v.drop(['x', 'y']) v = v.rename(columns={'outx': 'x', 'outy': 'y'}) newX = newX.sub(v.mul(speed)) #newX = newX.add(1) #v = v.query('x * x + y * y <= ' + str(threshold * threshold)) #print("newX") #print(newX) X = newX s = X.sum() print("iteration", i, "sum dist", s) X = X.truediv(scale) X = np.array(X.as_matrix()) print(X.shape) return X
def augmented_affinity_matrix( data_df, timepoints, timepoint_connections, n_neighbors=30, n_jobs=-2, pc_components=1000, device="cpu", ): """Function for max min sampling of waypoints :param data_df: Normalized data frame. Data frame should be sorted according to the timepoints :param timepoints: Panadas series indicating timepoints for each cell in data_df :param timepoint_connections: Links between timepoints :param n_neighbors: Number of nearest neighbors for graph construction :param n_jobs: Nearest Neighbors will be computed in parallel using n_jobs. :param pc_components: Minimum number of principal components to use. Specify `None` to use pre-computed components :return: Affinity matrix augmented to mutually nearest neighbors """ # Timepoints nad data_df should in same order timepoints = timepoints[data_df.index] cell_order = data_df.index # Time point cells and indices tp_cells = pd.Series() tp_offset = pd.Series() offset = 0 for i in timepoints.unique(): tp_offset[i] = offset tp_cells[i] = list(timepoints.index[timepoints == i]) offset += len(tp_cells[i]) # Run PCA to denoise the dropouts if pc_components is None: pca_projections = data_df else: pca_projections, _ = utils.run_pca(data_df,device, n_components=pc_components) # Nearest neighbor graph construction and affinity matrix print('Nearest neighbor computation...') # -------------------------------------------------------------------------- # nbrs = NearestNeighbors(n_neighbors=n_neighbors, # metric='euclidean', n_jobs=-2) # nbrs.fit(pca_projections.values) # dists, _ = nbrs.kneighbors(pca_projections.values) # adj = nbrs.kneighbors_graph(pca_projections.values, mode='distance') # # Scaling factors for affinity matrix construction # ka = np.int(n_neighbors / 3) # scaling_factors = pd.Series(dists[:, ka], index=cell_order) # # Affinity matrix # nn_aff = _convert_to_affinity(adj, scaling_factors, True) # -------------------------------------------------------------------------- if device == "cpu": temp = sc.AnnData(data_df.values) sc.pp.neighbors(temp, n_pcs=0, n_neighbors=n_neighbors) # maintaining backwards compatibility to Scanpy `sc.pp.neighbors` try: kNN = temp.uns['neighbors']['distances'] except KeyError: kNN = temp.obsp['distances'] elif device == "gpu": from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=n_neighbors,metric="euclidean") X_contiguous = np.ascontiguousarray(data_df.values) nn.fit(X_contiguous) kNN = nn.kneighbors_graph(X_contiguous,mode="distance") kNN.setdiag(0) kNN.eliminate_zeros() # Adaptive k adaptive_k = int(np.floor(n_neighbors / 3)) scaling_factors = np.zeros(data_df.shape[0]) for i in np.arange(len(scaling_factors)): scaling_factors[i] = np.sort(kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1] scaling_factors = pd.Series(scaling_factors, index=cell_order) # Affinity matrix nn_aff = _convert_to_affinity(kNN, scaling_factors, device, True) # Mututally nearest neighbor affinity matrix # Initilze mnn affinity matrix N = len(cell_order) full_mnn_aff = csr_matrix(([0], ([0], [0])), [N, N]) for i in timepoint_connections.index: t1, t2 = timepoint_connections.loc[i, :].values print(f'Constucting affinities between {t1} and {t2}...') # MNN matrix and distance to ka the distance t1_cells = tp_cells[t1] t2_cells = tp_cells[t2] mnn = _construct_mnn(t1_cells, t2_cells, pca_projections, n_neighbors, device, n_jobs) # MNN Scaling factors # Distance to the adaptive neighbor ka_dists = pd.Series(0.0, index=t1_cells + t2_cells) # T1 scaling factors ka_dists[t1_cells] = _mnn_ka_distances(mnn, n_neighbors) # T2 scaling factors ka_dists[t2_cells] = _mnn_ka_distances(mnn.T, n_neighbors) # Scaling factors mnn_scaling_factors = pd.Series(0.0, index=cell_order) mnn_scaling_factors[t1_cells] = _mnn_scaling_factors( ka_dists[t1_cells], scaling_factors, device) mnn_scaling_factors[t2_cells] = _mnn_scaling_factors( ka_dists[t2_cells], scaling_factors, device) # MNN affinity matrix full_mnn_aff = full_mnn_aff + \ _mnn_affinity(mnn, mnn_scaling_factors, tp_offset[t1], tp_offset[t2], device) # Symmetrize the affinity matrix and return aff = nn_aff + nn_aff.T + full_mnn_aff + full_mnn_aff.T return aff, nn_aff + nn_aff.T
def mk_nnmdl(feats, n_nbrs=N_NBRS): nnmdl = NearestNeighbors(N_NBRS, metric="cosine") nnmdl.fit(feats) return nnmdl
class gpActivationFlow(ActivationFlow): def __init__(self, nodes_data: xa.DataArray, n_neighbors: int, **kwargs): ActivationFlow.__init__(self, n_neighbors, **kwargs) self.I: cudf.DataFrame = None self.D: cudf.DataFrame = None self.P: cudf.DataFrame = None self.C: cudf.DataFrame = None self.nodes: cudf.DataFrame = None self.setNodeData(nodes_data, **kwargs) def setNodeData(self, nodes_data: xa.DataArray, **kwargs): print( f"{self.__class__.__name__}[{hex(id(self))}].setNodeData: input shape = {nodes_data.shape}" ) if self.reset or (self.nodes is None): if (nodes_data.size > 0): t0 = time.time() self.nodes = cudf.DataFrame({ icol: nodes_data[:, icol] for icol in range(nodes_data.shape[1]) }) self.nnd = NearestNeighbors(n_neighbors=self.nneighbors) self.nnd.fit(self.nodes) self.D, self.I = self.nnd.kneighbors(self.nodes, return_distance=True) dt = (time.time() - t0) print( f"Computed NN Graph with {self.nnd.n_neighbors} neighbors and {nodes_data.shape[0]} verts in {dt} sec ({dt/60} min)" ) print( f" ---> Indices shape = {self.I.shape}, Distances shape = {self.D.shape} " ) else: print("No data available for this block") def getGraph(self): return None def getConnectionMatrix(self) -> csr_matrix: distances = cupy.ravel(cupy.fromDlpack(self.D.to_dlpack())) indices = cupy.ravel(cupy.fromDlpack(self.I.to_dlpack())) n_samples = indices.shape[0] n_nonzero = n_samples * self.nneighbors rowptr = cupy.arange(0, n_nonzero + 1, self.nneighbors) knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr), shape=(n_samples, n_samples)) print(f"Completed KNN, sparse graph shape = {knn_graph.shape}") return knn_graph def spread(self, sample_data: np.ndarray, nIter: int = 1, **kwargs) -> Optional[bool]: converged = True spdf = shortest_path(G, source_pid) spdf.sort_by("vertex") distances = spdf["distance"] self.reset = False return converged
import cudf, cuml, cupy, cupyx from cuml.neighbors import NearestNeighbors # Using cudf Dataframe here is not likely to help with performance # However, it's a good opportunity to get familiar with the API source_df: cudf.DataFrame = cudf.read_csv( '/att/nobackup/tpmaxwel/data/fashion-mnist-csv/fashion_train.csv') data = source_df.loc[:, source_df.columns[:-1]] target = source_df[source_df.columns[-1]] n_neighbors = 5 # fit model model = NearestNeighbors(n_neighbors=5) model.fit(data) # get nearest neighbors dist_mlarr, ind_mlarr = model.kneighbors(data, return_distance=True) # create sparse matrix distances = cupy.ravel(cupy.fromDlpack(dist_mlarr.to_dlpack())) indices = cupy.ravel(cupy.fromDlpack(ind_mlarr.to_dlpack())) print( f"Computed KNN graph, distances shape = {distances.shape}, indices shape = {indices.shape}, distances[0:5]= {distances[0:5]}, indices[0:5]= {indices[0:5]}" ) n_samples = indices.shape[0] n_nonzero = n_samples * n_neighbors rowptr = cupy.arange(0, n_nonzero + 1, n_neighbors) knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr), shape=(n_samples, n_samples)) print(f"Completed KNN, graph shape = {knn_graph.shape}")
imagefeat.append(feat) # In[11]: from sklearn.preprocessing import normalize # l2 norm to kill all the sim in 0-1 imagefeat = np.vstack(imagefeat) imagefeat = normalize(imagefeat) # In[12]: KNN = 50 if len(test) == 3: KNN = 2 model = NearestNeighbors(n_neighbors=KNN) model.fit(imagefeat) # In[13]: preds = [] CHUNK = 1024 * 4 imagefeat = cupy.array(imagefeat) print('Finding similar images...') CTS = len(imagefeat) // CHUNK if len(imagefeat) % CHUNK != 0: CTS += 1 for j in range(CTS): a = j * CHUNK
def diffusion( adata: AnnData, n_components=10, knn=30, alpha=0, multiscale: bool = True, n_eigs: int = None, device="cpu", n_pcs=50, copy=False, ): """\ Wrapper to generate diffusion maps using Palantir. Parameters ---------- adata Annotated data matrix. use_highly_variable Use only variable genes for calculating PC components. n_components Number of diffusion components. knn Number of nearest neighbors for graph construction. alpha Normalization parameter for the diffusion operator. multiscale Whether to get mutliscale diffusion space (calls palantir.utils.determine_multiscale_space). n_eigs if multiscale is True, how much components to retain. device Run method on either `cpu` or on `gpu`. do_PCA Whether to perform PCA or not. n_pcs Number of PC components. seed Get reproducible results for the GPU implementation. copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` it returns AnnData, else it update field to `adata`: `.obsm['X_diffusion']` if `multiscale = False`, diffusion space. `.obsm['X_multiscale_diffusion']` if `multiscale = True`, multiscale diffusion space. `.uns['diffusion']` dict containing results from Palantir. """ logg.info("Running Diffusion maps ", reset=True) data_df = pd.DataFrame(adata.obsm["X_pca"], index=adata.obs_names) if device == "cpu": from palantir.utils import run_diffusion_maps res = run_diffusion_maps(data_df, n_components=n_components, knn=knn, alpha=alpha) # code converted in GPU, not reproducible! elif device == "gpu": logg.warn( "GPU implementation uses eigsh from cupy.sparse, which is not currently reproducible and can give unstable results!" ) import cupy as cp from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu from cupyx.scipy.sparse.linalg import eigsh # Determine the kernel N = data_df.shape[0] if not issparse(data_df): from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=knn, metric="euclidean") X_contiguous = np.ascontiguousarray(data_df.values) nn.fit(X_contiguous) kNN = nn.kneighbors_graph(X_contiguous, mode="distance") kNN.setdiag(0) kNN.eliminate_zeros() # Adaptive k adaptive_k = int(np.floor(knn / 3)) adaptive_std = np.zeros(N) for i in np.arange(len(adaptive_std)): adaptive_std[i] = np.sort( kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1] # Kernel x, y, dists = find(kNN) # X, y specific stds dists = dists / adaptive_std[x] W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N]) # Diffusion components kernel = W + W.T else: kernel = data_df # Markov D = np.ravel(kernel.sum(axis=1)) if alpha > 0: # L_alpha D[D != 0] = D[D != 0]**(-alpha) mat = csr_matrix((D, (range(N), range(N))), shape=[N, N]) kernel = mat.dot(kernel).dot(mat) D = np.ravel(kernel.sum(axis=1)) D[D != 0] = 1 / D[D != 0] kernel = csr_matrix_gpu(kernel) D = csr_matrix_gpu((cp.array(D), (cp.arange(N), cp.arange(N))), shape=(N, N)) T = D.dot(kernel) # Eigen value dcomposition D, V = eigsh(T, n_components, tol=1e-4, maxiter=1000) D, V = D.get(), V.get() inds = np.argsort(D)[::-1] D = D[inds] V = V[:, inds] # Normalize for i in range(V.shape[1]): V[:, i] = V[:, i] / np.linalg.norm(V[:, i]) # Create are results dictionary res = {"T": T.get(), "EigenVectors": V, "EigenValues": D} res["EigenVectors"] = pd.DataFrame(res["EigenVectors"]) if not issparse(data_df): res["EigenVectors"].index = data_df.index res["EigenValues"] = pd.Series(res["EigenValues"]) res["kernel"] = kernel.get() if multiscale: logg.info(" determining multiscale diffusion space") from palantir.utils import determine_multiscale_space adata.obsm["X_diffusion_multiscale"] = determine_multiscale_space( res, n_eigs=n_eigs).values logstr = " .obsm['X_diffusion_multiscale'], multiscale diffusion space.\n" else: adata.obsm["X_diffusion"] = res["EigenVectors"].iloc[:, 1:].values logstr = " .obsm['X_diffusion'], diffusion space.\n" adata.uns["diffusion"] = res logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint("added \n" + logstr + " .uns['diffusion'] dict containing diffusion maps results.")
def fast_knn( X, *, n_clusters: int = 5, n_neighbors: Optional[int] = None, graph_mode='distance', cluster_mode='spectral', algorithm='brute', n_jobs: Optional[int] = None, random_state: int = 1, framework: Literal['auto', 'cuml', 'sklearn'] = 'auto', ) -> NearestNeighbors: """ Parameters ---------- X : `ndarray` or tuple of (X, y) n_neighbors: int (default = 5) The top K closest datapoints you want the algorithm to return. Currently, this value must be < 1024. graph_mode : {'distance', 'connectivity'}, default='distance' This mode decides which values `kneighbors_graph` will return: - 'connectivity' : will return the connectivity matrix with ones and zeros (for 'SpectralClustering'). - 'distance' : will return the distances between neighbors according to the given metric (for 'DBSCAN'). cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote' This mode decides how to generate cluster prediction from the neighbors graph: - 'dbscan' : - 'spectral' : - 'isomap' : - 'kmeans' : algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. """ kwargs = dict(locals()) X = kwargs.pop('X') framework = kwargs.pop('framework') random_state = kwargs.pop('random_state') n_clusters = int(kwargs.pop('n_clusters')) if n_neighbors is None: kwargs['n_neighbors'] = n_clusters n_neighbors = n_clusters ## graph mode graph_mode = str(kwargs.pop('graph_mode')).strip().lower() assert graph_mode in ('distance', 'connectivity') ## cluster mode cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower() ## fine-tuning the kwargs use_cuml = _check_cuml(framework) if use_cuml: from cuml.neighbors import NearestNeighbors as KNN kwargs.pop('n_jobs') kwargs.pop('algorithm') else: KNN = NearestNeighbors ## fitting knn = KNN(**kwargs) knn.fit(X) knn._fitid = id(X) ## Transform mode knn._random_state = random_state knn._n_clusters = n_clusters knn._graph_mode = graph_mode knn._cluster_mode = cluster_mode if use_cuml: knn.n_samples_fit_ = X.shape[0] knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn) knn.transform = types.MethodType(nn_transform, knn) knn.fit_transform = types.MethodType(nn_fit_transform, knn) knn.predict = types.MethodType(nn_predict, knn) return knn
def KNN_predict(df, embeddings, KNN=50, thresh=None, thresh_range=None): ''' thresh_range: np.arrange for threshold selection thresh: distance threshold for result matching image: 2.7, tfidf: 0.6 image: list(np.arange(2,10,0.5)) text : list(np.arange(0.1, 1, 0.1)) ''' assert ((thresh is None) or (thresh_range is None)), "Must provide either `thresh` or `thresh_range`" if thresh_range is not None: assert 'matches' in df.columns, "Cannot perform threshold selection on testing data" model = NearestNeighbors(n_neighbors = KNN) model.fit(embeddings) distances, indices = model.kneighbors(embeddings) # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold thresholds_scores = None if thresh is None: thresholds = thresh_range scores = [] recalls = [] precisions = [] for threshold in thresholds: predictions = [] for k in range(embeddings.shape[0]): idx = np.where(distances[k,] < threshold)[0] ids = indices[k,idx] posting_ids = ' '.join(df['posting_id'].iloc[ids].values) predictions.append(posting_ids) df['pred_matches'] = predictions f1, precision, recall = f1_score(df['matches'], df['pred_matches']) print(f'Threshold {threshold:.2f}: F1 {f1.mean():.4f} Precision {precision.mean():.4f} Recall {recall.mean():.4f}') scores.append(f1.mean()) recalls.append(recall.mean()) precisions.append(precision.mean()) thresholds_scores = pd.DataFrame({ 'thresholds': thresholds, 'scores': scores, 'recalls': recalls, 'precisions': precisions }) max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()] best_threshold = max_score['thresholds'].values[0] best_score = max_score['scores'].values[0] print(f'Our best score is {best_score} and has a threshold {best_threshold}') thresh = best_threshold # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller predictions = [] for k in tqdm(range(embeddings.shape[0])): idx = np.where(distances[k,] < thresh)[0] ids = indices[k,idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids) del model, distances, indices gc.collect() return df, predictions, thresholds_scores