def find_matching_image_with_rapids(): model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None) train_gen = DataGenerator(train, batch_size=128) image_embeddings = model.predict(train_gen, verbose=1) print('image embeddings shape is', image_embeddings.shape) # After fitting KNN, we will display some example rows of train and their 8 closest other images in train (based EffNetB0 image embeddings). KNN = 50 model = NearestNeighbors(n_neighbors=KNN) model.fit(image_embeddings) distances, indices = model.kneighbors(image_embeddings) for k in range(180, 190): plt.figure(figsize=(20, 3)) plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-') plt.title('Image Distance From Train Row %i to Other Train Rows' % k, size=16) plt.ylabel('Distance to Train Row %i' % k, size=14) plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14) plt.show() cluster = train.loc[cupy.asnumpy(indices[k, :8])] displayDF(cluster, random=False, ROWS=2, COLS=4)
def get_image_predictions(df, embeddings, threshold=0.0): if len(df) > 3: KNN = 50 else: KNN = 3 model = NearestNeighbors(n_neighbors=KNN, metric='cosine') model.fit(embeddings) distances, indices = model.kneighbors(embeddings) predictions = [] for k in tqdm(range(embeddings.shape[0])): idx = np.where(distances[k, ] < threshold)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values if len(posting_ids) >= 2: idx_s = np.where(distances[k, ] < threshold - 0.08888)[0] ids_s = indices[k, idx_s] posting_ids_b = df['posting_id'].iloc[ids_s].values if len(posting_ids_b) >= 2: predictions.append(posting_ids_b) else: predictions.append(posting_ids) else: idx = np.where(distances[k, ] < 0.51313)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids[:2]) del model, distances, indices gc.collect() return predictions
def find_similar_image(): KNN = 50 if len(test) == 3: KNN = 2 model = NearestNeighbors(n_neighbors=KNN) model.fit(image_embeddings) preds = [] CHUNK = 1024 * 4 print('Finding similar images...') CTS = len(image_embeddings) // CHUNK if len(image_embeddings) % CHUNK != 0: CTS += 1 for j in range(CTS): a = j * CHUNK b = (j + 1) * CHUNK b = min(b, len(image_embeddings)) print('chunk', a, 'to', b) distances, indices = model.kneighbors(image_embeddings[a:b, ]) for k in range(b - a): IDX = np.where(distances[k,] < 6.0)[0] IDS = indices[k, IDX] o = test.iloc[IDS].posting_id.values preds.append(o) del model, distances, indices, image_embeddings, embeds _ = gc.collect() test['preds2'] = preds test.head()
def compute_neighbors_rapids(X: np.ndarray, n_neighbors: int, metric: _Metric = 'euclidean'): """Compute nearest neighbors using RAPIDS cuml. Parameters ---------- X: array of shape (n_samples, n_features) The data to compute nearest neighbors for. n_neighbors The number of neighbors to use. metric The metric to use to compute distances in high dimensional space. This string must match a valid predefined metric in RAPIDS cuml. Returns ------- **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors) """ from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) X_contiguous = np.ascontiguousarray(X, dtype=np.float32) nn.fit(X_contiguous) knn_dist, knn_indices = nn.kneighbors(X_contiguous) return knn_indices, knn_dist
def get_image_neighbors(df, embeddings, threshold=args.threshold): n_neighbors = args.n_neighbors_max if len(df) > 3 else args.n_neighbors_min model_nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors) model_nearest_neighbors.fit(embeddings) distances, indices = model_nearest_neighbors.kneighbors(embeddings) predictions = [] for k in range(embeddings.shape[0]): idx = np.where(distances[k] < threshold)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids) del model_nearest_neighbors, distances, indices gc.collect() return predictions
def _find_distance_threshold( self, features, posting_ids: np.ndarray, thresholds: List[float], ) -> Tuple[float, float, List[List[str]]]: features = F.normalize(torch.from_numpy(features)).numpy() with TimeUtil.timer("nearest neighbor search"): model = NearestNeighbors(n_neighbors=len(self.valid_df), n_jobs=32) model.fit(features) distances, indices = model.kneighbors(features) FileUtil.save_npy( distances, self.config.dir_config.output_dir / f"distances_{self.fold}_{self.current_epoch:02d}.npy", ) FileUtil.save_npy( indices, self.config.dir_config.output_dir / f"indices_{self.fold}_{self.current_epoch:02d}.npy", ) best_score = 0 best_threshold = -1 best_y_pred: List[List[str]] = [] for threshold in thresholds: y_pred = [] for i in range(len(distances)): IDX = np.where(distances[i] < threshold)[0] if len(IDX) < self.config.inference_config.min_indices: IDX = list(range(self.config.inference_config.min_indices)) idxs = indices[i, IDX] y_pred.append(posting_ids[idxs]) scores = MetricUtil.f1_scores(self.valid_df["target"].tolist(), y_pred) precisions, recalls = MetricUtil.precision_recall( self.valid_df["target"].tolist(), y_pred ) self.valid_df["score"] = scores self.valid_df["precision"] = precisions self.valid_df["recall"] = recalls selected_score = self.valid_df["score"].mean() _p_mean = self.valid_df["precision"].mean() _r_mean = self.valid_df["recall"].mean() print( f"----------- valid f1: {selected_score} precision: {_p_mean} recall: {_r_mean} threshold: {threshold} ------------" ) if selected_score > best_score: best_score = selected_score best_threshold = threshold best_y_pred = y_pred return best_score, best_threshold, best_y_pred
def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors, precomputed_nearest_neighbors): n_clusters = 30 random_state = 42 metric = 'euclidean' X, _ = make_blobs(n_samples=n_rows, centers=n_clusters, n_features=n_features, random_state=random_state) if precomputed_nearest_neighbors: nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) nn.fit(X) knn_dists, knn_indices = nn.kneighbors(X, n_neighbors, return_distance=True) cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state, metric, knn_indices=knn_indices, knn_dists=knn_dists) knn_indices = knn_indices.get() knn_dists = knn_dists.get() ref_fss_graph = ref_fuzzy_simplicial_set( X, n_neighbors, random_state, metric, knn_indices=knn_indices, knn_dists=knn_dists)[0].tocoo() else: cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state, metric) X = X.get() ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state, metric)[0].tocoo() cu_fss_graph = cu_fss_graph.todense() ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense() assert correctness_sparse(ref_fss_graph, cu_fss_graph, atol=0.1, rtol=0.2, threshold=0.95)
def get_image_neighbors(df, embeddings, KNN=50): model = NearestNeighbors(n_neighbors=KNN) model.fit(embeddings) distances, indices = model.kneighbors(embeddings) threshold = 4.5 predictions = [] for k in tqdm(range(embeddings.shape[0])): idx = np.where(distances[k, ] < threshold)[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids) del model, distances, indices gc.collect() return df, predictions
def compute_neighbors_rapids(X: np.ndarray, n_neighbors: int): """Compute nearest neighbors using RAPIDS cuml. Parameters ---------- X: array of shape (n_samples, n_features) The data to compute nearest neighbors for. n_neighbors The number of neighbors to use. Returns ------- **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors) """ from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=n_neighbors) X_contiguous = np.ascontiguousarray(X, dtype=np.float32) nn.fit(X_contiguous) knn_distsq, knn_indices = nn.kneighbors(X_contiguous) return knn_indices, np.sqrt( knn_distsq) # cuml uses sqeuclidean metric so take sqrt
def get_image_neighbors(df, embeddings, KNN=50): model = NearestNeighbors(n_neighbors=KNN) # 创建knn模型 model.fit(embeddings) # 训练features distances, indices = model.kneighbors(embeddings) # 获得图片之间的距离(相似度) predictions = [] for k in tqdm(range(embeddings.shape[0])): # 每张图片都拿出来两两比对 idx = np.where( distances[k, ] < CFG.img_thres)[0] # 设置一个thres(阈值),来确定匹配的严格程度 # 对于没有匹配到的其他图片的图片,我们放宽阈值再匹配一次 if len(idx) == 1: idx = np.where(distances[k, ] < (CFG.img_thres + CFG.addition))[0] ids = indices[k, idx] posting_ids = df['posting_id'].iloc[ids].values # 输出匹配的图片 predictions.append(posting_ids) del model, distances, indices gc.collect() return predictions
def compute_neighbors_sklearn(X: np.ndarray, n_neighbors: int): """Compute nearest neighbors using sklearn Parameters ---------- X: array of shape (n_samples, n_features) The data to compute nearest neighbors for. n_neighbors The number of neighbors to use. Returns ------- **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors) """ from sklearn.neighbors import NearestNeighbors import time t0 = time.time() nn = NearestNeighbors(n_neighbors=n_neighbors) X_contiguous = np.ascontiguousarray(X, dtype=np.float32) nn.fit(X_contiguous) knn_dist, knn_indices = nn.kneighbors(X_contiguous) print("Here", time.time() - t0) return knn_indices, knn_dist
def spreadXY(X_2d, threshold, speed): 'spreads items until distance is greater than threshold' import cudf from cuml.neighbors import NearestNeighbors def kernel(x, y, outx, outy, threshold2): for i, (x2, y2) in enumerate(zip(x, y)): d = math.sqrt(x2 * x2 + y2 * y2) if 0 < d <= threshold2: outx[i] = x2 / d outy[i] = y2 / d else: outx[i] = 0 outy[i] = 0 print('spreadXY') length = len(X_2d) X = cudf.DataFrame() X['x'] = X_2d[0:length, 0] X['y'] = X_2d[0:length, 1] k = 8 scale = 10000 threshold *= scale speed *= scale X = X.mul(scale) #X = np.copy(X_2d[:length]) for i in range(20): nn = NearestNeighbors(n_neighbors=k) nn.fit(X) distances, indices = nn.kneighbors(X) #print(distances.shape) joins = [] s = X.sum() print("iteration", i, "sum dist", s) newX = X for j in range(k): join = indices.drop([x for x in range(k) if x != j ]) #.rename(mapper={j: 'x'}, columns=[j]) join = join.merge(X, how='left', left_on=[j], right_index=True) join = join.drop(j) v = join.sub(X) v = v.apply_rows(kernel, incols=['x', 'y'], outcols=dict(outx=np.float32, outy=np.float32), kwargs=dict(threshold2=threshold)) v = v.drop(['x', 'y']) v = v.rename(columns={'outx': 'x', 'outy': 'y'}) newX = newX.sub(v.mul(speed)) #newX = newX.add(1) #v = v.query('x * x + y * y <= ' + str(threshold * threshold)) #print("newX") #print(newX) X = newX s = X.sum() print("iteration", i, "sum dist", s) X = X.truediv(scale) X = np.array(X.as_matrix()) print(X.shape) return X
from cuml.neighbors import NearestNeighbors # Using cudf Dataframe here is not likely to help with performance # However, it's a good opportunity to get familiar with the API source_df: cudf.DataFrame = cudf.read_csv( '/att/nobackup/tpmaxwel/data/fashion-mnist-csv/fashion_train.csv') data = source_df.loc[:, source_df.columns[:-1]] target = source_df[source_df.columns[-1]] n_neighbors = 5 # fit model model = NearestNeighbors(n_neighbors=5) model.fit(data) # get nearest neighbors dist_mlarr, ind_mlarr = model.kneighbors(data, return_distance=True) # create sparse matrix distances = cupy.ravel(cupy.fromDlpack(dist_mlarr.to_dlpack())) indices = cupy.ravel(cupy.fromDlpack(ind_mlarr.to_dlpack())) print( f"Computed KNN graph, distances shape = {distances.shape}, indices shape = {indices.shape}, distances[0:5]= {distances[0:5]}, indices[0:5]= {indices[0:5]}" ) n_samples = indices.shape[0] n_nonzero = n_samples * n_neighbors rowptr = cupy.arange(0, n_nonzero + 1, n_neighbors) knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr), shape=(n_samples, n_samples)) print(f"Completed KNN, graph shape = {knn_graph.shape}")
class gpActivationFlow(ActivationFlow): def __init__(self, nodes_data: xa.DataArray, n_neighbors: int, **kwargs): ActivationFlow.__init__(self, n_neighbors, **kwargs) self.I: cudf.DataFrame = None self.D: cudf.DataFrame = None self.P: cudf.DataFrame = None self.C: cudf.DataFrame = None self.nodes: cudf.DataFrame = None self.setNodeData(nodes_data, **kwargs) def setNodeData(self, nodes_data: xa.DataArray, **kwargs): print( f"{self.__class__.__name__}[{hex(id(self))}].setNodeData: input shape = {nodes_data.shape}" ) if self.reset or (self.nodes is None): if (nodes_data.size > 0): t0 = time.time() self.nodes = cudf.DataFrame({ icol: nodes_data[:, icol] for icol in range(nodes_data.shape[1]) }) self.nnd = NearestNeighbors(n_neighbors=self.nneighbors) self.nnd.fit(self.nodes) self.D, self.I = self.nnd.kneighbors(self.nodes, return_distance=True) dt = (time.time() - t0) print( f"Computed NN Graph with {self.nnd.n_neighbors} neighbors and {nodes_data.shape[0]} verts in {dt} sec ({dt/60} min)" ) print( f" ---> Indices shape = {self.I.shape}, Distances shape = {self.D.shape} " ) else: print("No data available for this block") def getGraph(self): return None def getConnectionMatrix(self) -> csr_matrix: distances = cupy.ravel(cupy.fromDlpack(self.D.to_dlpack())) indices = cupy.ravel(cupy.fromDlpack(self.I.to_dlpack())) n_samples = indices.shape[0] n_nonzero = n_samples * self.nneighbors rowptr = cupy.arange(0, n_nonzero + 1, self.nneighbors) knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr), shape=(n_samples, n_samples)) print(f"Completed KNN, sparse graph shape = {knn_graph.shape}") return knn_graph def spread(self, sample_data: np.ndarray, nIter: int = 1, **kwargs) -> Optional[bool]: converged = True spdf = shortest_path(G, source_pid) spdf.sort_by("vertex") distances = spdf["distance"] self.reset = False return converged
def KNN_predict(df, embeddings, KNN=50, thresh=None, thresh_range=None): ''' thresh_range: np.arrange for threshold selection thresh: distance threshold for result matching image: 2.7, tfidf: 0.6 image: list(np.arange(2,10,0.5)) text : list(np.arange(0.1, 1, 0.1)) ''' assert ((thresh is None) or (thresh_range is None)), "Must provide either `thresh` or `thresh_range`" if thresh_range is not None: assert 'matches' in df.columns, "Cannot perform threshold selection on testing data" model = NearestNeighbors(n_neighbors = KNN) model.fit(embeddings) distances, indices = model.kneighbors(embeddings) # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold thresholds_scores = None if thresh is None: thresholds = thresh_range scores = [] recalls = [] precisions = [] for threshold in thresholds: predictions = [] for k in range(embeddings.shape[0]): idx = np.where(distances[k,] < threshold)[0] ids = indices[k,idx] posting_ids = ' '.join(df['posting_id'].iloc[ids].values) predictions.append(posting_ids) df['pred_matches'] = predictions f1, precision, recall = f1_score(df['matches'], df['pred_matches']) print(f'Threshold {threshold:.2f}: F1 {f1.mean():.4f} Precision {precision.mean():.4f} Recall {recall.mean():.4f}') scores.append(f1.mean()) recalls.append(recall.mean()) precisions.append(precision.mean()) thresholds_scores = pd.DataFrame({ 'thresholds': thresholds, 'scores': scores, 'recalls': recalls, 'precisions': precisions }) max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()] best_threshold = max_score['thresholds'].values[0] best_score = max_score['scores'].values[0] print(f'Our best score is {best_score} and has a threshold {best_threshold}') thresh = best_threshold # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller predictions = [] for k in tqdm(range(embeddings.shape[0])): idx = np.where(distances[k,] < thresh)[0] ids = indices[k,idx] posting_ids = df['posting_id'].iloc[ids].values predictions.append(posting_ids) del model, distances, indices gc.collect() return df, predictions, thresholds_scores