示例#1
0
def _construct_mnn(t1_cells, t2_cells, data_df, n_neighbors,device,n_jobs=-2):
    # FUnction to construct mutually nearest neighbors bewteen two points
    
    if device == "gpu":
        from cuml import NearestNeighbors
        nbrs = NearestNeighbors(n_neighbors=n_neighbors,
                                metric='euclidean')
    elif device == "cpu":
        from sklearn.neighbors import NearestNeighbors
        nbrs = NearestNeighbors(n_neighbors=n_neighbors,
                                metric='euclidean', n_jobs=n_jobs)
    
    print(f't+1 neighbors of t...')
    nbrs.fit(data_df.loc[t1_cells, :].values)
    t1_nbrs = nbrs.kneighbors_graph(
        data_df.loc[t2_cells, :].values, mode='distance')

    print(f't neighbors of t+1...')
    nbrs.fit(data_df.loc[t2_cells, :].values)
    t2_nbrs = nbrs.kneighbors_graph(
        data_df.loc[t1_cells, :].values, mode='distance')

    # Mututally nearest neighbors
    mnn = t2_nbrs.multiply(t1_nbrs.T)
    mnn = mnn.sqrt()
    return mnn
示例#2
0
def find_similar_image():
    KNN = 50
    if len(test) == 3:
        KNN = 2

    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(image_embeddings)
    preds = []
    CHUNK = 1024 * 4

    print('Finding similar images...')
    CTS = len(image_embeddings) // CHUNK
    if len(image_embeddings) % CHUNK != 0:
        CTS += 1
    for j in range(CTS):

        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(image_embeddings))
        print('chunk', a, 'to', b)
        distances, indices = model.kneighbors(image_embeddings[a:b, ])

        for k in range(b - a):
            IDX = np.where(distances[k,] < 6.0)[0]
            IDS = indices[k, IDX]
            o = test.iloc[IDS].posting_id.values
            preds.append(o)

    del model, distances, indices, image_embeddings, embeds
    _ = gc.collect()

    test['preds2'] = preds
    test.head()
def get_image_predictions(df, embeddings, threshold=0.0):
    if len(df) > 3:
        KNN = 50
    else:
        KNN = 3

    model = NearestNeighbors(n_neighbors=KNN, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k, ] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values
        if len(posting_ids) >= 2:
            idx_s = np.where(distances[k, ] < threshold - 0.08888)[0]
            ids_s = indices[k, idx_s]
            posting_ids_b = df['posting_id'].iloc[ids_s].values
            if len(posting_ids_b) >= 2:
                predictions.append(posting_ids_b)
            else:
                predictions.append(posting_ids)
        else:
            idx = np.where(distances[k, ] < 0.51313)[0]
            ids = indices[k, idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids[:2])

    del model, distances, indices
    gc.collect()
    return predictions
示例#4
0
def find_matching_image_with_rapids():

    model = EfficientNetB0(weights='imagenet',
                           include_top=False,
                           pooling='avg',
                           input_shape=None)
    train_gen = DataGenerator(train, batch_size=128)
    image_embeddings = model.predict(train_gen, verbose=1)
    print('image embeddings shape is', image_embeddings.shape)

    # After fitting KNN, we will display some example rows of train and their 8 closest other images in train (based EffNetB0 image embeddings).

    KNN = 50
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(image_embeddings)
    distances, indices = model.kneighbors(image_embeddings)

    for k in range(180, 190):
        plt.figure(figsize=(20, 3))
        plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-')
        plt.title('Image Distance From Train Row %i to Other Train Rows' % k,
                  size=16)
        plt.ylabel('Distance to Train Row %i' % k, size=14)
        plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14)
        plt.show()

        cluster = train.loc[cupy.asnumpy(indices[k, :8])]
        displayDF(cluster, random=False, ROWS=2, COLS=4)
def get_preds(embs_path, threshold):
    image_embeddings = np.load(embs_path)['embeddings']

    KNN = 50
    if len(df) == 3:
        KNN = 2
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(image_embeddings)

    image_embeddings = cupy.array(image_embeddings)

    preds = []
    CHUNK = 1024 * 4

    # print('Finding similar images...')
    CTS = len(image_embeddings) // CHUNK
    if len(image_embeddings) % CHUNK != 0:
        CTS += 1

    for j in range(CTS):

        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(image_embeddings))

        cts = cupy.matmul(image_embeddings, image_embeddings[a:b].T).T

        for k in range(b - a):
            IDX = cupy.where(cts[k, ] > threshold)[0]
            IDX = cupy.asnumpy(IDX)
            o = cpu_df.iloc[IDX].posting_id.values
            preds.append(o)

    return preds
示例#6
0
def compute_neighbors_rapids(X: np.ndarray,
                             n_neighbors: int,
                             metric: _Metric = 'euclidean'):
    """Compute nearest neighbors using RAPIDS cuml.

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        The data to compute nearest neighbors for.
    n_neighbors
        The number of neighbors to use.
    metric
        The metric to use to compute distances in high dimensional space.
        This string must match a valid predefined metric in RAPIDS cuml.

        Returns
    -------
    **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors)
    """
    from cuml.neighbors import NearestNeighbors

    nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
    X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
    nn.fit(X_contiguous)
    knn_dist, knn_indices = nn.kneighbors(X_contiguous)
    return knn_indices, knn_dist
示例#7
0
def get_image_neighbors(df, embeddings, threshold=args.threshold):
    n_neighbors = args.n_neighbors_max if len(df) > 3 else args.n_neighbors_min
    model_nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    model_nearest_neighbors.fit(embeddings)
    distances, indices = model_nearest_neighbors.kneighbors(embeddings)
    predictions = []
    for k in range(embeddings.shape[0]):
        idx = np.where(distances[k] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
    del model_nearest_neighbors, distances, indices
    gc.collect()
    return predictions
示例#8
0
 def _find_distance_threshold(
     self,
     features,
     posting_ids: np.ndarray,
     thresholds: List[float],
 ) -> Tuple[float, float, List[List[str]]]:
     features = F.normalize(torch.from_numpy(features)).numpy()
     with TimeUtil.timer("nearest neighbor search"):
         model = NearestNeighbors(n_neighbors=len(self.valid_df), n_jobs=32)
         model.fit(features)
         distances, indices = model.kneighbors(features)
         FileUtil.save_npy(
             distances,
             self.config.dir_config.output_dir
             / f"distances_{self.fold}_{self.current_epoch:02d}.npy",
         )
         FileUtil.save_npy(
             indices,
             self.config.dir_config.output_dir
             / f"indices_{self.fold}_{self.current_epoch:02d}.npy",
         )
     best_score = 0
     best_threshold = -1
     best_y_pred: List[List[str]] = []
     for threshold in thresholds:
         y_pred = []
         for i in range(len(distances)):
             IDX = np.where(distances[i] < threshold)[0]
             if len(IDX) < self.config.inference_config.min_indices:
                 IDX = list(range(self.config.inference_config.min_indices))
             idxs = indices[i, IDX]
             y_pred.append(posting_ids[idxs])
         scores = MetricUtil.f1_scores(self.valid_df["target"].tolist(), y_pred)
         precisions, recalls = MetricUtil.precision_recall(
             self.valid_df["target"].tolist(), y_pred
         )
         self.valid_df["score"] = scores
         self.valid_df["precision"] = precisions
         self.valid_df["recall"] = recalls
         selected_score = self.valid_df["score"].mean()
         _p_mean = self.valid_df["precision"].mean()
         _r_mean = self.valid_df["recall"].mean()
         print(
             f"----------- valid f1: {selected_score} precision: {_p_mean} recall: {_r_mean} threshold: {threshold} ------------"
         )
         if selected_score > best_score:
             best_score = selected_score
             best_threshold = threshold
             best_y_pred = y_pred
     return best_score, best_threshold, best_y_pred
示例#9
0
def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors,
                              precomputed_nearest_neighbors):
    n_clusters = 30
    random_state = 42
    metric = 'euclidean'

    X, _ = make_blobs(n_samples=n_rows,
                      centers=n_clusters,
                      n_features=n_features,
                      random_state=random_state)

    if precomputed_nearest_neighbors:
        nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
        nn.fit(X)
        knn_dists, knn_indices = nn.kneighbors(X,
                                               n_neighbors,
                                               return_distance=True)
        cu_fss_graph = cu_fuzzy_simplicial_set(X,
                                               n_neighbors,
                                               random_state,
                                               metric,
                                               knn_indices=knn_indices,
                                               knn_dists=knn_dists)

        knn_indices = knn_indices.get()
        knn_dists = knn_dists.get()
        ref_fss_graph = ref_fuzzy_simplicial_set(
            X,
            n_neighbors,
            random_state,
            metric,
            knn_indices=knn_indices,
            knn_dists=knn_dists)[0].tocoo()
    else:
        cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                               metric)

        X = X.get()
        ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                                 metric)[0].tocoo()

    cu_fss_graph = cu_fss_graph.todense()
    ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
    assert correctness_sparse(ref_fss_graph,
                              cu_fss_graph,
                              atol=0.1,
                              rtol=0.2,
                              threshold=0.95)
def get_image_neighbors(df, embeddings, KNN=50):
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    threshold = 4.5
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k, ] < threshold)[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)

    del model, distances, indices
    gc.collect()
    return df, predictions
示例#11
0
    def reduce_dimensionality(self, embeddings):
        """Reduce dimensionality of embeddings using UMAP and train a UMAP model

        Args:
            embeddings (cupy.ndarray): The extracted embeddings using the
            sentence transformer module.

        Returns:
            umap_embeddings: The reduced embeddings
        """
        m_cos = NearestNeighbors(n_neighbors=15, metric="cosine")
        m_cos.fit(embeddings)
        knn_graph_cos = m_cos.kneighbors_graph(embeddings, mode="distance")
        u1 = UMAP(n_neighbors=15, n_components=5, min_dist=0.0)
        umap_embeddings = u1.fit_transform(embeddings, knn_graph=knn_graph_cos)

        return umap_embeddings
示例#12
0
def get_image_neighbors(df, embeddings, KNN=50):
    model = NearestNeighbors(n_neighbors=KNN)  # 创建knn模型
    model.fit(embeddings)  # 训练features
    distances, indices = model.kneighbors(embeddings)  # 获得图片之间的距离(相似度)

    predictions = []
    for k in tqdm(range(embeddings.shape[0])):  # 每张图片都拿出来两两比对
        idx = np.where(
            distances[k, ] < CFG.img_thres)[0]  # 设置一个thres(阈值),来确定匹配的严格程度
        # 对于没有匹配到的其他图片的图片,我们放宽阈值再匹配一次
        if len(idx) == 1:
            idx = np.where(distances[k, ] < (CFG.img_thres + CFG.addition))[0]
        ids = indices[k, idx]
        posting_ids = df['posting_id'].iloc[ids].values  # 输出匹配的图片
        predictions.append(posting_ids)

    del model, distances, indices
    gc.collect()
    return predictions
示例#13
0
文件: __init__.py 项目: hkailee/quanp
def compute_neighbors_rapids(X: np.ndarray, n_neighbors: int):
    """Compute nearest neighbors using RAPIDS cuml.
    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        The data to compute nearest neighbors for.
    n_neighbors
        The number of neighbors to use.
        Returns
    -------
    **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors)
    """
    from cuml.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
    nn.fit(X_contiguous)
    knn_distsq, knn_indices = nn.kneighbors(X_contiguous)
    return knn_indices, np.sqrt(
        knn_distsq)  # cuml uses sqeuclidean metric so take sqrt
示例#14
0
 def setNodeData(self, nodes_data: xa.DataArray, **kwargs):
     print(
         f"{self.__class__.__name__}[{hex(id(self))}].setNodeData: input shape = {nodes_data.shape}"
     )
     if self.reset or (self.nodes is None):
         if (nodes_data.size > 0):
             t0 = time.time()
             self.nodes = cudf.DataFrame({
                 icol: nodes_data[:, icol]
                 for icol in range(nodes_data.shape[1])
             })
             self.nnd = NearestNeighbors(n_neighbors=self.nneighbors)
             self.nnd.fit(self.nodes)
             self.D, self.I = self.nnd.kneighbors(self.nodes,
                                                  return_distance=True)
             dt = (time.time() - t0)
             print(
                 f"Computed NN Graph with {self.nnd.n_neighbors} neighbors and {nodes_data.shape[0]} verts in {dt} sec ({dt/60} min)"
             )
             print(
                 f"  ---> Indices shape = {self.I.shape}, Distances shape = {self.D.shape} "
             )
         else:
             print("No data available for this block")
def compute_neighbors_sklearn(X: np.ndarray, n_neighbors: int):
    """Compute nearest neighbors using sklearn

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        The data to compute nearest neighbors for.
    n_neighbors
        The number of neighbors to use.

        Returns
    -------
    **knn_indices**, **knn_dists** : np.arrays of shape (n_observations, n_neighbors)
    """
    from sklearn.neighbors import NearestNeighbors
    import time

    t0 = time.time()
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
    nn.fit(X_contiguous)
    knn_dist, knn_indices = nn.kneighbors(X_contiguous)
    print("Here", time.time() - t0)
    return knn_indices, knn_dist
示例#16
0
def find_similar_titles_with_rapids_knn():
    """
    First we will extract text embeddings using RAPIDS cuML's TfidfVectorizer.
    This will turn every title into a one-hot-encoding of the words present.
    We will then compare one-hot-encodings with RAPIDS cuML KNN to find title's that are similar.
    :return:
    """
    # LOAD TRAIN UNTO THE GPU WITH CUDF
    train_gf = cudf.read_csv('../input/shopee-product-matching/train.csv')
    print('train shape is', train_gf.shape)
    train_gf.head()

    # Extract Text Embeddings with RAPIDS TfidfVectorizer¶
    # TfidfVectorizer returns a cupy sparse matrix.
    # Afterward we convert to a cupy dense matrix and feed that into RAPIDS cuML KNN.

    model = TfidfVectorizer(stop_words='english', binary=True)
    text_embeddings = model.fit_transform(train_gf.title).toarray()
    print('text embeddings shape is', text_embeddings.shape)

    # After fitting KNN, we will display some example rows of train and their 10 closest other titles in train (based on word count one-hot-encoding).

    KNN = 50
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(text_embeddings)
    distances, indices = model.kneighbors(text_embeddings)

    for k in range(5):
        plt.figure(figsize=(20, 3))
        plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-')
        plt.title('Text Distance From Train Row %i to Other Train Rows' % k,
                  size=16)
        plt.ylabel('Distance to Train Row %i' % k, size=14)
        plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14)
        plt.show()

        print(train_gf.loc[cupy.asnumpy(indices[k, :10]),
                           ['title', 'label_group']])
示例#17
0
def computeGrid(img_collection, X_2d, out_res, out_dim):
    #grid = np.zeros(())
    out = np.ones((out_dim * out_res, out_dim * out_res, 4), dtype=np.uint8)
    nn = NearestNeighbors(n_neighbors=1)
    d = {}

    # TODO: assumes unique
    #for i, xy in enumerate(X_2d):
    #    d[tuple(xy)] = i

    def build(remaining):
        xs = sorted(remaining, key=lambda x: x[0])
        ys = sorted(remaining, key=lambda x: x[1])
        X = np.zeros((out_dim * out_dim, 2), np.float32)
        for x in range(out_dim):
            for y in range(out_dim):
                X[y * out_dim + x] = np.array([
                    xs[int(x * len(xs) / out_dim)][0],
                    ys[int(y * len(ys) / out_dim)][1]
                ])
        #X = np.array(X)
        #print(X.shape)
        nn.fit(remaining)
        X_cudf = cudf.DataFrame(X)
        distances, indices = nn.kneighbors(X_cudf)
        return indices

    #remaining = X_2d
    indices = build(X_2d)

    seen = {}
    #for i, x in enumerate(xs):
    #for j, y in enumerate(ys):
    for x in range(out_dim):
        for y in range(out_dim):
            done = False
            e = 0
            while not done:
                #X = (xs[int(x * len(xs) / out_dim)], ys[int(y * len(ys) / out_dim)])
                #X_cudf = cudf.DataFrame(X)
                #distances, indices = nn.kneighbors(X_cudf)
                p = nearest = indices[(y * out_dim + x + e) % len(indices)]
                if p in seen:
                    e += 1
                    #print("rebuild")
                    #remaining = [X_2d[x] for i, x in enumerate(X_2d) if not i in seen]
                    #indices = build(remaining)
                else:
                    seen[p] = True
                    done = True
            #print("nearest", nearest)
            #p = d[tuple(nearest)]
            #grid[i][j] = img_collection[p]
            pos = (x, y)
            print("xyp", x, y, p)
            img = img_collection[p]
            h_range = x * out_res
            w_range = y * out_res
            #print("range", h_range, h_range + out_res, w_range, w_range + out_res)
            out[h_range:h_range + out_res,
                w_range:w_range + out_res, :] = readImage(img, out_res)

            #break
        #break

    #im = image.array_to_img(out)
    im = Image.fromarray(out)
    im.save(out_dir + out_name, quality=100)
示例#18
0
def spreadXY(X_2d, threshold, speed):
    'spreads items until distance is greater than threshold'

    import cudf
    from cuml.neighbors import NearestNeighbors

    def kernel(x, y, outx, outy, threshold2):
        for i, (x2, y2) in enumerate(zip(x, y)):
            d = math.sqrt(x2 * x2 + y2 * y2)
            if 0 < d <= threshold2:
                outx[i] = x2 / d
                outy[i] = y2 / d
            else:
                outx[i] = 0
                outy[i] = 0

    print('spreadXY')
    length = len(X_2d)
    X = cudf.DataFrame()
    X['x'] = X_2d[0:length, 0]
    X['y'] = X_2d[0:length, 1]
    k = 8
    scale = 10000
    threshold *= scale
    speed *= scale
    X = X.mul(scale)
    #X = np.copy(X_2d[:length])
    for i in range(20):
        nn = NearestNeighbors(n_neighbors=k)
        nn.fit(X)
        distances, indices = nn.kneighbors(X)
        #print(distances.shape)
        joins = []

        s = X.sum()
        print("iteration", i, "sum dist", s)

        newX = X
        for j in range(k):
            join = indices.drop([x for x in range(k) if x != j
                                 ])  #.rename(mapper={j: 'x'}, columns=[j])
            join = join.merge(X, how='left', left_on=[j], right_index=True)
            join = join.drop(j)
            v = join.sub(X)
            v = v.apply_rows(kernel,
                             incols=['x', 'y'],
                             outcols=dict(outx=np.float32, outy=np.float32),
                             kwargs=dict(threshold2=threshold))
            v = v.drop(['x', 'y'])
            v = v.rename(columns={'outx': 'x', 'outy': 'y'})
            newX = newX.sub(v.mul(speed))
            #newX = newX.add(1)
            #v = v.query('x * x + y * y <= ' + str(threshold * threshold))
        #print("newX")
        #print(newX)
        X = newX

        s = X.sum()
        print("iteration", i, "sum dist", s)
    X = X.truediv(scale)
    X = np.array(X.as_matrix())
    print(X.shape)
    return X
示例#19
0
def augmented_affinity_matrix(
    data_df,
    timepoints,
    timepoint_connections,
    n_neighbors=30,
    n_jobs=-2,
    pc_components=1000,
    device="cpu",
):
    """Function for max min sampling of waypoints

    :param data_df: Normalized data frame. Data frame should be sorted according to the timepoints
    :param timepoints: Panadas series indicating timepoints for each cell in data_df
    :param timepoint_connections: Links between timepoints
    :param n_neighbors: Number of nearest neighbors for graph construction
    :param n_jobs: Nearest Neighbors will be computed in parallel using n_jobs.
    :param pc_components: Minimum number of principal components to use. Specify `None` to use pre-computed components
    :return: Affinity matrix  augmented to mutually nearest neighbors
    """

    # Timepoints nad data_df should in same order
    timepoints = timepoints[data_df.index]
    cell_order = data_df.index

    # Time point cells and indices
    tp_cells = pd.Series()
    tp_offset = pd.Series()
    offset = 0
    for i in timepoints.unique():
        tp_offset[i] = offset
        tp_cells[i] = list(timepoints.index[timepoints == i])
        offset += len(tp_cells[i])

    # Run PCA to denoise the dropouts
    if pc_components is None:
        pca_projections = data_df
    else:
        pca_projections, _ = utils.run_pca(data_df,device, n_components=pc_components)

    # Nearest neighbor graph construction and affinity matrix
    print('Nearest neighbor computation...')

    # --------------------------------------------------------------------------
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors,
    #                         metric='euclidean', n_jobs=-2)
    # nbrs.fit(pca_projections.values)
    # dists, _ = nbrs.kneighbors(pca_projections.values)
    # adj = nbrs.kneighbors_graph(pca_projections.values, mode='distance')
    # # Scaling factors for affinity matrix construction
    # ka = np.int(n_neighbors / 3)
    # scaling_factors = pd.Series(dists[:, ka], index=cell_order)
    # # Affinity matrix
    # nn_aff = _convert_to_affinity(adj, scaling_factors, True)
    # --------------------------------------------------------------------------
    
    if device == "cpu":
        temp = sc.AnnData(data_df.values)
        sc.pp.neighbors(temp, n_pcs=0, n_neighbors=n_neighbors)
        # maintaining backwards compatibility to Scanpy `sc.pp.neighbors`
        try:
            kNN = temp.uns['neighbors']['distances']
        except KeyError:
            kNN = temp.obsp['distances']
    elif device == "gpu":
        from cuml.neighbors import NearestNeighbors
        nn = NearestNeighbors(n_neighbors=n_neighbors,metric="euclidean")
        X_contiguous = np.ascontiguousarray(data_df.values)
        nn.fit(X_contiguous)

        kNN = nn.kneighbors_graph(X_contiguous,mode="distance")
        kNN.setdiag(0)
        kNN.eliminate_zeros()

    # Adaptive k
    adaptive_k = int(np.floor(n_neighbors / 3))
    scaling_factors = np.zeros(data_df.shape[0])

    for i in np.arange(len(scaling_factors)):
        scaling_factors[i] = np.sort(kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1]

    scaling_factors = pd.Series(scaling_factors, index=cell_order)

    # Affinity matrix
    nn_aff = _convert_to_affinity(kNN, scaling_factors, device, True)

    # Mututally nearest neighbor affinity matrix
    # Initilze mnn affinity matrix
    N = len(cell_order)
    full_mnn_aff = csr_matrix(([0], ([0], [0])), [N, N])
    for i in timepoint_connections.index:
        t1, t2 = timepoint_connections.loc[i, :].values
        print(f'Constucting affinities between {t1} and {t2}...')

        # MNN matrix  and distance to ka the distance
        t1_cells = tp_cells[t1]
        t2_cells = tp_cells[t2]
        mnn = _construct_mnn(t1_cells, t2_cells, pca_projections,
                             n_neighbors, device, n_jobs)

        # MNN Scaling factors
        # Distance to the adaptive neighbor
        ka_dists = pd.Series(0.0, index=t1_cells + t2_cells)
        # T1 scaling factors
        ka_dists[t1_cells] = _mnn_ka_distances(mnn, n_neighbors)
        # T2 scaling factors
        ka_dists[t2_cells] = _mnn_ka_distances(mnn.T, n_neighbors)

        # Scaling factors
        mnn_scaling_factors = pd.Series(0.0, index=cell_order)
        mnn_scaling_factors[t1_cells] = _mnn_scaling_factors(
            ka_dists[t1_cells], scaling_factors, device)
        mnn_scaling_factors[t2_cells] = _mnn_scaling_factors(
            ka_dists[t2_cells], scaling_factors, device)

        # MNN affinity matrix
        full_mnn_aff = full_mnn_aff + \
            _mnn_affinity(mnn, mnn_scaling_factors,
                          tp_offset[t1], tp_offset[t2], device)

    # Symmetrize the affinity matrix and return
    aff = nn_aff + nn_aff.T + full_mnn_aff + full_mnn_aff.T
    return aff, nn_aff + nn_aff.T
def mk_nnmdl(feats, n_nbrs=N_NBRS):
    nnmdl = NearestNeighbors(N_NBRS, metric="cosine")
    nnmdl.fit(feats)
    return nnmdl
示例#21
0
class gpActivationFlow(ActivationFlow):
    def __init__(self, nodes_data: xa.DataArray, n_neighbors: int, **kwargs):
        ActivationFlow.__init__(self, n_neighbors, **kwargs)
        self.I: cudf.DataFrame = None
        self.D: cudf.DataFrame = None
        self.P: cudf.DataFrame = None
        self.C: cudf.DataFrame = None
        self.nodes: cudf.DataFrame = None
        self.setNodeData(nodes_data, **kwargs)

    def setNodeData(self, nodes_data: xa.DataArray, **kwargs):
        print(
            f"{self.__class__.__name__}[{hex(id(self))}].setNodeData: input shape = {nodes_data.shape}"
        )
        if self.reset or (self.nodes is None):
            if (nodes_data.size > 0):
                t0 = time.time()
                self.nodes = cudf.DataFrame({
                    icol: nodes_data[:, icol]
                    for icol in range(nodes_data.shape[1])
                })
                self.nnd = NearestNeighbors(n_neighbors=self.nneighbors)
                self.nnd.fit(self.nodes)
                self.D, self.I = self.nnd.kneighbors(self.nodes,
                                                     return_distance=True)
                dt = (time.time() - t0)
                print(
                    f"Computed NN Graph with {self.nnd.n_neighbors} neighbors and {nodes_data.shape[0]} verts in {dt} sec ({dt/60} min)"
                )
                print(
                    f"  ---> Indices shape = {self.I.shape}, Distances shape = {self.D.shape} "
                )
            else:
                print("No data available for this block")

    def getGraph(self):
        return None

    def getConnectionMatrix(self) -> csr_matrix:
        distances = cupy.ravel(cupy.fromDlpack(self.D.to_dlpack()))
        indices = cupy.ravel(cupy.fromDlpack(self.I.to_dlpack()))
        n_samples = indices.shape[0]
        n_nonzero = n_samples * self.nneighbors
        rowptr = cupy.arange(0, n_nonzero + 1, self.nneighbors)
        knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr),
                                                  shape=(n_samples, n_samples))
        print(f"Completed KNN, sparse graph shape = {knn_graph.shape}")
        return knn_graph

    def spread(self,
               sample_data: np.ndarray,
               nIter: int = 1,
               **kwargs) -> Optional[bool]:
        converged = True

        spdf = shortest_path(G, source_pid)
        spdf.sort_by("vertex")
        distances = spdf["distance"]

        self.reset = False
        return converged
示例#22
0
import cudf, cuml, cupy, cupyx
from cuml.neighbors import NearestNeighbors

# Using cudf Dataframe here is not likely to help with performance
# However, it's a good opportunity to get familiar with the API
source_df: cudf.DataFrame = cudf.read_csv(
    '/att/nobackup/tpmaxwel/data/fashion-mnist-csv/fashion_train.csv')
data = source_df.loc[:, source_df.columns[:-1]]
target = source_df[source_df.columns[-1]]
n_neighbors = 5

# fit model
model = NearestNeighbors(n_neighbors=5)
model.fit(data)

# get nearest neighbors
dist_mlarr, ind_mlarr = model.kneighbors(data, return_distance=True)

# create sparse matrix
distances = cupy.ravel(cupy.fromDlpack(dist_mlarr.to_dlpack()))
indices = cupy.ravel(cupy.fromDlpack(ind_mlarr.to_dlpack()))
print(
    f"Computed KNN graph, distances shape = {distances.shape}, indices shape = {indices.shape}, distances[0:5]= {distances[0:5]}, indices[0:5]= {indices[0:5]}"
)
n_samples = indices.shape[0]
n_nonzero = n_samples * n_neighbors
rowptr = cupy.arange(0, n_nonzero + 1, n_neighbors)
knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr),
                                          shape=(n_samples, n_samples))

print(f"Completed KNN, graph shape = {knn_graph.shape}")
示例#23
0
        imagefeat.append(feat)

# In[11]:

from sklearn.preprocessing import normalize

# l2 norm to kill all the sim in 0-1
imagefeat = np.vstack(imagefeat)
imagefeat = normalize(imagefeat)

# In[12]:

KNN = 50
if len(test) == 3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(imagefeat)

# In[13]:

preds = []
CHUNK = 1024 * 4

imagefeat = cupy.array(imagefeat)

print('Finding similar images...')
CTS = len(imagefeat) // CHUNK
if len(imagefeat) % CHUNK != 0: CTS += 1
for j in range(CTS):

    a = j * CHUNK
示例#24
0
def diffusion(
    adata: AnnData,
    n_components=10,
    knn=30,
    alpha=0,
    multiscale: bool = True,
    n_eigs: int = None,
    device="cpu",
    n_pcs=50,
    copy=False,
):
    """\
    Wrapper to generate diffusion maps using Palantir.

    Parameters
    ----------
    adata
        Annotated data matrix.
    use_highly_variable
        Use only variable genes for calculating PC components.
    n_components
        Number of diffusion components.
    knn
        Number of nearest neighbors for graph construction.
    alpha
        Normalization parameter for the diffusion operator.
    multiscale
        Whether to get mutliscale diffusion space
        (calls palantir.utils.determine_multiscale_space).
    n_eigs
        if multiscale is True, how much components to retain.
    device
        Run method on either `cpu` or on `gpu`.
    do_PCA
        Whether to perform PCA or not.
    n_pcs
        Number of PC components.
    seed
        Get reproducible results for the GPU implementation.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------
    adata : anndata.AnnData
        if `copy=True` it returns AnnData, else it update field to `adata`:

        `.obsm['X_diffusion']`
            if `multiscale = False`, diffusion space.
        `.obsm['X_multiscale_diffusion']`
            if `multiscale = True`, multiscale diffusion space.
        `.uns['diffusion']`
            dict containing results from Palantir.
    """

    logg.info("Running Diffusion maps ", reset=True)

    data_df = pd.DataFrame(adata.obsm["X_pca"], index=adata.obs_names)

    if device == "cpu":
        from palantir.utils import run_diffusion_maps

        res = run_diffusion_maps(data_df,
                                 n_components=n_components,
                                 knn=knn,
                                 alpha=alpha)
    # code converted in GPU, not reproducible!
    elif device == "gpu":
        logg.warn(
            "GPU implementation uses eigsh from cupy.sparse, which is not currently reproducible and can give unstable results!"
        )
        import cupy as cp
        from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu
        from cupyx.scipy.sparse.linalg import eigsh

        # Determine the kernel
        N = data_df.shape[0]
        if not issparse(data_df):
            from cuml.neighbors import NearestNeighbors

            nn = NearestNeighbors(n_neighbors=knn, metric="euclidean")
            X_contiguous = np.ascontiguousarray(data_df.values)
            nn.fit(X_contiguous)

            kNN = nn.kneighbors_graph(X_contiguous, mode="distance")
            kNN.setdiag(0)
            kNN.eliminate_zeros()

            # Adaptive k
            adaptive_k = int(np.floor(knn / 3))
            adaptive_std = np.zeros(N)

            for i in np.arange(len(adaptive_std)):
                adaptive_std[i] = np.sort(
                    kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1]

            # Kernel
            x, y, dists = find(kNN)

            # X, y specific stds
            dists = dists / adaptive_std[x]
            W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N])

            # Diffusion components
            kernel = W + W.T
        else:
            kernel = data_df

        # Markov
        D = np.ravel(kernel.sum(axis=1))

        if alpha > 0:
            # L_alpha
            D[D != 0] = D[D != 0]**(-alpha)
            mat = csr_matrix((D, (range(N), range(N))), shape=[N, N])
            kernel = mat.dot(kernel).dot(mat)
            D = np.ravel(kernel.sum(axis=1))

        D[D != 0] = 1 / D[D != 0]
        kernel = csr_matrix_gpu(kernel)
        D = csr_matrix_gpu((cp.array(D), (cp.arange(N), cp.arange(N))),
                           shape=(N, N))
        T = D.dot(kernel)
        # Eigen value dcomposition
        D, V = eigsh(T, n_components, tol=1e-4, maxiter=1000)
        D, V = D.get(), V.get()

        inds = np.argsort(D)[::-1]
        D = D[inds]
        V = V[:, inds]

        # Normalize
        for i in range(V.shape[1]):
            V[:, i] = V[:, i] / np.linalg.norm(V[:, i])

        # Create are results dictionary
        res = {"T": T.get(), "EigenVectors": V, "EigenValues": D}
        res["EigenVectors"] = pd.DataFrame(res["EigenVectors"])
        if not issparse(data_df):
            res["EigenVectors"].index = data_df.index
        res["EigenValues"] = pd.Series(res["EigenValues"])
        res["kernel"] = kernel.get()

    if multiscale:
        logg.info("    determining multiscale diffusion space")
        from palantir.utils import determine_multiscale_space

        adata.obsm["X_diffusion_multiscale"] = determine_multiscale_space(
            res, n_eigs=n_eigs).values
        logstr = "    .obsm['X_diffusion_multiscale'], multiscale diffusion space.\n"
    else:
        adata.obsm["X_diffusion"] = res["EigenVectors"].iloc[:, 1:].values
        logstr = "    .obsm['X_diffusion'], diffusion space.\n"

    adata.uns["diffusion"] = res

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint("added \n" + logstr +
              "    .uns['diffusion'] dict containing diffusion maps results.")
示例#25
0
def fast_knn(
    X,
    *,
    n_clusters: int = 5,
    n_neighbors: Optional[int] = None,
    graph_mode='distance',
    cluster_mode='spectral',
    algorithm='brute',
    n_jobs: Optional[int] = None,
    random_state: int = 1,
    framework: Literal['auto', 'cuml', 'sklearn'] = 'auto',
) -> NearestNeighbors:
    """
  Parameters
  ----------
  X : `ndarray` or tuple of (X, y)
  n_neighbors: int (default = 5)
    The top K closest datapoints you want the algorithm to return.
    Currently, this value must be < 1024.
  graph_mode : {'distance', 'connectivity'}, default='distance'
    This mode decides which values `kneighbors_graph` will return:
      - 'connectivity' : will return the connectivity matrix with ones and
        zeros (for 'SpectralClustering').
      - 'distance' : will return the distances between neighbors according
        to the given metric (for 'DBSCAN').
  cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote'
      This mode decides how to generate cluster prediction from the
      neighbors graph:
      - 'dbscan' :
      - 'spectral' :
      - 'isomap' :
      - 'kmeans' :
  algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
      Algorithm used to compute the nearest neighbors:
      - 'ball_tree' will use :class:`BallTree`
      - 'kd_tree' will use :class:`KDTree`
      - 'brute' will use a brute-force search.
      - 'auto' will attempt to decide the most appropriate algorithm
        based on the values passed to :meth:`fit` method.
      Note: fitting on sparse input will override the setting of
      this parameter, using brute force.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    framework = kwargs.pop('framework')
    random_state = kwargs.pop('random_state')
    n_clusters = int(kwargs.pop('n_clusters'))
    if n_neighbors is None:
        kwargs['n_neighbors'] = n_clusters
        n_neighbors = n_clusters
    ## graph mode
    graph_mode = str(kwargs.pop('graph_mode')).strip().lower()
    assert graph_mode in ('distance', 'connectivity')
    ## cluster mode
    cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower()
    ## fine-tuning the kwargs
    use_cuml = _check_cuml(framework)
    if use_cuml:
        from cuml.neighbors import NearestNeighbors as KNN
        kwargs.pop('n_jobs')
        kwargs.pop('algorithm')
    else:
        KNN = NearestNeighbors
    ## fitting
    knn = KNN(**kwargs)
    knn.fit(X)
    knn._fitid = id(X)
    ## Transform mode
    knn._random_state = random_state
    knn._n_clusters = n_clusters
    knn._graph_mode = graph_mode
    knn._cluster_mode = cluster_mode
    if use_cuml:
        knn.n_samples_fit_ = X.shape[0]
    knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn)
    knn.transform = types.MethodType(nn_transform, knn)
    knn.fit_transform = types.MethodType(nn_fit_transform, knn)
    knn.predict = types.MethodType(nn_predict, knn)
    return knn
示例#26
0
文件: knn.py 项目: ckhui/DL-shopee
def KNN_predict(df, embeddings, KNN=50, thresh=None, thresh_range=None):
    '''
    thresh_range: np.arrange for threshold selection
    thresh: distance threshold for result matching 

    image: 2.7, tfidf: 0.6
    image: list(np.arange(2,10,0.5))
    text : list(np.arange(0.1, 1, 0.1))   
    '''
    assert ((thresh is None) or (thresh_range is None)), "Must provide either `thresh` or `thresh_range`"
    if thresh_range is not None:
        assert 'matches' in df.columns, "Cannot perform threshold selection on testing data"

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    thresholds_scores = None
    if thresh is None:
        thresholds = thresh_range

        scores = []
        recalls = []
        precisions = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            f1, precision, recall = f1_score(df['matches'], df['pred_matches'])
            print(f'Threshold {threshold:.2f}: F1 {f1.mean():.4f} Precision {precision.mean():.4f} Recall {recall.mean():.4f}')
            scores.append(f1.mean())
            recalls.append(recall.mean())
            precisions.append(precision.mean())
        thresholds_scores = pd.DataFrame({
            'thresholds': thresholds, 
            'scores': scores, 
            'recalls': recalls, 
            'precisions': precisions
            })
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    
        thresh = best_threshold
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < thresh)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions, thresholds_scores