示例#1
0
文件: seurat.py 项目: jksr/ALLCools
def find_neighbor(cc1, cc2, k, random_state=0):
    """
    find all four way of neighbors for two datasets

    Parameters
    ----------
    cc1
        cc for dataset 1
    cc2
        cc for dataset 2
    k
        number of neighbors

    Returns
    -------
    11, 12, 21, 22 neighbor matrix in shape (n_cell, k)
    """
    index = pynndescent.NNDescent(cc1,
                                  metric='euclidean',
                                  n_neighbors=k + 1,
                                  random_state=random_state)
    G11 = index.neighbor_graph[0][:, 1:k + 1]
    G21 = index.query(cc2, k=k)[0]
    index = pynndescent.NNDescent(cc2,
                                  metric='euclidean',
                                  n_neighbors=k + 1,
                                  random_state=random_state)
    G22 = index.neighbor_graph[0][:, 1:k + 1]
    G12 = index.query(cc1, k=k)[0]
    return G11, G12, G21, G22
示例#2
0
    def fit(self, X):
        if self._pynnd_metric == "jaccard":
            # Convert to sparse matrix format
            X = self._sparse_convert_for_fit(X)

        self._index = pynndescent.NNDescent(
            X,
            n_neighbors=self._n_neighbors,
            metric=self._pynnd_metric,
            low_memory=True,
            leaf_size=self._leaf_size,
            pruning_degree_multiplier=self._pruning_degree_multiplier,
            diversify_prob=self._diversify_prob,
            n_search_trees=self._n_search_trees,
            compressed=True,
            verbose=True,
        )
        if hasattr(self._index, "prepare"):
            self._index.prepare()
        else:
            self._index._init_search_graph()
            if self._index._is_sparse:
                if hasattr(self._index, "_init_sparse_search_function"):
                    self._index._init_sparse_search_function()
            else:
                if hasattr(self._index, "_init_search_function"):
                    self._index._init_search_function()
示例#3
0
def create_tree(data, params):
    '''
	Create a faiss/cKDTree/KDTree/annoy/pynndescent index for nearest neighbour lookup. 
	All undescribed input as in ``bbknn.bbknn()``. Returns the resulting index.

	Input
	-----
	data : ``numpy.array``
		PCA coordinates of a batch's cells to index.
	params : ``dict``
		A dictionary of arguments used to call ``bbknn.matrix.bbknn()``, plus ['computation']
		storing the knn algorithm to use.
	'''
    if params['computation'] == 'annoy':
        ckd = AnnoyIndex(data.shape[1], metric=params['metric'])
        for i in np.arange(data.shape[0]):
            ckd.add_item(i, data[i, :])
        ckd.build(params['annoy_n_trees'])
    elif params['computation'] == 'pynndescent':
        ckd = pynndescent.NNDescent(
            data,
            metric=params['metric'],
            n_jobs=-1,
            n_neighbors=params['pynndescent_n_neighbors'],
            random_state=params['pynndescent_random_state'])
        ckd.prepare()
    elif params['computation'] == 'faiss':
        ckd = faiss.IndexFlatL2(data.shape[1])
        ckd.add(data)
    elif params['computation'] == 'cKDTree':
        ckd = cKDTree(data)
    elif params['computation'] == 'KDTree':
        ckd = KDTree(data, metric=params['metric'])
    return ckd
    def build(self, data, k):
        # These values were taken from UMAP, which we assume to be sensible defaults
        n_trees = 5 + int(round((data.shape[0])**0.5 / 20))
        n_iters = max(5, int(round(np.log2(data.shape[0]))))

        # Numba takes a while to load up, so there's little point in loading it
        # unless we're actually going to use it
        import pynndescent

        # UMAP uses the "alternative" algorithm, but that sometimes causes
        # memory corruption, so use the standard one, which seems to work fine
        self.index = pynndescent.NNDescent(
            data,
            n_neighbors=15,
            metric=self.metric,
            metric_kwds=self.metric_params,
            random_state=self.random_state,
            n_trees=n_trees,
            n_iters=n_iters,
            algorithm="standard",
            max_candidates=60,
            n_jobs=self.n_jobs,
        )

        indices, distances = self.index.query(data, k=k + 1)
        return indices[:, 1:], distances[:, 1:]
示例#5
0
文件: seurat.py 项目: jksr/ALLCools
def find_nearest_anchor(data,
                        anchor_all,
                        data_qry,
                        ref,
                        qry,
                        key_correct='X_pca',
                        npc=30,
                        kweight=100,
                        sd=1,
                        random_state=0):
    print('Initialize')
    cum_ref, cum_qry = [0], [0]
    for xx in ref:
        cum_ref.append(cum_ref[-1] + data[xx].shape[0])
    for xx in qry:
        cum_qry.append(cum_qry[-1] + data[xx].shape[0])

    anchor = []
    for i, xx in enumerate(ref):
        for j, yy in enumerate(qry):
            if xx < yy:
                tmp = anchor_all[(xx, yy)].copy()
            else:
                tmp = anchor_all[(yy, xx)].copy()
                tmp[['x1', 'x2']] = tmp[['x2', 'x1']]
            tmp['x1'] += cum_ref[i]
            tmp['x2'] += cum_qry[j]
            anchor.append(tmp)
    anchor = pd.concat(anchor)
    score = anchor['score'].values
    anchor = anchor[['x1', 'x2']].values

    if key_correct == 'X':
        model = PCA(n_components=npc,
                    svd_solver='arpack',
                    random_state=random_state)
        reduce_qry = model.fit_transform(data_qry)
    else:
        reduce_qry = data_qry

    print('Find nearest anchors')
    index = pynndescent.NNDescent(reduce_qry[anchor[:, 1]],
                                  metric='euclidean',
                                  n_neighbors=kweight,
                                  random_state=random_state)
    G, D = index.query(reduce_qry, k=kweight)

    print('Normalize graph')
    cellfilter = (D[:, -1] == 0)
    D = (1 - D / D[:, -1][:, None]) * score[G]
    D[cellfilter] = score[G[cellfilter]]
    D = 1 - np.exp(-D * (sd**2) / 4)
    D = D / (np.sum(D, axis=1) + 1e-6)[:, None]

    return anchor, G, D, cum_qry
 def fit(self, X):
     self._index = pynndescent.NNDescent(
         X,
         n_neighbors=self._n_neighbors,
         metric=self._pynnd_metric,
         low_memory=True,
         leaf_size=self._leaf_size,
         pruning_degree_multiplier=self._pruning_degree_multiplier,
         diversify_epsilon=self._diversify_epsilon,
         n_search_trees=self._n_search_trees,
         n_jobs=self._n_jobs)
     self._index._init_search_graph()
示例#7
0
 def _calculate_local_knn(self):
     """If klocal is provided, we calculate the local knn graph to
     evaluate whether the anchor preserves local structure within the dataset.
     One can use a different obsm with key_local to compute knn for each dataset.
     """
     if self.k_local is not None:
         print('Find neighbors within datasets')
         for adata in self.adata_list:
             index = pynndescent.NNDescent(adata.obsm[self.key_local],
                                           metric='euclidean',
                                           n_neighbors=self.k_local + 1,
                                           random_state=self.random_state)
             self.local_knn.append(index.neighbor_graph[0][:, 1:])
     else:
         self.local_knn = [None for _ in self.adata_list]
示例#8
0
    def fit(self, X):
        if self._pynnd_metric == 'jaccard':
            # Convert to sparse matrix format
            X = self._sparse_convert_for_fit(X)

        self._index = pynndescent.NNDescent(
            X,
            n_neighbors=self._n_neighbors,
            metric=self._pynnd_metric,
            low_memory=True,
            leaf_size=self._leaf_size,
            pruning_degree_multiplier=self._pruning_degree_multiplier,
            diversify_epsilon=self._diversify_epsilon,
            n_search_trees=self._n_search_trees,
            n_jobs=self._n_jobs)
        self._index._init_search_graph()
示例#9
0
def build_nndescent_idx(vecs, output_path, n_trees):
    import pynndescent

    start = time.time()
    ret = pynndescent.NNDescent(
        vecs.copy(),
        metric="dot",
        n_neighbors=100,
        n_trees=n_trees,
        diversify_prob=0.5,
        pruning_degree_multiplier=2.0,
        low_memory=False,
    )
    print("first phase done...")
    ret.prepare()
    print("prepare done... writing output...", output_path)
    end = time.time()
    difftime = end - start
    pickle.dump(ret, file=open(output_path, "wb"))
    return difftime
示例#10
0
def FM_to_p2p_aux(FM, eigvects1, eigvects2, use_ANN=False):
    """
    Obtain a point to point map from a functional map with another method.
    For each row in Phi2 @ C, looks for the nearest row in Phi1

    Parameters
    --------------------------
    FM        : (k2,k1) functional map in reduced basis
    eigvects1 : (n1,k1') first k' eigenvectors of the first basis  (k1'>k1).
                First dimension can be subsampled.
    eigvects2 : (n2,k2') first k' eigenvectors of the second basis (k2'>k2)
                First dimension can be subsampled.
    use_ANN   : Whether to use approximate nearest neighbors

    Outputs:
    --------------------------
    p2p       : (n2,) match vertex i on shape 2 to vertex p2p[i] on shape 1,
                or equivalent result if the eigenvectors are subsampled.
    """
    if use_ANN and not ANN:
        raise ValueError(
            'Please install pydescent to achieve Approximate Nearest Neighbor')

    k2, k1 = FM.shape

    assert k1 <= eigvects1.shape[1], \
        f'At least {k1} should be provided, here only {eigvects1.shape[1]} are given'
    assert k2 <= eigvects2.shape[1], \
        f'At least {k2} should be provided, here only {eigvects2.shape[1]} are given'

    if use_ANN:
        index = pynndescent.NNDescent(eigvects1[:, :k1], n_jobs=8)
        matches, _ = index.query(eigvects2[:, :k2] @ FM, k=1)  # (n2,1)
        matches = matches.flatten()  # (n2,)
    else:
        tree = KDTree(eigvects1[:, :k1])  # Tree on (n1,k1)
        matches = tree.query(eigvects2[:, :k2] @ FM,
                             k=1,
                             return_distance=False).flatten()  # (n2,)

    return matches  # (n2,)
示例#11
0
def filter_anchor(anchor,
                  adata_ref=None,
                  adata_qry=None,
                  high_dim_feature=None,
                  k_filter=200,
                  random_state=0):
    """
    Check if an anchor is still an anchor when only
    using the high_dim_features to construct KNN graph.
    If not, remove the anchor.
    """
    ref_data = normalize(adata_ref.X[:, high_dim_feature], axis=1)
    qry_data = normalize(adata_qry.X[:, high_dim_feature], axis=1)
    index = pynndescent.NNDescent(ref_data,
                                  metric='euclidean',
                                  n_neighbors=k_filter,
                                  random_state=random_state)
    G = index.query(qry_data, k=k_filter)[0]
    input_anchors = anchor.shape[0]
    anchor = np.array([xx for xx in anchor if (xx[0] in G[xx[1]])])
    print(f'Anchor selected with high CC feature graph: {anchor.shape[0]} / {input_anchors}')
    return anchor
示例#12
0
文件: seurat.py 项目: jksr/ALLCools
def find_anchor(adata_list,
                k_local=50,
                key_local='X_pca',
                k_anchor=5,
                key_anchor='X',
                dimred='pca',
                max_cc_cell=20000,
                k_score=30,
                k_filter=200,
                scale1=False,
                scale2=False,
                ncc=30,
                n_features=200,
                alignments=None,
                random_state=0):
    nds = len(adata_list)
    ncell = [xx.shape[0] for xx in adata_list]

    # If klocal is provided, we calculate the local knn graph to
    # evaluate whether the anchor preserves local structure within the dataset.
    # One can use a different obsm with key_local to compute knn for each dataset
    if k_local:
        print('Find neighbors within datasets')
        Gp = []
        for i in range(nds):
            index = pynndescent.NNDescent(adata_list[i].obsm[key_local],
                                          metric='euclidean',
                                          n_neighbors=k_local + 1,
                                          random_state=random_state)
            Gp.append(index.neighbor_graph[0][:, 1:])
    else:
        Gp = [None for _ in range(nds)]

    if alignments is not None:
        all_pairs = []
        for pair in alignments:
            for xx in pair[0]:
                for yy in pair[1]:
                    if xx < yy:
                        all_pairs.append(f'{xx}-{yy}')
                    else:
                        all_pairs.append(f'{yy}-{xx}')
        all_pairs = np.unique(all_pairs)
    else:
        all_pairs = np.array([])

    print('Find anchors across datasets')
    anchor = {}
    for i in range(nds - 1):
        for j in range(i + 1, nds):
            if (alignments is not None) and (f'{i}-{j}' not in all_pairs):
                continue
            # run cca between datasets
            print('Run CCA')
            if key_anchor == 'X':
                # in case the adata var is not in the same order
                # select and order the var to make sure it is matched
                if (adata_list[i].shape[1] != adata_list[j].shape[1]) or (
                    (adata_list[i].var.index == adata_list[j].var.index).sum()
                        < adata_list[i].shape[1]):
                    sel_b = adata_list[i].var.index & adata_list[j].var.index
                    U = adata_list[i][:, sel_b].X.copy()
                    V = adata_list[j][:, sel_b].X.copy()
                else:
                    U = adata_list[i].X.copy()
                    V = adata_list[j].X.copy()
            else:
                U = adata_list[i].obsm[key_anchor]
                V = adata_list[j].obsm[key_anchor]

            if dimred == 'pca':
                U, V = cca(U,
                           V,
                           scale1=scale1,
                           scale2=scale2,
                           n_components=ncc)
            elif dimred == 'lsi':
                U, V = lsi_cca(U, V, n_components=ncc, max_cc_cell=max_cc_cell)

            # compute ccv feature loading
            high_dim_feature = np.array([])
            if k_filter:
                mat = np.concatenate([U, V], axis=0).T.dot(
                    np.concatenate([adata_list[i].X, adata_list[j].X], axis=0))
                high_dim_feature = top_features_idx(mat, n_features=n_features)

            # normalize ccv
            U = normalize(U, axis=1)
            V = normalize(V, axis=1)

            # find MNN as anchors
            print('Find Anchors')
            G11, G12, G21, G22 = find_neighbor(
                U, V, k=max([k_anchor, k_local, k_score, 50]))
            raw_anchors = find_mnn(G12, G21, k_anchor)

            # filter anchors by high dimensional neighbors
            if k_filter:
                if ncell[i] >= ncell[j]:
                    raw_anchors = filter_anchor(
                        anchor=raw_anchors,
                        adata_ref=adata_list[i],
                        adata_qry=adata_list[j],
                        high_dim_feature=high_dim_feature,
                        k_filter=k_filter)
                else:
                    raw_anchors = filter_anchor(
                        anchor=raw_anchors[:, ::-1],
                        adata_ref=adata_list[j],
                        adata_qry=adata_list[i],
                        high_dim_feature=high_dim_feature,
                        k_filter=k_filter)[:, ::-1]

            # score anchors with snn and local structure preservation
            print('Score Anchors')
            anchor_df = score_anchor(anchor=raw_anchors,
                                     G11=G11,
                                     G12=G12,
                                     G21=G21,
                                     G22=G22,
                                     k_score=k_score,
                                     k_local=k_local,
                                     Gp1=Gp[i],
                                     Gp2=Gp[j])
            anchor[(i, j)] = anchor_df.copy()

            # distance between datasets
            # dist.append(len(anchor[(i,j)]) / min([ncell[i], ncell[j]]))
            print(
                f'Identified {len(anchor[i, j])} anchors between datasets {i} and {j}.'
            )
    return anchor
示例#13
0
 def build(data, metadata=None, **kwargs):
     metadata = Index._get_valid_metadata(data, metadata)
     nnd_index = pynndescent.NNDescent(data, **kwargs)
     return NNDescentIndex(nnd_index, metadata)
示例#14
0
    def fit(self, X, k_NN):

        if self.verbose:
            timer_str = f"Finding {k_NN} approximate nearest neighbors using"
            timer_str += f" NNDescent and the '{self.metric}' metric..."
            timer = utl.Timer(timer_str, verbose=self.verbose)
            timer.__enter__()

        ## Get the data shape
        self.n_samples, self.n_features = X.shape[0], X.shape[1]

        k_NN = self._check_k(k_NN, self.n_samples)

        ## > These values were taken from UMAP, which we assume to be sensible
        ## > defaults [because the UMAP and pynndescent authors are the same.]
        ## - Pavlin Policar
        if self.n_trees is None:
            self.n_trees = 5 + int(round((self.n_samples**0.5) / 20))
        if self.n_iters is None:
            self.n_iters = max(5, int(round(np.log2(self.n_samples))))

        ## If `k_NN` > 15, use just the first 15 NN to build the approximate
        ## NN graph, then use query() to the rest of the desired neighbors.
        if k_NN <= 15:
            k_build = k_NN + 1
        else:
            k_build = 15

        import pynndescent
        self.index = pynndescent.NNDescent(X,
                                           n_neighbors=k_build,
                                           metric=self.metric,
                                           metric_kwds=self.metric_params,
                                           random_state=self.random_state,
                                           n_trees=self.n_trees,
                                           n_iters=self.n_iters,
                                           n_jobs=self.n_jobs,
                                           verbose=self.verbose,
                                           **self.pynnd_kws)

        ## If k_NN <= 15, we're in the clear!
        NN_idx, distances = self.index.neighbor_graph

        ## ... Except when pynndescent fails, then it puts a -1 in the index.
        n_failures = np.sum(NN_idx == -1)

        ## If k_NN > 15, use query() to get the indices and distances
        if k_NN > 15:
            self.index.prepare()
            NN_idx, distances = self.index.query(X, k=k_NN + 1)

        ## If pynndescent fails to find neighbors for some points, raise ERROR.
        if n_failures > 0:
            err_str = "WARNING: `pynndescent` failed to find neighbors for all"
            err_str += " points in the data."

            if self.verbose >= 4:
                print_opt = np.get_print_options()
                np.set_print_options(threshold=np.inf)
                err_str += " The indices of the failed points are: "
                err_str += f"\n{np.where(np.sum(NN_idx == -1, axis=1))[0]}"
                np.set_print_options(**print_opt)
            else:
                err_str += " Set verbose >= 4 to see the indices of the"
                err_str += " failed points."

            raise ValueError(err_str)

        if self.verbose:
            timer.__exit__()

        # return NN_idx[:, 1:], distances[:, 1:]
        self.kNN_idx = NN_idx[:, 1:]
        self.kNN_dst = distances[:, 1:]

        ## Return the indices of the nearest neighbors and the distances
        ## to those neighbors.
        return self.kNN_idx.copy(), self.kNN_dst.copy()
示例#15
0
def k_nearest_neighbors(data, k, max_distance=None, verbose=False):
    """Compute k-nearest neighbors for each row in data matrix.

    Computes the k-nearest neighbor graph of data matrix, under
    the Euclidean distance. Each row in the data matrix is treated as an item.

    Arguments
    ---------
    data: {torch.Tensor, np.ndarray, scipy.sparse matrix}(
            shape=(n_items, n_features))
        The data matrix
    k: int
        The number of nearest neighbors per item
    max_distance: float (optional)
        If not None, neighborhoods are restricted to have a radius
        no greater than `max_distance`.
    verbose: bool
        If True, print verbose output.

    Returns
    -------
    pymde.Graph
        a neighborhood graph
    """
    # lazy import, because importing pynndescent takes some time
    import pynndescent

    if isinstance(data, torch.Tensor):
        device = data.device
        data = data.cpu().numpy()
    else:
        device = "cpu"

    n = data.shape[0]
    if n < 10000:
        import sklearn.neighbors

        if verbose:
            problem.LOGGER.info("Exact nearest neighbors by brute force ")
        nn = sklearn.neighbors.NearestNeighbors(
            n_neighbors=k + 1, algorithm="brute"
        )
        nn.fit(data)
        distances, neighbors = nn.kneighbors(data)
    else:
        # TODO default params (n_trees, max_candidates)
        index = pynndescent.NNDescent(
            data,
            n_neighbors=k + 1,
            verbose=verbose,
            max_candidates=60,
        )
        neighbors, distances = index.neighbor_graph
    neighbors = neighbors[:, 1:]
    distances = distances[:, 1:]

    n = data.shape[0]
    items = np.arange(n)
    items = np.repeat(items, k)
    edges = np.stack([items, neighbors.flatten()], axis=1)

    flip_idx = edges[:, 0] > edges[:, 1]
    edges[flip_idx] = np.stack(
        [edges[flip_idx][:, 1], edges[flip_idx][:, 0]], axis=1
    )

    weights = torch.ones(edges.shape[0], device=device, dtype=torch.float)
    if max_distance is not None:
        weights[
            torch.tensor(distances.ravel(), device=device, dtype=torch.float)
            > max_distance
        ] = 0.0

    # weights for duplicated edges will be summed.
    edges = torch.tensor(edges, device=device)
    return Graph.from_edges(edges, weights)
示例#16
0
import numpy as np
from tqdm import tqdm

import pyFM.spectral as spectral

try:
    import pynndescent
    index = pynndescent.NNDescent(np.random.random((100,3)),n_jobs=2)
    del index
    ANN = True
except ImportError:
    ANN = False


def zoomout_iteration(eigvects1, eigvects2, FM, step=1, A2=None, return_p2p=False, use_ANN=False):
    """
    Performs an iteration of ZoomOut.

    Parameters
    --------------------
    eigvects1  : (n1,k1') eigenvectors on source shape with k1' >= k1 + step.
                 Can be a subsample of the original ones on the first dimension.
    eigvects2  : (n2,k2') eigenvectors on target shape with k2' >= k2 + step.
                 Can be a subsample of the original ones on the first dimension.
    FM         : (k2,k1) Functional map from from eigvects1[:,:k1] to eigvects2[:,:k2]
    step       : int - step of increase of dimension.
    A2         : (n2,n2) sparse area matrix on target mesh, for vertex to vertex computation.
                 If specified, the eigenvectors can't be subsampled !
    return_p2p : bool - if True returns the vertex to vertex map.
    use_ANN    : bool - if True, uses approximate nearest neighbor
示例#17
0
    def build(self, data, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors using NN descent approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        # These values were taken from UMAP, which we assume to be sensible defaults
        n_trees = 5 + int(round((data.shape[0]) ** 0.5 / 20))
        n_iters = max(5, int(round(np.log2(data.shape[0]))))

        # Numba takes a while to load up, so there's little point in loading it
        # unless we're actually going to use it
        import pynndescent

        # Will use query() only for k>15
        if k <= 15:
            n_neighbors_build = k + 1
        else:
            n_neighbors_build = 15

        self.index = pynndescent.NNDescent(
            data,
            n_neighbors=n_neighbors_build,
            metric=self.metric,
            metric_kwds=self.metric_params,
            random_state=self.random_state,
            n_trees=n_trees,
            n_iters=n_iters,
            max_candidates=60,
            n_jobs=self.n_jobs,
            verbose=self.verbose > 1,
        )

        # -1 in indices means that pynndescent failed
        indices, distances = self.index.neighbor_graph
        mask = np.sum(indices == -1, axis=1) > 0

        if k > 15:
            indices, distances = self.index.query(data, k=k + 1)

        # As a workaround, we let the failed points group together
        if np.sum(mask) > 0:
            if self.verbose:
                opt = np.get_printoptions()
                np.set_printoptions(threshold=np.inf)
                warnings.warn(
                    f"`pynndescent` failed to find neighbors for some of the points. "
                    f"As a workaround, openTSNE considers all such points similar to "
                    f"each other, so they will likely form a cluster in the embedding."
                    f"The indices of the failed points are:\n{np.where(mask)[0]}"
                )
                np.set_printoptions(**opt)
            else:
                warnings.warn(
                    f"`pynndescent` failed to find neighbors for some of the points. "
                    f"As a workaround, openTSNE considers all such points similar to "
                    f"each other, so they will likely form a cluster in the embedding. "
                    f"Run with verbose=True, to see indices of the failed points."
                )
            distances[mask] = 1
            rs = check_random_state(self.random_state)
            fake_indices = rs.choice(
                np.sum(mask), size=np.sum(mask) * indices.shape[1], replace=True
            )
            fake_indices = np.where(mask)[0][fake_indices]
            indices[mask] = np.reshape(fake_indices, (np.sum(mask), indices.shape[1]))

        timer.__exit__()

        return indices[:, 1:], distances[:, 1:]
示例#18
0
 def fit(self, X):
     self._index = pynndescent.NNDescent(X,
                                         n_neighbors=self._n_neighbors,
                                         n_trees=self._n_trees,
                                         leaf_size=self._leaf_size,
                                         metric=self._pynnd_metric)
示例#19
0
def CorrelateStitchImages(dirname,
                          dirout,
                          stitchchannel,
                          chosenstitchgroup,
                          x1lim=-.05,
                          x2lim=1.05,
                          y1lim=-.05,
                          y2lim=1.05,
                          save_stitched=True,
                          save_multipage=True,
                          constant_size=250000000,
                          roll_ball=True,
                          use_gene_name=True,
                          save_merged=False):
    #dirname = os.path.expanduser('/wynton/group/ye/mtschmitz/images/MacaqueMotorCortex2/P2sagittal1_27_20200828/TR1.2020-09-03-01-35-13/')
    #Test params
    '''dirname = os.path.expanduser('/media/mt/Extreme SSD/MacaqueMotorCortex2/P2_OB_20200805/TR1.2020-08-06-23-10-18')
    dirout =os.path.expanduser('~/tmp/')
    stitchchannel='1'
    chosenstitchgroup='1'
    x1lim=0
    x2lim=1
    y1lim=0
    y2lim=1
    save_merged=False
    save_stitched=False
    save_multipage=True
    use_gene_name=True
    '''

    print(dirname, flush=True)
    ray.shutdown()
    num_cpus = 1  #psutil.cpu_count(logical=False)
    #set number of cores to use
    print('cpus:', num_cpus)
    ray.init(num_cpus=num_cpus)

    x1lim = float(x1lim)
    x2lim = float(x2lim)
    y1lim = float(y1lim)
    y2lim = float(y2lim)
    chosenstitchgroup = re.sub('TR_', "1", chosenstitchgroup)
    chosenstitchgroup = re.sub('TR', "", chosenstitchgroup)

    protocol = [x for x in os.listdir(dirname) if '.scanprotocol' in x][0]
    minoverlap = 0
    with open(os.path.join(dirname, protocol), 'r') as f:
        for line in f:
            try:
                #print(line)
                if 'MinOverlapPixel' in line:
                    minoverlap = float(line.split('>')[1].split('<')[0]) * 1.1
            except:
                pass

    n_locations = 0
    counting = False
    with open(os.path.join(dirname, protocol), 'r') as f:
        for line in f:
            try:
                if 'LocationIds' in line:
                    counting = ~counting
                elif counting:
                    n_locations += 1
            except:
                pass

    n_location = 1
    counting = False
    reference = False
    shapes = defaultdict(list)
    with open(os.path.join(dirname, protocol), 'r') as f:
        for line in f:
            try:
                if '<d2p1:ScanLocation>' in line:
                    counting = ~counting
                if counting and '<d10p1:_x>' in line:
                    x = float(line.split('>')[1].split('<')[0])
                if counting and '<d10p1:_y>' in line:
                    y = float(line.split('>')[1].split('<')[0])
                    shapes[str(n_location)].append((x, y))
                if '<d2p1:ReferencePoint ' in line:
                    counting = False
                    reference = True
                if reference and '<d10p1:_x>' in line:
                    xref = float(line.split('>')[1].split('<')[0])
                if reference and '<d10p1:_y>' in line:
                    yref = float(line.split('>')[1].split('<')[0])
                    reference = False
                    shapes[str(n_location)] = [
                        (x + xref, y + yref)
                        for x, y in shapes[str(n_location)]
                    ]
                    n_location += 1
                    xref = 0
                    yref = 0
            except:
                pass

    tags_to_get = [
        'SizeX', 'SizeY', 'ActualPositionX', 'ActualPositionY', 'Run Index',
        'Index', '"field" Index', 'TheC', 'AreaGrid AreaGridIndex', 'Name',
        '<Image ID="Image:" Name', 'ActualPositionZ'
    ]

    def get_tag(x, tp):
        return (re.search(x + '=' + '"([A-Za-z0-9_\./\\-]*)"', tp).group(1))

    @ray.remote
    def getTiffMetadata(f):
        f = open(f, 'rb')
        # Return Exif tags
        tags = exifread.process_file(f)
        tp = tags['Image ImageDescription'].values
        d = {}
        for tag in tags_to_get:
            d[tag] = get_tag(tag, tp)
        return (d)

    data = []
    for fname in sorted(os.listdir(dirname)):
        if fname.endswith(".TIF"):
            fpath = os.path.join(dirname, fname)
            path = os.path.normpath(dirname)
            d = path.split(os.sep)[-2]
            data.append((fname, d, fpath))
    imageFiles = pd.DataFrame(data, columns=['FileName', 'DirName', 'Path'])

    #normalize positions so starts at 0,0
    #imageFiles['ActualPositionX']=imageFiles['ActualPositionX']-imageFiles['ActualPositionX'].min()
    #imageFiles['ActualPositionY']=imageFiles['ActualPositionY']-imageFiles['ActualPositionY'].min()
    imageFiles = imageFiles.loc[['_R_' in x
                                 for x in imageFiles['FileName']], :]

    l = []
    for i in imageFiles.index:
        f = imageFiles.loc[i, 'Path']
        #d=getTiffMetadata(f)
        l.append(getTiffMetadata.remote(f))
        #print(d)
        #imageFiles.loc[i,d.keys()]=list(d.values())

    metadf = pd.DataFrame(ray.get(l))
    metadf.index = imageFiles.index
    imageFiles = imageFiles.join(metadf)
    ray.shutdown()

    imageFiles.rename(columns={'<Image ID="Image:" Name': 'Channel Name'},
                      inplace=True)

    imageFiles['ActualPositionX'] = imageFiles['ActualPositionX'].astype(float)
    imageFiles['ActualPositionY'] = imageFiles['ActualPositionY'].astype(float)
    #print(imageFiles['ActualPositionX'])
    #print(imageFiles['ActualPositionY'])
    imageFiles['SizeX'] = imageFiles['SizeX'].astype(float)
    imageFiles['SizeY'] = imageFiles['SizeY'].astype(float)
    imageFiles.sort_values(by=['"field" Index', 'Channel Name'], inplace=True)
    print(imageFiles, flush=True)
    #Max size
    chif = imageFiles  #.loc[imageFiles['Channel Name']==stitchchannel,:]
    chif['x1pix'] = 0
    chif['x2pix'] = 0
    chif['y1pix'] = 0
    chif['y2pix'] = 0
    xsize = imageFiles['SizeX'].value_counts().idxmax()
    ysize = imageFiles['SizeY'].value_counts().idxmax()
    xend = len(chif.ActualPositionX.unique()) * xsize
    yend = len(chif.ActualPositionY.unique()) * ysize

    #assume adjacent images are taken sequentially
    xd = []
    yd = []
    for i in range(len(chif['"field" Index'].unique()) - 1):
        indi = chif['"field" Index'] == list(chif['"field" Index'])[i]
        indip1 = chif['"field" Index'] == list(chif['"field" Index'])[i + 1]
        xdiff = np.abs(
            list(chif.loc[indi, 'ActualPositionX'])[0] -
            list(chif.loc[indip1, 'ActualPositionX'])[0])
        ydiff = np.abs(
            list(chif.loc[indi, 'ActualPositionY'])[0] -
            list(chif.loc[indip1, 'ActualPositionY'])[0])
        xd.append(xdiff)
        yd.append(ydiff)
    xd = np.array(xd)
    yd = np.array(yd)
    #nonzero (>10) median is the distance between images
    xmedian = np.median(xd[xd > 10])
    ymedian = np.median(yd[yd > 10])
    #for reference xsize-minoverlap=xmedian
    print('MEDIANS')
    print(xmedian, ymedian)

    from shapely.geometry import Point
    from shapely.geometry.polygon import Polygon
    polygon = Polygon(shapes[chosenstitchgroup])
    #Buffer expands, scale doesn't work for expanding linear regions with no points
    buffgon = Polygon(polygon.buffer(min(xmedian, ymedian)).exterior)
    #polygon=shapely.affinity.scale(polygon,xfact=1.2,yfact=1.2)
    inside = [
        buffgon.contains(Point(x, y))
        for x, y in list(zip(chif['ActualPositionX'], chif['ActualPositionY']))
    ]

    matplotlib.pyplot.scatter(list(chif['ActualPositionX']),
                              list(chif['ActualPositionY']))
    matplotlib.pyplot.scatter([x[0] for x in shapes[chosenstitchgroup]],
                              [x[1] for x in shapes[chosenstitchgroup]])
    matplotlib.pyplot.scatter(buffgon.exterior.coords.xy[0],
                              buffgon.exterior.coords.xy[1])
    matplotlib.pyplot.savefig(os.path.join(dirout, 'BufferPolygon.png'))
    matplotlib.pyplot.close()
    chif = chif.loc[inside, :]

    #normalize positions so starts at 0,0
    chif['ActualPositionX'] = chif['ActualPositionX'] - chif[
        'ActualPositionX'].min()
    chif['ActualPositionY'] = chif['ActualPositionY'] - chif[
        'ActualPositionY'].min()

    xorder = dict(
        zip(np.sort(chif['ActualPositionX'].unique()),
            np.sort(chif['ActualPositionX'].unique().argsort())))
    yorder = dict(
        zip(np.sort(chif['ActualPositionY'].unique()),
            np.sort(chif['ActualPositionY'].unique().argsort())))

    for i in chif.index:
        x = chif.loc[
            i,
            'ActualPositionX'] / xmedian  #xorder[chif.loc[i,'ActualPositionX']]
        y = chif.loc[
            i,
            'ActualPositionY'] / ymedian  #yorder[chif.loc[i,'ActualPositionY']]
        x1 = int((xsize * x) - (x * minoverlap))
        x2 = x1 + int(xsize)
        y1 = int((ysize * y) - (y * minoverlap))
        y2 = y1 + int(ysize)
        chif.loc[i, 'x1pix'] = x1
        chif.loc[i, 'x2pix'] = x2
        chif.loc[i, 'y1pix'] = y1
        chif.loc[i, 'y2pix'] = y2

    cf = chif.loc[chif['TheC'] == stitchchannel, :]
    cf['Image'] = None
    for i in tqdm.tqdm(cf.index):
        img = cv2.imread(cf.loc[i, 'Path'])[:, :, 0].T
        cf.loc[i, 'Image'] = [img]
    print(chif, flush=True)
    #print(cf.loc[2,'Image'].shape)
    #plt.imshow(cf.loc[2,'Image'],origin='lower')

    index = pynndescent.NNDescent(cf.loc[:, ['x1pix', 'y1pix']], n_neighbors=9)
    nn = index.query(cf.loc[:, ['x1pix', 'y1pix']], k=9)[0][:, 1:]

    g = networkx.DiGraph().to_undirected()
    for i, x in enumerate(nn):
        for j in x:
            g.add_edge(i, j)
    g = g.to_undirected()

    x1i = np.where(cf.columns == 'x1pix')[0][0]
    x2i = np.where(cf.columns == 'x2pix')[0][0]
    y1i = np.where(cf.columns == 'y1pix')[0][0]
    y2i = np.where(cf.columns == 'y2pix')[0][0]
    imgi = np.where(cf.columns == 'Image')[0][0]

    #could be modified with inner loop to get
    #average correlation of all channels
    #Downside of course is 4x processing time
    def correlateOffsets(x):
        xoffset, yoffset = x[0], x[1]
        for i in cf.index:
            x = cf.loc[
                i,
                'ActualPositionX'] / xmedian  #xorder[chif.loc[i,'ActualPositionX']]
            y = cf.loc[
                i,
                'ActualPositionY'] / ymedian  #yorder[chif.loc[i,'ActualPositionY']]
            x1 = int((xsize * x) - x * int(xoffset))
            x2 = x1 + int(xsize)
            y1 = int((ysize * y) - y * int(yoffset))
            y2 = y1 + int(ysize)
            cf.loc[i, 'x1pix'] = x1
            cf.loc[i, 'x2pix'] = x2
            cf.loc[i, 'y1pix'] = y1
            cf.loc[i, 'y2pix'] = y2

        i_vect = []
        j_vect = []
        for i, j in list(g.edges):
            ix1i = cf.iloc[i, x1i]
            iy1i = cf.iloc[i, y1i]
            ix2i = cf.iloc[i, x2i]
            iy2i = cf.iloc[i, y2i]
            jx1i = cf.iloc[j, x1i]
            jy1i = cf.iloc[j, y1i]
            jx2i = cf.iloc[j, x2i]
            jy2i = cf.iloc[j, y2i]

            p = Polygon([(ix1i, iy1i), (ix2i, iy1i), (ix2i, iy2i),
                         (ix1i, iy2i)])

            q = Polygon([(jx1i, jy1i), (jx2i, jy1i), (jx2i, jy2i),
                         (jx1i, jy2i)])

            if p.intersects(q):
                pqi = p.intersection(q)
                #x1,y1,x2,y2
                bounds = pqi.bounds
                ix1b, iy1b, ix2b, iy2b = bounds[0] - ix1i, bounds[
                    1] - iy1i, bounds[2] - ix2i, bounds[3] - iy2i
                jx1b, jy1b, jx2b, jy2b = bounds[0] - jx1i, bounds[
                    1] - jy1i, bounds[2] - jx2i, bounds[3] - jy2i

                rxi = np.intersect1d(np.arange(ix1i, ix2i),
                                     np.arange(bounds[0], bounds[2])) - ix1i
                rxj = np.intersect1d(np.arange(jx1i, jx2i),
                                     np.arange(bounds[0], bounds[2])) - jx1i
                ryi = np.intersect1d(np.arange(iy1i, iy2i),
                                     np.arange(bounds[1], bounds[3])) - iy1i
                ryj = np.intersect1d(np.arange(jy1i, jy2i),
                                     np.arange(bounds[1], bounds[3])) - jy1i
                rxi = rxi.astype(int)
                rxj = rxj.astype(int)
                ryi = ryi.astype(int)
                ryj = ryj.astype(int)

                if len(rxi) > 0 and len(rxj) > 0:
                    i_vect.append(
                        list(cf.iloc[i, imgi][rxi, ryi[:,
                                                       np.newaxis]].flatten()))
                    j_vect.append(
                        list(cf.iloc[j, imgi][rxj, ryj[:,
                                                       np.newaxis]].flatten()))
                    #plt.imshow(cf.iloc[i,imgi][rxi,ryi[:,np.newaxis]],origin='lower')
                    #plt.show()
                    #plt.imshow(cf.iloc[j,imgi][rxj,ryj[:,np.newaxis]],origin='lower')
                    #plt.show()
                #x,y = p.exterior.xy
                #plt.plot(x,y)
                #x,y = q.exterior.xy
                #plt.plot(x,y)
                #x,y = pqi.exterior.xy
                #plt.plot(x,y)
                #plt.show()

        i_vect = [item for sublist in i_vect for item in sublist]
        j_vect = [item for sublist in j_vect for item in sublist]
        corr = 1 - np.corrcoef(i_vect, j_vect)[1, 0]
        #print(corr)
        return (corr)

    #opt=scipy.optimize.brute(correlateOffsets,(slice(30,500),slice(30,500)),full_output=True,disp=False,workers=num_cpus)
    opt = scipy.optimize.minimize(correlateOffsets,
                                  (minoverlap * 2.5, minoverlap * 2.5),
                                  method='Nelder-Mead',
                                  options={
                                      'xtol': .9,
                                      'ftol': .05
                                  })

    chif['final_identifier'] = chif['TheC']
    if use_gene_name:
        l = []
        for x in chif['FileName']:
            if x.startswith('L_'):
                l.append('1')
                continue
            if x.startswith('R_'):
                l.append('2')
                continue
            if '_TR_' in x:
                l.append('1')
                continue
            if '_TD_' in x:
                l.append('1')
                continue
            if 'TR' in x or 'TD' in x:
                groupnum = re.sub('TR|TD', '',
                                  re.search('TD[0-9]|TR[0-9]', x).group(0))
                l.append(groupnum)
                continue
            l.append(chosenstitchgroup)

        chif['stitchgroup'] = l
        tmppath = os.path.expanduser('~/imagingmetadata.csv')
        if os.path.exists(tmppath):
            refdf = pd.read_csv(tmppath, sep='\t')
        else:
            refdf = pd.read_csv(
                'https://docs.google.com/spreadsheets/d/e/2PACX-1vSYbvCJpS-GfRKuGgs2IBH7MD1KtDPDqs7ePqQJ1PyrMKp7f7z7ZpY4WtMFGPxU4mWbnRHgBl4PtaeH/pub?output=tsv&gid=1520792104',
                sep='\t')
            refdf.to_csv(tmppath, sep='\t')

        refdf.rename(columns={
            'Channel0': '0',
            'Channel1': '1',
            'Channel2': '2',
            'Channel3': '3'
        },
                     inplace=True)
        chif = pd.merge(chif,
                        refdf,
                        how='left',
                        left_on=['DirName', 'stitchgroup'],
                        right_on=['DirName', 'SlidePosition.1isL'])
        chif['gene'] = list(
            [chif.loc[chif.index[i], x] for i, x in enumerate(chif['TheC'])])
        chif['final_identifier'] = chif['gene']
    else:
        chif['final_identifier'] = chif['TheC']

    xoffset = opt.x[0]
    yoffset = opt.x[1]
    #xoffset=218
    #yoffset=209

    for i in chif.index:
        x = chif.loc[
            i,
            'ActualPositionX'] / xmedian  #xorder[chif.loc[i,'ActualPositionX']]
        y = chif.loc[
            i,
            'ActualPositionY'] / ymedian  #yorder[chif.loc[i,'ActualPositionY']]
        x1 = int((xsize * x) - x * int(xoffset))
        x2 = x1 + int(xsize)
        y1 = int((ysize * y) - y * int(yoffset))
        y2 = y1 + int(ysize)
        chif.loc[i, 'x1pix'] = x1
        chif.loc[i, 'x2pix'] = x2
        chif.loc[i, 'y1pix'] = y1
        chif.loc[i, 'y2pix'] = y2
    print('before min subtract')
    print(chif)
    xmin = chif['x1pix'].min()
    ymin = chif['y1pix'].min()

    chif['x1pix'] = chif['x1pix'] - xmin
    chif['x2pix'] = chif['x2pix'] - xmin
    chif['y1pix'] = chif['y1pix'] - ymin
    chif['y2pix'] = chif['y2pix'] - ymin

    def write_stitchy(chdf, infile, keyname='FileName'):
        with open(infile, 'w') as the_file:
            the_file.write('dim = 2\n')
            for i in chdf.index:
                cur = chdf.loc[i, :]
                the_file.write(cur[keyname] + '; ; (' + str(cur['x1pix']) +
                               ',' + str(cur['y1pix']) + ') \n')

    #for imagej merge on the fly
    for c in chif['final_identifier'].unique():
        cf = chif.loc[chif['final_identifier'] == c, :]
        infile = os.path.join(
            dirout,
            str(chosenstitchgroup) + '_' + stitchchannel + '.stitchy')
        write_stitchy(cf, infile, keyname='FileName')

    print(chif)
    #Or write whole file
    if save_stitched:
        for c in chif['final_identifier'].unique():
            cf = chif.loc[chif['final_identifier'] == c, :]
            cf['Image'] = None
            for i in cf.index:
                img = cv2.imread(cf.loc[i, 'Path'])[:, :, 0].T
                cf.loc[i, 'Image'] = [img]
            newimg = np.zeros(
                (int(np.nanmax(cf['x2pix'])), int(np.nanmax(cf['y2pix']))),
                np.uint8)
            divisor = np.zeros(
                (int(np.nanmax(cf['x2pix'])), int(np.nanmax(cf['y2pix']))),
                np.uint8)
            for i in cf.index:
                x1, x2, y1, y2 = cf.loc[i,
                                        ['x1pix', 'x2pix', 'y1pix', 'y2pix']]
                newimg[x1:x2, y1:y2] += cf.loc[i, 'Image']
                divisor[x1:x2, y1:y2] += 1
            im = np.nan_to_num(np.divide(newimg, divisor).T,
                               nan=0).astype(np.uint8)
            cur_size = im.shape[0] * im.shape[1]
            if constant_size < cur_size:
                scale_percent = np.sqrt(constant_size /
                                        cur_size)  # percent of original size
                width = int(im.shape[1] * scale_percent)
                height = int(im.shape[0] * scale_percent)
                dim = (width, height)
                # resize image
                im = cv2.resize(im, dim, interpolation=cv2.INTER_LINEAR)
            #from PIL import Image
            print('background subbing', flush=True)
            #import skimage
            #from skimage import morphology
            #im=im-skimage.morphology.rolling_ball(im,radius=100)
            #subtract_background(im,radius=100,light_bg=False)
            im = im.astype(np.uint16)
            tifffile.imsave(os.path.join(dirout, c + '_stitched.TIF'),
                            im,
                            compress=6)
            if roll_ball:
                RollingBallIJ(os.path.join(dirout, c + '_stitched.TIF'))
            print('background subbed', flush=True)
    '''
    if save_merged:
        imgs={}
        for c in sorted(chif['final_identifier'].unique()):
            cf=chif.loc[chif['final_identifier']==c,:]
            cf['Image']=None
            for i in tqdm.tqdm(cf.index):
                img=cv2.imread(cf.loc[i,'Path'])[:,:,0].T
                cf.loc[i,'Image']=[img]
            newimg=np.zeros((int(cf['x2pix'].max()),int(cf['y2pix'].max())), np.uint8)
            divisor=np.zeros((int(cf['x2pix'].max()),int(cf['y2pix'].max())), np.uint8)
            for i in cf.index:
                x1,x2,y1,y2=cf.loc[i,['x1pix','x2pix','y1pix','y2pix']]
                newimg[x1:x2,y1:y2]+=cf.loc[i,'Image']
                divisor[x1:x2,y1:y2]+=1
                imgs[c]=np.nan_to_num(np.divide(newimg,divisor).T,nan=0).astype(np.uint8)
        tifffile.imsave(os.path.join(dirout,'merged_stitched.TIF'),list(imgs.values()),metadata={'Test':'YES,No','Value':100},compress=6)
    '''

    if save_multipage:
        l = []
        for f in tqdm.tqdm(cf['"field" Index'].unique()):
            print(f, flush=True)
            cf = chif.loc[chif['"field" Index'] == f, :]
            cf = cf.sort_values(by='final_identifier', axis=0)
            cf['Image'] = None
            for i in cf.index:
                img = cv2.imread(cf.loc[i, 'Path'])[:, :, 0]
                cf.loc[i, 'Image'] = [img]
            #print(cf)
            imgs = {}
            for i in cf.index:
                imgs[cf.loc[i, 'final_identifier']] = cf.loc[i, 'Image']  #
            metadata = {'channel_names': ','.join(list(imgs.keys()))}
            metadata.update(cf.loc[i, [
                'DirName', 'SizeX', 'SizeY', 'ActualPositionX',
                'ActualPositionY', '"field" Index', 'ActualPositionZ', 'x1pix',
                'y1pix'
            ]].astype(str).to_dict())
            tifffile.imsave(os.path.join(dirout, f + '_merged.TIF'),
                            list(imgs.values()),
                            metadata=metadata,
                            compress=6)
            l.append(
                [f + '_merged.TIF', cf.loc[i, 'x1pix'], cf.loc[i, 'y1pix']])
        infile = os.path.join(dirout,
                              '_'.join(list(imgs.keys())) + '_merged.stitchy')
        write_stitchy(pd.DataFrame(l, columns=['FileName', 'x1pix', 'y1pix']),
                      infile,
                      keyname='FileName')
示例#20
0
def generate_triplets(key,
                      inputs,
                      n_inliers,
                      n_outliers,
                      n_random,
                      weight_temp=0.5,
                      distance='euclidean',
                      verbose=False):
    """Generate triplets.

  Args:
    key: Random key.
    inputs: Input points.
    n_inliers: Number of inliers.
    n_outliers: Number of outliers.
    n_random: Number of random triplets per point.
    weight_temp: Temperature of the log transformation on the weights.
    distance: Distance type.
    verbose: Whether to print progress.

  Returns:
    triplets and weights
  """
    n_points = inputs.shape[0]
    n_extra = min(n_inliers + 50, n_points)
    index = pynndescent.NNDescent(inputs, metric=distance)
    index.prepare()
    neighbors = index.query(inputs, n_extra)[0]
    neighbors = np.concatenate(
        (np.arange(n_points).reshape([-1, 1]), neighbors), 1)
    if verbose:
        logging.info('found nearest neighbors')
    distance_fn = get_distance_fn(distance)
    # conpute scaled neighbors and the scale parameter
    knn_distances, neighbors, sig = find_scaled_neighbors(
        inputs, neighbors, distance_fn)
    neighbors = neighbors[:, :n_inliers + 1]
    knn_distances = knn_distances[:, :n_inliers + 1]
    key, use_key = random.split(key)
    triplets = sample_knn_triplets(use_key, neighbors, n_inliers, n_outliers)
    weights = find_triplet_weights(inputs,
                                   triplets,
                                   neighbors[:, 1:n_inliers + 1],
                                   distance_fn,
                                   sig,
                                   distances=knn_distances[:, 1:n_inliers + 1])
    flip = weights < 0
    anchors, pairs = triplets[:, 0].reshape([-1, 1]), triplets[:, 1:]
    pairs = jnp.where(jnp.tile(flip.reshape([-1, 1]), [1, 2]),
                      jnp.fliplr(pairs), pairs)
    triplets = jnp.concatenate((anchors, pairs), 1)

    if n_random > 0:
        key, use_key = random.split(key)
        rand_triplets, rand_weights = sample_random_triplets(
            use_key, inputs, n_random, distance_fn, sig)

        triplets = jnp.concatenate((triplets, rand_triplets), 0)
        weights = jnp.concatenate((weights, 0.1 * rand_weights))

    weights -= jnp.min(weights)
    weights = tempered_log(1. + weights, weight_temp)
    return triplets, weights