def find_neighbor(cc1, cc2, k, random_state=0): """ find all four way of neighbors for two datasets Parameters ---------- cc1 cc for dataset 1 cc2 cc for dataset 2 k number of neighbors Returns ------- 11, 12, 21, 22 neighbor matrix in shape (n_cell, k) """ index = pynndescent.NNDescent(cc1, metric='euclidean', n_neighbors=k + 1, random_state=random_state) G11 = index.neighbor_graph[0][:, 1:k + 1] G21 = index.query(cc2, k=k)[0] index = pynndescent.NNDescent(cc2, metric='euclidean', n_neighbors=k + 1, random_state=random_state) G22 = index.neighbor_graph[0][:, 1:k + 1] G12 = index.query(cc1, k=k)[0] return G11, G12, G21, G22
def fit(self, X): if self._pynnd_metric == "jaccard": # Convert to sparse matrix format X = self._sparse_convert_for_fit(X) self._index = pynndescent.NNDescent( X, n_neighbors=self._n_neighbors, metric=self._pynnd_metric, low_memory=True, leaf_size=self._leaf_size, pruning_degree_multiplier=self._pruning_degree_multiplier, diversify_prob=self._diversify_prob, n_search_trees=self._n_search_trees, compressed=True, verbose=True, ) if hasattr(self._index, "prepare"): self._index.prepare() else: self._index._init_search_graph() if self._index._is_sparse: if hasattr(self._index, "_init_sparse_search_function"): self._index._init_sparse_search_function() else: if hasattr(self._index, "_init_search_function"): self._index._init_search_function()
def create_tree(data, params): ''' Create a faiss/cKDTree/KDTree/annoy/pynndescent index for nearest neighbour lookup. All undescribed input as in ``bbknn.bbknn()``. Returns the resulting index. Input ----- data : ``numpy.array`` PCA coordinates of a batch's cells to index. params : ``dict`` A dictionary of arguments used to call ``bbknn.matrix.bbknn()``, plus ['computation'] storing the knn algorithm to use. ''' if params['computation'] == 'annoy': ckd = AnnoyIndex(data.shape[1], metric=params['metric']) for i in np.arange(data.shape[0]): ckd.add_item(i, data[i, :]) ckd.build(params['annoy_n_trees']) elif params['computation'] == 'pynndescent': ckd = pynndescent.NNDescent( data, metric=params['metric'], n_jobs=-1, n_neighbors=params['pynndescent_n_neighbors'], random_state=params['pynndescent_random_state']) ckd.prepare() elif params['computation'] == 'faiss': ckd = faiss.IndexFlatL2(data.shape[1]) ckd.add(data) elif params['computation'] == 'cKDTree': ckd = cKDTree(data) elif params['computation'] == 'KDTree': ckd = KDTree(data, metric=params['metric']) return ckd
def build(self, data, k): # These values were taken from UMAP, which we assume to be sensible defaults n_trees = 5 + int(round((data.shape[0])**0.5 / 20)) n_iters = max(5, int(round(np.log2(data.shape[0])))) # Numba takes a while to load up, so there's little point in loading it # unless we're actually going to use it import pynndescent # UMAP uses the "alternative" algorithm, but that sometimes causes # memory corruption, so use the standard one, which seems to work fine self.index = pynndescent.NNDescent( data, n_neighbors=15, metric=self.metric, metric_kwds=self.metric_params, random_state=self.random_state, n_trees=n_trees, n_iters=n_iters, algorithm="standard", max_candidates=60, n_jobs=self.n_jobs, ) indices, distances = self.index.query(data, k=k + 1) return indices[:, 1:], distances[:, 1:]
def find_nearest_anchor(data, anchor_all, data_qry, ref, qry, key_correct='X_pca', npc=30, kweight=100, sd=1, random_state=0): print('Initialize') cum_ref, cum_qry = [0], [0] for xx in ref: cum_ref.append(cum_ref[-1] + data[xx].shape[0]) for xx in qry: cum_qry.append(cum_qry[-1] + data[xx].shape[0]) anchor = [] for i, xx in enumerate(ref): for j, yy in enumerate(qry): if xx < yy: tmp = anchor_all[(xx, yy)].copy() else: tmp = anchor_all[(yy, xx)].copy() tmp[['x1', 'x2']] = tmp[['x2', 'x1']] tmp['x1'] += cum_ref[i] tmp['x2'] += cum_qry[j] anchor.append(tmp) anchor = pd.concat(anchor) score = anchor['score'].values anchor = anchor[['x1', 'x2']].values if key_correct == 'X': model = PCA(n_components=npc, svd_solver='arpack', random_state=random_state) reduce_qry = model.fit_transform(data_qry) else: reduce_qry = data_qry print('Find nearest anchors') index = pynndescent.NNDescent(reduce_qry[anchor[:, 1]], metric='euclidean', n_neighbors=kweight, random_state=random_state) G, D = index.query(reduce_qry, k=kweight) print('Normalize graph') cellfilter = (D[:, -1] == 0) D = (1 - D / D[:, -1][:, None]) * score[G] D[cellfilter] = score[G[cellfilter]] D = 1 - np.exp(-D * (sd**2) / 4) D = D / (np.sum(D, axis=1) + 1e-6)[:, None] return anchor, G, D, cum_qry
def fit(self, X): self._index = pynndescent.NNDescent( X, n_neighbors=self._n_neighbors, metric=self._pynnd_metric, low_memory=True, leaf_size=self._leaf_size, pruning_degree_multiplier=self._pruning_degree_multiplier, diversify_epsilon=self._diversify_epsilon, n_search_trees=self._n_search_trees, n_jobs=self._n_jobs) self._index._init_search_graph()
def _calculate_local_knn(self): """If klocal is provided, we calculate the local knn graph to evaluate whether the anchor preserves local structure within the dataset. One can use a different obsm with key_local to compute knn for each dataset. """ if self.k_local is not None: print('Find neighbors within datasets') for adata in self.adata_list: index = pynndescent.NNDescent(adata.obsm[self.key_local], metric='euclidean', n_neighbors=self.k_local + 1, random_state=self.random_state) self.local_knn.append(index.neighbor_graph[0][:, 1:]) else: self.local_knn = [None for _ in self.adata_list]
def fit(self, X): if self._pynnd_metric == 'jaccard': # Convert to sparse matrix format X = self._sparse_convert_for_fit(X) self._index = pynndescent.NNDescent( X, n_neighbors=self._n_neighbors, metric=self._pynnd_metric, low_memory=True, leaf_size=self._leaf_size, pruning_degree_multiplier=self._pruning_degree_multiplier, diversify_epsilon=self._diversify_epsilon, n_search_trees=self._n_search_trees, n_jobs=self._n_jobs) self._index._init_search_graph()
def build_nndescent_idx(vecs, output_path, n_trees): import pynndescent start = time.time() ret = pynndescent.NNDescent( vecs.copy(), metric="dot", n_neighbors=100, n_trees=n_trees, diversify_prob=0.5, pruning_degree_multiplier=2.0, low_memory=False, ) print("first phase done...") ret.prepare() print("prepare done... writing output...", output_path) end = time.time() difftime = end - start pickle.dump(ret, file=open(output_path, "wb")) return difftime
def FM_to_p2p_aux(FM, eigvects1, eigvects2, use_ANN=False): """ Obtain a point to point map from a functional map with another method. For each row in Phi2 @ C, looks for the nearest row in Phi1 Parameters -------------------------- FM : (k2,k1) functional map in reduced basis eigvects1 : (n1,k1') first k' eigenvectors of the first basis (k1'>k1). First dimension can be subsampled. eigvects2 : (n2,k2') first k' eigenvectors of the second basis (k2'>k2) First dimension can be subsampled. use_ANN : Whether to use approximate nearest neighbors Outputs: -------------------------- p2p : (n2,) match vertex i on shape 2 to vertex p2p[i] on shape 1, or equivalent result if the eigenvectors are subsampled. """ if use_ANN and not ANN: raise ValueError( 'Please install pydescent to achieve Approximate Nearest Neighbor') k2, k1 = FM.shape assert k1 <= eigvects1.shape[1], \ f'At least {k1} should be provided, here only {eigvects1.shape[1]} are given' assert k2 <= eigvects2.shape[1], \ f'At least {k2} should be provided, here only {eigvects2.shape[1]} are given' if use_ANN: index = pynndescent.NNDescent(eigvects1[:, :k1], n_jobs=8) matches, _ = index.query(eigvects2[:, :k2] @ FM, k=1) # (n2,1) matches = matches.flatten() # (n2,) else: tree = KDTree(eigvects1[:, :k1]) # Tree on (n1,k1) matches = tree.query(eigvects2[:, :k2] @ FM, k=1, return_distance=False).flatten() # (n2,) return matches # (n2,)
def filter_anchor(anchor, adata_ref=None, adata_qry=None, high_dim_feature=None, k_filter=200, random_state=0): """ Check if an anchor is still an anchor when only using the high_dim_features to construct KNN graph. If not, remove the anchor. """ ref_data = normalize(adata_ref.X[:, high_dim_feature], axis=1) qry_data = normalize(adata_qry.X[:, high_dim_feature], axis=1) index = pynndescent.NNDescent(ref_data, metric='euclidean', n_neighbors=k_filter, random_state=random_state) G = index.query(qry_data, k=k_filter)[0] input_anchors = anchor.shape[0] anchor = np.array([xx for xx in anchor if (xx[0] in G[xx[1]])]) print(f'Anchor selected with high CC feature graph: {anchor.shape[0]} / {input_anchors}') return anchor
def find_anchor(adata_list, k_local=50, key_local='X_pca', k_anchor=5, key_anchor='X', dimred='pca', max_cc_cell=20000, k_score=30, k_filter=200, scale1=False, scale2=False, ncc=30, n_features=200, alignments=None, random_state=0): nds = len(adata_list) ncell = [xx.shape[0] for xx in adata_list] # If klocal is provided, we calculate the local knn graph to # evaluate whether the anchor preserves local structure within the dataset. # One can use a different obsm with key_local to compute knn for each dataset if k_local: print('Find neighbors within datasets') Gp = [] for i in range(nds): index = pynndescent.NNDescent(adata_list[i].obsm[key_local], metric='euclidean', n_neighbors=k_local + 1, random_state=random_state) Gp.append(index.neighbor_graph[0][:, 1:]) else: Gp = [None for _ in range(nds)] if alignments is not None: all_pairs = [] for pair in alignments: for xx in pair[0]: for yy in pair[1]: if xx < yy: all_pairs.append(f'{xx}-{yy}') else: all_pairs.append(f'{yy}-{xx}') all_pairs = np.unique(all_pairs) else: all_pairs = np.array([]) print('Find anchors across datasets') anchor = {} for i in range(nds - 1): for j in range(i + 1, nds): if (alignments is not None) and (f'{i}-{j}' not in all_pairs): continue # run cca between datasets print('Run CCA') if key_anchor == 'X': # in case the adata var is not in the same order # select and order the var to make sure it is matched if (adata_list[i].shape[1] != adata_list[j].shape[1]) or ( (adata_list[i].var.index == adata_list[j].var.index).sum() < adata_list[i].shape[1]): sel_b = adata_list[i].var.index & adata_list[j].var.index U = adata_list[i][:, sel_b].X.copy() V = adata_list[j][:, sel_b].X.copy() else: U = adata_list[i].X.copy() V = adata_list[j].X.copy() else: U = adata_list[i].obsm[key_anchor] V = adata_list[j].obsm[key_anchor] if dimred == 'pca': U, V = cca(U, V, scale1=scale1, scale2=scale2, n_components=ncc) elif dimred == 'lsi': U, V = lsi_cca(U, V, n_components=ncc, max_cc_cell=max_cc_cell) # compute ccv feature loading high_dim_feature = np.array([]) if k_filter: mat = np.concatenate([U, V], axis=0).T.dot( np.concatenate([adata_list[i].X, adata_list[j].X], axis=0)) high_dim_feature = top_features_idx(mat, n_features=n_features) # normalize ccv U = normalize(U, axis=1) V = normalize(V, axis=1) # find MNN as anchors print('Find Anchors') G11, G12, G21, G22 = find_neighbor( U, V, k=max([k_anchor, k_local, k_score, 50])) raw_anchors = find_mnn(G12, G21, k_anchor) # filter anchors by high dimensional neighbors if k_filter: if ncell[i] >= ncell[j]: raw_anchors = filter_anchor( anchor=raw_anchors, adata_ref=adata_list[i], adata_qry=adata_list[j], high_dim_feature=high_dim_feature, k_filter=k_filter) else: raw_anchors = filter_anchor( anchor=raw_anchors[:, ::-1], adata_ref=adata_list[j], adata_qry=adata_list[i], high_dim_feature=high_dim_feature, k_filter=k_filter)[:, ::-1] # score anchors with snn and local structure preservation print('Score Anchors') anchor_df = score_anchor(anchor=raw_anchors, G11=G11, G12=G12, G21=G21, G22=G22, k_score=k_score, k_local=k_local, Gp1=Gp[i], Gp2=Gp[j]) anchor[(i, j)] = anchor_df.copy() # distance between datasets # dist.append(len(anchor[(i,j)]) / min([ncell[i], ncell[j]])) print( f'Identified {len(anchor[i, j])} anchors between datasets {i} and {j}.' ) return anchor
def build(data, metadata=None, **kwargs): metadata = Index._get_valid_metadata(data, metadata) nnd_index = pynndescent.NNDescent(data, **kwargs) return NNDescentIndex(nnd_index, metadata)
def fit(self, X, k_NN): if self.verbose: timer_str = f"Finding {k_NN} approximate nearest neighbors using" timer_str += f" NNDescent and the '{self.metric}' metric..." timer = utl.Timer(timer_str, verbose=self.verbose) timer.__enter__() ## Get the data shape self.n_samples, self.n_features = X.shape[0], X.shape[1] k_NN = self._check_k(k_NN, self.n_samples) ## > These values were taken from UMAP, which we assume to be sensible ## > defaults [because the UMAP and pynndescent authors are the same.] ## - Pavlin Policar if self.n_trees is None: self.n_trees = 5 + int(round((self.n_samples**0.5) / 20)) if self.n_iters is None: self.n_iters = max(5, int(round(np.log2(self.n_samples)))) ## If `k_NN` > 15, use just the first 15 NN to build the approximate ## NN graph, then use query() to the rest of the desired neighbors. if k_NN <= 15: k_build = k_NN + 1 else: k_build = 15 import pynndescent self.index = pynndescent.NNDescent(X, n_neighbors=k_build, metric=self.metric, metric_kwds=self.metric_params, random_state=self.random_state, n_trees=self.n_trees, n_iters=self.n_iters, n_jobs=self.n_jobs, verbose=self.verbose, **self.pynnd_kws) ## If k_NN <= 15, we're in the clear! NN_idx, distances = self.index.neighbor_graph ## ... Except when pynndescent fails, then it puts a -1 in the index. n_failures = np.sum(NN_idx == -1) ## If k_NN > 15, use query() to get the indices and distances if k_NN > 15: self.index.prepare() NN_idx, distances = self.index.query(X, k=k_NN + 1) ## If pynndescent fails to find neighbors for some points, raise ERROR. if n_failures > 0: err_str = "WARNING: `pynndescent` failed to find neighbors for all" err_str += " points in the data." if self.verbose >= 4: print_opt = np.get_print_options() np.set_print_options(threshold=np.inf) err_str += " The indices of the failed points are: " err_str += f"\n{np.where(np.sum(NN_idx == -1, axis=1))[0]}" np.set_print_options(**print_opt) else: err_str += " Set verbose >= 4 to see the indices of the" err_str += " failed points." raise ValueError(err_str) if self.verbose: timer.__exit__() # return NN_idx[:, 1:], distances[:, 1:] self.kNN_idx = NN_idx[:, 1:] self.kNN_dst = distances[:, 1:] ## Return the indices of the nearest neighbors and the distances ## to those neighbors. return self.kNN_idx.copy(), self.kNN_dst.copy()
def k_nearest_neighbors(data, k, max_distance=None, verbose=False): """Compute k-nearest neighbors for each row in data matrix. Computes the k-nearest neighbor graph of data matrix, under the Euclidean distance. Each row in the data matrix is treated as an item. Arguments --------- data: {torch.Tensor, np.ndarray, scipy.sparse matrix}( shape=(n_items, n_features)) The data matrix k: int The number of nearest neighbors per item max_distance: float (optional) If not None, neighborhoods are restricted to have a radius no greater than `max_distance`. verbose: bool If True, print verbose output. Returns ------- pymde.Graph a neighborhood graph """ # lazy import, because importing pynndescent takes some time import pynndescent if isinstance(data, torch.Tensor): device = data.device data = data.cpu().numpy() else: device = "cpu" n = data.shape[0] if n < 10000: import sklearn.neighbors if verbose: problem.LOGGER.info("Exact nearest neighbors by brute force ") nn = sklearn.neighbors.NearestNeighbors( n_neighbors=k + 1, algorithm="brute" ) nn.fit(data) distances, neighbors = nn.kneighbors(data) else: # TODO default params (n_trees, max_candidates) index = pynndescent.NNDescent( data, n_neighbors=k + 1, verbose=verbose, max_candidates=60, ) neighbors, distances = index.neighbor_graph neighbors = neighbors[:, 1:] distances = distances[:, 1:] n = data.shape[0] items = np.arange(n) items = np.repeat(items, k) edges = np.stack([items, neighbors.flatten()], axis=1) flip_idx = edges[:, 0] > edges[:, 1] edges[flip_idx] = np.stack( [edges[flip_idx][:, 1], edges[flip_idx][:, 0]], axis=1 ) weights = torch.ones(edges.shape[0], device=device, dtype=torch.float) if max_distance is not None: weights[ torch.tensor(distances.ravel(), device=device, dtype=torch.float) > max_distance ] = 0.0 # weights for duplicated edges will be summed. edges = torch.tensor(edges, device=device) return Graph.from_edges(edges, weights)
import numpy as np from tqdm import tqdm import pyFM.spectral as spectral try: import pynndescent index = pynndescent.NNDescent(np.random.random((100,3)),n_jobs=2) del index ANN = True except ImportError: ANN = False def zoomout_iteration(eigvects1, eigvects2, FM, step=1, A2=None, return_p2p=False, use_ANN=False): """ Performs an iteration of ZoomOut. Parameters -------------------- eigvects1 : (n1,k1') eigenvectors on source shape with k1' >= k1 + step. Can be a subsample of the original ones on the first dimension. eigvects2 : (n2,k2') eigenvectors on target shape with k2' >= k2 + step. Can be a subsample of the original ones on the first dimension. FM : (k2,k1) Functional map from from eigvects1[:,:k1] to eigvects2[:,:k2] step : int - step of increase of dimension. A2 : (n2,n2) sparse area matrix on target mesh, for vertex to vertex computation. If specified, the eigenvectors can't be subsampled ! return_p2p : bool - if True returns the vertex to vertex map. use_ANN : bool - if True, uses approximate nearest neighbor
def build(self, data, k): timer = utils.Timer( f"Finding {k} nearest neighbors using NN descent approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() # These values were taken from UMAP, which we assume to be sensible defaults n_trees = 5 + int(round((data.shape[0]) ** 0.5 / 20)) n_iters = max(5, int(round(np.log2(data.shape[0])))) # Numba takes a while to load up, so there's little point in loading it # unless we're actually going to use it import pynndescent # Will use query() only for k>15 if k <= 15: n_neighbors_build = k + 1 else: n_neighbors_build = 15 self.index = pynndescent.NNDescent( data, n_neighbors=n_neighbors_build, metric=self.metric, metric_kwds=self.metric_params, random_state=self.random_state, n_trees=n_trees, n_iters=n_iters, max_candidates=60, n_jobs=self.n_jobs, verbose=self.verbose > 1, ) # -1 in indices means that pynndescent failed indices, distances = self.index.neighbor_graph mask = np.sum(indices == -1, axis=1) > 0 if k > 15: indices, distances = self.index.query(data, k=k + 1) # As a workaround, we let the failed points group together if np.sum(mask) > 0: if self.verbose: opt = np.get_printoptions() np.set_printoptions(threshold=np.inf) warnings.warn( f"`pynndescent` failed to find neighbors for some of the points. " f"As a workaround, openTSNE considers all such points similar to " f"each other, so they will likely form a cluster in the embedding." f"The indices of the failed points are:\n{np.where(mask)[0]}" ) np.set_printoptions(**opt) else: warnings.warn( f"`pynndescent` failed to find neighbors for some of the points. " f"As a workaround, openTSNE considers all such points similar to " f"each other, so they will likely form a cluster in the embedding. " f"Run with verbose=True, to see indices of the failed points." ) distances[mask] = 1 rs = check_random_state(self.random_state) fake_indices = rs.choice( np.sum(mask), size=np.sum(mask) * indices.shape[1], replace=True ) fake_indices = np.where(mask)[0][fake_indices] indices[mask] = np.reshape(fake_indices, (np.sum(mask), indices.shape[1])) timer.__exit__() return indices[:, 1:], distances[:, 1:]
def fit(self, X): self._index = pynndescent.NNDescent(X, n_neighbors=self._n_neighbors, n_trees=self._n_trees, leaf_size=self._leaf_size, metric=self._pynnd_metric)
def CorrelateStitchImages(dirname, dirout, stitchchannel, chosenstitchgroup, x1lim=-.05, x2lim=1.05, y1lim=-.05, y2lim=1.05, save_stitched=True, save_multipage=True, constant_size=250000000, roll_ball=True, use_gene_name=True, save_merged=False): #dirname = os.path.expanduser('/wynton/group/ye/mtschmitz/images/MacaqueMotorCortex2/P2sagittal1_27_20200828/TR1.2020-09-03-01-35-13/') #Test params '''dirname = os.path.expanduser('/media/mt/Extreme SSD/MacaqueMotorCortex2/P2_OB_20200805/TR1.2020-08-06-23-10-18') dirout =os.path.expanduser('~/tmp/') stitchchannel='1' chosenstitchgroup='1' x1lim=0 x2lim=1 y1lim=0 y2lim=1 save_merged=False save_stitched=False save_multipage=True use_gene_name=True ''' print(dirname, flush=True) ray.shutdown() num_cpus = 1 #psutil.cpu_count(logical=False) #set number of cores to use print('cpus:', num_cpus) ray.init(num_cpus=num_cpus) x1lim = float(x1lim) x2lim = float(x2lim) y1lim = float(y1lim) y2lim = float(y2lim) chosenstitchgroup = re.sub('TR_', "1", chosenstitchgroup) chosenstitchgroup = re.sub('TR', "", chosenstitchgroup) protocol = [x for x in os.listdir(dirname) if '.scanprotocol' in x][0] minoverlap = 0 with open(os.path.join(dirname, protocol), 'r') as f: for line in f: try: #print(line) if 'MinOverlapPixel' in line: minoverlap = float(line.split('>')[1].split('<')[0]) * 1.1 except: pass n_locations = 0 counting = False with open(os.path.join(dirname, protocol), 'r') as f: for line in f: try: if 'LocationIds' in line: counting = ~counting elif counting: n_locations += 1 except: pass n_location = 1 counting = False reference = False shapes = defaultdict(list) with open(os.path.join(dirname, protocol), 'r') as f: for line in f: try: if '<d2p1:ScanLocation>' in line: counting = ~counting if counting and '<d10p1:_x>' in line: x = float(line.split('>')[1].split('<')[0]) if counting and '<d10p1:_y>' in line: y = float(line.split('>')[1].split('<')[0]) shapes[str(n_location)].append((x, y)) if '<d2p1:ReferencePoint ' in line: counting = False reference = True if reference and '<d10p1:_x>' in line: xref = float(line.split('>')[1].split('<')[0]) if reference and '<d10p1:_y>' in line: yref = float(line.split('>')[1].split('<')[0]) reference = False shapes[str(n_location)] = [ (x + xref, y + yref) for x, y in shapes[str(n_location)] ] n_location += 1 xref = 0 yref = 0 except: pass tags_to_get = [ 'SizeX', 'SizeY', 'ActualPositionX', 'ActualPositionY', 'Run Index', 'Index', '"field" Index', 'TheC', 'AreaGrid AreaGridIndex', 'Name', '<Image ID="Image:" Name', 'ActualPositionZ' ] def get_tag(x, tp): return (re.search(x + '=' + '"([A-Za-z0-9_\./\\-]*)"', tp).group(1)) @ray.remote def getTiffMetadata(f): f = open(f, 'rb') # Return Exif tags tags = exifread.process_file(f) tp = tags['Image ImageDescription'].values d = {} for tag in tags_to_get: d[tag] = get_tag(tag, tp) return (d) data = [] for fname in sorted(os.listdir(dirname)): if fname.endswith(".TIF"): fpath = os.path.join(dirname, fname) path = os.path.normpath(dirname) d = path.split(os.sep)[-2] data.append((fname, d, fpath)) imageFiles = pd.DataFrame(data, columns=['FileName', 'DirName', 'Path']) #normalize positions so starts at 0,0 #imageFiles['ActualPositionX']=imageFiles['ActualPositionX']-imageFiles['ActualPositionX'].min() #imageFiles['ActualPositionY']=imageFiles['ActualPositionY']-imageFiles['ActualPositionY'].min() imageFiles = imageFiles.loc[['_R_' in x for x in imageFiles['FileName']], :] l = [] for i in imageFiles.index: f = imageFiles.loc[i, 'Path'] #d=getTiffMetadata(f) l.append(getTiffMetadata.remote(f)) #print(d) #imageFiles.loc[i,d.keys()]=list(d.values()) metadf = pd.DataFrame(ray.get(l)) metadf.index = imageFiles.index imageFiles = imageFiles.join(metadf) ray.shutdown() imageFiles.rename(columns={'<Image ID="Image:" Name': 'Channel Name'}, inplace=True) imageFiles['ActualPositionX'] = imageFiles['ActualPositionX'].astype(float) imageFiles['ActualPositionY'] = imageFiles['ActualPositionY'].astype(float) #print(imageFiles['ActualPositionX']) #print(imageFiles['ActualPositionY']) imageFiles['SizeX'] = imageFiles['SizeX'].astype(float) imageFiles['SizeY'] = imageFiles['SizeY'].astype(float) imageFiles.sort_values(by=['"field" Index', 'Channel Name'], inplace=True) print(imageFiles, flush=True) #Max size chif = imageFiles #.loc[imageFiles['Channel Name']==stitchchannel,:] chif['x1pix'] = 0 chif['x2pix'] = 0 chif['y1pix'] = 0 chif['y2pix'] = 0 xsize = imageFiles['SizeX'].value_counts().idxmax() ysize = imageFiles['SizeY'].value_counts().idxmax() xend = len(chif.ActualPositionX.unique()) * xsize yend = len(chif.ActualPositionY.unique()) * ysize #assume adjacent images are taken sequentially xd = [] yd = [] for i in range(len(chif['"field" Index'].unique()) - 1): indi = chif['"field" Index'] == list(chif['"field" Index'])[i] indip1 = chif['"field" Index'] == list(chif['"field" Index'])[i + 1] xdiff = np.abs( list(chif.loc[indi, 'ActualPositionX'])[0] - list(chif.loc[indip1, 'ActualPositionX'])[0]) ydiff = np.abs( list(chif.loc[indi, 'ActualPositionY'])[0] - list(chif.loc[indip1, 'ActualPositionY'])[0]) xd.append(xdiff) yd.append(ydiff) xd = np.array(xd) yd = np.array(yd) #nonzero (>10) median is the distance between images xmedian = np.median(xd[xd > 10]) ymedian = np.median(yd[yd > 10]) #for reference xsize-minoverlap=xmedian print('MEDIANS') print(xmedian, ymedian) from shapely.geometry import Point from shapely.geometry.polygon import Polygon polygon = Polygon(shapes[chosenstitchgroup]) #Buffer expands, scale doesn't work for expanding linear regions with no points buffgon = Polygon(polygon.buffer(min(xmedian, ymedian)).exterior) #polygon=shapely.affinity.scale(polygon,xfact=1.2,yfact=1.2) inside = [ buffgon.contains(Point(x, y)) for x, y in list(zip(chif['ActualPositionX'], chif['ActualPositionY'])) ] matplotlib.pyplot.scatter(list(chif['ActualPositionX']), list(chif['ActualPositionY'])) matplotlib.pyplot.scatter([x[0] for x in shapes[chosenstitchgroup]], [x[1] for x in shapes[chosenstitchgroup]]) matplotlib.pyplot.scatter(buffgon.exterior.coords.xy[0], buffgon.exterior.coords.xy[1]) matplotlib.pyplot.savefig(os.path.join(dirout, 'BufferPolygon.png')) matplotlib.pyplot.close() chif = chif.loc[inside, :] #normalize positions so starts at 0,0 chif['ActualPositionX'] = chif['ActualPositionX'] - chif[ 'ActualPositionX'].min() chif['ActualPositionY'] = chif['ActualPositionY'] - chif[ 'ActualPositionY'].min() xorder = dict( zip(np.sort(chif['ActualPositionX'].unique()), np.sort(chif['ActualPositionX'].unique().argsort()))) yorder = dict( zip(np.sort(chif['ActualPositionY'].unique()), np.sort(chif['ActualPositionY'].unique().argsort()))) for i in chif.index: x = chif.loc[ i, 'ActualPositionX'] / xmedian #xorder[chif.loc[i,'ActualPositionX']] y = chif.loc[ i, 'ActualPositionY'] / ymedian #yorder[chif.loc[i,'ActualPositionY']] x1 = int((xsize * x) - (x * minoverlap)) x2 = x1 + int(xsize) y1 = int((ysize * y) - (y * minoverlap)) y2 = y1 + int(ysize) chif.loc[i, 'x1pix'] = x1 chif.loc[i, 'x2pix'] = x2 chif.loc[i, 'y1pix'] = y1 chif.loc[i, 'y2pix'] = y2 cf = chif.loc[chif['TheC'] == stitchchannel, :] cf['Image'] = None for i in tqdm.tqdm(cf.index): img = cv2.imread(cf.loc[i, 'Path'])[:, :, 0].T cf.loc[i, 'Image'] = [img] print(chif, flush=True) #print(cf.loc[2,'Image'].shape) #plt.imshow(cf.loc[2,'Image'],origin='lower') index = pynndescent.NNDescent(cf.loc[:, ['x1pix', 'y1pix']], n_neighbors=9) nn = index.query(cf.loc[:, ['x1pix', 'y1pix']], k=9)[0][:, 1:] g = networkx.DiGraph().to_undirected() for i, x in enumerate(nn): for j in x: g.add_edge(i, j) g = g.to_undirected() x1i = np.where(cf.columns == 'x1pix')[0][0] x2i = np.where(cf.columns == 'x2pix')[0][0] y1i = np.where(cf.columns == 'y1pix')[0][0] y2i = np.where(cf.columns == 'y2pix')[0][0] imgi = np.where(cf.columns == 'Image')[0][0] #could be modified with inner loop to get #average correlation of all channels #Downside of course is 4x processing time def correlateOffsets(x): xoffset, yoffset = x[0], x[1] for i in cf.index: x = cf.loc[ i, 'ActualPositionX'] / xmedian #xorder[chif.loc[i,'ActualPositionX']] y = cf.loc[ i, 'ActualPositionY'] / ymedian #yorder[chif.loc[i,'ActualPositionY']] x1 = int((xsize * x) - x * int(xoffset)) x2 = x1 + int(xsize) y1 = int((ysize * y) - y * int(yoffset)) y2 = y1 + int(ysize) cf.loc[i, 'x1pix'] = x1 cf.loc[i, 'x2pix'] = x2 cf.loc[i, 'y1pix'] = y1 cf.loc[i, 'y2pix'] = y2 i_vect = [] j_vect = [] for i, j in list(g.edges): ix1i = cf.iloc[i, x1i] iy1i = cf.iloc[i, y1i] ix2i = cf.iloc[i, x2i] iy2i = cf.iloc[i, y2i] jx1i = cf.iloc[j, x1i] jy1i = cf.iloc[j, y1i] jx2i = cf.iloc[j, x2i] jy2i = cf.iloc[j, y2i] p = Polygon([(ix1i, iy1i), (ix2i, iy1i), (ix2i, iy2i), (ix1i, iy2i)]) q = Polygon([(jx1i, jy1i), (jx2i, jy1i), (jx2i, jy2i), (jx1i, jy2i)]) if p.intersects(q): pqi = p.intersection(q) #x1,y1,x2,y2 bounds = pqi.bounds ix1b, iy1b, ix2b, iy2b = bounds[0] - ix1i, bounds[ 1] - iy1i, bounds[2] - ix2i, bounds[3] - iy2i jx1b, jy1b, jx2b, jy2b = bounds[0] - jx1i, bounds[ 1] - jy1i, bounds[2] - jx2i, bounds[3] - jy2i rxi = np.intersect1d(np.arange(ix1i, ix2i), np.arange(bounds[0], bounds[2])) - ix1i rxj = np.intersect1d(np.arange(jx1i, jx2i), np.arange(bounds[0], bounds[2])) - jx1i ryi = np.intersect1d(np.arange(iy1i, iy2i), np.arange(bounds[1], bounds[3])) - iy1i ryj = np.intersect1d(np.arange(jy1i, jy2i), np.arange(bounds[1], bounds[3])) - jy1i rxi = rxi.astype(int) rxj = rxj.astype(int) ryi = ryi.astype(int) ryj = ryj.astype(int) if len(rxi) > 0 and len(rxj) > 0: i_vect.append( list(cf.iloc[i, imgi][rxi, ryi[:, np.newaxis]].flatten())) j_vect.append( list(cf.iloc[j, imgi][rxj, ryj[:, np.newaxis]].flatten())) #plt.imshow(cf.iloc[i,imgi][rxi,ryi[:,np.newaxis]],origin='lower') #plt.show() #plt.imshow(cf.iloc[j,imgi][rxj,ryj[:,np.newaxis]],origin='lower') #plt.show() #x,y = p.exterior.xy #plt.plot(x,y) #x,y = q.exterior.xy #plt.plot(x,y) #x,y = pqi.exterior.xy #plt.plot(x,y) #plt.show() i_vect = [item for sublist in i_vect for item in sublist] j_vect = [item for sublist in j_vect for item in sublist] corr = 1 - np.corrcoef(i_vect, j_vect)[1, 0] #print(corr) return (corr) #opt=scipy.optimize.brute(correlateOffsets,(slice(30,500),slice(30,500)),full_output=True,disp=False,workers=num_cpus) opt = scipy.optimize.minimize(correlateOffsets, (minoverlap * 2.5, minoverlap * 2.5), method='Nelder-Mead', options={ 'xtol': .9, 'ftol': .05 }) chif['final_identifier'] = chif['TheC'] if use_gene_name: l = [] for x in chif['FileName']: if x.startswith('L_'): l.append('1') continue if x.startswith('R_'): l.append('2') continue if '_TR_' in x: l.append('1') continue if '_TD_' in x: l.append('1') continue if 'TR' in x or 'TD' in x: groupnum = re.sub('TR|TD', '', re.search('TD[0-9]|TR[0-9]', x).group(0)) l.append(groupnum) continue l.append(chosenstitchgroup) chif['stitchgroup'] = l tmppath = os.path.expanduser('~/imagingmetadata.csv') if os.path.exists(tmppath): refdf = pd.read_csv(tmppath, sep='\t') else: refdf = pd.read_csv( 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSYbvCJpS-GfRKuGgs2IBH7MD1KtDPDqs7ePqQJ1PyrMKp7f7z7ZpY4WtMFGPxU4mWbnRHgBl4PtaeH/pub?output=tsv&gid=1520792104', sep='\t') refdf.to_csv(tmppath, sep='\t') refdf.rename(columns={ 'Channel0': '0', 'Channel1': '1', 'Channel2': '2', 'Channel3': '3' }, inplace=True) chif = pd.merge(chif, refdf, how='left', left_on=['DirName', 'stitchgroup'], right_on=['DirName', 'SlidePosition.1isL']) chif['gene'] = list( [chif.loc[chif.index[i], x] for i, x in enumerate(chif['TheC'])]) chif['final_identifier'] = chif['gene'] else: chif['final_identifier'] = chif['TheC'] xoffset = opt.x[0] yoffset = opt.x[1] #xoffset=218 #yoffset=209 for i in chif.index: x = chif.loc[ i, 'ActualPositionX'] / xmedian #xorder[chif.loc[i,'ActualPositionX']] y = chif.loc[ i, 'ActualPositionY'] / ymedian #yorder[chif.loc[i,'ActualPositionY']] x1 = int((xsize * x) - x * int(xoffset)) x2 = x1 + int(xsize) y1 = int((ysize * y) - y * int(yoffset)) y2 = y1 + int(ysize) chif.loc[i, 'x1pix'] = x1 chif.loc[i, 'x2pix'] = x2 chif.loc[i, 'y1pix'] = y1 chif.loc[i, 'y2pix'] = y2 print('before min subtract') print(chif) xmin = chif['x1pix'].min() ymin = chif['y1pix'].min() chif['x1pix'] = chif['x1pix'] - xmin chif['x2pix'] = chif['x2pix'] - xmin chif['y1pix'] = chif['y1pix'] - ymin chif['y2pix'] = chif['y2pix'] - ymin def write_stitchy(chdf, infile, keyname='FileName'): with open(infile, 'w') as the_file: the_file.write('dim = 2\n') for i in chdf.index: cur = chdf.loc[i, :] the_file.write(cur[keyname] + '; ; (' + str(cur['x1pix']) + ',' + str(cur['y1pix']) + ') \n') #for imagej merge on the fly for c in chif['final_identifier'].unique(): cf = chif.loc[chif['final_identifier'] == c, :] infile = os.path.join( dirout, str(chosenstitchgroup) + '_' + stitchchannel + '.stitchy') write_stitchy(cf, infile, keyname='FileName') print(chif) #Or write whole file if save_stitched: for c in chif['final_identifier'].unique(): cf = chif.loc[chif['final_identifier'] == c, :] cf['Image'] = None for i in cf.index: img = cv2.imread(cf.loc[i, 'Path'])[:, :, 0].T cf.loc[i, 'Image'] = [img] newimg = np.zeros( (int(np.nanmax(cf['x2pix'])), int(np.nanmax(cf['y2pix']))), np.uint8) divisor = np.zeros( (int(np.nanmax(cf['x2pix'])), int(np.nanmax(cf['y2pix']))), np.uint8) for i in cf.index: x1, x2, y1, y2 = cf.loc[i, ['x1pix', 'x2pix', 'y1pix', 'y2pix']] newimg[x1:x2, y1:y2] += cf.loc[i, 'Image'] divisor[x1:x2, y1:y2] += 1 im = np.nan_to_num(np.divide(newimg, divisor).T, nan=0).astype(np.uint8) cur_size = im.shape[0] * im.shape[1] if constant_size < cur_size: scale_percent = np.sqrt(constant_size / cur_size) # percent of original size width = int(im.shape[1] * scale_percent) height = int(im.shape[0] * scale_percent) dim = (width, height) # resize image im = cv2.resize(im, dim, interpolation=cv2.INTER_LINEAR) #from PIL import Image print('background subbing', flush=True) #import skimage #from skimage import morphology #im=im-skimage.morphology.rolling_ball(im,radius=100) #subtract_background(im,radius=100,light_bg=False) im = im.astype(np.uint16) tifffile.imsave(os.path.join(dirout, c + '_stitched.TIF'), im, compress=6) if roll_ball: RollingBallIJ(os.path.join(dirout, c + '_stitched.TIF')) print('background subbed', flush=True) ''' if save_merged: imgs={} for c in sorted(chif['final_identifier'].unique()): cf=chif.loc[chif['final_identifier']==c,:] cf['Image']=None for i in tqdm.tqdm(cf.index): img=cv2.imread(cf.loc[i,'Path'])[:,:,0].T cf.loc[i,'Image']=[img] newimg=np.zeros((int(cf['x2pix'].max()),int(cf['y2pix'].max())), np.uint8) divisor=np.zeros((int(cf['x2pix'].max()),int(cf['y2pix'].max())), np.uint8) for i in cf.index: x1,x2,y1,y2=cf.loc[i,['x1pix','x2pix','y1pix','y2pix']] newimg[x1:x2,y1:y2]+=cf.loc[i,'Image'] divisor[x1:x2,y1:y2]+=1 imgs[c]=np.nan_to_num(np.divide(newimg,divisor).T,nan=0).astype(np.uint8) tifffile.imsave(os.path.join(dirout,'merged_stitched.TIF'),list(imgs.values()),metadata={'Test':'YES,No','Value':100},compress=6) ''' if save_multipage: l = [] for f in tqdm.tqdm(cf['"field" Index'].unique()): print(f, flush=True) cf = chif.loc[chif['"field" Index'] == f, :] cf = cf.sort_values(by='final_identifier', axis=0) cf['Image'] = None for i in cf.index: img = cv2.imread(cf.loc[i, 'Path'])[:, :, 0] cf.loc[i, 'Image'] = [img] #print(cf) imgs = {} for i in cf.index: imgs[cf.loc[i, 'final_identifier']] = cf.loc[i, 'Image'] # metadata = {'channel_names': ','.join(list(imgs.keys()))} metadata.update(cf.loc[i, [ 'DirName', 'SizeX', 'SizeY', 'ActualPositionX', 'ActualPositionY', '"field" Index', 'ActualPositionZ', 'x1pix', 'y1pix' ]].astype(str).to_dict()) tifffile.imsave(os.path.join(dirout, f + '_merged.TIF'), list(imgs.values()), metadata=metadata, compress=6) l.append( [f + '_merged.TIF', cf.loc[i, 'x1pix'], cf.loc[i, 'y1pix']]) infile = os.path.join(dirout, '_'.join(list(imgs.keys())) + '_merged.stitchy') write_stitchy(pd.DataFrame(l, columns=['FileName', 'x1pix', 'y1pix']), infile, keyname='FileName')
def generate_triplets(key, inputs, n_inliers, n_outliers, n_random, weight_temp=0.5, distance='euclidean', verbose=False): """Generate triplets. Args: key: Random key. inputs: Input points. n_inliers: Number of inliers. n_outliers: Number of outliers. n_random: Number of random triplets per point. weight_temp: Temperature of the log transformation on the weights. distance: Distance type. verbose: Whether to print progress. Returns: triplets and weights """ n_points = inputs.shape[0] n_extra = min(n_inliers + 50, n_points) index = pynndescent.NNDescent(inputs, metric=distance) index.prepare() neighbors = index.query(inputs, n_extra)[0] neighbors = np.concatenate( (np.arange(n_points).reshape([-1, 1]), neighbors), 1) if verbose: logging.info('found nearest neighbors') distance_fn = get_distance_fn(distance) # conpute scaled neighbors and the scale parameter knn_distances, neighbors, sig = find_scaled_neighbors( inputs, neighbors, distance_fn) neighbors = neighbors[:, :n_inliers + 1] knn_distances = knn_distances[:, :n_inliers + 1] key, use_key = random.split(key) triplets = sample_knn_triplets(use_key, neighbors, n_inliers, n_outliers) weights = find_triplet_weights(inputs, triplets, neighbors[:, 1:n_inliers + 1], distance_fn, sig, distances=knn_distances[:, 1:n_inliers + 1]) flip = weights < 0 anchors, pairs = triplets[:, 0].reshape([-1, 1]), triplets[:, 1:] pairs = jnp.where(jnp.tile(flip.reshape([-1, 1]), [1, 2]), jnp.fliplr(pairs), pairs) triplets = jnp.concatenate((anchors, pairs), 1) if n_random > 0: key, use_key = random.split(key) rand_triplets, rand_weights = sample_random_triplets( use_key, inputs, n_random, distance_fn, sig) triplets = jnp.concatenate((triplets, rand_triplets), 0) weights = jnp.concatenate((weights, 0.1 * rand_weights)) weights -= jnp.min(weights) weights = tempered_log(1. + weights, weight_temp) return triplets, weights