def get_cdist(c1, c2, genome="hg38"): """ Helper function to compute the pairwise spatial distance between two clusters (i.e. single chromosome copies). Params: ------- c1: first cluster, dataframe c2: second cluster, dataframe genome: genome type used to return dataframe keys, string Returns: -------- R_cdist: pairwise distance between the two clusters P1: list of genomic positions for the first cluster P2: list of genomic positions for the second cluster """ #Init genome-specific dataframe keys KEYS = const.get_genome_keys(genome) #Get spatial position vectors for the clusters R1 = np.array( [c1[KEYS["x"]].values, c1[KEYS["y"]].values, c1[KEYS["z"]].values]).T R2 = np.array( [c2[KEYS["x"]].values, c2[KEYS["y"]].values, c2[KEYS["z"]].values]).T #Get genomic position vectors for the clusters P1 = np.array(c1[KEYS["pos"]].values) P2 = np.array(c2[KEYS["pos"]].values) #Compute the pairwise distances R_cdist = distance.cdist(R1, R2) return (R_cdist, P1, P2)
def get_hull(cell, dim=3, genome='hg38'): """ Construct a convex hull of a cell in n-d space. Params: ------- cell: target cell, dataframe dims: number of dimensions for hull genome: target genome, for constants Returns: -------- hull: scipy convex hull object """ #Init genome-specific dataframe keys KEYS = const.get_genome_keys(genome) #Get spatial position vectors for the clusters R = [] for i in range(dim): R.append(cell[KEYS['dim'][i]].values) R = np.array(R).T hull = ConvexHull(R) return hull
def get_pdists(cluster, genome="hg38"): """ Get all pairwise euclidean distances within a cluster (e.g. single chromsome copy, chromosome arm), as well as their genomic distances. Params: ------- cluster: reads of interest, dataframe genome: target genome, to retreive constants, string Returns: -------- R_pdist: list of all pairwise euclidean distances P_pdist: list of all pairwise genomic distances """ KEYS = const.get_genome_keys(genome) #Get spatial position vector and genomic position vector R = np.array([ cluster[KEYS["x"]].values, cluster[KEYS["y"]].values, cluster[KEYS["z"]].values ]).T P = np.array(cluster[KEYS["pos"]].values) R_pdist = distance.pdist(R) P_pdist = distance.pdist(np.array([P]).T) return R_pdist, P_pdist
def make_genome_wide_matrix(cells, resolution=10 * 10**6, genome="hg38"): """ Make a population ensemble genome wide distance matrix given a list of single cells. For each cell, iterate over chromosomes (distinguishing between homologs), compute pairwise distances between their corrsponding reads, and index into population matrix to append distances. Params: ------- cells: list of cells, dataframe resolution: matrix resolution, in basepairs genome: target genome, to retreive constants Returns: -------- GM: the genome wide distance matrix; each pixel is a list of distances observed at that corresponding pair of genomic positions, binned at the input resolution. """ #Init constants SIZES = const.get_genome_sizes(genome) #chromosome sizes KEYS = const.get_genome_keys(genome) #dataframe keys chr_count = len(SIZES) #Bin sizes for matrix per chromosome bin_sizes_chr = get_bin_sizes(resolution, genome) #Total and cumulative bins total_bins = np.sum(bin_sizes_chr) #Init genome wide distance matrix GM = init_empty_matrix(total_bins) for cell in cells: for i in range(1, chr_count): for j in range(i, chr_count): #Handle multiple clusters (e.g. homologs) for a chromosome if i == j: chro = cell.loc[cell[KEYS["chr"]] == i] cluster_idxs = chro[KEYS["cluster"]].unique() for idx in cluster_idxs: #get intra cluster distances ci = chro.loc[chro[KEYS["cluster"]] == idx] if len(ci) < 1: continue #need more than 1 read GM = populate_tile(GM, i, j, ci, ci, resolution) #Nonhomologous clusters else: ci = cell.loc[cell[KEYS["chr"]] == i] cj = cell.loc[cell[KEYS["chr"]] == j] if len(ci) < 1 or len(cj) < 1: continue GM = populate_tile(GM, i, j, ci, cj, resolution) return GM
def get_cell_clusters(cell, chr_nums, genome="hg38"): """ Return clusters (i.e. single chromosome copies) from a single cell given a list of chromosome numbers. Params: ------- cell: cell of interest, dataframe chr_nums: chromosomes of interest, list of ints genome: target genome, to retreive constants, string Returns: -------- cell_clusters: list of dataframes corresponding to single chr copies """ KEYS = const.get_genome_keys(genome) cell_clusters = [] for chr_num in chr_nums: chro = cell.loc[cell[KEYS["chr"]] == chr_num] if len(chro) == 0: continue #e.g. x chromsome not present cluster_nums = chro[KEYS["cluster"]].unique() if genome == "mm10": for cluster_num in cluster_nums: cell_clusters.append(chro.loc[chro[KEYS["cluster"]] == \ cluster_num]) #Annoying but necessary logic due to cluster labeling in fibroblast data elif genome == "hg38": clusters_temp = [] for cluster_num in cluster_nums: clusters_temp.append(chro.loc[chro[KEYS["cluster"]] == \ cluster_num]) clusters = sorted(clusters_temp, key=len, reverse=True) #If there are three or more clusters, discard all but the largest #two, corresponding to the putative chromosome territories. The #smaller clusters are the outliers. for i in range(len(clusters)): if len(clusters) > 1 and i < 2: cell_clusters.append(clusters[i]) else: raise ValueError("Genome not found.") return cell_clusters
def center_cell(cell, origin, dim=3, genome='hg38'): """ Translate cell to a new origin. Params: ------- cell: target cell, dataframe origin: spatial coordinates dim: dimensions for translation genome: target genome to retrieve constants Returns: -------- cell: translated cell """ KEYS = const.get_genome_keys(genome) for index, row in cell.iterrows(): for i in range(dim): cell.at[index, KEYS['dim'][i]] = row[KEYS['dim'][i]] - origin[i] return cell
def make_distance_matrix(cluster, resolution=2.5 * 10**6, statistic=np.nanmean, flatten=True, genome="mm10"): """ Function to generate a distance matrix from a chromosome cluster. Params: ------- cluster: pandas dataframe with chromosome cluster information resolution: matrix resolution in base pairs statistic: function implementing desired distance matrix flatten: If true, apply the distance function, if false, return matrix with variable length genome: human or mouse genome, string Returns: -------- A: distance matrix as defined by the statistic """ chr_num = cluster["chr"].unique()[0] if type(chr_num) != np.int64: raise ValueError("Cluster includes multiple chromosomes.") SIZES = const.get_genome_sizes(genome) KEYS = const.get_genome_keys(genome) chr_size = SIZES[chr_num] num_bins = int(np.ceil(chr_size / resolution)) A = init_empty_matrix(num_bins) B = np.arange(0, num_bins + 1) #bin vector #Get spatial position vector and genomic position vector R = np.array([ cluster[KEYS["x"]].values, cluster[KEYS["y"]].values, cluster[KEYS["z"]].values ]).T P = np.array(cluster[KEYS["pos"]].values) #Bin position vector, then populate binned matrix from unbinned matrix of #pdists using binned indices P_inds = np.digitize(P, B * resolution) - 1 R_pdist = distance.pdist(R) R_sf = distance.squareform(R_pdist) #Do the matrix binning for i in range(len(P_inds)): for j in range(i + 1, len(P_inds)): ii, jj = P_inds[i], P_inds[j] A[ii][jj].append((R_sf[i][j])) A[jj][ii].append((R_sf[i][j])) if flatten: #Now flatten lists into 2D matrix according to some statistic A = flatten_matrix(A, func=statistic) return A