def get_radial_statistic(R, func=np.nanmean, genome='hg38'): """ Compute a statistic on the radial data. Params: ------- R: lists of radial positions, indexed by chromosome func: the function used to compute the statistic genome: target genome, to retrieve constants Returns: -------- R_stat: array with a single number per chromosome, i.e. the statistic """ SIZES = const.get_genome_sizes(genome) #chromosome sizes R_stat = np.zeros(len(SIZES)) for i in range(len(R)): with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) R_stat[i] = func(R[i]) return R_stat
def get_bin_sizes(resolution, genome="hg38"): """ Helper function to generate a list of bin sizes corresponding to the chromosomes in a target genome. Each bin size reflects the number of bins that a given chromosome is discretized to at a given bin resolution. Params: ------- resolution: the resolution at which the chromosomes will be binned, in basepairs genome: genome type used to look up chromosome sizes Returns: -------- bin_sizes_chr: list of bin sizes corresponding to each chromosome in the target genome. """ #Init constants SIZES = const.get_genome_sizes(genome) #Bin sizes for matrix per chromosome chr_count = len(SIZES) bin_sizes_chr = np.array([0]) for i in range(1, chr_count): chr_size = SIZES[i] num_bins = int(np.ceil(SIZES[i] / resolution)) bin_sizes_chr = np.append(bin_sizes_chr, num_bins) return bin_sizes_chr
def get_radial_dists(cells, genome='hg38'): """ Get the radial positions of all reads, indexed by chromosome. Params: ------- cells: a list of the single cells, dataframe genome: target genome, to retrieve constants Returns: -------- R: lists of radial read positions, indexed by chromosome """ SIZES = const.get_genome_sizes(genome) #chromosome sizes R = [] # to record normed radial distances for i in range(len(SIZES)): R.append([]) for cell in cells: chr_nums = cell["hg38_chr"].values radii = cell["norm_r_2D"].values for i in range(len(chr_nums)): R[chr_nums[i]].append(radii[i]) return R
def make_genome_wide_matrix(cells, resolution=10 * 10**6, genome="hg38"): """ Make a population ensemble genome wide distance matrix given a list of single cells. For each cell, iterate over chromosomes (distinguishing between homologs), compute pairwise distances between their corrsponding reads, and index into population matrix to append distances. Params: ------- cells: list of cells, dataframe resolution: matrix resolution, in basepairs genome: target genome, to retreive constants Returns: -------- GM: the genome wide distance matrix; each pixel is a list of distances observed at that corresponding pair of genomic positions, binned at the input resolution. """ #Init constants SIZES = const.get_genome_sizes(genome) #chromosome sizes KEYS = const.get_genome_keys(genome) #dataframe keys chr_count = len(SIZES) #Bin sizes for matrix per chromosome bin_sizes_chr = get_bin_sizes(resolution, genome) #Total and cumulative bins total_bins = np.sum(bin_sizes_chr) #Init genome wide distance matrix GM = init_empty_matrix(total_bins) for cell in cells: for i in range(1, chr_count): for j in range(i, chr_count): #Handle multiple clusters (e.g. homologs) for a chromosome if i == j: chro = cell.loc[cell[KEYS["chr"]] == i] cluster_idxs = chro[KEYS["cluster"]].unique() for idx in cluster_idxs: #get intra cluster distances ci = chro.loc[chro[KEYS["cluster"]] == idx] if len(ci) < 1: continue #need more than 1 read GM = populate_tile(GM, i, j, ci, ci, resolution) #Nonhomologous clusters else: ci = cell.loc[cell[KEYS["chr"]] == i] cj = cell.loc[cell[KEYS["chr"]] == j] if len(ci) < 1 or len(cj) < 1: continue GM = populate_tile(GM, i, j, ci, cj, resolution) return GM
def make_ensemble_matrix(data, chr_num, resolution=2.5 * 10**6, statistic=np.nanmean, genome="mm10"): """ Build an ensemble distance matrix for a particular chromosome given a dataframe. Params: ------- data: the dataframe (must be prefiltered on stage, parent, and chr) chr_num: desired chromosome number for ensemble distance resolution: matrix resolution in base pairs statistic: function to get distance metric genome: species corresponding to the data Returns: -------- A_ensemble: the ensemble distance matrix. """ #Get all chromosome copies matrices, clusters = [], [] chr_data = data.loc[data["chr"] == chr_num] cell_indexes = data["cell_index"].unique() for ci in cell_indexes: cell = chr_data.loc[chr_data["cell_index"] == ci] cluster = cell.loc[cell["chr"] == chr_num] if len(cluster) > 0: clusters.append(cluster) #Get matrix parameters SIZES = const.get_genome_sizes(genome) chr_size = SIZES[chr_num] num_bins = int(np.ceil(chr_size / resolution)) #Make the ensemble matrix by concatenating all the single cell matrices A_ensemble = init_empty_matrix(num_bins) for cluster in clusters: A = make_distance_matrix(cluster, resolution=resolution, flatten=False) for i in range(len(A)): for j in range(len(A)): A_ensemble[i][j] = np.concatenate((A_ensemble[i][j], A[i][j])) #Flatten the matrix using distance metric function A_ensemble = flatten_matrix(A_ensemble, func=statistic) return A_ensemble
def make_distance_matrix(cluster, resolution=2.5 * 10**6, statistic=np.nanmean, flatten=True, genome="mm10"): """ Function to generate a distance matrix from a chromosome cluster. Params: ------- cluster: pandas dataframe with chromosome cluster information resolution: matrix resolution in base pairs statistic: function implementing desired distance matrix flatten: If true, apply the distance function, if false, return matrix with variable length genome: human or mouse genome, string Returns: -------- A: distance matrix as defined by the statistic """ chr_num = cluster["chr"].unique()[0] if type(chr_num) != np.int64: raise ValueError("Cluster includes multiple chromosomes.") SIZES = const.get_genome_sizes(genome) KEYS = const.get_genome_keys(genome) chr_size = SIZES[chr_num] num_bins = int(np.ceil(chr_size / resolution)) A = init_empty_matrix(num_bins) B = np.arange(0, num_bins + 1) #bin vector #Get spatial position vector and genomic position vector R = np.array([ cluster[KEYS["x"]].values, cluster[KEYS["y"]].values, cluster[KEYS["z"]].values ]).T P = np.array(cluster[KEYS["pos"]].values) #Bin position vector, then populate binned matrix from unbinned matrix of #pdists using binned indices P_inds = np.digitize(P, B * resolution) - 1 R_pdist = distance.pdist(R) R_sf = distance.squareform(R_pdist) #Do the matrix binning for i in range(len(P_inds)): for j in range(i + 1, len(P_inds)): ii, jj = P_inds[i], P_inds[j] A[ii][jj].append((R_sf[i][j])) A[jj][ii].append((R_sf[i][j])) if flatten: #Now flatten lists into 2D matrix according to some statistic A = flatten_matrix(A, func=statistic) return A
def draw_genome_wide_matrix(A, xlabel="\nGenomic Coordinate [Mb]", clabel='\nSpatial Distance [um]', resolution=10 * 10**6, q=0.01, genome='hg38'): """ Draw a genome wide distance matrix. Params: ------- A: distance matrix xlabel, clabel: x-axis and colorbar labels resolution: matrix resolution in basepairs q: percentile cutoff for clim genome: target genome, to retreive constants Returns: -------- fig, ax: the figure and axes where the matrix is drawn """ fig, ax = plt.subplots() SIZES = const.get_genome_sizes(genome) #chromosome sizes, bp chr_count = len(SIZES) #Bin sizes for matrix per chromosome bin_sizes_chr = get_bin_sizes(resolution) total_bins = np.sum(bin_sizes_chr) sum_sizes = np.cumsum(bin_sizes_chr) clim = get_clims([A], q) #clim = qth and 1-qth percentile values clim = (0, clim[1]) #start linear scale at 0 #Draw outline around chromosome territories for i in range(2, chr_count + 1): offset0 = np.sum(bin_sizes_chr[:i - 1]) offset1 = np.sum(bin_sizes_chr[:i]) ax.hlines(offset0 - 1, offset0, offset1, lw=1, color='black') ax.hlines(offset1 - 1, offset0, offset1, lw=1, color='black') ax.vlines(offset0, offset0 - 1, offset1 - 1, lw=1, color='black') ax.vlines(offset1, offset0 - 1, offset1 - 1, lw=1, color='black') cmap = plt.get_cmap('seismic_r') cmap.set_bad(color='lightgrey') #handle unmapped regions #Draw the matrix and interpolate adjacent unmapped regions cax = ax.imshow(A, cmap=cmap, interpolation='nearest') cax.set_clim(clim) cbar = fig.colorbar(cax, label=clabel) #Handle tick labels x_tick_labels = ["Chr 1 "] y_tick_labels = ["1"] for i in range(2, chr_count): if i == 23: x_tick_labels.append("X") y_tick_labels.append("X") elif i == 24: x_tick_labels.append("Y") y_tick_labels.append("Y") elif i > 1 and i < 9: x_tick_labels.append(str(i)) y_tick_labels.append(str(i)) elif i == 10: x_tick_labels.append(str(" ⋯ ")) y_tick_labels.append(str("")) else: x_tick_labels.append(str("")) y_tick_labels.append(str("")) plt.xticks(sum_sizes) plt.yticks(sum_sizes) ax.set_xticklabels(x_tick_labels) ax.set_yticklabels(y_tick_labels) ax.set_xlabel(xlabel) ax.set_xlim(0, total_bins) ax.set_ylim(total_bins, 0) plt.tight_layout() plt.show() return fig, ax