def fetch_scores(self, genome_range, num_bins, max_try_nums=5): """ Fetch bins scores within input chromosome range. on rare occasions pyBigWig may throw an error, apparently caused by a corruption of the memory. This only occurs when calling trackPlot from different processors. Reloading the file solves the problem. """ num_tries = 0 scores_per_bin = np.zeros(num_bins) gr = to_gr(genome_range) if gr.chrom not in self.bw.chroms(): gr.change_chrom_names() while num_tries < max_try_nums: num_tries += 1 try: scores_per_bin = np.array( self.bw.stats(gr.chrom, gr.start, gr.end, nBins=num_bins)).astype(float) except Exception as e: import pyBigWig self.bw = pyBigWig.open(self.properties['file']) log.warning( "error found while reading bigwig scores ({}).\nTrying again. Iter num: {}" .format(e, num_tries)) pass else: if num_tries > 1: log.warning("After {} the scores could be computed".format( num_tries)) break return scores_per_bin
def fetch_pixels(self, genome_range, threshold=1e-4, resolution=None): gr = to_gr(genome_range) pvals = self.fetch_matrix(genome_range, resolution) mat1 = self.mat1 mat2 = self.mat2 binsize = self.properties['hic1'].fetched_binsize idx_y, idx_x = np.where(pvals <= threshold) ix_up = idx_y <= idx_x idx_y, idx_x = idx_y[ix_up], idx_x[ix_up] start = (gr.start // binsize) * binsize start1 = start + idx_y * binsize end1 = start1 + binsize start2 = start + idx_x * binsize end2 = start2 + binsize df = pd.DataFrame({ 'chrom': gr.chrom, 'start1': start1, 'end1': end1, 'start2': start2, 'end2': end2, 'value1': mat1[idx_y, idx_x], 'value2': mat2[idx_y, idx_x], 'qvalue': pvals[idx_y, idx_x], }) return df
def infer_binsize(self, genome_range1, genome_range2=None, **kwargs) -> int: from coolbox.utilities.hic.wrap import StrawWrap path = self.properties['file'] wrap = StrawWrap(path, normalization=self.balance, binsize=kwargs.get('resolution', 'auto')) gr1 = to_gr(genome_range1) return wrap.infer_binsize(gr1)
def fetch_pixels(self, genome_range, genome_range2=None, balance=None, resolution='auto', join=True): """ Parameters ---------- genome_range : {str, GenomeRange} Intervals within input chromosome range. genome_range2 : {str, GenomeRange}, optional. balance : bool, optional balance matrix or not, default `self.is_balance`. resolution : {'auto', int} resolution of the data. for example 5000. 'auto' for calculate resolution automatically. default 'auto' join : bool whether to expand the bin ID columns into (chrom, start, end). default True Return ------ pixels : pandas.core.frame.DataFrame Hi-C pixels table. The pixel table contains the non-zero upper triangle entries of the contact map. """ from coolbox.utilities.hic.wrap import CoolerWrap genome_range = to_gr(genome_range) if genome_range2 is not None: genome_range2 = to_gr(genome_range2) path = self.properties['file'] if balance is None: balance = self.is_balance wrap = CoolerWrap(path, balance=balance, binsize=resolution) pixels = wrap.fetch_pixels(genome_range, genome_range2, join=join) return pixels
def plot_joint(self, ax, genome_range1, genome_range2): self.ax = ax gr1 = to_gr(genome_range1) gr2 = to_gr(genome_range2) arr = self.fetch_matrix(gr1, gr2, resolution=self.resolution) self.matrix = arr cmap = self.__get_cmap() img = ax.matshow(arr, cmap=cmap, extent=(gr1.start, gr1.end, gr2.end, gr2.start), aspect='auto') c_min, c_max = self.matrix_val_range if self.norm == 'log': img.set_norm(colors.LogNorm(vmin=c_min, vmax=c_max)) else: img.set_norm(colors.Normalize(vmin=c_min, vmax=c_max)) self.__adjust_figure(gr1, gr2) self.__draw_cbar(img) self.plot_label()
def fetch_data(self, gr: GenomeRange, **kwargs): gr = to_gr(gr) if gr.chrom not in list(self.interval_tree): gr.change_chrom_names() return [ (region.begin, region.end, region.data) for region in sorted( self.interval_tree[gr.chrom][gr.start - 10000 : gr.end + 10000] ) ]
def fetch_data(self, genome_range): gr = to_gr(genome_range) regions = [] if gr.chrom not in list(self.interval_tree): gr.change_chrom_names() for region in sorted(self.interval_tree[gr.chrom][gr.start - 10000:gr.end + 10000]): regions.append((region.begin, region.end, region.data)) return regions
def load_range(self, genome_range): genome_range = to_gr(genome_range) valid_intervals, min_score, max_score = self.__load(genome_range) if valid_intervals == 0: genome_range.change_chrom_names() valid_intervals, min_score, max_score = self.__load(genome_range) if valid_intervals == 0: log.debug( "No valid intervals were found in file {} within range {}". format(self.properties['file'], f"{str(genome_range)}")) self.score_range = (min_score, max_score)
def fetch_data(self, genome_range): vlines_list = [] gr = to_gr(genome_range) if gr.chrom not in list(self.vlines_intval_tree): gr.change_chrom_names() for region in sorted(self.vlines_intval_tree[gr.chrom][gr.start - 1:gr.end + 1]): vlines_list.append(region.begin) return vlines_list
def fetch_pixels(self, gr, gr2=None, balance=None, **kwargs): """ Parameters ---------- gr : {str, GenomeRange} Intervals within input chromosome range. gr2 : {str, GenomeRange} Intervals within input chromsome range2. balance : {bool, 'KR', 'VC', 'VC_SQRT'}, optional matrix balance method, default `self.balance`. resolution : {'auto', int} resolution of the data. for example 5000. 'auto' for calculate resolution automatically. default 'auto' Returns ------- pixels : pandas.core.frame.DataFrame Hi-C pixels table. The pixel table contains the non-zero upper triangle entries of the contact map. """ from coolbox.utilities.hic.wrap import StrawWrap gr = to_gr(gr) if gr2 is not None: gr2 = to_gr(gr2) path = self.properties['file'] balance = kwargs.get('balance', self.is_balance) wrap = StrawWrap(path, normalization=balance, binsize=kwargs.get('resolution', 'auto')) return wrap.fetch_pixels(gr, gr2)
def fetch_pixels(self, genome_range, genome_range2=None, balance=None, resolution='auto'): """ Parameters ---------- genome_range : {str, GenomeRange} Intervals within input chromosome range. balance : {bool, 'KR', 'VC', 'VC_SQRT'}, optional matrix balance method, default `self.balance`. resolution : {'auto', int} resolution of the data. for example 5000. 'auto' for calculate resolution automatically. default 'auto' Return ------ pixels : pandas.core.frame.DataFrame Hi-C pixels table. The pixel table contains the non-zero upper triangle entries of the contact map. """ from coolbox.utilities.hic.wrap import StrawWrap genome_range = to_gr(genome_range) if genome_range2 is not None: genome_range2 = to_gr(genome_range2) path = self.properties['file'] if balance is None: balance = self.balance wrap = StrawWrap(path, normalization=balance, binsize=resolution) pixels = wrap.fetch_pixels(genome_range, genome_range2) return pixels
def fetch_scores(self, genome_range, num_bins, max_try_nums=5): """Fetch bins scores within input chromosome range. """ scores_per_bin = np.zeros(num_bins) gr = to_gr(genome_range) if gr.chrom not in self.bw.chromsizes: gr.change_chrom_names() try: scores_per_bin = self.bw.fetch(gr.chrom, gr.start, gr.end, num_bins).astype(float) except Exception as e: log.warning(f"error found while reading bigwig scores: {e}") return scores_per_bin
def fetch_data(self, genome_range): grange = to_gr(genome_range) if grange.chrom not in self.interval_tree: grange.change_chrom_names() bands_in_region = sorted( self.interval_tree[grange.chrom][grange.start:grange.end]) rows = [] for itv in bands_in_region: start, end = itv.begin, itv.end band_name, band_type = itv.data[:2] rows.append([grange.chrom, start, end, band_name, band_type]) fields = ['chrom', 'start', 'end', 'name', 'gieStain'] df = pd.DataFrame(rows, columns=fields) return df
def fetch_intervals(self, genome_range, open_query=True): gr = to_gr(genome_range) rows = self.load(gr, open_query) fields = self.fields if len(rows) == 0: gr.change_chrom_names() rows = self.load(gr, open_query) if len(rows) == 0: return pd.DataFrame([], columns=fields) _row = rows[0] _diff = len(_row) - len(fields) if _diff > 0: fields += [f"extra_{i}" for i in range(_diff)] df = pd.DataFrame(rows, columns=fields) df = self.convert_type(df, _row) return df
def fetch_data(self, genome_range): gr = to_gr(genome_range) ix_chrom = self.properties['col_chrom'] ix_pos = self.properties['col_pos'] ix_pval = self.properties['col_pval'] rows = self.__load(gr) if len(rows) == 0: gr.change_chrom_names() rows = self.__load(gr) df = pd.DataFrame(rows) if df.shape[0] > 0: columns = [f'col_{i}' for i in range(df.shape[1])] columns[ix_chrom] = "chrom" columns[ix_pos] = "position" columns[ix_pval] = "pval" df.columns = columns return df
def plot_arcs(self, ax, genome_range, intervals): """ Parameters ---------- intervals : List[Tuple(int, int, float)] List of intervals (start, end, score). """ gr = to_gr(genome_range) max_height = self.properties.get('height', 1.0) alpha = self.properties.get('alpha', 1.0) self.__adjust_yaxis(ax, max_height) ax.set_xlim(gr.start, gr.end) color = self.properties['color'] if len(intervals) == 0: return max_itv = max(intervals, key=lambda t: t[1] - t[0]) max_diameter = max_itv[1] - max_itv[0] for itv in intervals: start, end, score = itv line_width = self.__get_linewidth(score) diameter = (end - start) height = 2 * self.__get_height(max_height, max_diameter, diameter) center = (start + end) / 2 ax.plot([center], [diameter]) arc = Arc( (center, 0), diameter, height, 0, 0, 180, color=color, alpha=alpha, lw=line_width, ) ax.add_patch(arc)
def fetch_data(self, genome_range): return to_gr(genome_range).chrom # return chromosome name