예제 #1
0
파일: bigwig.py 프로젝트: wkopp/CoolBox
    def fetch_scores(self, genome_range, num_bins, max_try_nums=5):
        """
        Fetch bins scores within input chromosome range.

        on rare occasions pyBigWig may throw an error, apparently caused by a corruption
        of the memory. This only occurs when calling trackPlot from different
        processors. Reloading the file solves the problem.
        """
        num_tries = 0
        scores_per_bin = np.zeros(num_bins)
        gr = to_gr(genome_range)
        if gr.chrom not in self.bw.chroms():
            gr.change_chrom_names()
        while num_tries < max_try_nums:
            num_tries += 1
            try:
                scores_per_bin = np.array(
                    self.bw.stats(gr.chrom, gr.start, gr.end,
                                  nBins=num_bins)).astype(float)
            except Exception as e:
                import pyBigWig
                self.bw = pyBigWig.open(self.properties['file'])

                log.warning(
                    "error found while reading bigwig scores ({}).\nTrying again. Iter num: {}"
                    .format(e, num_tries))
                pass
            else:
                if num_tries > 1:
                    log.warning("After {} the scores could be computed".format(
                        num_tries))
                break
        return scores_per_bin
예제 #2
0
 def fetch_pixels(self, genome_range, threshold=1e-4, resolution=None):
     gr = to_gr(genome_range)
     pvals = self.fetch_matrix(genome_range, resolution)
     mat1 = self.mat1
     mat2 = self.mat2
     binsize = self.properties['hic1'].fetched_binsize
     idx_y, idx_x = np.where(pvals <= threshold)
     ix_up = idx_y <= idx_x
     idx_y, idx_x = idx_y[ix_up], idx_x[ix_up]
     start = (gr.start // binsize) * binsize
     start1 = start + idx_y * binsize
     end1 = start1 + binsize
     start2 = start + idx_x * binsize
     end2 = start2 + binsize
     df = pd.DataFrame({
         'chrom': gr.chrom,
         'start1': start1,
         'end1': end1,
         'start2': start2,
         'end2': end2,
         'value1': mat1[idx_y, idx_x],
         'value2': mat2[idx_y, idx_x],
         'qvalue': pvals[idx_y, idx_x],
     })
     return df
예제 #3
0
    def infer_binsize(self, genome_range1, genome_range2=None, **kwargs) -> int:
        from coolbox.utilities.hic.wrap import StrawWrap

        path = self.properties['file']
        wrap = StrawWrap(path, normalization=self.balance, binsize=kwargs.get('resolution', 'auto'))
        gr1 = to_gr(genome_range1)
        return wrap.infer_binsize(gr1)
예제 #4
0
    def fetch_pixels(self,
                     genome_range,
                     genome_range2=None,
                     balance=None,
                     resolution='auto',
                     join=True):
        """
        Parameters
        ----------
        genome_range : {str, GenomeRange}
            Intervals within input chromosome range.

        genome_range2 : {str, GenomeRange}, optional.

        balance : bool, optional
            balance matrix or not,
            default `self.is_balance`.

        resolution : {'auto', int}
            resolution of the data. for example 5000.
            'auto' for calculate resolution automatically.
            default 'auto'

        join : bool
            whether to expand the bin ID columns
            into (chrom, start, end).
            default True

        Return
        ------
        pixels : pandas.core.frame.DataFrame
            Hi-C pixels table.
            The pixel table contains the non-zero upper triangle entries of the contact map.
        """
        from coolbox.utilities.hic.wrap import CoolerWrap

        genome_range = to_gr(genome_range)
        if genome_range2 is not None:
            genome_range2 = to_gr(genome_range2)

        path = self.properties['file']
        if balance is None:
            balance = self.is_balance
        wrap = CoolerWrap(path, balance=balance, binsize=resolution)

        pixels = wrap.fetch_pixels(genome_range, genome_range2, join=join)
        return pixels
예제 #5
0
 def plot_joint(self, ax, genome_range1, genome_range2):
     self.ax = ax
     gr1 = to_gr(genome_range1)
     gr2 = to_gr(genome_range2)
     arr = self.fetch_matrix(gr1, gr2, resolution=self.resolution)
     self.matrix = arr
     cmap = self.__get_cmap()
     img = ax.matshow(arr,
                      cmap=cmap,
                      extent=(gr1.start, gr1.end, gr2.end, gr2.start),
                      aspect='auto')
     c_min, c_max = self.matrix_val_range
     if self.norm == 'log':
         img.set_norm(colors.LogNorm(vmin=c_min, vmax=c_max))
     else:
         img.set_norm(colors.Normalize(vmin=c_min, vmax=c_max))
     self.__adjust_figure(gr1, gr2)
     self.__draw_cbar(img)
     self.plot_label()
예제 #6
0
    def fetch_data(self, gr: GenomeRange, **kwargs):
        gr = to_gr(gr)
        if gr.chrom not in list(self.interval_tree):
            gr.change_chrom_names()

        return [
            (region.begin, region.end, region.data)
            for region in sorted(
                self.interval_tree[gr.chrom][gr.start - 10000 : gr.end + 10000]
            )
        ]
예제 #7
0
    def fetch_data(self, genome_range):
        gr = to_gr(genome_range)
        regions = []

        if gr.chrom not in list(self.interval_tree):
            gr.change_chrom_names()

        for region in sorted(self.interval_tree[gr.chrom][gr.start - 10000:gr.end + 10000]):
            regions.append((region.begin, region.end, region.data))

        return regions
예제 #8
0
 def load_range(self, genome_range):
     genome_range = to_gr(genome_range)
     valid_intervals, min_score, max_score = self.__load(genome_range)
     if valid_intervals == 0:
         genome_range.change_chrom_names()
         valid_intervals, min_score, max_score = self.__load(genome_range)
         if valid_intervals == 0:
             log.debug(
                 "No valid intervals were found in file {} within range {}".
                 format(self.properties['file'], f"{str(genome_range)}"))
     self.score_range = (min_score, max_score)
예제 #9
0
    def fetch_data(self, genome_range):
        vlines_list = []
        gr = to_gr(genome_range)

        if gr.chrom not in list(self.vlines_intval_tree):
            gr.change_chrom_names()

        for region in sorted(self.vlines_intval_tree[gr.chrom][gr.start - 1:gr.end + 1]):
            vlines_list.append(region.begin)

        return vlines_list
예제 #10
0
    def fetch_pixels(self, gr, gr2=None, balance=None, **kwargs):
        """

        Parameters
        ----------
        gr : {str, GenomeRange}
            Intervals within input chromosome range.

        gr2 : {str, GenomeRange}
            Intervals within input chromsome range2.

        balance : {bool, 'KR', 'VC', 'VC_SQRT'}, optional
            matrix balance method,
            default `self.balance`.

        resolution : {'auto', int}
            resolution of the data. for example 5000.
            'auto' for calculate resolution automatically.
            default 'auto'

        Returns
        -------
        pixels : pandas.core.frame.DataFrame
            Hi-C pixels table.
            The pixel table contains the non-zero upper triangle entries of the contact map.
        """
        from coolbox.utilities.hic.wrap import StrawWrap

        gr = to_gr(gr)
        if gr2 is not None:
            gr2 = to_gr(gr2)

        path = self.properties['file']
        balance = kwargs.get('balance', self.is_balance)
        wrap = StrawWrap(path,
                         normalization=balance,
                         binsize=kwargs.get('resolution', 'auto'))

        return wrap.fetch_pixels(gr, gr2)
예제 #11
0
    def fetch_pixels(self,
                     genome_range,
                     genome_range2=None,
                     balance=None,
                     resolution='auto'):
        """
        Parameters
        ----------
        genome_range : {str, GenomeRange}
            Intervals within input chromosome range.

        balance : {bool, 'KR', 'VC', 'VC_SQRT'}, optional
            matrix balance method,
            default `self.balance`.

        resolution : {'auto', int}
            resolution of the data. for example 5000.
            'auto' for calculate resolution automatically.
            default 'auto'

        Return
        ------
        pixels : pandas.core.frame.DataFrame
            Hi-C pixels table.
            The pixel table contains the non-zero upper triangle entries of the contact map.
        """
        from coolbox.utilities.hic.wrap import StrawWrap

        genome_range = to_gr(genome_range)
        if genome_range2 is not None:
            genome_range2 = to_gr(genome_range2)

        path = self.properties['file']
        if balance is None:
            balance = self.balance
        wrap = StrawWrap(path, normalization=balance, binsize=resolution)

        pixels = wrap.fetch_pixels(genome_range, genome_range2)
        return pixels
예제 #12
0
 def fetch_scores(self, genome_range, num_bins, max_try_nums=5):
     """Fetch bins scores within input chromosome range.
     """
     scores_per_bin = np.zeros(num_bins)
     gr = to_gr(genome_range)
     if gr.chrom not in self.bw.chromsizes:
         gr.change_chrom_names()
     try:
         scores_per_bin = self.bw.fetch(gr.chrom, gr.start, gr.end,
                                        num_bins).astype(float)
     except Exception as e:
         log.warning(f"error found while reading bigwig scores: {e}")
     return scores_per_bin
예제 #13
0
 def fetch_data(self, genome_range):
     grange = to_gr(genome_range)
     if grange.chrom not in self.interval_tree:
         grange.change_chrom_names()
     bands_in_region = sorted(
         self.interval_tree[grange.chrom][grange.start:grange.end])
     rows = []
     for itv in bands_in_region:
         start, end = itv.begin, itv.end
         band_name, band_type = itv.data[:2]
         rows.append([grange.chrom, start, end, band_name, band_type])
     fields = ['chrom', 'start', 'end', 'name', 'gieStain']
     df = pd.DataFrame(rows, columns=fields)
     return df
예제 #14
0
    def fetch_intervals(self, genome_range, open_query=True):
        gr = to_gr(genome_range)
        rows = self.load(gr, open_query)
        fields = self.fields
        if len(rows) == 0:
            gr.change_chrom_names()
            rows = self.load(gr, open_query)
        if len(rows) == 0:
            return pd.DataFrame([], columns=fields)

        _row = rows[0]
        _diff = len(_row) - len(fields)
        if _diff > 0:
            fields += [f"extra_{i}" for i in range(_diff)]
        df = pd.DataFrame(rows, columns=fields)
        df = self.convert_type(df, _row)
        return df
예제 #15
0
 def fetch_data(self, genome_range):
     gr = to_gr(genome_range)
     ix_chrom = self.properties['col_chrom']
     ix_pos = self.properties['col_pos']
     ix_pval = self.properties['col_pval']
     rows = self.__load(gr)
     if len(rows) == 0:
         gr.change_chrom_names()
         rows = self.__load(gr)
     df = pd.DataFrame(rows)
     if df.shape[0] > 0:
         columns = [f'col_{i}' for i in range(df.shape[1])]
         columns[ix_chrom] = "chrom"
         columns[ix_pos] = "position"
         columns[ix_pval] = "pval"
         df.columns = columns
     return df
예제 #16
0
    def plot_arcs(self, ax, genome_range, intervals):
        """
        Parameters
        ----------
        intervals : List[Tuple(int, int, float)]
            List of intervals (start, end, score).
        """
        gr = to_gr(genome_range)
        max_height = self.properties.get('height', 1.0)
        alpha = self.properties.get('alpha', 1.0)
        self.__adjust_yaxis(ax, max_height)
        ax.set_xlim(gr.start, gr.end)
        color = self.properties['color']

        if len(intervals) == 0:
            return

        max_itv = max(intervals, key=lambda t: t[1] - t[0])
        max_diameter = max_itv[1] - max_itv[0]

        for itv in intervals:
            start, end, score = itv
            line_width = self.__get_linewidth(score)
            diameter = (end - start)
            height = 2 * self.__get_height(max_height, max_diameter, diameter)
            center = (start + end) / 2
            ax.plot([center], [diameter])
            arc = Arc(
                (center, 0),
                diameter,
                height,
                0,
                0,
                180,
                color=color,
                alpha=alpha,
                lw=line_width,
            )
            ax.add_patch(arc)
예제 #17
0
 def fetch_data(self, genome_range):
     return to_gr(genome_range).chrom  # return chromosome name