Exemplo n.º 1
0
    def calc_windowed_seg_sites(self, chrom=0, L=1e3, filt_rec=True, mask=None):
        """Calculate windowed estimates of segregating sites.

        Arguments:
            * chrom: identifier for the chromosome
            * L: length of independent locus
            * filt_rec: filter recombination
            * mask: bed file for the underlying mask

        """
        assert self.chrom_pos_dict is not None
        phys_pos = self.chrom_physpos_dict[chrom]
        rec_pos = self.chrom_pos_dict[chrom]
        weights = self.chrom_weight_dict[chrom]
        if filt_rec:
            diff = np.abs(rec_pos[:-1] - rec_pos[1:])
            idx = np.where(diff != 0)[0]
            phys_pos = phys_pos[idx]
            rec_pos = rec_pos[idx]
            weights = weights[idx]
        if mask is not None:
            phys_pos = phys_pos.astype(np.float64)
            df_mask = pyranges.read_bed(mask)
            df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1))
            cov_sites = df_pos.coverage(df_mask)
            sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32))
            idx = np.where(sites_idx > 0.0)[0]
            phys_pos[idx] = np.nan
        # 1. Setup the bins for the analysis
        bins = np.arange(np.nanmin(phys_pos), np.nanmax(phys_pos), L)
        windowed_vars, bin_edges = np.histogram(
            phys_pos[~np.isnan(phys_pos)],
            bins=bins,
            weights=weights[~np.isnan(phys_pos)],
        )
        bin_edges = bin_edges.astype(np.uint32)
        # Interpolate the midpoints of the recombination bins
        f = interpolate.interp1d(phys_pos, rec_pos)
        midpts = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2
        rec_midpts = f(midpts)
        # Calculate the weightings from the mask as needed ...
        mask_weights = np.ones(rec_midpts.size)
        if mask is not None:
            # Mask must be a bedfile
            df_windows = PyRanges(
                chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:]
            )
            df_mask = pyranges.read_bed(mask)
            cov = df_windows.coverage(df_mask)
            mask_weights = np.array(cov.FractionOverlaps.astype(np.float32))
            # Set the mask weights to scale up the fraction that may be missing!
            mask_weights = 1.0 / (1.0 - mask_weights)
            mask_weights[np.isinf(mask_weights)] = np.nan

        # Stacking all of the data to make sure that we can use it later on
        tot_data = np.vstack([windowed_vars, bin_edges[1:], rec_midpts, mask_weights])
        self.chrom_total_dict[chrom] = tot_data
Exemplo n.º 2
0
    def monte_carlo_corr_SA_SB_v2(
        self, L=1e3, dist=100, nreps=1000, chrom=0, seed=42, filt_rec=True, mask=None
    ):
        """Estimate the correlation using alternative Monte-Carlo Sampling.

        Key: this allows us to test much shorter length scales

        """
        assert self.chrom_physpos_dict is not None
        assert self.chrom_pos_dict is not None
        assert L > 0
        assert dist > 0
        assert seed > 0
        np.random.seed(seed)
        phys_pos = self.chrom_physpos_dict[chrom]
        rec_pos = self.chrom_pos_dict[chrom]
        weights = self.chrom_weight_dict[chrom]
        if filt_rec:
            diff = np.abs(rec_pos[:-1] - rec_pos[1:])
            idx = np.where(diff != 0)[0]
            phys_pos = phys_pos[idx]
            rec_pos = rec_pos[idx]
            weights = weights[idx]
        if mask is not None:
            phys_pos = phys_pos.astype(np.float64)
            df_mask = pyranges.read_bed(mask)
            df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1))
            cov_sites = df_pos.coverage(df_mask)
            sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32))
            idx = np.where(sites_idx > 0.0)[0]
            phys_pos[idx] = np.nan
        # 1. Setup bins separated by some distance
        startp = np.nanmin(phys_pos)
        endp = startp + L
        windowed_vars = []
        bins = []
        while endp < np.nanmax(phys_pos):
            bins.append((startp, endp))
            start = np.searchsorted(phys_pos[~np.isnan(phys_pos)], startp, "left")
            end = np.searchsorted(phys_pos[~np.isnan(phys_pos)], endp, "right")
            # Append this to actually weight the variants
            windowed_vars.append(end - start)
            startp += L + dist
            endp += L + dist
        windowed_vars = np.array(windowed_vars)
        bin_edges = np.array(bins).ravel()
        #         print(bin_edges.size)
        #         print(windowed_vars.size, bin_edges.size)
        assert (bin_edges.size / 2) == windowed_vars.size
        # Interpolate the midpoints of the recombination bins
        f = interpolate.interp1d(phys_pos, rec_pos)
        rec_dist = f(bin_edges[2:-1:2]) - f(bin_edges[1:-2:2])
        #         print(np.mean(rec_dist))
        windowed_vars = windowed_vars[:-1]
        #         print(rec_dist.size, windowed_vars.size)

        # Calculate the weightings from the mask as needed ...
        mask_weights = np.ones(windowed_vars.size)
        if mask is not None:
            # Mask must be a bedfile
            df_windows = PyRanges(
                chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:]
            )
            df_mask = pyranges.read_bed(mask)
            cov = df_windows.coverage(df_mask)
            mask_weights = np.array(cov.FractionOverlaps.astype(np.float32))
            # Set the mask weights to scale up the fraction that may be missing!
            mask_weights = 1.0 / (1.0 - mask_weights)
            mask_weights[np.isinf(mask_weights)] = np.nan

        # add to whatever total datatype that we require?
        windowed_het_weighted = mask_weights * windowed_vars
        #         print(windowed_het_weighted.size)
        s1s = windowed_het_weighted[:-2:2]
        s2s = windowed_het_weighted[1:-1:2]
        #         print(rec_dist.size, s1s.size, s2s.size)
        assert s1s.size == s2s.size
        #         assert ((rec_dist.size  / 2) - 1) == s1s.size
        # Perform the Monte-Carlo resampling here
        idx = np.random.randint(s1s.size, size=nreps)
        s1s_samp = s1s[idx]
        s2s_samp = s2s[idx]
        rec_dist_samp = rec_dist[2 * idx]
        if self.rec_dist is None:
            self.rec_dist = {}
            self.s1 = {}
            self.s2 = {}
        if chrom in self.rec_dist:
            tmp_rec_dist = np.append(self.rec_dist[chrom], rec_dist_samp)
            tmp_s1 = np.append(self.s1[chrom], s1s_samp)
            tmp_s2 = np.append(self.s2[chrom], s2s_samp)
            self.rec_dist[chrom] = tmp_rec_dist
            self.s1[chrom] = tmp_s1
            self.s2[chrom] = tmp_s2
        else:
            self.rec_dist[chrom] = rec_dist_samp
            self.s1[chrom] = s1s_samp
            self.s2[chrom] = s2s_samp
Exemplo n.º 3
0
def compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum,
                              ratios, ratio, args):

    print("peaks and zscores")
    all_peaks, zs = _compute_peaks_and_zscores(cvg, center, left, right, chip,
                                               background_sum, ratios, ratio,
                                               args)
    print("peaks and zscores done")

    min_er = args["min_enrichment"]

    peaks_with_info = {}
    for peak_type, peaks in enumerate(all_peaks, 1):

        # print("find max start")
        # print(list(len(v) for v in zs[peak_type - 1].values()))
        # print(peaks)
        # print(zs[peak_type - 1].values())
        # t1 = list(zs[peak_type - 1].values())[0]
        # print(t1)
        # print(max(t1[1]))
        max_zs = {}
        for k, v in zs[peak_type - 1].items():
            max_zs[k] = np.array([max(v2[1]) for v2 in v])

        # max_zs = np.array(max_zs)
        # print("find max end")

        # print("len max_zs:", sum(len(v) for v in max_zs.values()))

        result = {k: -(pnorm(v) / np.log(10)) for k, v in max_zs.items()}
        # print(len(peaks))
        # print(len(np.concatenate([result[k] for k in natsorted(result)])))
        peaks.NLP = np.around(
            np.concatenate([result[k] for k in natsorted(result)]), 3)

        peaks.Location = np.array(np.ceil((peaks.Start + peaks.End) / 2),
                                  dtype=np.long)

        peaks.Type = peak_type

        peaks_loc = PyRanges(seqnames=peaks.Chromosome,
                             starts=peaks.Location,
                             ends=peaks.Location + 1,
                             strands=peaks.Strand)
        loc_cvg = peaks_loc.coverage()

        chip_cvg = loc_cvg * cvg
        bg_cvg = loc_cvg * background_sum

        peak_enrich_cvg_f = 1 + (ratio["+"] * chip_cvg["+"])
        peak_enrich_cvg_r = 1 + (ratio["-"] * chip_cvg["-"])
        peak_enrich_cvg = PyRles({
            k: v
            for k, v in list(peak_enrich_cvg_r.items() +
                             peak_enrich_cvg_f.items())
        })

        peak_enrich_ref = 1 + (bg_cvg)
        peak_enrich = peak_enrich_cvg / peak_enrich_ref

        vals_f = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["+"].keys()])
        vals_r = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["-"].keys()])
        vals_f = vals_f[np.isfinite(vals_f)]
        vals_r = vals_r[np.isfinite(vals_r)]

        # print(len(vals_f))
        vals_f = vals_f[vals_f > 1]
        vals_r = vals_r[vals_r > 1]

        if peak_type == 1:
            min_er_f = np.percentile(vals_f, min_er * 100)
            min_er_r = np.percentile(vals_r, min_er * 100)

        vals_f = vals_f > min_er_f
        vals_r = vals_r > min_er_r

        # print(np.sum(vals_f))
        # print(len(vals_f))
        # print(peaks["+"])

        peaks["+"].Enrichment = vals_f
        peaks["-"].Enrichment = vals_r

        peaks_loc["+"].Enrichment = vals_f
        peaks_loc["-"].Enrichment = vals_r

        peaks = peaks.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc = peaks_loc.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc.Start += 1
        peaks_loc.End += 1

        chip_cvg = np.array(np.concatenate([
            cvg[k][peaks[k].Location] for k in cvg.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        left_cvg = np.array(np.concatenate([
            left[k][peaks[k].Location] for k in left.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        right_cvg = np.array(np.concatenate([
            right[k][peaks[k].Location] for k in right.keys()
            if not peaks[k].empty()
        ]),
                             dtype=np.long)

        peaks.CVG = chip_cvg
        peaks.SURL = left_cvg
        peaks.SURR = right_cvg

        peaks.drop_empty()

        peaks_with_info[peak_type] = peaks

    return peaks_with_info