Пример #1
0
    def setUp(self):
        """
        As above, but using CRAM rather than BAM
        The distribution of reads between the two bam files is as follows.

        They cover 200 bp::

              0                              100                           200
              |------------------------------------------------------------|
            A                                ==============>
                                                            <==============


            B                 <==============               ==============>
                                             ==============>
                                                            ==============>
        """
        self.root = ROOT
        self.bamFile1 = self.root + "testA.cram"
        self.bamFile2 = self.root + "testB.cram"
        self.bamFile_PE = self.root + "test_paired2.cram"
        self.chrom = '3R'
        step_size = 50
        bin_length = 25

        self.c = cr.CountReadsPerBin([self.bamFile1, self.bamFile2],
                                     binLength=bin_length,
                                     stepSize=step_size)
Пример #2
0
def main(args=None):
    args = process_args(args)

    cr = countR.CountReadsPerBin(args.bamfiles,
                                 args.binSize,
                                 args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    i = 0
    for reads in num_reads_per_bin.T:
        count = np.cumsum(np.sort(reads))
        count = count / count[-1]  # to normalyze y from 0 to 1
        plt.plot(x, count, label=args.labels[i])
        plt.xlabel('rank')
        plt.ylabel('fraction w.r.t. bin with highest coverage')
        i += 1
    plt.legend(loc='upper left')
    plt.suptitle(args.plotTitle)
    # set the plotFileFormat explicitly to None to trigger the
    # format from the file-extension
    if not args.plotFileFormat:
        args.plotFileFormat = None

    plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat)

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))
Пример #3
0
    def test_count_reads_in_region_extension_1(self):
        """
        In this case when read extension is smaller than read length
        extension is turned off and a warning is printed.
        """
        self.c = cr.CountReadsPerBin([self.bamFile1, self.bamFile2],
                                     binLength=1,
                                     stepSize=50,
                                     extendReads=25)

        resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200)

        nt.assert_equal(resp, np.array([[0, 0.], [0, 1.], [1, 1.], [1, 2.]]))
Пример #4
0
    def test_bed_file(self):
        bed = "chr3R\t0\t10\nchr3R\t110\t120\nchr3R\t160\t180"
        import tempfile
        bed_file = tempfile.NamedTemporaryFile(suffix=".bed",
                                               delete=False,
                                               mode="w")
        bed_file.write(bed)
        bed_file.close()

        self.c = cr.CountReadsPerBin([self.bamFile2], bedFile=[bed_file.name])

        resp = self.c.run()
        nt.assert_equal(resp, np.array([[0.], [1.], [2.]]))

        import os
        os.unlink(bed_file.name)
Пример #5
0
def calculate_frip(bam, peakfile):
    '''Calculates the fraction of reads in peaks for replicate bam files.'''
    b, pkf = bam, peakfile
    num_lines = sum(1 for line in open(peakfile))
    if num_lines < 10:
        frip = "NA"
    else:
        # access deeptools function to get reads in peaks
        cr = crpb.CountReadsPerBin([b], bedFile=pkf, numberOfProcessors=12)
        rip = cr.run()
        total = rip.sum(axis=0)
        # read alignments with pysam
        b1 = pysam.AlignmentFile(b)
        # calculate fraction of reads in peaks
        frip = float(total[0]) / b1.mapped
    return frip
Пример #6
0
def estimateScaleFactor(bamFilesList,
                        binLength,
                        numberOfSamples,
                        normalizationLength,
                        avg_method='median',
                        blackListFileName=None,
                        numberOfProcessors=1,
                        verbose=False,
                        chrsToSkip=[]):
    r"""
    Subdivides the genome into chunks to be analyzed in parallel
    using several processors. The code handles the creation of
    workers that compute fragment counts (coverage) for different
    regions and then collect and integrates the results.

    Parameters
    ----------
    bamFilesList : list
        list of bam files to normalize
    binLength : int
        the window size in bp, where reads are going to be
        counted.
    numberOfSamples : int
        number of sites to sample from the genome. For more info see
        the documentation of the CountReadsPerBin class
    normalizationLength : int
        length, in bp, to normalize the data.
        For a value of 1, on average
        1 read per base pair is found
    avg_method : str
        defines how the different values are to be summarized.
        The options are 'mean' and 'median'
    chrsToSkip : list
        name of the chromosomes to be excluded from the
        scale estimation. Usually the chrX is included.
    blackListFileName : str
        BED file containing blacklisted regions

    Returns
    -------
    dict
        Dictionary with the following keys::
            'size_factors'
            'size_factors_based_on_mapped_reads'
            'size_factors_SES'
            'size_factors_based_on_mean'
            'size_factors_based_on_median'
            'mean'
            'meanSES'
            'median'
            'reads_per_bin'
            'std'
            'sites_sampled'


    Examples
    --------
    >>> test = Tester()
    >>> bin_length = 50
    >>> num_samples = 4
    >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples,  1)
    >>> _dict['size_factors']
    array([ 1. ,  0.5])
    >>> _dict['size_factors_based_on_mean']
    array([ 1. ,  0.5])
    """

    assert len(
        bamFilesList) == 2, "SES scale factors are only defined for 2 files"

    bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList]
    mappedReads = [x.mapped for x in bamFilesHandlers]

    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

    sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min(
    ) / sizeFactorBasedOnMappedReads

    cr = countR.CountReadsPerBin(bamFilesList,
                                 binLength=binLength,
                                 numberOfSamples=numberOfSamples,
                                 extendReads=False,
                                 blackListFileName=blackListFileName,
                                 numberOfProcessors=numberOfProcessors,
                                 verbose=verbose,
                                 chrsToSkip=chrsToSkip)

    try:
        num_reads_per_bin = cr.run()
    except Exception as detail:
        exit("*ERROR*: {}".format(detail))

    sitesSampled = len(num_reads_per_bin)

    # the transpose is taken to easily iterate by columns which are now
    # converted to rows
    num_reads_per_bin = num_reads_per_bin.transpose()
    # size factors based on order statistics
    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
    # Normalization, bias correction, and peak calling for ChIP-seq.
    # Statistical applications in genetics and molecular biology, 11(3).

    # using the same names as in Diaz paper
    # p refers to ChIP, q to input

    p = np.sort(num_reads_per_bin[0, :]).cumsum()
    q = np.sort(num_reads_per_bin[1, :]).cumsum()

    # p[-1] and q[-1] are the maximum values in the  arrays.
    # both p and q are normalized by this value
    diff = np.abs(p / p[-1] - q / q[-1])
    # get the lowest rank for wich the difference is the maximum
    maxIndex = np.flatnonzero(diff == diff.max())[0]
    # Take a lower rank to move to a region with probably
    # less peaks and more background.
    maxIndex = int(maxIndex * 0.8)
    while (maxIndex < len(p)):
        # in rare cases the maxIndex maps to a zero value.
        # In such cases, the next index is used until
        # a non zero value appears.
        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
        if cumSum.min() > 0:
            break
        maxIndex += 1

    meanSES = [
        np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
        np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])
    ]

    # the maxIndex may be too close to the the signal regions
    # so i take a more conservative approach by taking a close number

    sizeFactorsSES = cumSum.min() / cumSum
    median = np.median(num_reads_per_bin, axis=1)

    # consider only those read numbers that are below the 90
    # percentile to stimate the
    # mean and std
    mean = []
    std = []
    for values in num_reads_per_bin:
        maxNumReads = (np.percentile(values, 90))
        if maxNumReads == 0:
            maxNumReads = (np.percentile(values, 99))
            if maxNumReads == 0:
                print("all genomic regions sampled from one ")
                "of the bam files have no reads.\n"
                values = values[values <= maxNumReads]

        mean.append(np.mean(values))
        std.append(np.std(values))

    mean = np.array(mean)
    readsPerBin = mean if avg_method == 'mean' else median

    if min(median) == 0:
        idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
        exit(
            "\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
            "Try selecting a larger sample size or a region with coverage\n".
            format(idx_zero))

    sizeFactor = sizeFactorsSES
    return {
        'size_factors': sizeFactor,
        'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
        'size_factors_SES': sizeFactorsSES,
        'size_factors_based_on_mean': mean.min() / mean,
        'size_factors_based_on_median': median.min() / median,
        'mean': mean,
        'meanSES': meanSES,
        'median': median,
        'reads_per_bin': readsPerBin,
        'std': std,
        'sites_sampled': sitesSampled
    }
Пример #7
0
def main(args=None):
    args = process_args(args)

    if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics:
        sys.exit(
            "At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n"
        )

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 bedFile=bed_regions,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 blackListFileName=args.blackListFileName,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude,
                                 minFragmentLength=args.minFragmentLength,
                                 maxFragmentLength=args.maxFragmentLength,
                                 bed_and_bin=True,
                                 out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = cr.run()

    if args.outCoverageMetrics and args.coverageThresholds:
        args.coverageThresholds.sort(
        )  # Galaxy in particular tends to give things in a weird order
        of = open(args.outCoverageMetrics, "w")
        of.write("Sample\tThreshold\tPercent\n")
        nbins = float(num_reads_per_bin.shape[0])
        for thresh in args.coverageThresholds:
            vals = np.sum(num_reads_per_bin >= thresh, axis=0)
            for lab, val in zip(args.labels, vals):
                of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh,
                                                    100. * val / nbins))
        of.close()

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, 'r+')
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        f.close()

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non-zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    if args.plotFile:
        if args.plotFileFormat == 'plotly':
            fig = go.Figure()
            fig['layout']['xaxis1'] = {
                'domain': [0.0, 0.48],
                'anchor': 'x1',
                'title': 'coverage (#reads per base)'
            }
            fig['layout']['xaxis2'] = {
                'domain': [0.52, 1.0],
                'anchor': 'x2',
                'title': 'coverage (#reads per base)'
            }
            fig['layout']['yaxis1'] = {
                'domain': [0.0, 1.0],
                'anchor': 'x1',
                'title': 'fraction of bases sampled'
            }
            fig['layout']['yaxis2'] = {
                'domain': [0.0, 1.0],
                'anchor': 'x2',
                'title': 'fraction of bases sampled >= coverage'
            }
            fig['layout'].update(title=args.plotTitle)
        else:
            fig, axs = plt.subplots(1,
                                    2,
                                    figsize=(args.plotWidth, args.plotHeight))
            plt.suptitle(args.plotTitle)

    # plot up to two std from mean
    num_reads_per_bin = num_reads_per_bin.astype(int)
    sample_mean = num_reads_per_bin.mean(axis=0)
    sample_std = num_reads_per_bin.std(axis=0)
    sample_max = num_reads_per_bin.max(axis=0)
    sample_min = num_reads_per_bin.min(axis=0)
    sample_25 = np.percentile(num_reads_per_bin, 25, axis=0)
    sample_50 = np.percentile(num_reads_per_bin, 50, axis=0)
    sample_75 = np.percentile(num_reads_per_bin, 75, axis=0)

    # use the largest 99th percentile from all samples to set the x_max value
    x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0))
    # plot coverage
    # print headers for text output
    print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax")
    # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs.
    # coverage) is important because, depending on the data,
    # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample
    # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of
    # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is
    # very por and centers close to 1 then a good y axis range is (0,1).

    # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and
    # sets that as the x_axis range.
    y_max = []
    data = []
    # We need to manually set the line colors so they're shared between the two plots.
    plotly_colors = [
        "#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"
    ]
    plotly_styles = sum([
        6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"],
        6 * ["dashdot"], 6 * ["longdashdot"]
    ], [])
    for idx, col in enumerate(num_reads_per_bin.T):
        if args.plotFile:
            frac_reads_per_coverage = np.bincount(
                col.astype(int)).astype(float) / num_reads_per_bin.shape[0]
            csum = np.bincount(col.astype(int))[::-1].cumsum()
            csum_frac = csum.astype(float)[::-1] / csum.max()
            if args.plotFileFormat == 'plotly':
                color = plotly_colors[idx % len(plotly_colors)]
                dash = plotly_styles[idx % len(plotly_styles)]
                trace = go.Scatter(x=np.arange(0,
                                               int(x_max) - 1),
                                   y=frac_reads_per_coverage[:int(x_max)],
                                   mode='lines',
                                   xaxis='x1',
                                   yaxis='y1',
                                   line=dict(color=color, dash=dash),
                                   name="{}, mean={:.1f}".format(
                                       args.labels[idx], sample_mean[idx]),
                                   legendgroup="{}".format(idx))
                data.append(trace)
                trace = go.Scatter(x=np.arange(0,
                                               int(x_max) - 1),
                                   y=csum_frac[:int(x_max)],
                                   mode='lines',
                                   xaxis='x2',
                                   yaxis='y2',
                                   line=dict(color=color, dash=dash),
                                   name=args.labels[idx],
                                   showlegend=False,
                                   legendgroup="{}".format(idx))
                data.append(trace)
            else:
                axs[0].plot(frac_reads_per_coverage,
                            label="{}, mean={:.1f}".format(
                                args.labels[idx], sample_mean[idx]))
                axs[1].plot(csum_frac, label=args.labels[idx])
            # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases
            # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)`
            # then find the fraction of bases sampled that that have the largest x
            y_max.append(frac_reads_per_coverage[max(
                np.flatnonzero(csum_frac > 0.5))])
        print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(
            args.labels[idx],
            sample_mean[idx],
            sample_std[idx],
            sample_min[idx],
            sample_25[idx],
            sample_50[idx],
            sample_75[idx],
            sample_max[idx],
        ))

    if args.plotFile:
        # Don't clip plots
        y_max = max(y_max)
        if args.plotFileFormat == "plotly":
            fig['data'] = data
            fig['layout']['yaxis1'].update(
                range=[0.0, min(1, y_max + (y_max * 0.10))])
            fig['layout']['yaxis2'].update(range=[0.0, 1.0])
            py.plot(fig, filename=args.plotFile, auto_open=False)
        else:
            axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
            axs[0].set_xlim(0, x_max)
            axs[0].set_xlabel('coverage (#reads per bp)')
            axs[0].legend(fancybox=True, framealpha=0.5)
            axs[0].set_ylabel('fraction of bases sampled')
            # plot cumulative coverage
            axs[1].set_xlim(0, x_max)
            axs[1].set_xlabel('coverage (#reads per bp)')
            axs[1].set_ylabel('fraction of bases sampled >= coverage')
            axs[1].legend(fancybox=True, framealpha=0.5)
            plt.savefig(args.plotFile, format=args.plotFileFormat)
            plt.close()
Пример #8
0
def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. save data for further plotting

    """
    args = process_args(args)

    if len(args.bamfiles) < 2:
        print "Please input at least two bam files to compare"
        exit(1)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    stepsize = args.binSize + args.distanceBetweenBins
    c = countR.CountReadsPerBin(
        args.bamfiles,
        args.binSize,
        numberOfSamples=None,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
        region=args.region,
        bedFile=bed_regions,
        extendReads=args.extendReads,
        minMappingQuality=args.minMappingQuality,
        ignoreDuplicates=args.ignoreDuplicates,
        center_read=args.centerReads,
        samFlag_include=args.samFlagInclude,
        samFlag_exclude=args.samFlagExclude,
        stepSize=stepsize,
        zerosToNans=False,
        out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = c.run()

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    np.savez_compressed(args.outFileName,
                        matrix=num_reads_per_bin,
                        labels=args.labels)

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        with open(args.outRawCounts.name, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(header + content)
        args.outRawCounts.close()
Пример #9
0
def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. save data for further plotting

    """
    args = process_args(args)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    if len(args.bamfiles) == 1 and not args.outRawCounts:
        sys.stderr.write("You've input a single BAM file and not specified "
                         "--outRawCounts. The resulting output will NOT be "
                         "useful with any deepTools program!\n")

    stepsize = args.binSize + args.distanceBetweenBins
    c = countR.CountReadsPerBin(args.bamfiles,
                                args.binSize,
                                numberOfSamples=None,
                                numberOfProcessors=args.numberOfProcessors,
                                verbose=args.verbose,
                                region=args.region,
                                bedFile=bed_regions,
                                blackListFileName=args.blackListFileName,
                                extendReads=args.extendReads,
                                minMappingQuality=args.minMappingQuality,
                                ignoreDuplicates=args.ignoreDuplicates,
                                center_read=args.centerReads,
                                samFlag_include=args.samFlagInclude,
                                samFlag_exclude=args.samFlagExclude,
                                minFragmentLength=args.minFragmentLength,
                                maxFragmentLength=args.maxFragmentLength,
                                stepSize=stepsize,
                                zerosToNans=False,
                                out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = c.run(allArgs=args)

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    # numpy will append .npz to the file name if we don't do this...
    f = open(args.outFileName, "wb")
    np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels)
    f.close()

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, 'r+')
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        f.close()
Пример #10
0
def main(args=None):
    args = process_args(args)
    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 blackListFileName=args.blackListFileName,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude,
                                 minFragmentLength=args.minFragmentLength,
                                 maxFragmentLength=args.maxFragmentLength,
                                 out_file_for_raw_data=args.outRawCounts)

    num_reads_per_bin = cr.run()

    sys.stderr.write("Number of non zero bins "
                     "used: {}\n".format(num_reads_per_bin.shape[0]))

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, 'r+')
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        f.close()

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non-zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    plt.suptitle(args.plotTitle)
    # plot up to two std from mean
    num_reads_per_bin = num_reads_per_bin.astype(int)
    sample_mean = num_reads_per_bin.mean(axis=0)
    sample_std = num_reads_per_bin.std(axis=0)
    sample_max = num_reads_per_bin.max(axis=0)
    sample_min = num_reads_per_bin.min(axis=0)
    sample_25 = np.percentile(num_reads_per_bin, 25, axis=0)
    sample_50 = np.percentile(num_reads_per_bin, 50, axis=0)
    sample_75 = np.percentile(num_reads_per_bin, 75, axis=0)

    # use the largest 99th percentile from all samples to set the x_max value
    x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0))
    # plot coverage
    # print headers for text output
    print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax")
    # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs.
    # coverage) is important because, depending on the data,
    # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample
    # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of
    # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is
    # very por and centers close to 1 then a good y axis range is (0,1).

    # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and
    # sets that as the x_axis range.
    y_max = []
    for idx, col in enumerate(num_reads_per_bin.T):
        frac_reads_per_coverage = np.bincount(
            col.astype(int)).astype(float) / num_reads_per_bin.shape[0]
        axs[0].plot(frac_reads_per_coverage,
                    label="{}, mean={:.1f}".format(args.labels[idx],
                                                   sample_mean[idx]))
        csum = np.bincount(col.astype(int))[::-1].cumsum()
        csum_frac = csum.astype(float)[::-1] / csum.max()
        axs[1].plot(csum_frac, label=args.labels[idx])
        # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases
        # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)`
        # then find the fraction of bases sampled that that have the largest x
        y_max.append(frac_reads_per_coverage[max(
            np.flatnonzero(csum_frac > 0.5))])
        print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(
            args.labels[idx],
            sample_mean[idx],
            sample_std[idx],
            sample_min[idx],
            sample_25[idx],
            sample_50[idx],
            sample_75[idx],
            sample_max[idx],
        ))

    # The 'good' x-axis is computed for each sample. The lower value is favored in which
    # distributions with a wider x-range can better be seen.
    y_max = min(y_max)
    axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
    axs[0].set_xlim(0, x_max)
    axs[0].set_xlabel('coverage (#reads per bp)')
    axs[0].legend(fancybox=True, framealpha=0.5)
    axs[0].set_ylabel('fraction of bases sampled')
    # plot cumulative coverage
    axs[1].set_xlim(0, x_max)
    axs[1].set_xlabel('coverage (#reads per bp)')
    axs[1].set_ylabel('fraction of bases sampled >= coverage')
    axs[1].legend(fancybox=True, framealpha=0.5)
    plt.savefig(args.plotFile, format=args.plotFileFormat)
    plt.close()
Пример #11
0
frips = []
for idx, bam_file in enumerate(bam_file_list):
    # Init
    frip = 0

    # Read first line
    first_line = None
    with open(peak_file_list[idx], "r") as file:
        for line in file:
            first_line = line
            break

    if first_line is not None:
        print("Calculating " + bam_file + " using " + peak_file_list[idx])
        cr = crpb.CountReadsPerBin([bam_file],
                                   bedFile=[peak_file_list[idx]],
                                   numberOfProcessors=int(args.threads))

        # Calc the total number of reads in peaks per bam file
        reads_at_peaks = cr.run()
        total = reads_at_peaks.sum(axis=0)

        # Load up bam file and get the total number of mapped reads
        bam = pysam.AlignmentFile(bam_file)

        # Calc frip
        frip = float(total[0]) / bam.mapped

    frips.append(str(frip))

    # Log
Пример #12
0
]

# file = "results/mapping/SRX4108929.1.control.final.bam"
mappedReads = []
for file in files:
    mappedReads.append(
        bamHandler.openBam(file, returnStats=True, nThreads=nThreads)[1])

sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min(
) / sizeFactorBasedOnMappedReads

cr = countR.CountReadsPerBin(files,
                             binLength=50,
                             numberOfSamples=10000,
                             extendReads=False,
                             numberOfProcessors=nThreads)

chromsizes, non_common = deeptools.utilities.getCommonChrNames(
    [bamHandler.openBam(file) for file in files])

chrNames, chrLengths = list(zip(*chromsizes))

genomeSize = sum(chrLengths)

bam = bamHandler.openBam(file)

for read in bam.fetch("chr3", 9998999, 9999999):
    print(read.is_unmapped)
Пример #13
0
    def get_coverage(self, **kwargs):
        """
        retrieve coverage for each regions specified in bed file using deeptools' CountReadsPerBin.
        """
        bamFilesList = _get_all_bams(self.extended_yml)
        out_file_for_raw_data_tmp = kwargs[
            'out_file_for_raw_data'] if 'out_file_for_raw_data' in kwargs else "tmp_counts.count"

        cr = crpb.CountReadsPerBin(
            bamFilesList,
            binLength=kwargs['binLength'] if 'binLength' in kwargs else 50,
            numberOfSamples=kwargs['numberOfSamples']
            if 'numberOfSamples' in kwargs else None,
            numberOfProcessors=kwargs['numberOfProcessors']
            if 'numberOfProcessors' in kwargs else 5,
            verbose=kwargs['verbose'] if 'verbose' in kwargs else False,
            region=kwargs['region'] if 'region' in kwargs else None,
            bedFile=kwargs['bedFile'] if 'bedFile' in kwargs else None,
            extendReads=kwargs['extendReads']
            if 'extendReads' in kwargs else False,
            genomeChunkSize=kwargs['genomeChunkSize']
            if 'genomeChunkSize' in kwargs else None,
            blackListFileName=kwargs['blackListFileName']
            if 'blackListFileName' in kwargs else None,
            minMappingQuality=kwargs['minMappingQuality']
            if 'minMappingQuality' in kwargs else None,
            ignoreDuplicates=kwargs['ignoreDuplicates']
            if 'ignoreDuplicates' in kwargs else False,
            chrsToSkip=kwargs['chrsToSkip'] if 'chrsToSkip' in kwargs else [],
            stepSize=kwargs['stepSize'] if 'stepSize' in kwargs else None,
            center_read=kwargs['center_read']
            if 'center_read' in kwargs else False,
            samFlag_include=kwargs['samFlag_include']
            if 'samFlag_include' in kwargs else None,
            samFlag_exclude=kwargs['samFlag_exclude']
            if 'samFlag_exclude' in kwargs else None,
            zerosToNans=kwargs['zerosToNans']
            if 'zerosToNans' in kwargs else False,
            skipZeroOverZero=kwargs['skipZeroOverZero']
            if 'skipZeroOverZero' in kwargs else False,
            smoothLength=kwargs['smoothLength']
            if 'smoothLength' in kwargs else 0,
            minFragmentLength=kwargs['minFragmentLength']
            if 'minFragmentLength' in kwargs else 0,
            maxFragmentLength=kwargs['maxFragmentLength']
            if 'maxFragmentLength' in kwargs else 0,
            out_file_for_raw_data=out_file_for_raw_data_tmp,
            bed_and_bin=kwargs['bed_and_bin']
            if 'bed_and_bin' in kwargs else False,
            statsList=kwargs['statsList'] if 'statsList' in kwargs else [],
            mappedList=kwargs['mappedList'] if 'mappedList' in kwargs else [])

        sequencing_depth = cr.run()
        col_names = ["chr", "start", "end"
                     ] + [sample.split("/")[-1] for sample in bamFilesList]
        sequencing_depth_df = pd.read_csv(out_file_for_raw_data_tmp,
                                          sep="\t",
                                          header=None)
        sequencing_depth_df.columns = col_names

        if not 'out_file_for_raw_data' in kwargs:
            os.remove(out_file_for_raw_data_tmp)

        return sequencing_depth_df
Пример #14
0
parser.add_argument("-p",
                    "--processors",
                    help="number of processors",
                    type=int)
args = parser.parse_args()

# Do not calculate if the bedfile is empty

num_lines = 0
with open(args.bed, 'r') as f:
    for line in f:
        num_lines += 1

# Calculate Reads in bam file
if num_lines > 0:
    cr = crpb.CountReadsPerBin([args.bam],
                               bedFile=args.bed,
                               numberOfProcessors=args.processors)
    reads_at_peaks = cr.run()

    # Calculate total number of reads in peaks
    total = reads_at_peaks.sum(axis=0)

    # Calculate % of frangments in peaks
    bam = pysam.AlignmentFile(args.bam)
    frip = float(total[0]) / bam.mapped

    print(str(frip * 100))
else:
    print('0')
Пример #15
0
def main(args=None):
    args = process_args(args)
    cr = countR.CountReadsPerBin(args.bamfiles,
                                 binLength=1,
                                 numberOfSamples=args.numberOfSamples,
                                 numberOfProcessors=args.numberOfProcessors,
                                 verbose=args.verbose,
                                 region=args.region,
                                 extendReads=args.extendReads,
                                 minMappingQuality=args.minMappingQuality,
                                 ignoreDuplicates=args.ignoreDuplicates,
                                 center_read=args.centerReads,
                                 samFlag_include=args.samFlagInclude,
                                 samFlag_exclude=args.samFlagExclude)

    num_reads_per_bin = cr.run()

    sys.stderr.write("Number of non zero bins "
                     "used: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))

    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    plt.suptitle(args.plotTitle)
    # plot up to two std from mean
    sample_mean = num_reads_per_bin.mean(axis=0)
    std = max(num_reads_per_bin.std(axis=0))
    y_max = max(sample_mean) + 3 * std

    # plot coverage
    for idx, col in enumerate(num_reads_per_bin.T):
        axs[0].plot(np.bincount(col.astype(int)).astype(float) /
                    num_reads_per_bin.shape[0],
                    label="{}, mean={:.1f}".format(args.labels[idx],
                                                   sample_mean[idx]))
        csum = np.bincount(col.astype(int))[::-1].cumsum()
        axs[1].plot(csum.astype(float)[::-1] / csum.max(),
                    label=args.labels[idx])

    axs[0].set_xlim(0, y_max)
    axs[0].set_xlabel('coverage')
    axs[0].legend()
    axs[0].set_ylabel('fraction of bases sampled')
    # plot cumulative coverage
    axs[1].set_xlim(0, y_max)
    axs[1].set_xlabel('coverage')
    axs[1].set_ylabel('fraction of bases sampled >= coverage')
    axs[1].legend()
    plt.savefig(args.plotFile.name, format=args.plotFileFormat)
Пример #16
0
def main(args=None):
    args = process_args(args)

    cr = countR.CountReadsPerBin(
        args.bamfiles,
        args.binSize,
        args.numberOfSamples,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
        region=args.region,
        extendReads=args.extendReads,
        minMappingQuality=args.minMappingQuality,
        ignoreDuplicates=args.ignoreDuplicates,
        center_read=args.centerReads,
        samFlag_include=args.samFlagInclude,
        samFlag_exclude=args.samFlagExclude,
        minFragmentLength=args.minFragmentLength,
        maxFragmentLength=args.maxFragmentLength)

    num_reads_per_bin = cr.run()
    if num_reads_per_bin.sum() == 0:
        import sys
        sys.stderr.write(
            "\nNo reads were found in {} regions sampled. Check that the\n"
            "min mapping quality is not overly high and that the \n"
            "chromosome names between bam files are consistant.\n"
            "\n".format(num_reads_per_bin.shape[0]))
        exit(1)

    if args.skipZeros:
        num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)

    total = len(num_reads_per_bin[:, 0])
    x = np.arange(total).astype('float') / total  # normalize from 0 to 1

    i = 0
    # matplotlib won't iterate through line styles by itself
    pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"], 7 * ["."]], [])
    for i, reads in enumerate(num_reads_per_bin.T):
        count = np.cumsum(np.sort(reads))
        count = count / count[-1]  # to normalize y from 0 to 1
        j = i % 35
        plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j])
        plt.xlabel('rank')
        plt.ylabel('fraction w.r.t. bin with highest coverage')
    plt.legend(loc='upper left')
    plt.suptitle(args.plotTitle)
    # set the plotFileFormat explicitly to None to trigger the
    # format from the file-extension
    if not args.plotFileFormat:
        args.plotFileFormat = None

    plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat)
    plt.close()

    if args.outRawCounts:
        args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
        fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
        for row in num_reads_per_bin:
            args.outRawCounts.write(fmt % tuple(row))
        args.outRawCounts.close()

    if args.outQualityMetrics:
        args.outQualityMetrics.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point")
        if args.JSDsample:
            args.outQualityMetrics.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence")
        args.outQualityMetrics.write("\n")
        line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1)
        for idx, reads in enumerate(num_reads_per_bin.T):
            counts = np.cumsum(np.sort(reads))
            counts = counts / float(counts[-1])
            AUC = np.sum(counts) / float(len(counts))
            XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0])
            elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0])
            expected = getExpected(np.mean(reads))  # A tuple of expected (AUC, XInt, elbow)
            args.outQualityMetrics.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2]))
            if args.JSDsample:
                JSD = getJSD(args, idx, num_reads_per_bin)
                syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
                CHANCE = getCHANCE(args, idx, num_reads_per_bin)
                args.outQualityMetrics.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2]))
            args.outQualityMetrics.write("\n")
        args.outQualityMetrics.close()