def makePureSignal(row):
    signal = row[0]
    control = row[1]
    norm_factor = row[2]

    bam = BedTool('../H3K27me3/bw/' + signal + '.bam')
    bam.genome_coverage(bg=True, output="../H3K27me3/bw/" + signal + ".bdg")

    bam_control = BedTool('../H3K27me3/control/' + control + '.bam')
    bam_control.genome_coverage(bg=True,
                                scale=norm_factor,
                                output="../H3K27me3/control/" + control +
                                ".bdg")

    s = BedTool('../H3K27me3/bw/' + signal + '.bdg')
    c = BedTool('../H3K27me3/control/' + control + '.bdg')
    BedTool().union_bedgraphs(i=[s.fn, c.fn],
                              output="../H3K27me3/bw/" + signal + "_" +
                              control + ".bdg")

    union = pd.read_csv("../H3K27me3/bw/" + signal + "_" + control + ".bdg",
                        sep='\t',
                        index_col=False,
                        header=None)
    diff = union[3] - union[4]
    diff = diff.apply(lambda x: 0 if x < 0 else x)
    union["diff"] = diff
    union.drop([3, 4], axis=1, inplace=True)
    without_zero = union[union['diff'] != 0]
    os.remove("../H3K27me3/bw/" + signal + "_" + control + ".bdg")
    without_zero.to_csv("../H3K27me3/bw/pure_" + signal + ".bdg", sep='\t')
Exemplo n.º 2
0
def cov_at_loci(bam, cov_fn):
    bed = BedTool(bam)
    cov_df = bed.genome_coverage(bg=True).to_dataframe()

    # get coverage
    inter = {}
    for idx, row in cov_df.iterrows():
        chrom,s,e,n = row
        s = int(s); e = int(e)
        if chrom not in inter:
            inter[chrom] = [[s,e]]
        else:
            if s - inter[chrom][-1][1] < 1000:
                inter[chrom][-1][1] = e
            else:
                inter[chrom].append([s,e])
    # prepare coverage list
    with open(cov_fn,'w') as out_f:
        for k,v in inter.items(): # k is chrom, v is list of region pos
            for c in v:
                s,e = c
                cov = cov_df.query('chrom==@k and start >=@s and end <=@e')['name'].mean()
                out_f.write('\t'.join([k,str(s),str(e),str(cov)])+'\n')

    cov_df = pd.read_csv(cov_fn,sep='\t',header=0,names=['chr','s','e','cov'])
    cov_df = cov_df.sort_values('cov',ascending=False)
    cov_df.to_csv(cov_fn,sep='\t',index=False)
Exemplo n.º 3
0
def make_bg(files):
	tfp = files[1]+str(uuid.uuid4())
	bamf = BedTool(files[0])
	bgf = bamf.genome_coverage(bg=True, strand='+')
	bgf.saveas(tfp)
	#os.system("cat "+tfp+" | awk '$4 > 3' > "+tfp) # Filter low cov regions 
	df = pd.read_csv(tfp, sep='\t', header=None, names=['Chr', 'Start', 'End', 'Strand'], index_col=None)
	df['Strand'] = '+'
	df.to_csv(tfp, sep='\t', index=False, header=None)
	tfn = files[1]+str(uuid.uuid4())
	bgf = bamf.genome_coverage(bg=True, strand='-')
	bgf.saveas(tfn)
	#os.system("cat "+tfn+" | awk '$4 > 3' > "+tfn) # Filter low cov regions 
	df = pd.read_csv(tfn, sep='\t', header=None, names=['Chr', 'Start', 'End', 'Strand'], index_col=None)
	df['Strand'] = '-'
	df.to_csv(tfn, sep='\t', index=False, header=None)
	
	os.system('cat  ' + tfp + ' ' + tfn + ' > ' + files[1]+files[0].split("/")[-1].replace('.bam', '.bg'))
	temp = BedTool(files[1]+files[0].split("/")[-1].replace('.bam', '.bg'))
	temp = temp.sort()
	temp.saveas(files[1]+files[0].split("/")[-1].replace('.bam', '.bg'))
	
	os.system('rm ' + tfp + ' ' + tfn)
	return(1)
Exemplo n.º 4
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  plot_x_limits,
                  plot_y_limits,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  bin_large_plots=False,
                  binning_summary_statistic="max",
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''
    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # only sort if not sorted
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    should_remove_sorted = True
    if not util.file.bam_is_sorted(bam_dupe_processed):
        samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])
        if plot_only_non_duplicates:
            os.unlink(bam_dupe_processed)
    else:
        bam_sorted = bam_dupe_processed
        if not plot_only_non_duplicates:
            # in this case we are passing through the original in_bam directly
            should_remove_sorted = False

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)

    # only remove the sorted bam if it is not the original input bam
    # which we use directly in some casess
    if should_remove_sorted:
        os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(float(row[2]))
            domain_max += 1

    with matplotlib.pyplot.style.context(plot_style):
        fig = matplotlib.pyplot.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = matplotlib.pyplot.subplot(
        )  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        # Binning
        bin_size = 1
        if bin_large_plots:
            # Bin locations and take summary value (maximum or minimum) in each bin
            binning_fn = {
                "min": min,
                "max": max,
                "mean": mean,
                "median": median
            }
            binning_action = binning_fn.get(binning_summary_statistic, "max")

            inner_plot_width_inches = ax.get_window_extent().transformed(
                fig.dpi_scale_trans.inverted()).width
            inner_plot_width_px = inner_plot_width_inches * fig.dpi  # width of actual plot (sans whitespace and y axis text)
            bins_per_pixel = 1  # increase to make smaller (but less visible) bins
            bin_size = 1 + int(domain_max /
                               (inner_plot_width_px * bins_per_pixel))

            binned_segment_depths = OrderedDict()
            for segment_num, (segment_name, position_depths) in enumerate(
                    segment_depths.items()):
                summary_depths_in_bins = [
                    binning_action(position_depths[i:i + bin_size])
                    for i in range(0, len(position_depths), bin_size)
                ]
                binned_segment_depths[segment_name] = summary_depths_in_bins
            segment_depths = binned_segment_depths

        # Plotting
        domain_max = 0
        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(
                matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key()
                ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            x_values = range(prior_domain_max, domain_max)
            x_values = [x * bin_size for x in x_values]

            if plot_data_style == "filled":
                matplotlib.pyplot.fill_between(x_values,
                                               position_depths,
                                               [0] * len(position_depths),
                                               linewidth=0,
                                               antialiased=True,
                                               color=segment_color)
            elif plot_data_style == "line":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       antialiased=True,
                                       color=segment_color)
            elif plot_data_style == "dots":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       'ro',
                                       antialiased=True,
                                       color=segment_color)

        matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2)
        matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1)

        ylabel = "read depth"
        if (bin_size > 1):
            ylabel = "read depth ({summary} in {size}-bp bin)".format(
                size=bin_size, summary=binning_summary_statistic)
        matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1)

        if plot_x_limits is not None:
            x_min, x_max = plot_x_limits
            matplotlib.pyplot.xlim(x_min, x_max)
        if plot_y_limits is not None:
            y_min, y_max = plot_y_limits
            matplotlib.pyplot.ylim(y_min, y_max)

        # to squash a backend renderer error on OSX related to tight layout
        if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        matplotlib.pyplot.savefig(out_plot_file, format=plot_format,
                                  dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
Exemplo n.º 5
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''

    # TODO: remove this:
    #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv"

    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024"], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # call samtools sort
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])

    if plot_only_non_duplicates:
        os.unlink(bam_dupe_processed)

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)
    os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(int(row[2]))
            domain_max += 1

    domain_max = 0
    with plt.style.context(plot_style):
        fig = plt.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = plt.subplot()  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(plt.rcParams['axes.prop_cycle'].by_key()
                          ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            if plot_data_style == "filled":
                plt.fill_between(range(prior_domain_max, domain_max),
                                 position_depths, [0] * len(position_depths),
                                 linewidth=0,
                                 antialiased=True,
                                 color=segment_color)
            elif plot_data_style == "line":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         antialiased=True,
                         color=segment_color)
            elif plot_data_style == "dots":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         'ro',
                         antialiased=True,
                         color=segment_color)

        plt.title(plot_title, fontsize=font_size * 1.2)
        plt.xlabel("bp", fontsize=font_size * 1.1)
        plt.ylabel("read depth", fontsize=font_size * 1.1)

        # to squash a backend renderer error on OSX related to tight layout
        if plt.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        plt.savefig(out_plot_file, format=plot_format,
                    dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)