Пример #1
0
def regression_plots():

    from src.regression_compare import plot_compare_r2, load_results
    from src.gp import plot_res_distribution, plot_res_distribution_time
    from src.gp import GP

    gp_dir = "%s/gp" % OUTPUT_DIR

    mkdirs_safe([gp_dir])

    # plot comparison
    plot_compare_r2(gp_dir)
    plt.savefig('%s/compare_gp_r2.pdf' % gp_dir, transparent=True)

    plot_compare_r2(gp_dir, show_legend=True)
    plt.savefig('%s/compare_gp_r2_legend.pdf' % gp_dir, transparent=True)

    from src.gp import plot_res_distribution_time, plot_res_distribution, GP

    results = load_results(gp_dir)
    for name in ['Full']:
        cur = GP(name, results_path='%s/%s_results.csv' % (gp_dir, name))
        plot_res_distribution(cur, selected_genes=selected_genes)
        plt.savefig('%s/%s_predictions.pdf' % (gp_dir, name), transparent=True)

        for time in [7.5, 30, 120]:
            plot_res_distribution_time(cur,
                                       time,
                                       selected_genes=selected_genes)
            plt.savefig('%s/%s_%s.pdf' % (gp_dir, name, time),
                        transparent=True,
                        dpi=100)
Пример #2
0
def summary_plots():

    orfs = paper_orfs
    orf_cc = pd.read_hdf(cross_corr_sense_path, 'cross_correlation')

    sum_plotter = SummaryPlotter(datastore, orfs, orf_cc)
    show_saved_plot = False

    custom_lims = {'TAD2': [(-3, 3), (-3, 3)], 'MET31': [(-2.5, 2.5), (-8, 8)]}

    cc_dir = '%s/cc' % OUTPUT_DIR
    lines_dir = '%s/lines' % OUTPUT_DIR

    mkdirs_safe([cc_dir, lines_dir])

    sum_plotter.write_gene_plots(genes,
                                 cc_dir=cc_dir,
                                 lines_dir=lines_dir,
                                 show_plot=show_saved_plot,
                                 custom_lims=custom_lims)

    sum_plotter.write_gene_plots(['HSP26'],
                                 cc_dir=cc_dir,
                                 lines_dir=lines_dir,
                                 show_plot=show_saved_plot,
                                 custom_lims=custom_lims,
                                 suffix='_figure',
                                 large_font=True)
Пример #3
0
def compute_cross_correlations(strand='sense'):

    from src.cross_correlation_kernel import MNaseSeqDensityKernel
    from src.cross_correlation import calculate_cross_correlation_all_chromosomes

    cc_orfs = paper_orfs
    cc_dir = cc_sense_chrom_dir
    cross_corr_path = cross_corr_sense_path
    if strand == 'antisense': 
        cc_orfs = antisense_orfs
        cc_dir = cc_antisense_chrom_dir
        cross_corr_path = cross_corr_antisense_path

    mkdirs_safe([cc_dir])

    nuc_kernel = MNaseSeqDensityKernel(filepath=nuc_kernel_path)
    sm_kernel = MNaseSeqDensityKernel(filepath=sm_kernel_path)
    triple_kernel = compute_triple_kernel(nuc_kernel)

    print_fl("Cross correlating %d ORFs..." % len(cc_orfs))
    
    cross, summary_cross = calculate_cross_correlation_all_chromosomes(
        all_mnase_data, cc_orfs, nuc_kernel, sm_kernel, triple_kernel,
        save_chrom_dir=cc_dir, timer=timer, log=True,
        find_antisense=(strand == 'antisense'))
    
    cross.to_hdf(cross_corr_path,
        'cross_correlation', mode='w', complevel=9, complib='zlib')
    summary_cross.to_csv('%s/cross_correlation_summary_%s.csv' % 
        (mnase_dir, strand))

    print_fl("Done.")
    timer.print_time()
    print_fl()
Пример #4
0
def plot_heatmaps():

    from config import OUTPUT_DIR
    from src.chromatin_metrics_data import pivot_metric
    from src.chromatin_heatmaps import ChromatinHeatmaps

    heatmaps = ChromatinHeatmaps(datastore)

    write_dir = '%s/heatmaps' % OUTPUT_DIR
    mkdirs_safe([write_dir])

    heatmaps.show_xlabels = True
    heatmaps.show_saved_plot = False
    heatmaps.plot_heatmap(write_path=("%s/all.png" % write_dir),
                          aspect_scale=25.,
                          fig_height=20.,
                          lines=[200, -200])

    heatmaps.show_xlabels = False
    small_heatmap_scale = 15.
    heatmaps.plot_heatmap(head=200,
                          write_path=("%s/t100.png" % write_dir),
                          aspect_scale=small_heatmap_scale,
                          lines=[-20])

    heatmaps.show_xlabels = True
    heatmaps.plot_heatmap(tail=200,
                          write_path=("%s/b100.png" % write_dir),
                          aspect_scale=small_heatmap_scale,
                          lines=[20])
    heatmaps.show_xlabels = False

    heatmaps.plot_gene_names = True
    heatmaps.plot_heatmap(head=20,
                          write_path=("%s/t20.png" % write_dir),
                          aspect_scale=small_heatmap_scale)

    heatmaps.show_xlabels = True
    heatmaps.plot_heatmap(tail=20,
                          write_path=("%s/b20.png" % write_dir),
                          aspect_scale=small_heatmap_scale)
    heatmaps.plot_gene_names = False

    from src import plot_utils

    heatmaps.plot_colorbars(write_path='%s/cbar.png' % write_dir)

    from src import met4

    heatmaps.show_saved_plot = True
    heatmaps.plot_gene_names = True
    heatmaps.plot_heatmap(orf_groups=met4.orf_groups(),
                          group_names=met4.groups(),
                          group_colors=met4.group_colors(),
                          write_path=("%s/sulfur.pdf" % write_dir),
                          fig_height=11,
                          aspect_scale=5000.,
                          highlight_max=[],
                          y_padding=1)
    heatmaps.plot_gene_names = False
Пример #5
0
def main():

    print_fl("*******************************")
    print_fl("* 6    Reviewer Materials     *")
    print_fl("*******************************")

    print_preamble()

    mkdirs_safe([save_dir])

    plot_utils.apply_global_settings()

    # plots for shift edge analysis
    shift_edge_analysis.main()

    # additional scatter plots
    scatters()

    xrate_vs_TPM()

    # danpos
    danpos()

    # OD curve
    plot_OD_curve()
Пример #6
0
def tf_plots():

    global plotter

    from src.small_peak_calling import SmallPeakCalling
    from src.utils import mkdir_safe
    from src.small_peak_calling import plot_tf_scatter, plot_tf_heatmap, \
        plot_tf_summary
    from src.small_peak_calling import plot_colorbars

    small_peaks = SmallPeakCalling()
    small_peaks.load_data()

    save_dir = '%s/tf_analysis' % OUTPUT_DIR
    mkdir_safe(save_dir)

    plot_tf_scatter(small_peaks, t1=60)
    plt.savefig('%s/small_peaks_0_60.pdf' % save_dir, transparent=True)

    plot_tf_summary(small_peaks, tail=small_peaks.view_low)
    plt.savefig('%s/tf_means_bottom.pdf' % save_dir, transparent=True)

    plot_tf_summary(small_peaks, head=small_peaks.view_high)
    plt.savefig('%s/tf_means_top.pdf' % save_dir, transparent=True)

    plot_tf_summary(small_peaks)
    plt.savefig('%s/tf_means.pdf' % save_dir, transparent=True)

    # plot Aft1/Aft2 peaks
    labeled_peaks = small_peaks.all_motifs.copy()
    labeled_peaks = labeled_peaks[labeled_peaks.tf.isin(
        ['AFT1', 'AFT2'])][['orf', 'peak']].drop_duplicates()
    labeled_peaks = labeled_peaks.merge(paper_orfs[['name']],
                                        left_on='orf',
                                        right_on='orf_name')
    labeled_peaks = labeled_peaks.set_index('peak')
    selected_labeled_peaks = labeled_peaks[labeled_peaks['name'].isin(
        ['LEE1', 'SER33', 'ENB1'])]

    fig, ax = plot_tf_scatter(small_peaks,
                              tf_names=['AFT1', 'AFT2'],
                              labeled_peaks=selected_labeled_peaks,
                              t1=60.0)
    plt.savefig("%s/aft1_aft2_scatter.pdf" % save_dir, transparent=True)

    # typhoon dir
    save_typhoon_dir = '%s/tf_analysis/typhoon/' % OUTPUT_DIR
    save_dir_all_motifs = '%s/tf_analysis/typhoon_all_motifs/' % OUTPUT_DIR
    mkdirs_safe([save_typhoon_dir, save_dir_all_motifs])

    if plotter is None:
        plotter = get_plotter()

    aft_genes = ['SER33', 'LEE1', 'ENB1']
    plotter.plot_genes(aft_genes,
                       save_typhoon_dir,
                       save_dir_all_motifs,
                       times=[0.0, 30.0, 60.0],
                       titlesize=34)
Пример #7
0
def danpos():

    from src.dpos_bed import create_bed_for_dpos
    import os
    from src.utils import run_cmd

    working_dir = os.getcwd()

    danpos_output = '%s/danpos/' % (OUTPUT_DIR)
    mkdirs_safe([danpos_output])

    danpos_path = "%s/danpos-2.2.2/danpos.py" % working_dir

    # create DANPOS Bed file
    mnase = pd.read_hdf(mnase_seq_path, 'mnase_data')
    mnase = mnase[mnase.time == 0]

    save_file = 'mnase_0.bed'
    save_path = '%s/%s' % (danpos_output, save_file)
    create_bed_for_dpos(mnase, save_path)
    print_fl("Wrote %s" % save_path)

    bash_command = "scripts/6_reviewer_mats/run_danpos.sh %s %s %s" % \
        (save_file, OUTPUT_DIR, danpos_path)
    output, error = run_cmd(bash_command, stdout_file=None)

    danpos_calls_path = '%s/result/pooled/mnase_0.smooth.positions.xls' % \
        (danpos_output)
    danpos_positions = pd.read_csv(danpos_calls_path, sep='\t')

    plt.hist(danpos_positions[danpos_positions.smt_value < 10000].smt_value,
             bins=100)
    plt.savefig("%s/danpos_smt_pos.png" % danpos_output)

    danpos_positions = danpos_positions.sort_values('smt_value',
                                                    ascending=False)

    top_danpos = danpos_positions.head(2500)
    top_danpos = top_danpos.rename(columns={
        'chr': 'chromosome',
        'smt_pos': 'position'
    })

    from src.chromatin import collect_mnase
    from src.kernel_fitter import compute_nuc_kernel

    nuc_kernel = compute_nuc_kernel(mnase, top_danpos)
    nuc_kernel.save_kernel("%s/danpos_kernel.json" % danpos_output)

    from src.kernel_fitter import compute_triple_kernel
    nuc_kernel.plot_kernel(kernel_type='nucleosome')
    plt.savefig('%s/danpos_nuc_kernel.pdf' % (save_dir), transparent=True)

    triple_kernel = compute_triple_kernel(nuc_kernel)
    triple_kernel.plot_kernel(kernel_type='triple')
    plt.savefig('%s/danpos_triple_kernel.pdf' % (save_dir), transparent=True)
    def save_data(self):
        save_dir = '%s/tf_analysis' % OUTPUT_DIR
        mkdirs_safe([save_dir])

        print_fl("Saving %s" % save_dir)
        self.all_peaks.to_csv('%s/all_peaks.csv' % save_dir)
        self.linked_peaks_normalized.to_csv('%s/linked_peaks_norm.csv' % save_dir)
        self.linked_peaks.to_csv('%s/linked_peaks.csv' % save_dir)
        self.prom_peaks.to_csv('%s/prom_peaks.csv' % save_dir)
        self.all_motifs.to_csv('%s/all_motifs.csv' % save_dir)
Пример #9
0
def locus_plots():
    """Merge typhoon, cc, and line plots into a single pdf"""

    from src.pdf_utils import merge_locus_pdf

    save_dir = '%s/locus_plots' % OUTPUT_DIR
    mkdirs_safe([save_dir])

    for gene_name in genes:
        write_path = '%s/locus_%s.pdf' % (save_dir, gene_name)
        merge_locus_pdf(OUTPUT_DIR, gene_name, write_path)
Пример #10
0
    def __init__(self, name, parent_watch_dir, num_wait, sleep_time=600, 
        timer=None):

        self.name = name
        self.watch_dir = parent_watch_dir + '/' + name
        self.num_wait = num_wait
        self.sleep_time = sleep_time
        self.timer = timer

        # create directory to watch for slurm jobs to finish
        mkdirs_safe([self.watch_dir])
def main():
    """
    Initial BAM data download and conversion to dataframes

    1. Download RNA-seq and MNase-seq BAM files to disk
    2. Read RNA-seq BAM convert to dataframe and save
    3. Compute RNA-seq pileup and save
    4. Read MNase-seq BAM duplicates
    5. Merged, sample, and save

    Inputs:
    - output directory path
    - RNA-seq BAM files path
    - MNase-seq BAM files path

    Output:
    - RNA-seq data frame of reads
    - RNA-seq pileup data frame
    - MNase-seq merged data frame

    """

    print_fl("*******************************")
    print_fl("* 1      Initialization       *")
    print_fl("*******************************")

    # ------ Setup -------------

    print_preamble()


    # Make directories
    mkdirs_safe([
        rna_bam_rep1_dir, mnase_bam_rep2_dir, mnase_bam_rep1_dir,
        rna_dir, mnase_dir
        ])

    # ------- Download BAM files to disk ------------

    print_fl("\n------- Downloading dataset ----------\n")
    download_bam()

    print_fl("\n------- RNA-seq ----------\n")
    init_rna_seq()

    print_fl("\n------- MNase-seq ----------\n")
    init_mnase_seq()

    print_fl("Data initialization done. Time elapsed: %s" % timer.get_time())
Пример #12
0
def call_p123_nucleosomes(strand='sense'):

    from src.nucleosome_linkages import call_all_nucleosome_p123

    # relevant cross correlation directory

    p123_orfs = paper_orfs
    save_chrom_dir = sense_nuc_chrom_dir
    cc_dir = cc_sense_chrom_dir
    p1_path, p2_path, p3_path = (
        p1_sense_path,
        p2_sense_path,
        p3_sense_path
        )

    if strand == 'antisense': 
        p123_orfs = antisense_orfs
        save_chrom_dir = anti_nuc_chrom_dir
        cc_dir = cc_antisense_chrom_dir
        p1_path, p2_path, p3_path = (
            p1_antisense_path,
            p2_antisense_path,
            p3_antisense_path
            )

    mkdirs_safe([save_chrom_dir])

    print_fl("Calling +1, +2, and +3 nucleosomes...", end='\n')

    linkages, p123_orfs = call_all_nucleosome_p123(p123_orfs, 
        (strand=='antisense'), cc_dir, save_chrom_dir, timer)

    linkages.to_csv('%s/called_orf_nucleosomes_%s.csv' % (mnase_dir, strand))
    p123_orfs.to_csv('%s/called_orf_p123_nucleosomes_%s.csv' % (mnase_dir, strand))

    p1 = nucleosome_linkages.convert_to_pos_time_df(p123_orfs, linkages, '+1')
    p2 = nucleosome_linkages.convert_to_pos_time_df(p123_orfs, linkages, '+2')
    p3 = nucleosome_linkages.convert_to_pos_time_df(p123_orfs, linkages, '+3')

    p1.to_csv(p1_path)
    p2.to_csv(p2_path)
    p3.to_csv(p3_path)

    print_fl('Done.')
    timer.print_time()
    print_fl()
Пример #13
0
def compute_organization_measures(strand='sense'):

    from src.entropy import calculate_cc_summary_measure

    orfs = paper_orfs
    if strand == 'antisense':
        orfs = antisense_orfs

    mkdirs_safe([sense_entropy_dir, anti_entropy_dir]) 

    print_fl("Loading cross correlation")
    cross = pd.read_hdf('%s/cross_correlation_%s.h5.z' % 
        (mnase_dir, strand), 'cross_correlation')
    
    print_fl("Calculating entropy %d ORFS..." % len(orfs))
    entropies = calculate_cc_summary_measure(orfs, cross, strand,
        timer)
    entropies = entropies.round(3)
    entropies.to_csv('%s/orf_%s_entropies.csv' % (mnase_dir, strand))
    timer.print_time()
    print_fl()
Пример #14
0
def typhoon_plots():

    global plotter
    plotter = get_plotter()

    save_dir = '%s/typhoon' % OUTPUT_DIR
    save_dir_all_motifs = '%s/typhoon_all_motifs' % OUTPUT_DIR

    mkdirs_safe([save_dir, save_dir_all_motifs, misc_figures_dir])

    figwidths = {'MCD4': 12, 'APJ1': 12}
    paddings = {'MCD4': (1000, 2000), 'APJ1': (1000, 2000)}

    print_fl("Plotting typhoons...", end='')
    plotter.plot_genes(genes,
                       save_dir,
                       save_dir_all_motifs,
                       figwidths=figwidths,
                       paddings=paddings)
    print_fl("Done.")
    timer.print_time()

    # example plots
    print_fl("Plotting examples...", end='')
    draw_example_mnase_seq(plotter, misc_figures_dir)
    draw_example_rna_seq(plotter, misc_figures_dir)
    # plot_example_cross(plotter, misc_figures_dir)
    print_fl("Done.")
    timer.print_time()

    # Hsp26 plot for figure 1
    plotter.plot_genes(['HSP26'],
                       save_dir,
                       None,
                       times=[0.0, 30.0, 60.0, 120.0],
                       prefix='fig_')
Пример #15
0
def shift_plots():

    from src.nucleosome_calling import plot_p123
    from src.reference_data import read_sgd_orf_introns, read_sgd_orfs
    from src.reference_data import read_park_TSS_PAS
    from src.summary_plotter import SummaryPlotter

    global plotter

    orf_cc = pd.read_hdf(cross_corr_sense_path, 'cross_correlation')

    all_orfs = all_orfs_TSS_PAS()

    sum_plotter = SummaryPlotter(datastore, all_orfs, orf_cc)

    if plotter is None:
        plotter = get_plotter()

    save_dir = '%s/shift' % OUTPUT_DIR
    mkdirs_safe([save_dir])

    shift_genes = ['RPS7A']

    for gene_name in shift_genes:
        fig = plot_p123(gene_name, orf_cc, plotter, sum_plotter, save_dir)

    p1 = datastore.p1_shift[[120.0]]
    p2 = datastore.p2_shift[[120.0]]
    p3 = datastore.p3_shift[[120.0]]

    p12 = p1.join(p2, lsuffix='_+1', rsuffix='_+2')
    p23 = p2.join(p3, lsuffix='_+2', rsuffix='_+3')

    from src.chromatin_summary_plots import plot_distribution

    x = datastore.p1_shift[120]
    y = datastore.transcript_rate_logfold.loc[x.index][120.0]

    model = plot_distribution(x,
                              y,
                              '$\\Delta$ +1 nucleosome shift',
                              '$\log_2$ fold-change transcription rate',
                              title='+1 shift vs transcription, 0-120 min',
                              xlim=(-40, 40),
                              ylim=(-8, 8),
                              xstep=10,
                              ystep=2,
                              pearson=True,
                              s=10)
    plt.savefig('%s/shift_+1_xrate.pdf' % save_dir, transparent=True)

    x = datastore.p1_shift[120]
    y = datastore.p2_shift[120]
    model = plot_distribution(x,
                              y,
                              '$\\Delta$ +1 nucleosome shift',
                              '$\\Delta$ +2 nucleosome shift',
                              title='+1, +2 nucleosome shift\n0-120 min',
                              xlim=(-40, 40),
                              ylim=(-40, 40),
                              xstep=10,
                              ystep=10,
                              pearson=False,
                              s=10)

    plt.savefig('%s/shift_p12.pdf' % save_dir, transparent=True)
Пример #16
0
def determine_transcript_boundaries():

    print_fl("Reading RNA-seq pileup...", end='')
    rna_seq_pileup = pd.read_hdf(pileup_path, 'pileup')
    print_fl("Done.")
    timer.print_time()
    print_fl()

    from src.transcript_boundaries import compute_boundaries, load_park_boundaries

    park_boundaries = load_park_boundaries()

    mkdirs_safe([anti_chrom_dir, sense_chrom_dir])

    # ------------- Antisense transcript boundaries ------------------

    print_fl("Determining antisense boundaries...", end='')
    antisense_boundaries = compute_boundaries(park_boundaries,
                                              rna_seq_pileup,
                                              save_dir=anti_chrom_dir,
                                              pileup_path=pileup_path,
                                              find_antisense=True,
                                              log=True,
                                              timer=timer)

    # all to compare with Park
    path = '%s/antisense_boundaries_computed_all.csv' % rna_dir
    antisense_boundaries.to_csv(path)

    # antisense paper data set
    path = '%s/antisense_boundaries_computed.csv' % rna_dir
    antisense_boundaries = antisense_boundaries[[
        'TSS', 'strand', 'start', 'stop'
    ]].join(paper_orfs[['name', 'chr', 'orf_class']], how='inner')
    antisense_boundaries.to_csv(antisense_orfs_path)

    print_fl("Done.")
    print_fl("Wrote to %s" % path)
    timer.print_time()
    print_fl()

    # ------------- Sense transcript boundaries ------------------

    # compute sense boundaries
    # TODO: currently unused, only for check against Park boundaries
    print_fl("Determining sense boundaries...", end='')
    sense_boundaries = compute_boundaries(park_boundaries,
                                          rna_seq_pileup,
                                          save_dir=sense_chrom_dir,
                                          pileup_path=pileup_path,
                                          find_antisense=False,
                                          log=True,
                                          timer=timer)

    # all to compare with Park
    path = '%s/sense_boundaries_computed_all.csv' % rna_dir
    sense_boundaries.to_csv(path)

    # paper data set
    path = '%s/sense_boundaries_computed.csv' % rna_dir
    sense_boundaries.join(paper_orfs[[]], how='inner').to_csv(path)

    print_fl("Done.")
    print_fl("Wrote to %s" % path)
    timer.print_time()
    print_fl()
Пример #17
0
def main():
    """
    Computation of chromatin metrics against sense and antisense strand
    for analysis.

    1. * For the sense strand for each ORF
    2. Compute occupancies
    3. Compute cross correlation and save these per chromosome to disk
    4. Compute cross correlation summaries
    5. Call nucleosomes in cross correlation window
    6. Call +1, +2, and +3 nucleosomes relative to TSS
    7. * Repeat 2-6 for identified for antisense TSSs

    Inputs:
    - MNase-seq data
    - RNA-seq data
    - Antisense transcript boundaries

    Output:
    - MNase-seq occupancy summaries for each ORF
    - Per bp cross correlation scores for each ORF
    - Cross correlation summary scores for each ORF
    - Called nucleosomes local to each ORF
    - Called +1, +2, +3 nucleosomes to each ORF

        * for Sense and Antisense strands

    """

    print_fl("***********************")
    print_fl("* 3      Metrics      *")
    print_fl("***********************")

    print_preamble()

    # paths to save cross correlations per chromosome
    mkdirs_safe([cc_sense_chrom_dir, cc_antisense_chrom_dir])
    
    if USE_SLURM: mkdirs_safe([WATCH_TMP_DIR])
    
    print_fl("\n------- Read inputs ----------\n")
    read_input_data()

    print_fl("\n------- Calculate occupancies (Sense) ----------\n")
    compute_occupancies()

    print_fl("\n------- Calculate cross correlation (Sense) ----------\n")
    compute_cross_correlations()  

    print_fl("\n------- Calculate nucleosome shift (Sense) ----------\n")
    call_p123_nucleosomes()

    print_fl("\n------- Calculate entropy (Sense) ----------\n")
    compute_organization_measures(strand='sense')

    print_fl("\n------- Calculate occupancies (Antisense) ----------\n")
    compute_occupancies(strand='antisense')

    print_fl("\n------- Calculate cross correlation (Antisense) ----------\n")
    compute_cross_correlations(strand='antisense')

    print_fl("\n------- Calculate nucleosome shift (Antisense) ----------\n")
    call_p123_nucleosomes(strand='antisense')

    print_fl("\n------- Calculate entropy (Antisense) ----------\n")
    compute_organization_measures(strand='antisense')

    print_fl("\n--------- Generate data for supplemental ------------\n")
    create_suppl_data()
def init_rna_seq():

    rna_seq_rep1_filenames = [
        "DM538_RNA_rep1_0_min.bam",
        "DM539_RNA_rep1_7.5_min.bam",
        "DM540_RNA_rep1_15_min.bam",
        "DM541_RNA_rep1_30_min.bam",
        "DM542_RNA_rep1_60_min.bam",
        "DM543_RNA_rep1_120_min.bam"
    ]

    rna_seq_rep2_filenames = [
        "DM1450_RNA_rep2_0_min.bam",
        "DM1451_RNA_rep2_7.5_min.bam",
        "DM1452_RNA_rep2_15_min.bam",
        "DM1453_RNA_rep2_30_min.bam",
        "DM1454_RNA_rep2_60_min.bam",
        "DM1455_RNA_rep2_120_min.bam"
    ]

    from src.read_bam import read_rna_seq_set
    from src.transcription import sample_rna

    # Read replicate 1
    print_fl("Reading Replicate 1 RNA-seq BAM...", end='')
    rna_seq_rep1 = read_rna_seq_set(rna_bam_rep1_dir, rna_seq_rep1_filenames,
        source='dm538_dm543',
        debug=DEBUG)
    print_fl("Done.")
    timer.print_time()
    print_fl()

    # Read replicate 2
    print_fl("Reading Replicate 2 RNA-seq BAM...", end='')
    rna_seq_rep2 = read_rna_seq_set(rna_bam_rep2_dir, rna_seq_rep2_filenames,
        source='dm1450_dm1455',
        debug=DEBUG)
    print_fl("Done.")
    timer.print_time()
    print_fl()

    # depth of each dataset
    rep1_depth = rna_seq_rep1[['chr', 'time']].groupby('time').count()\
        .rename(columns={'chr':'count'})
    rep2_depth = rna_seq_rep2[['chr', 'time']].groupby('time').count()\
        .rename(columns={'chr':'count'})

    print_fl("Rep1 read depth:\n" + str(rep1_depth), end='\n\n')
    print_fl("Rep2 read depth:\n" + str(rep2_depth), end='\n\n')

    sample_rep1_depth = rep1_depth['count'].min()
    sample_rep2_depth = rep2_depth['count'].min()

    print_fl("Sampling Rep1 to %d for each time point..." % 
        sample_rep1_depth, end='')
    rep1_sampled = sample_rna(rna_seq_rep1, sample_rep1_depth)
    print_fl("Done.")
    timer.print_time()
    print_fl()

    print_fl("Sampling Rep2 to %d for each time point..." % 
        sample_rep2_depth, end='')
    rep2_sampled = sample_rna(rna_seq_rep2, sample_rep2_depth)
    print_fl("Done.")
    timer.print_time()
    print_fl()

    # Merge replicates
    print_fl("Merging RNA-seq files...", end='')
    merged_rna = rna_seq_rep1.append(rna_seq_rep2)
    merged_rna = merged_rna[['chr', 'start', 
    'stop', 'length', 'strand', 'time', 'source']].sort_values(
        ['source', 'time', 'chr', 'start'])
    print_fl("Done.")
    timer.print_time()
    print_fl()

    # Merge sampled replicates
    print_fl("Merging RNA-seq files...", end='')
    merged_rna_sampled = rep1_sampled.append(rep2_sampled)
    merged_rna_sampled = merged_rna_sampled[['chr', 'start', 
    'stop', 'length', 'strand', 'time', 'source']].sort_values(
        ['source', 'time', 'chr', 'start'])
    print_fl("Done.")
    timer.print_time()
    print_fl()

    # Save to all RNA-seq data to disk
    save_path = '%s/rnase_seq_all.h5.z' % rna_dir
    print_fl("Saving merged RNase-seq to %s..." % save_path, end='')
    merged_rna.to_hdf(save_path, 'rna_seq_data', mode='w', complevel=9,
        complib='zlib')

    # Save merged data to disk
    save_path = '%s/rnase_seq_merged_sampled.h5.z' % rna_dir
    print_fl("Saving merged RNase-seq to %s..." % save_path, end='')
    merged_rna_sampled.to_hdf(save_path, 'rna_seq_data', mode='w', complevel=9,
        complib='zlib')
    print_fl("Done.")
    timer.print_time()
    print_fl()

    # convert to pileup dataframe
    mkdirs_safe([pileup_chrom_dir])

    print_fl("Calculating RNA-seq pileup...", end='')
    from src.rna_seq_pileup import calculate_rna_seq_pileup
    pileup = calculate_rna_seq_pileup(merged_rna_sampled, timer)
    print_fl("Done.")
    timer.print_time()
    print_fl()

    save_path = pileup_path
    print_fl("Saving RNA-seq pileup to %s..." % save_path, end='')
    pileup.to_hdf(save_path, 'pileup', mode='w', complevel=9, complib='zlib')
    print_fl("Done.")
    timer.print_time()
    print_fl()
Пример #19
0
def misc_plots():

    scatter_dpi = 200

    from src.met4 import plot_timecourse
    from src.chromatin_summary_plots import (
        plot_combined_vs_xrate, plot_sul_prom_disorg, plot_occ_vs_xrate,
        plot_disorg_vs_xrate, plot_diosorg_vs_occ, plot_frag_len_dist)
    from src.cross_correlation_kernel import MNaseSeqDensityKernel

    met4_dir = "%s/met4" % OUTPUT_DIR
    scatters_dir = "%s/scatters" % OUTPUT_DIR
    kernels_dir = "%s/kernels" % OUTPUT_DIR
    mkdirs_safe([met4_dir, scatters_dir, kernels_dir])

    nuc_kernel = MNaseSeqDensityKernel(filepath=nuc_kernel_path)
    nuc_kernel.plot_kernel(kernel_type='nucleosome')
    plt.savefig('%s/nuc_kernel.pdf' % (kernels_dir), transparent=True)

    sm_kernel = MNaseSeqDensityKernel(filepath=sm_kernel_path)
    sm_kernel.plot_kernel(kernel_type='small')
    plt.savefig('%s/sm_kernel.pdf' % (kernels_dir), transparent=True)

    from src.kernel_fitter import compute_triple_kernel
    triple_kernel = compute_triple_kernel(nuc_kernel)
    triple_kernel.plot_kernel(kernel_type='triple')
    plt.savefig('%s/triple_kernel.pdf' % (kernels_dir), transparent=True)

    from src.nucleosome_calling import plot_nuc_calls_cc
    plot_nuc_calls_cc()
    plt.savefig('%s/nuc_cross_cor_0_min.pdf' % (misc_figures_dir),
                transparent=True)

    # met4 plots
    plot_timecourse(datastore)
    plt.savefig('%s/met4_timecourse.pdf' % (met4_dir), transparent=True)

    plot_sul_prom_disorg(datastore)
    plt.savefig('%s/met4_scatter.pdf' % (met4_dir),
                transparent=True,
                dpi=scatter_dpi)

    # scatter plots
    plot_combined_vs_xrate(datastore, selected_genes)
    plt.savefig('%s/combined_vs_xrate.pdf' % (scatters_dir),
                transparent=True,
                dpi=scatter_dpi)

    plot_occ_vs_xrate(datastore, selected_genes)
    plt.savefig('%s/small_vs_xrate.pdf' % (scatters_dir),
                transparent=True,
                dpi=scatter_dpi)

    plot_disorg_vs_xrate(datastore, selected_genes)
    plt.savefig('%s/disorg_vs_xrate.pdf' % (scatters_dir),
                transparent=True,
                dpi=scatter_dpi)

    plot_diosorg_vs_occ(datastore, selected_genes)
    plt.savefig('%s/disorg_vs_small.pdf' % (scatters_dir),
                transparent=True,
                dpi=scatter_dpi)

    plot_ORFs_len(misc_figures_dir)

    plot_coverage(misc_figures_dir)

    global plotter

    if plotter is None:
        plotter = get_plotter()

    # plot sampled mnase data
    plot_frag_len_dist(plotter.all_mnase_data)
    plt.savefig("%s/frag_length_distribution.pdf" % misc_figures_dir,
                transparent=True)

    print_fl("Load all MNase-seq data for fragment length distributions")
    all_mnase_data = pd.read_hdf('%s/mnase_seq_merged_all.h5.z' % mnase_dir,
                                 'mnase_data')
    repl1_mnase = all_mnase_data[all_mnase_data['source'] == 'dm498_503']
    repl2_mnase = all_mnase_data[all_mnase_data['source'] == 'dm504_509']
    print_fl("Done.")

    plot_frag_len_dist(repl1_mnase, "Replicate 1", normalize=True)
    plt.savefig('%s/frag_length_distribution_repl1.pdf' % misc_figures_dir,
                transparent=True)

    plot_frag_len_dist(repl2_mnase, "Replicate 2", normalize=True)
    plt.savefig('%s/frag_length_distribution_repl2.pdf' % misc_figures_dir,
                transparent=True)
Пример #20
0
def antisense_plots():

    from src.antisense_analysis import plot_antisense_vs_sense
    from src.antisense_analysis import plot_bar_counts, plot_antisense_dist

    save_dir = '%s/antisense' % OUTPUT_DIR
    mkdirs_safe([save_dir])

    antisense_TPM = read_orfs_data('%s/antisense_TPM.csv' % rna_dir)
    antisense_TPM_logfold = read_orfs_data('%s/antisense_TPM_log2fold.csv' %
                                           rna_dir)

    plot_antisense_vs_sense(
        antisense_TPM_logfold,
        datastore.transcript_rate_logfold,
        120.0,
        highlight=['MET31', 'CKB1', 'RPS7A', 'YBR241C', 'UTR2'])
    plt.savefig('%s/sense_antisense_distr.pdf' % save_dir,
                transparent=True,
                dpi=100)

    plot_bar_counts(antisense_TPM_logfold, datastore.transcript_rate_logfold)
    plt.savefig('%s/sense_antisense_counts.pdf' % save_dir)

    plot_antisense_dist(antisense_TPM_logfold)
    plt.savefig('%s/antisense_logfc_dist.pdf' % save_dir)

    from src.antisense_analysis import plot_antisense_lengths, plot_antisense_calling

    rna_seq_pileup = pd.read_hdf('%s/rna_seq_pileup.h5.z' % rna_dir, 'pileup')
    antisense_boundaries = read_orfs_data(
        '%s/antisense_boundaries_computed.csv' % rna_dir)

    plot_antisense_lengths()
    plt.savefig('%s/antisense_lengths_dist.pdf' % save_dir)

    plot_antisense_calling('MET31', rna_seq_pileup)
    plt.savefig('%s/antisense_met31_calling.pdf' % save_dir)

    from src.chromatin_summary_plots import plot_distribution

    anti_datastore = ChromatinDataStore(is_antisense=True)
    x = anti_datastore.promoter_sm_occupancy_delta.mean(axis=1)
    y = anti_datastore.antisense_TPM_logfold.mean(axis=1).loc[x.index]
    model = plot_distribution(
        x,
        y,
        '$\\Delta$ Antisense promoter occupancy',
        'Log$_2$ fold-change antisense transcript',
        highlight=[],
        title='Promoter occupancy vs transcription (Antisense)',
        xlim=(-1.5, 1.5),
        ylim=(-4, 4),
        xstep=0.5,
        ystep=1)
    plt.savefig('%s/antisense_chrom_dist_prom_vs_xrate.pdf' % save_dir)

    x = anti_datastore.gene_body_disorganization_delta.mean(axis=1).dropna()
    y = anti_datastore.antisense_TPM_logfold.loc[x.index].mean(
        axis=1).loc[x.index]

    model = plot_distribution(
        x,
        y,
        '$\\Delta$ antisense nucleosome disorganization',
        'Log$_2$ fold-change antisense transcripts',
        highlight=[],
        title='Nuc. disorganization vs transcription (Antisense)',
        xlim=(-1.5, 1.5),
        ylim=(-4, 4),
        xstep=0.5,
        ystep=1)
    plt.savefig('%s/antisense_chrom_dist_disorg_vs_xrate.pdf' % save_dir)