def main():

    (_, chrom, antisense) = \
        tuple(sys.argv)
    antisense = antisense.lower() == 'true'

    chrom = int(chrom)
    print_fl("Running nucleosome calling on chromosome %d, antisense: %s" % 
        (chrom, str(antisense)))

    name = task_name(antisense)
    timer = Timer()

    p123_orfs = paper_orfs
    save_chrom_dir = sense_nuc_chrom_dir
    cc_dir = cc_sense_chrom_dir

    if antisense:
        p123_orfs = antisense_orfs
        save_chrom_dir = anti_nuc_chrom_dir
        cc_dir = cc_antisense_chrom_dir

    call_nucleosomes_p123_chrom(p123_orfs, chrom, antisense, 
        cc_dir, save_chrom_dir, timer)

    child_done(name, WATCH_TMP_DIR, chrom)
Пример #2
0
def child_done(name, parent_watch_dir, child_name):
    # write child done file
    watch_dir = parent_watch_dir + '/' + name
    write_path = ('%s/child_%s.watch' % (watch_dir, str(child_name)))
    print_fl(write_path)
    with open(write_path, 'wb') as f:
        f.write('Done')
Пример #3
0
    def wait_for_tasks(self):
        """
        Wait for the number of child processes to finish by monitoring watch
        directory
        """

        while True:

            # count # of child processes done
            files = os.listdir(self.watch_dir)
            num_done = sum([f.endswith('.watch') for f in files])
            print_fl("%d/%d finished. " % (num_done, self.num_wait), end='')

            if self.timer is not None:
                print_fl(" %s " % self.timer.get_time(), end='')

            # number of children done
            if num_done == self.num_wait:
                print_fl("All children done.")
                break

            # sleep until all children are done
            else:
                print_fl("Sleeping %ds" %
                    (self.sleep_time))
                time.sleep(self.sleep_time)

        # clean up watch directory
        print_fl("Cleaning up watch directory.")
        self.cleanup()
Пример #4
0
    def select_data(self, head=500):

        act_prm = self.store.promoter_sm_occupancy_delta
        act_dorg = self.store.gene_body_disorganization_delta

        mean_prm = act_prm.mean(axis=1).sort_values(ascending=False)
        mean_dorg = act_dorg.mean(axis=1).sort_values(ascending=False)

        prm_orfs = mean_prm.head(head).index
        dorg_orfs = mean_dorg.head(head).index

        self.data = self.store.chromatin_data.loc[set(prm_orfs).union(
            set(dorg_orfs))]

        def _inverse_quantile(array, val):
            """What is the quantile of the value in the array. (CDF)"""
            return np.mean(array <= val)

        prom_quantile = _inverse_quantile(mean_prm.values,
                                          mean_prm.values[head])
        dorg_quantile = _inverse_quantile(mean_dorg.values,
                                          mean_dorg.values[head])

        print_fl("Promoter ORFs: %d (%.1f%%)\n"
                 "Disorganization ORFs: %d (%.1f%%)" % \
                 (len(prm_orfs),  prom_quantile*100.,
                  len(dorg_orfs), dorg_quantile*100.
                  ))

        print_fl("%d genes total" % len(self.data))
def compute_boundaries_chrom(orfs, pileup, chrom, save_dir, find_antisense,
    log=False, timer=None):

    chrom_genes = orfs[orfs.chr == chrom]

    transcript_boundaries = chrom_genes[[]].copy()
    transcript_boundaries['start'] = None
    transcript_boundaries['stop'] = None

    search_window = 2000
    search_2 = search_window/2

    if log and timer is not None: 
        print_fl("Chromosome %d - %s. %d genes" % 
            (chrom, timer.get_time(), len(chrom_genes)))

    chrom_rna_seq = filter_rna_seq_pileup(pileup, chrom == chrom)

    i = 0
    for orf_name, gene in chrom_genes.iterrows():

        if log and timer is not None and i % 100 == 0: 
            print_fl("%d/%d - %s" % (i, len(chrom_genes),
             timer.get_time()))

        i += 1

        span = gene.transcript_start-search_2, gene.transcript_stop+search_2
        gene_pileup = filter_rna_seq_pileup(chrom_rna_seq, 
            span[0], span[1], gene.chr)

        try:
            start, stop = find_transcript_boundaries(gene_pileup, span, gene,
                find_antisense=find_antisense)
        except ValueError:
            # skip if issues finding boundaries
            continue

        transcript_boundaries.loc[orf_name, 'start'] = start
        transcript_boundaries.loc[orf_name, 'stop'] = stop

        TSS, TES = start, stop
        if ((gene.strand == '-' and not find_antisense) or
            (gene.strand == '+' and find_antisense)):
            TSS, TES = stop, start

        strand = gene.strand
        if find_antisense: strand = flip_strand(gene.strand) 

        transcript_boundaries.loc[orf_name, 'TSS'] = TSS
        transcript_boundaries.loc[orf_name, 'TES'] = TES
        transcript_boundaries.loc[orf_name, 'strand'] = strand
        transcript_boundaries.loc[orf_name, 'chr'] = gene.chr

    transcript_boundaries = transcript_boundaries.dropna()
    transcript_boundaries.to_csv(boundary_file_name(save_dir, chrom, 
        find_antisense))

    return transcript_boundaries
Пример #6
0
    def fit(self):

        np.random.seed(123)
        print_fl("Fitting positional distribution...")
        self.fit_position()

        print_fl("\nFitting length distribution...")
        self.fit_length()
Пример #7
0
def plot_example_cross(plotter, save_dir):
    from src.chromatin import filter_mnase
    from src.plot_utils import apply_global_settings
    from config import cross_corr_sense_path
    from src.utils import get_orf

    orf_cc = pd.read_hdf(cross_corr_sense_path, 'cross_correlation')
    orfs = plotter.orfs

    gene = get_orf('APJ1', orfs)
    span = (gene.TSS - 500, gene.TSS + 500)
    plotter.set_span_chrom(span, gene.chr)

    cc_nuc = orf_cc.loc['nucleosomal'].loc[gene.name].loc[0.0]
    cc_small = orf_cc.loc['small'].loc[gene.name].loc[0.0]

    data = filter_mnase(plotter.all_mnase_data,
                        span[0],
                        span[1],
                        chrom=gene.chr,
                        time=0)

    fig, (ax, leg_ax) = plt.subplots(2, 1, figsize=(5, 6))
    fig.tight_layout(rect=[0.1, 0.1, 0.92, 0.945])
    plt.subplots_adjust(hspace=0.0, wspace=0.5)

    plotter.plot_typhoon_time(ax, data, 0, scale_z=True)
    ax.set_xlim(*span)
    ax.set_xticks(np.arange(span[0], span[1], 500))
    ax.set_xticks(np.arange(span[0], span[1], 100), minor=True)

    ax.set_xlabel("Position (bp)", fontsize=16)
    ax.set_ylabel("Fragment length (bp)", fontsize=16, labelpad=10)
    ax.set_ylim(-100, 250)

    draw_legend(leg_ax, span, 500)

    cc_ax = ax.twinx()
    cc_ax.set_ylabel("Cross correlation $\\times$0.1",
                     rotation=270,
                     fontsize=16,
                     labelpad=10,
                     va='bottom')

    scale_cc = 1
    y_origin = 0
    x = cc_nuc.index + gene.TSS
    y = cc_nuc.values * scale_cc + y_origin
    cc_ax.fill_between(x, y, y_origin, color='#28a098')

    y = -cc_small.values * scale_cc + y_origin
    cc_ax.fill_between(x, y_origin, y, color='#f28500')
    cc_ax.set_ylim(-0.1, 0.4)
    cc_ax.set_yticklabels(np.arange(-1, 5))

    write_path = '%s/%s.pdf' % (save_dir, 'example_cross_correlation')
    plt.savefig(write_path, transparent=True)
    print_fl("Wrote %s" % write_path)
Пример #8
0
def danpos():

    from src.dpos_bed import create_bed_for_dpos
    import os
    from src.utils import run_cmd

    working_dir = os.getcwd()

    danpos_output = '%s/danpos/' % (OUTPUT_DIR)
    mkdirs_safe([danpos_output])

    danpos_path = "%s/danpos-2.2.2/danpos.py" % working_dir

    # create DANPOS Bed file
    mnase = pd.read_hdf(mnase_seq_path, 'mnase_data')
    mnase = mnase[mnase.time == 0]

    save_file = 'mnase_0.bed'
    save_path = '%s/%s' % (danpos_output, save_file)
    create_bed_for_dpos(mnase, save_path)
    print_fl("Wrote %s" % save_path)

    bash_command = "scripts/6_reviewer_mats/run_danpos.sh %s %s %s" % \
        (save_file, OUTPUT_DIR, danpos_path)
    output, error = run_cmd(bash_command, stdout_file=None)

    danpos_calls_path = '%s/result/pooled/mnase_0.smooth.positions.xls' % \
        (danpos_output)
    danpos_positions = pd.read_csv(danpos_calls_path, sep='\t')

    plt.hist(danpos_positions[danpos_positions.smt_value < 10000].smt_value,
             bins=100)
    plt.savefig("%s/danpos_smt_pos.png" % danpos_output)

    danpos_positions = danpos_positions.sort_values('smt_value',
                                                    ascending=False)

    top_danpos = danpos_positions.head(2500)
    top_danpos = top_danpos.rename(columns={
        'chr': 'chromosome',
        'smt_pos': 'position'
    })

    from src.chromatin import collect_mnase
    from src.kernel_fitter import compute_nuc_kernel

    nuc_kernel = compute_nuc_kernel(mnase, top_danpos)
    nuc_kernel.save_kernel("%s/danpos_kernel.json" % danpos_output)

    from src.kernel_fitter import compute_triple_kernel
    nuc_kernel.plot_kernel(kernel_type='nucleosome')
    plt.savefig('%s/danpos_nuc_kernel.pdf' % (save_dir), transparent=True)

    triple_kernel = compute_triple_kernel(nuc_kernel)
    triple_kernel.plot_kernel(kernel_type='triple')
    plt.savefig('%s/danpos_triple_kernel.pdf' % (save_dir), transparent=True)
    def save_data(self):
        save_dir = '%s/tf_analysis' % OUTPUT_DIR
        mkdirs_safe([save_dir])

        print_fl("Saving %s" % save_dir)
        self.all_peaks.to_csv('%s/all_peaks.csv' % save_dir)
        self.linked_peaks_normalized.to_csv('%s/linked_peaks_norm.csv' % save_dir)
        self.linked_peaks.to_csv('%s/linked_peaks.csv' % save_dir)
        self.prom_peaks.to_csv('%s/prom_peaks.csv' % save_dir)
        self.all_motifs.to_csv('%s/all_motifs.csv' % save_dir)
Пример #10
0
def calculate_promoter_regions():

    from config import OUTPUT_DIR

    all_orfs = all_orfs_TSS_PAS()
    gene_boundaries = load_park_orf_transcript_boundaries()
    gene_boundaries = gene_boundaries.join(all_orfs[['orf_class', 'name']])
    gene_boundaries = gene_boundaries[gene_boundaries.orf_class != 'Dubious']

    # find promoter regions using gene boundaries
    gene_boundaries = gene_boundaries.sort_values(['chr', 'transcript_start'])
    gene_boundaries['promoter_start'] = -1
    gene_boundaries['promoter_stop'] = -1

    default_prom_size = 1000

    for chrom in range(1, 17):
        
        chrom_genes = gene_boundaries[gene_boundaries.chr == chrom]
        
        for idx, gene in chrom_genes.iterrows():
            
            if gene.strand == '+':
                prom_start = gene.transcript_start-default_prom_size # default promoter boundary
                upstream_genes = chrom_genes[(chrom_genes.transcript_stop > prom_start) & 
                                             (chrom_genes.transcript_stop < gene.transcript_start)]

                # how close is the closest upstream gene
                if len(upstream_genes) > 0:
                    upstream_genes = upstream_genes.sort_values('transcript_stop', ascending=False)
                    prom_start = upstream_genes.reset_index().loc[0].transcript_stop

                # promoter defined by closest upstream stop and transcript start
                gene_boundaries.loc[idx, 'promoter_start'] = prom_start
                gene_boundaries.loc[idx, 'promoter_stop'] = gene.transcript_start
                
            elif gene.strand == '-':
                prom_start = gene.transcript_stop+default_prom_size # default promoter boundary
                upstream_genes = chrom_genes[(chrom_genes.transcript_start < prom_start) &
                                             (chrom_genes.transcript_start > gene.transcript_stop)]

                # how close is the closest upstream gene
                if len(upstream_genes) > 0:
                    upstream_genes = upstream_genes.sort_values('transcript_start')
                    prom_start = upstream_genes.reset_index().loc[0].transcript_start

                # promoter defined by closest upstream stop and transcript start
                gene_boundaries.loc[idx, 'promoter_stop'] = prom_start
                gene_boundaries.loc[idx, 'promoter_start'] = gene.transcript_stop

    save_path = '%s/calculated_promoters.csv' % OUTPUT_DIR
    gene_boundaries.to_csv(save_path)
    print_fl("Saved to %s" % save_path)
    return gene_boundaries
def calculate_cross_correlation_all_chromosomes(mnase_seq,
                                                TSSs,
                                                nuc_kernel,
                                                sm_kernel,
                                                triple_kernel,
                                                log=True,
                                                save_chrom_dir=None,
                                                timer=None,
                                                find_antisense=False):

    name = task_name(find_antisense)
    driver = TaskDriver(name, WATCH_TMP_DIR, 16, timer=timer)
    driver.print_driver()

    # for all chromosomes calculate occupancies per orf
    for chrom in range(1, 17):

        if not USE_SLURM:
            calculate_cross_correlation_chr(mnase_seq, TSSs, chrom,
                                            find_antisense, nuc_kernel,
                                            sm_kernel, triple_kernel,
                                            save_chrom_dir, log, timer)
            child_done(name, WATCH_TMP_DIR, chrom)
        else:
            exports = (
                "CHROM=%d,ANTISENSE=%s,SLURM_WORKING_DIR=%s,CONDA_PATH=%s,CONDA_ENV=%s"
                % (chrom, str(find_antisense), SLURM_WORKING_DIR, CONDA_PATH,
                   CONDA_ENV))
            script = 'scripts/2_preprocessing/cross_correlation.sh'
            submit_sbatch(exports, script, WATCH_TMP_DIR)

    # wait for all chromosomes to finish
    # superfluous if not in SLURM mode
    driver.wait_for_tasks()
    print_fl()

    # merge
    summary_cross = pd.DataFrame()
    cross = pd.DataFrame()
    for chrom in range(1, 17):
        chrom_cross = pd.read_hdf(cross_filename(save_chrom_dir, chrom),
                                  'cross_correlation')

        if len(chrom_cross) == 0: continue

        chrom_summary = pd.read_csv(summary_filename(save_chrom_dir, chrom))\
            .set_index('orf_name')
        cross = cross.append(chrom_cross)
        summary_cross = summary_cross.append(chrom_summary)

    summary_cross = np.round(summary_cross, 5)
    cross = np.round(cross, 5)

    return cross, summary_cross
Пример #12
0
    def write_gene_plots(self,
                         genes,
                         cc_dir,
                         lines_dir,
                         show_plot=True,
                         custom_lims={},
                         suffix='',
                         large_font=False):

        for gene_name in genes:

            # create heatmaps of the cross correlation for each gene
            write_path = "%s/%s%s.pdf" % (cc_dir, gene_name, suffix)

            try:
                self.set_gene(gene_name)
            except KeyError:
                print_fl("Could not plot %s" % gene_name)
                continue

            fig = self.plot_cross_correlation_heatmap(
                show_colorbar=True,
                title='$\it{' + gene_name + '}$ cross correlation',
                large_font=large_font)
            plt.savefig(write_path, transparent=False)

            # close plots
            if not show_plot:
                plt.close(fig)
                plt.cla()
                plt.clf()

            if gene_name in custom_lims.keys():
                lims = custom_lims[gene_name]
            else:
                lims = (None, None, None)

            # plot lines plots of time course
            write_path = "%s/%s%s.pdf" % (lines_dir, gene_name, suffix)
            fig = self.plot_lines(self.gene.name,
                                  title=r'$\it{' + gene_name +
                                  '}$ time course',
                                  lims=lims,
                                  large_font=large_font)
            plt.savefig(write_path, transparent=False)

            # close
            if not show_plot:
                plt.close(fig)
                plt.cla()
                plt.clf()
Пример #13
0
    def subset_genes(self, head=None, tail=None):

        if head is not None: 
            subset_func = pd.Series.head
            N = head
        elif tail is not None:  
            subset_func = pd.Series.tail
            N = tail

        self.N = N

        data = self.agg_fun(self.gene_body_disorganization_delta, axis=1)
        self.cur_disorg_orfs = subset_func(data.sort_values(ascending=False), N).index.values

        data = self.agg_fun(self.promoter_sm_occupancy_delta, axis=1)
        self.cur_promoter_orfs = subset_func(data.sort_values(ascending=False), N).index.values

        data = self.agg_fun(self.chromatin_data, axis=1)
        self.cur_chromatin_orfs = subset_func(data.sort_values(ascending=False), N).index.values

        data = self.agg_fun(self.xrate_data, axis=1)
        self.cur_xrate_orfs = subset_func(data.sort_values(ascending=False), N).index.values

        # TODO:
        # Report the quantile of the selection

        print_fl("Disorganization ORFs: %d" % len(self.cur_disorg_orfs))
        print_fl("Promoter ORFs:        %d" % len(self.cur_promoter_orfs))
        print_fl("Chromatin ORFs:       %d" % len(self.cur_chromatin_orfs))
        print_fl("Transcription ORFs:   %d" % len(self.cur_xrate_orfs))
Пример #14
0
def sra_download_convert_bam(write_dir, sra_id, filename):

    prefetch = "%s/prefetch" % SRA_BIN_DIR
    sam_dump = "%s/sam-dump" % SRA_BIN_DIR

    # prefetch SRA ID
    print_fl("Prefetching %s" % sra_id)
    sra_write_path = "%s/%s.sra" % (write_dir, filename)
    run_cmd("%s %s --output-file %s" % (prefetch, sra_id, sra_write_path))

    # dump to sam
    sam_write_path = "%s/%s.sam" % (write_dir, filename)
    print_fl("Dumping SAM %s" % sam_write_path)
    run_cmd("%s %s" % (sam_dump, sra_write_path), stdout_file=sam_write_path)

    # convert to bam
    bam_write_path = "%s/%s.bam" % (write_dir, filename)
    print_fl("Converting to BAM %s" % bam_write_path)
    run_cmd("samtools view -b -S %s" % (sam_write_path),
            stdout_file=bam_write_path)

    # index
    bam_write_path = "%s/%s.bam" % (write_dir, filename)
    bam_index_path = "%s/%s.bam.bai" % (write_dir, filename)
    print_fl("Indexing BAM %s" % bam_index_path)
    run_cmd("samtools index %s %s" % (bam_write_path, bam_index_path))

    # remove large SAM file
    os.remove(sam_write_path)
def call_nucleosomes_p123_chrom(orfs, chrom, antisense,
    cross_correlation_dir, save_chrom_dir, timer):

    chrom_orfs = orfs[orfs.chr == chrom]

    if len(chrom_orfs) == 0: return None

    print_fl("Chromosome %d. %d genes" % (chrom, len(chrom_orfs)))
    timer.print_time()

    p123_orfs = chrom_orfs[[]].copy()
    p123_orfs['+1'] = np.nan
    p123_orfs['+2'] = np.nan
    p123_orfs['+3'] = np.nan

    linkages = pd.DataFrame()

    # load relevant cross correlations
    chrom_cross_correlation = pd.read_hdf(
        '%s/cross_correlation_chr%d.h5.z' % 
        (cross_correlation_dir, chrom))

    i = 0
    for idx, orf in chrom_orfs.iterrows():

        if i % 200 == 0: 
            print_fl("  %d/%d - %s" % (i, len(chrom_orfs), timer.get_time()))

        i += 1
        # call nucleosomes and link them in ORF window

        try:
            nucs = call_orf_nucleosomes(chrom_cross_correlation.loc['diff'],
                orf)
            cur_linkages = find_linkages(nucs)
            linkages = linkages.append(cur_linkages)
        except KeyError:
            continue

        # +1, +2, +3    
        p1, p2, p3 = find_p123_gene(orf, linkages)
        p123_orfs.loc[idx] = [p1, p2, p3]

    save_path = nucleosomes_filename(save_chrom_dir, chrom)
    linkages.to_csv(save_path)

    p123_orfs = p123_orfs.dropna()
    save_path = p123_filename(save_chrom_dir, chrom)
    p123_orfs.to_csv(save_path)
Пример #16
0
    def fit_position(self):
        """Fit positional distribution to a normal with uniform background"""
        Y = self.pivoted_data.sum(axis=0)
            
        # fixed parameters, mean of normal and uniform range
        mean_position = 0

        if self.kernel_type == 'small':
            self.pos_std = np.std(Y)
        else:
            self.pos_std = np.std(Y)

        self.pos_mean = mean_position
        print_fl("Kernel positional mean %.2f and std %.2f" % 
              (self.pos_mean, self.pos_std))
Пример #17
0
def compute_nuc_kernel(all_mnase_data, brogaard, top=2500):

    # Get MNase-seq data @ 0 min for top 2500 nucleosomes
    mnase_seq_0 = filter_mnase(all_mnase_data, time=0.0)
    top_brogaard = brogaard.head(top)

    brogaard_mnase = collect_mnase(mnase_seq_0, window=200, pos_chr_df=top_brogaard)

    nuc_length_mode = int(brogaard_mnase.length.mode())
    print_fl("Nucleosome length mode: %d" % nuc_length_mode)

    nuc_fitter = KernelFitter(brogaard_mnase, len_mean=nuc_length_mode, window=200, kernel_type='nucleosome')
    nuc_fitter.fit()
    nuc_fitter.generate_kernel()

    return nuc_fitter.kernel
Пример #18
0
def compute_sm_kernel(all_mnase_data, abf1_sites):

    mnase_seq_0 = filter_mnase(all_mnase_data, time=0.0)
    abf1_mnase = collect_mnase(mnase_seq_0, window=150, 
                                   pos_chr_df=abf1_sites, chrom_key='chr', 
                                   pos_key='mid',
                                   strand='strand')
    
    abf1_length_mode = int(abf1_mnase.length.mode())
    print_fl("Abf1 fragment length mode: %d" % abf1_length_mode)

    sm_fitter = KernelFitter(abf1_mnase, len_mean=abf1_length_mode, window=150, 
        kernel_type='small')
    sm_fitter.fit()
    sm_fitter.generate_kernel()
    return sm_fitter.kernel
Пример #19
0
    def fit_length(self):
        Y = self.pivoted_data.sum(axis=1)

        if self.kernel_type == 'small':
            # mirror small fragments distribution from 0-mode to compute standard deviation in length
            data = self.pivoted_data.sum(axis=1).loc[0:self.len_mean]
            mirrored_data = data.sort_index(ascending=False)
            data = np.concatenate([data, mirrored_data])
            self.len_std = np.std(data)#/50.

        elif self.kernel_type == 'nucleosome':
            data = self.pivoted_data.sum(axis=1).loc[self.len_mean:]
            mirrored_data = data.sort_index(ascending=False)
            data = np.concatenate([mirrored_data, data])
            self.len_std = np.std(data)#/500.

        print_fl("Kernel %s length mean %.2f and std %.2f" % 
              (self.kernel_type, self.len_mean, self.len_std), log=True)
Пример #20
0
    def fit(self, k=10):

        with TimingContext() as timing:
            for model in self.models:
                print_fl("Fitting %s" % model.name)
                model.fit_cv(log=False, k=k)
                print_fl("  " + timing.get_time())

        times = model.times

        df = pd.DataFrame(index=times)
        for model in self.models:
            df[model.name] = model.mse
        self.mse = df

        df = pd.DataFrame(index=times)
        for model in self.models:
            df[model.name] = model.r2
        self.r2 = df
def call_all_nucleosome_p123(orfs, antisense,
    cross_correlation_dir, chrom_save_dir, timer):
  
    linkages = pd.DataFrame()

    name = task_name(antisense)
    driver = TaskDriver(name, WATCH_TMP_DIR, 16, timer=timer)
    driver.print_driver()

    for chrom in range(1, 17):

        if not USE_SLURM:
            call_nucleosomes_p123_chrom(orfs, chrom, antisense, 
                cross_correlation_dir, chrom_save_dir, timer)
            child_done(name, WATCH_TMP_DIR, chrom)
        else:
            exports = ("CHROM=%d,ANTISENSE=%s,SLURM_WORKING_DIR=%s,CONDA_PATH=%s,CONDA_ENV=%s"
                       % (chrom, str(antisense), SLURM_WORKING_DIR, CONDA_PATH, CONDA_ENV))
            script = 'scripts/2_preprocessing/call_nucleosomes.sh'
            submit_sbatch(exports, script, WATCH_TMP_DIR)

    # wait for all chromosomes to finish
    # superfluous if not in SLURM mode
    driver.wait_for_tasks()

    print_fl()

    # merge
    nucleosomes = pd.DataFrame()
    p123 = pd.DataFrame()
    for chrom in range(1, 17):

        if not os.path.exists(nucleosomes_filename(chrom_save_dir, chrom)): 
            continue
        nuc_chr = pd.read_csv(nucleosomes_filename(chrom_save_dir, chrom))\
            .set_index('orf')
        p123_chr = pd.read_csv(p123_filename(chrom_save_dir, chrom))\
            .set_index('orf_name')

        nucleosomes = nucleosomes.append(nuc_chr)
        p123 = p123.append(p123_chr)

    return nucleosomes, p123
Пример #22
0
    def __init__(self,
                 name,
                 times=[0, 7.5, 15, 30, 60, 120],
                 sample_N=None,
                 results_path=None):

        print_fl("Loading %s" % name)

        self.name = name
        self.sample_N = sample_N
        self.times = times

        if results_path is None:
            self.design_matrix()
        else:
            self.load_results(results_path)

        self.l_scale = 1
        self.l_bounds = 1, 10
Пример #23
0
def main():

    print_fl("Loading models")
    gp_compare = RegressionCompare(reg_model=GP)
    gp_compare.fit(k=10)

    gp_compare.plot_compare(metric='r2')
    plt.savefig('output/gp/r2.pdf', transparent=True)

    gp_compare.plot_compare(metric='mse')
    plt.savefig('output/gp/mse.pdf', transparent=True)

    gp_compare.full_model.plot_fit()
    plt.savefig('output/gp/full.pdf', transparent=True)

    gp_compare.full_model.plot_fit(120)
    plt.savefig('output/gp/full_120.pdf', transparent=True)

    gp_compare.mse.T.to_csv('output/gp/model_mse.csv', float_format='%.4f')
    gp_compare.r2.T.to_csv('output/gp/model_r2.csv', float_format='%.4f')
Пример #24
0
def main():

    print_fl("*******************************")
    print_fl("* 6    Reviewer Materials     *")
    print_fl("*******************************")

    print_preamble()

    mkdirs_safe([save_dir])

    plot_utils.apply_global_settings()

    # plots for shift edge analysis
    shift_edge_analysis.main()

    # additional scatter plots
    scatters()

    xrate_vs_TPM()

    # danpos
    danpos()

    # OD curve
    plot_OD_curve()
Пример #25
0
def run_models(save_dir, timer):

    task_name = 'gp'

    # launch gp models
    print_fl("Loading models...", end='')
    models = get_model_funs()

    print_fl("Running %d models..." % len(models), end='')
    driver = TaskDriver(task_name,
                        WATCH_TMP_DIR,
                        len(models.keys()),
                        timer=timer)
    driver.print_driver()

    for name, model in models.items():
        if not USE_SLURM:
            run_model(name, save_dir)
            child_done(task_name, WATCH_TMP_DIR, name)
            pass
        else:
            exports = ("MODEL=%s,SLURM_WORKING_DIR=%s,CONDA_PATH=%s,CONDA_ENV=%s" % \
                      (name.replace(' ', '_'),
                       SLURM_WORKING_DIR, CONDA_PATH, CONDA_ENV))
            script = 'scripts/4_analysis/gp.sh'
            submit_sbatch(exports, script, WATCH_TMP_DIR)

    driver.wait_for_tasks()
    print_fl()
Пример #26
0
def compute_cross_correlations(strand='sense'):

    from src.cross_correlation_kernel import MNaseSeqDensityKernel
    from src.cross_correlation import calculate_cross_correlation_all_chromosomes

    cc_orfs = paper_orfs
    cc_dir = cc_sense_chrom_dir
    cross_corr_path = cross_corr_sense_path
    if strand == 'antisense': 
        cc_orfs = antisense_orfs
        cc_dir = cc_antisense_chrom_dir
        cross_corr_path = cross_corr_antisense_path

    mkdirs_safe([cc_dir])

    nuc_kernel = MNaseSeqDensityKernel(filepath=nuc_kernel_path)
    sm_kernel = MNaseSeqDensityKernel(filepath=sm_kernel_path)
    triple_kernel = compute_triple_kernel(nuc_kernel)

    print_fl("Cross correlating %d ORFs..." % len(cc_orfs))
    
    cross, summary_cross = calculate_cross_correlation_all_chromosomes(
        all_mnase_data, cc_orfs, nuc_kernel, sm_kernel, triple_kernel,
        save_chrom_dir=cc_dir, timer=timer, log=True,
        find_antisense=(strand == 'antisense'))
    
    cross.to_hdf(cross_corr_path,
        'cross_correlation', mode='w', complevel=9, complib='zlib')
    summary_cross.to_csv('%s/cross_correlation_summary_%s.csv' % 
        (mnase_dir, strand))

    print_fl("Done.")
    timer.print_time()
    print_fl()
def create_merged_metrics_df(write_dir, occupancy, cross):

    if cross:
        # merge cross correlation dataframes
        print_fl("Merging cross correlation...")
        cross_correlation = pd.DataFrame()
        for chrom in range(1, 17):
            path = "%s/orf_cross_correlation_chr%d.h5.z" % (write_dir, chrom)
            cur_df = pd.read_hdf(path, 'cross_correlation')
            cross_correlation = cross_correlation.append(cur_df)
        # write to disk
        cross_correlation.to_hdf('%s/orf_cross_correlation.h5.z' % write_dir,
                                 'cross_correlation',
                                 mode='w',
                                 complevel=9,
                                 complib='zlib')

        # merge cross correlation summary
        print_fl("Merging cross correlation summary...")
        orf_cross_correlation_summary = pd.DataFrame()
        for chrom in range(1, 17):
            path = "%s/orf_cross_correlation_summary_chr%d.csv" % (write_dir,
                                                                   chrom)
            cur_df = pd.read_csv(path)
            orf_cross_correlation_summary = orf_cross_correlation_summary.append(
                cur_df)
        # write to disk
        orf_cross_correlation_summary.to_csv(
            '%s/orf_cross_correlation_summary.csv' % write_dir, index=False)

    # merge coverage
    if occupancy:
        print_fl("Merging coverage...")
        coverage = pd.DataFrame()
        for chrom in range(1, 17):
            path = "%s/coverage_chr%d.csv" % (write_dir, chrom)
            cur_df = pd.read_csv(path)
            coverage = coverage.append(cur_df)
        # write to disk
        coverage.to_csv('%s/coverage.csv' % write_dir, index=False)

        # merge coverage
        print_fl("Merging occupancy...")
        occupancy = pd.DataFrame()
        for chrom in range(1, 17):
            path = "%s/occupancy_chr%d.csv" % (write_dir, chrom)
            cur_df = pd.read_csv(path)
            occupancy = occupancy.append(cur_df)
        # write to disk
        occupancy.to_csv('%s/occupancy.csv' % write_dir, index=False)
def get_cross_correlation(wide_counts_df,
                          kernel,
                          times=[0.0, 7.5, 15, 30, 60, 120]):
    """
    Assumes ndarray of (orf, time, length, position)
    """

    # calculate indices to be create the resulting dataframe
    kernel_span = kernel.extent[0], kernel.extent[1]
    positions = wide_counts_df.columns.values
    pos_span = positions.min(), positions.max()
    kernel_width_2 = (kernel_span[1] - kernel_span[0]) / 2
    result_span = pos_span[0] + kernel_width_2, pos_span[1] - kernel_width_2
    result_len = result_span[1] - result_span[0] + 1

    orf_idxs = wide_counts_df.index.levels[0]
    n = 1
    num_times = len(times)

    kern_mat = kernel.kernel_mat

    conv_df = create_orfs_time_df(orf_idxs,
                                  columns=np.arange(result_span[0],
                                                    result_span[1] + 1))

    for orf_name in orf_idxs:
        for time in times:

            try:
                cur_arr = wide_counts_df.loc[orf_name].loc[time].values
            except Exception as e:
                print_fl("Exception thrown for ORF %s.\n%s" %
                         (orf_name, str(e)))
                continue

            cur_conv_score = correlate2d(cur_arr, kern_mat, mode='valid')
            conv_df.loc[orf_name].loc[time] = cur_conv_score

    return conv_df.astype(float)
Пример #29
0
    def print_go(self):
        go_terms = {}

        for cluster in self.clusters:
            terms = list(self.clustered_go_sig[\
                self.clustered_go_sig['cluster'] == cluster]['name'].values)

            # remove high-level terms
            drop_items = {
                'molecular_function', 'cytoplasm', 'cellular_component',
                'biological_process', 'nucleolus', 'cytoplasmic vesicle'
            }
            terms = set(terms) - drop_items
            terms = [t[0].upper() + t[1:] for t in terms]
            go_value = ('Cluster %d\n' % (cluster)) + '\n'.join(list(terms))
            go_terms[str(cluster)] = go_value

            if len(terms) > 0:
                print_fl(go_value)
                print_fl("-------------------")

        self.go_terms = go_terms
Пример #30
0
    def __init__(self,
                 mnase_path=None,
                 rna_seq_pileup_path=None,
                 orfs=None,
                 times=[0.0, 7.5, 15, 30, 60, 120]):

        self.orfs = orfs

        print_fl("Loading MNase-seq...")
        self.all_mnase_data = pd.read_hdf(mnase_path, 'mnase_data')
        self.CDS_introns = read_sgd_orf_introns()

        print_fl("Loading RNA-seq pileup...")
        pileup = pd.read_hdf(rna_seq_pileup_path, 'pileup')
        self.rna_seq_plotter = RNASeqPlotter(pileup)

        self.orfs_plotter = ORFAnnotationPlotter(orfs, self.CDS_introns)

        self.times = times
        self.span = None
        self.chrom = None

        self.set_config()