def main(): (_, chrom, antisense) = \ tuple(sys.argv) antisense = antisense.lower() == 'true' chrom = int(chrom) print_fl("Running nucleosome calling on chromosome %d, antisense: %s" % (chrom, str(antisense))) name = task_name(antisense) timer = Timer() p123_orfs = paper_orfs save_chrom_dir = sense_nuc_chrom_dir cc_dir = cc_sense_chrom_dir if antisense: p123_orfs = antisense_orfs save_chrom_dir = anti_nuc_chrom_dir cc_dir = cc_antisense_chrom_dir call_nucleosomes_p123_chrom(p123_orfs, chrom, antisense, cc_dir, save_chrom_dir, timer) child_done(name, WATCH_TMP_DIR, chrom)
def child_done(name, parent_watch_dir, child_name): # write child done file watch_dir = parent_watch_dir + '/' + name write_path = ('%s/child_%s.watch' % (watch_dir, str(child_name))) print_fl(write_path) with open(write_path, 'wb') as f: f.write('Done')
def wait_for_tasks(self): """ Wait for the number of child processes to finish by monitoring watch directory """ while True: # count # of child processes done files = os.listdir(self.watch_dir) num_done = sum([f.endswith('.watch') for f in files]) print_fl("%d/%d finished. " % (num_done, self.num_wait), end='') if self.timer is not None: print_fl(" %s " % self.timer.get_time(), end='') # number of children done if num_done == self.num_wait: print_fl("All children done.") break # sleep until all children are done else: print_fl("Sleeping %ds" % (self.sleep_time)) time.sleep(self.sleep_time) # clean up watch directory print_fl("Cleaning up watch directory.") self.cleanup()
def select_data(self, head=500): act_prm = self.store.promoter_sm_occupancy_delta act_dorg = self.store.gene_body_disorganization_delta mean_prm = act_prm.mean(axis=1).sort_values(ascending=False) mean_dorg = act_dorg.mean(axis=1).sort_values(ascending=False) prm_orfs = mean_prm.head(head).index dorg_orfs = mean_dorg.head(head).index self.data = self.store.chromatin_data.loc[set(prm_orfs).union( set(dorg_orfs))] def _inverse_quantile(array, val): """What is the quantile of the value in the array. (CDF)""" return np.mean(array <= val) prom_quantile = _inverse_quantile(mean_prm.values, mean_prm.values[head]) dorg_quantile = _inverse_quantile(mean_dorg.values, mean_dorg.values[head]) print_fl("Promoter ORFs: %d (%.1f%%)\n" "Disorganization ORFs: %d (%.1f%%)" % \ (len(prm_orfs), prom_quantile*100., len(dorg_orfs), dorg_quantile*100. )) print_fl("%d genes total" % len(self.data))
def compute_boundaries_chrom(orfs, pileup, chrom, save_dir, find_antisense, log=False, timer=None): chrom_genes = orfs[orfs.chr == chrom] transcript_boundaries = chrom_genes[[]].copy() transcript_boundaries['start'] = None transcript_boundaries['stop'] = None search_window = 2000 search_2 = search_window/2 if log and timer is not None: print_fl("Chromosome %d - %s. %d genes" % (chrom, timer.get_time(), len(chrom_genes))) chrom_rna_seq = filter_rna_seq_pileup(pileup, chrom == chrom) i = 0 for orf_name, gene in chrom_genes.iterrows(): if log and timer is not None and i % 100 == 0: print_fl("%d/%d - %s" % (i, len(chrom_genes), timer.get_time())) i += 1 span = gene.transcript_start-search_2, gene.transcript_stop+search_2 gene_pileup = filter_rna_seq_pileup(chrom_rna_seq, span[0], span[1], gene.chr) try: start, stop = find_transcript_boundaries(gene_pileup, span, gene, find_antisense=find_antisense) except ValueError: # skip if issues finding boundaries continue transcript_boundaries.loc[orf_name, 'start'] = start transcript_boundaries.loc[orf_name, 'stop'] = stop TSS, TES = start, stop if ((gene.strand == '-' and not find_antisense) or (gene.strand == '+' and find_antisense)): TSS, TES = stop, start strand = gene.strand if find_antisense: strand = flip_strand(gene.strand) transcript_boundaries.loc[orf_name, 'TSS'] = TSS transcript_boundaries.loc[orf_name, 'TES'] = TES transcript_boundaries.loc[orf_name, 'strand'] = strand transcript_boundaries.loc[orf_name, 'chr'] = gene.chr transcript_boundaries = transcript_boundaries.dropna() transcript_boundaries.to_csv(boundary_file_name(save_dir, chrom, find_antisense)) return transcript_boundaries
def fit(self): np.random.seed(123) print_fl("Fitting positional distribution...") self.fit_position() print_fl("\nFitting length distribution...") self.fit_length()
def plot_example_cross(plotter, save_dir): from src.chromatin import filter_mnase from src.plot_utils import apply_global_settings from config import cross_corr_sense_path from src.utils import get_orf orf_cc = pd.read_hdf(cross_corr_sense_path, 'cross_correlation') orfs = plotter.orfs gene = get_orf('APJ1', orfs) span = (gene.TSS - 500, gene.TSS + 500) plotter.set_span_chrom(span, gene.chr) cc_nuc = orf_cc.loc['nucleosomal'].loc[gene.name].loc[0.0] cc_small = orf_cc.loc['small'].loc[gene.name].loc[0.0] data = filter_mnase(plotter.all_mnase_data, span[0], span[1], chrom=gene.chr, time=0) fig, (ax, leg_ax) = plt.subplots(2, 1, figsize=(5, 6)) fig.tight_layout(rect=[0.1, 0.1, 0.92, 0.945]) plt.subplots_adjust(hspace=0.0, wspace=0.5) plotter.plot_typhoon_time(ax, data, 0, scale_z=True) ax.set_xlim(*span) ax.set_xticks(np.arange(span[0], span[1], 500)) ax.set_xticks(np.arange(span[0], span[1], 100), minor=True) ax.set_xlabel("Position (bp)", fontsize=16) ax.set_ylabel("Fragment length (bp)", fontsize=16, labelpad=10) ax.set_ylim(-100, 250) draw_legend(leg_ax, span, 500) cc_ax = ax.twinx() cc_ax.set_ylabel("Cross correlation $\\times$0.1", rotation=270, fontsize=16, labelpad=10, va='bottom') scale_cc = 1 y_origin = 0 x = cc_nuc.index + gene.TSS y = cc_nuc.values * scale_cc + y_origin cc_ax.fill_between(x, y, y_origin, color='#28a098') y = -cc_small.values * scale_cc + y_origin cc_ax.fill_between(x, y_origin, y, color='#f28500') cc_ax.set_ylim(-0.1, 0.4) cc_ax.set_yticklabels(np.arange(-1, 5)) write_path = '%s/%s.pdf' % (save_dir, 'example_cross_correlation') plt.savefig(write_path, transparent=True) print_fl("Wrote %s" % write_path)
def danpos(): from src.dpos_bed import create_bed_for_dpos import os from src.utils import run_cmd working_dir = os.getcwd() danpos_output = '%s/danpos/' % (OUTPUT_DIR) mkdirs_safe([danpos_output]) danpos_path = "%s/danpos-2.2.2/danpos.py" % working_dir # create DANPOS Bed file mnase = pd.read_hdf(mnase_seq_path, 'mnase_data') mnase = mnase[mnase.time == 0] save_file = 'mnase_0.bed' save_path = '%s/%s' % (danpos_output, save_file) create_bed_for_dpos(mnase, save_path) print_fl("Wrote %s" % save_path) bash_command = "scripts/6_reviewer_mats/run_danpos.sh %s %s %s" % \ (save_file, OUTPUT_DIR, danpos_path) output, error = run_cmd(bash_command, stdout_file=None) danpos_calls_path = '%s/result/pooled/mnase_0.smooth.positions.xls' % \ (danpos_output) danpos_positions = pd.read_csv(danpos_calls_path, sep='\t') plt.hist(danpos_positions[danpos_positions.smt_value < 10000].smt_value, bins=100) plt.savefig("%s/danpos_smt_pos.png" % danpos_output) danpos_positions = danpos_positions.sort_values('smt_value', ascending=False) top_danpos = danpos_positions.head(2500) top_danpos = top_danpos.rename(columns={ 'chr': 'chromosome', 'smt_pos': 'position' }) from src.chromatin import collect_mnase from src.kernel_fitter import compute_nuc_kernel nuc_kernel = compute_nuc_kernel(mnase, top_danpos) nuc_kernel.save_kernel("%s/danpos_kernel.json" % danpos_output) from src.kernel_fitter import compute_triple_kernel nuc_kernel.plot_kernel(kernel_type='nucleosome') plt.savefig('%s/danpos_nuc_kernel.pdf' % (save_dir), transparent=True) triple_kernel = compute_triple_kernel(nuc_kernel) triple_kernel.plot_kernel(kernel_type='triple') plt.savefig('%s/danpos_triple_kernel.pdf' % (save_dir), transparent=True)
def save_data(self): save_dir = '%s/tf_analysis' % OUTPUT_DIR mkdirs_safe([save_dir]) print_fl("Saving %s" % save_dir) self.all_peaks.to_csv('%s/all_peaks.csv' % save_dir) self.linked_peaks_normalized.to_csv('%s/linked_peaks_norm.csv' % save_dir) self.linked_peaks.to_csv('%s/linked_peaks.csv' % save_dir) self.prom_peaks.to_csv('%s/prom_peaks.csv' % save_dir) self.all_motifs.to_csv('%s/all_motifs.csv' % save_dir)
def calculate_promoter_regions(): from config import OUTPUT_DIR all_orfs = all_orfs_TSS_PAS() gene_boundaries = load_park_orf_transcript_boundaries() gene_boundaries = gene_boundaries.join(all_orfs[['orf_class', 'name']]) gene_boundaries = gene_boundaries[gene_boundaries.orf_class != 'Dubious'] # find promoter regions using gene boundaries gene_boundaries = gene_boundaries.sort_values(['chr', 'transcript_start']) gene_boundaries['promoter_start'] = -1 gene_boundaries['promoter_stop'] = -1 default_prom_size = 1000 for chrom in range(1, 17): chrom_genes = gene_boundaries[gene_boundaries.chr == chrom] for idx, gene in chrom_genes.iterrows(): if gene.strand == '+': prom_start = gene.transcript_start-default_prom_size # default promoter boundary upstream_genes = chrom_genes[(chrom_genes.transcript_stop > prom_start) & (chrom_genes.transcript_stop < gene.transcript_start)] # how close is the closest upstream gene if len(upstream_genes) > 0: upstream_genes = upstream_genes.sort_values('transcript_stop', ascending=False) prom_start = upstream_genes.reset_index().loc[0].transcript_stop # promoter defined by closest upstream stop and transcript start gene_boundaries.loc[idx, 'promoter_start'] = prom_start gene_boundaries.loc[idx, 'promoter_stop'] = gene.transcript_start elif gene.strand == '-': prom_start = gene.transcript_stop+default_prom_size # default promoter boundary upstream_genes = chrom_genes[(chrom_genes.transcript_start < prom_start) & (chrom_genes.transcript_start > gene.transcript_stop)] # how close is the closest upstream gene if len(upstream_genes) > 0: upstream_genes = upstream_genes.sort_values('transcript_start') prom_start = upstream_genes.reset_index().loc[0].transcript_start # promoter defined by closest upstream stop and transcript start gene_boundaries.loc[idx, 'promoter_stop'] = prom_start gene_boundaries.loc[idx, 'promoter_start'] = gene.transcript_stop save_path = '%s/calculated_promoters.csv' % OUTPUT_DIR gene_boundaries.to_csv(save_path) print_fl("Saved to %s" % save_path) return gene_boundaries
def calculate_cross_correlation_all_chromosomes(mnase_seq, TSSs, nuc_kernel, sm_kernel, triple_kernel, log=True, save_chrom_dir=None, timer=None, find_antisense=False): name = task_name(find_antisense) driver = TaskDriver(name, WATCH_TMP_DIR, 16, timer=timer) driver.print_driver() # for all chromosomes calculate occupancies per orf for chrom in range(1, 17): if not USE_SLURM: calculate_cross_correlation_chr(mnase_seq, TSSs, chrom, find_antisense, nuc_kernel, sm_kernel, triple_kernel, save_chrom_dir, log, timer) child_done(name, WATCH_TMP_DIR, chrom) else: exports = ( "CHROM=%d,ANTISENSE=%s,SLURM_WORKING_DIR=%s,CONDA_PATH=%s,CONDA_ENV=%s" % (chrom, str(find_antisense), SLURM_WORKING_DIR, CONDA_PATH, CONDA_ENV)) script = 'scripts/2_preprocessing/cross_correlation.sh' submit_sbatch(exports, script, WATCH_TMP_DIR) # wait for all chromosomes to finish # superfluous if not in SLURM mode driver.wait_for_tasks() print_fl() # merge summary_cross = pd.DataFrame() cross = pd.DataFrame() for chrom in range(1, 17): chrom_cross = pd.read_hdf(cross_filename(save_chrom_dir, chrom), 'cross_correlation') if len(chrom_cross) == 0: continue chrom_summary = pd.read_csv(summary_filename(save_chrom_dir, chrom))\ .set_index('orf_name') cross = cross.append(chrom_cross) summary_cross = summary_cross.append(chrom_summary) summary_cross = np.round(summary_cross, 5) cross = np.round(cross, 5) return cross, summary_cross
def write_gene_plots(self, genes, cc_dir, lines_dir, show_plot=True, custom_lims={}, suffix='', large_font=False): for gene_name in genes: # create heatmaps of the cross correlation for each gene write_path = "%s/%s%s.pdf" % (cc_dir, gene_name, suffix) try: self.set_gene(gene_name) except KeyError: print_fl("Could not plot %s" % gene_name) continue fig = self.plot_cross_correlation_heatmap( show_colorbar=True, title='$\it{' + gene_name + '}$ cross correlation', large_font=large_font) plt.savefig(write_path, transparent=False) # close plots if not show_plot: plt.close(fig) plt.cla() plt.clf() if gene_name in custom_lims.keys(): lims = custom_lims[gene_name] else: lims = (None, None, None) # plot lines plots of time course write_path = "%s/%s%s.pdf" % (lines_dir, gene_name, suffix) fig = self.plot_lines(self.gene.name, title=r'$\it{' + gene_name + '}$ time course', lims=lims, large_font=large_font) plt.savefig(write_path, transparent=False) # close if not show_plot: plt.close(fig) plt.cla() plt.clf()
def subset_genes(self, head=None, tail=None): if head is not None: subset_func = pd.Series.head N = head elif tail is not None: subset_func = pd.Series.tail N = tail self.N = N data = self.agg_fun(self.gene_body_disorganization_delta, axis=1) self.cur_disorg_orfs = subset_func(data.sort_values(ascending=False), N).index.values data = self.agg_fun(self.promoter_sm_occupancy_delta, axis=1) self.cur_promoter_orfs = subset_func(data.sort_values(ascending=False), N).index.values data = self.agg_fun(self.chromatin_data, axis=1) self.cur_chromatin_orfs = subset_func(data.sort_values(ascending=False), N).index.values data = self.agg_fun(self.xrate_data, axis=1) self.cur_xrate_orfs = subset_func(data.sort_values(ascending=False), N).index.values # TODO: # Report the quantile of the selection print_fl("Disorganization ORFs: %d" % len(self.cur_disorg_orfs)) print_fl("Promoter ORFs: %d" % len(self.cur_promoter_orfs)) print_fl("Chromatin ORFs: %d" % len(self.cur_chromatin_orfs)) print_fl("Transcription ORFs: %d" % len(self.cur_xrate_orfs))
def sra_download_convert_bam(write_dir, sra_id, filename): prefetch = "%s/prefetch" % SRA_BIN_DIR sam_dump = "%s/sam-dump" % SRA_BIN_DIR # prefetch SRA ID print_fl("Prefetching %s" % sra_id) sra_write_path = "%s/%s.sra" % (write_dir, filename) run_cmd("%s %s --output-file %s" % (prefetch, sra_id, sra_write_path)) # dump to sam sam_write_path = "%s/%s.sam" % (write_dir, filename) print_fl("Dumping SAM %s" % sam_write_path) run_cmd("%s %s" % (sam_dump, sra_write_path), stdout_file=sam_write_path) # convert to bam bam_write_path = "%s/%s.bam" % (write_dir, filename) print_fl("Converting to BAM %s" % bam_write_path) run_cmd("samtools view -b -S %s" % (sam_write_path), stdout_file=bam_write_path) # index bam_write_path = "%s/%s.bam" % (write_dir, filename) bam_index_path = "%s/%s.bam.bai" % (write_dir, filename) print_fl("Indexing BAM %s" % bam_index_path) run_cmd("samtools index %s %s" % (bam_write_path, bam_index_path)) # remove large SAM file os.remove(sam_write_path)
def call_nucleosomes_p123_chrom(orfs, chrom, antisense, cross_correlation_dir, save_chrom_dir, timer): chrom_orfs = orfs[orfs.chr == chrom] if len(chrom_orfs) == 0: return None print_fl("Chromosome %d. %d genes" % (chrom, len(chrom_orfs))) timer.print_time() p123_orfs = chrom_orfs[[]].copy() p123_orfs['+1'] = np.nan p123_orfs['+2'] = np.nan p123_orfs['+3'] = np.nan linkages = pd.DataFrame() # load relevant cross correlations chrom_cross_correlation = pd.read_hdf( '%s/cross_correlation_chr%d.h5.z' % (cross_correlation_dir, chrom)) i = 0 for idx, orf in chrom_orfs.iterrows(): if i % 200 == 0: print_fl(" %d/%d - %s" % (i, len(chrom_orfs), timer.get_time())) i += 1 # call nucleosomes and link them in ORF window try: nucs = call_orf_nucleosomes(chrom_cross_correlation.loc['diff'], orf) cur_linkages = find_linkages(nucs) linkages = linkages.append(cur_linkages) except KeyError: continue # +1, +2, +3 p1, p2, p3 = find_p123_gene(orf, linkages) p123_orfs.loc[idx] = [p1, p2, p3] save_path = nucleosomes_filename(save_chrom_dir, chrom) linkages.to_csv(save_path) p123_orfs = p123_orfs.dropna() save_path = p123_filename(save_chrom_dir, chrom) p123_orfs.to_csv(save_path)
def fit_position(self): """Fit positional distribution to a normal with uniform background""" Y = self.pivoted_data.sum(axis=0) # fixed parameters, mean of normal and uniform range mean_position = 0 if self.kernel_type == 'small': self.pos_std = np.std(Y) else: self.pos_std = np.std(Y) self.pos_mean = mean_position print_fl("Kernel positional mean %.2f and std %.2f" % (self.pos_mean, self.pos_std))
def compute_nuc_kernel(all_mnase_data, brogaard, top=2500): # Get MNase-seq data @ 0 min for top 2500 nucleosomes mnase_seq_0 = filter_mnase(all_mnase_data, time=0.0) top_brogaard = brogaard.head(top) brogaard_mnase = collect_mnase(mnase_seq_0, window=200, pos_chr_df=top_brogaard) nuc_length_mode = int(brogaard_mnase.length.mode()) print_fl("Nucleosome length mode: %d" % nuc_length_mode) nuc_fitter = KernelFitter(brogaard_mnase, len_mean=nuc_length_mode, window=200, kernel_type='nucleosome') nuc_fitter.fit() nuc_fitter.generate_kernel() return nuc_fitter.kernel
def compute_sm_kernel(all_mnase_data, abf1_sites): mnase_seq_0 = filter_mnase(all_mnase_data, time=0.0) abf1_mnase = collect_mnase(mnase_seq_0, window=150, pos_chr_df=abf1_sites, chrom_key='chr', pos_key='mid', strand='strand') abf1_length_mode = int(abf1_mnase.length.mode()) print_fl("Abf1 fragment length mode: %d" % abf1_length_mode) sm_fitter = KernelFitter(abf1_mnase, len_mean=abf1_length_mode, window=150, kernel_type='small') sm_fitter.fit() sm_fitter.generate_kernel() return sm_fitter.kernel
def fit_length(self): Y = self.pivoted_data.sum(axis=1) if self.kernel_type == 'small': # mirror small fragments distribution from 0-mode to compute standard deviation in length data = self.pivoted_data.sum(axis=1).loc[0:self.len_mean] mirrored_data = data.sort_index(ascending=False) data = np.concatenate([data, mirrored_data]) self.len_std = np.std(data)#/50. elif self.kernel_type == 'nucleosome': data = self.pivoted_data.sum(axis=1).loc[self.len_mean:] mirrored_data = data.sort_index(ascending=False) data = np.concatenate([mirrored_data, data]) self.len_std = np.std(data)#/500. print_fl("Kernel %s length mean %.2f and std %.2f" % (self.kernel_type, self.len_mean, self.len_std), log=True)
def fit(self, k=10): with TimingContext() as timing: for model in self.models: print_fl("Fitting %s" % model.name) model.fit_cv(log=False, k=k) print_fl(" " + timing.get_time()) times = model.times df = pd.DataFrame(index=times) for model in self.models: df[model.name] = model.mse self.mse = df df = pd.DataFrame(index=times) for model in self.models: df[model.name] = model.r2 self.r2 = df
def call_all_nucleosome_p123(orfs, antisense, cross_correlation_dir, chrom_save_dir, timer): linkages = pd.DataFrame() name = task_name(antisense) driver = TaskDriver(name, WATCH_TMP_DIR, 16, timer=timer) driver.print_driver() for chrom in range(1, 17): if not USE_SLURM: call_nucleosomes_p123_chrom(orfs, chrom, antisense, cross_correlation_dir, chrom_save_dir, timer) child_done(name, WATCH_TMP_DIR, chrom) else: exports = ("CHROM=%d,ANTISENSE=%s,SLURM_WORKING_DIR=%s,CONDA_PATH=%s,CONDA_ENV=%s" % (chrom, str(antisense), SLURM_WORKING_DIR, CONDA_PATH, CONDA_ENV)) script = 'scripts/2_preprocessing/call_nucleosomes.sh' submit_sbatch(exports, script, WATCH_TMP_DIR) # wait for all chromosomes to finish # superfluous if not in SLURM mode driver.wait_for_tasks() print_fl() # merge nucleosomes = pd.DataFrame() p123 = pd.DataFrame() for chrom in range(1, 17): if not os.path.exists(nucleosomes_filename(chrom_save_dir, chrom)): continue nuc_chr = pd.read_csv(nucleosomes_filename(chrom_save_dir, chrom))\ .set_index('orf') p123_chr = pd.read_csv(p123_filename(chrom_save_dir, chrom))\ .set_index('orf_name') nucleosomes = nucleosomes.append(nuc_chr) p123 = p123.append(p123_chr) return nucleosomes, p123
def __init__(self, name, times=[0, 7.5, 15, 30, 60, 120], sample_N=None, results_path=None): print_fl("Loading %s" % name) self.name = name self.sample_N = sample_N self.times = times if results_path is None: self.design_matrix() else: self.load_results(results_path) self.l_scale = 1 self.l_bounds = 1, 10
def main(): print_fl("Loading models") gp_compare = RegressionCompare(reg_model=GP) gp_compare.fit(k=10) gp_compare.plot_compare(metric='r2') plt.savefig('output/gp/r2.pdf', transparent=True) gp_compare.plot_compare(metric='mse') plt.savefig('output/gp/mse.pdf', transparent=True) gp_compare.full_model.plot_fit() plt.savefig('output/gp/full.pdf', transparent=True) gp_compare.full_model.plot_fit(120) plt.savefig('output/gp/full_120.pdf', transparent=True) gp_compare.mse.T.to_csv('output/gp/model_mse.csv', float_format='%.4f') gp_compare.r2.T.to_csv('output/gp/model_r2.csv', float_format='%.4f')
def main(): print_fl("*******************************") print_fl("* 6 Reviewer Materials *") print_fl("*******************************") print_preamble() mkdirs_safe([save_dir]) plot_utils.apply_global_settings() # plots for shift edge analysis shift_edge_analysis.main() # additional scatter plots scatters() xrate_vs_TPM() # danpos danpos() # OD curve plot_OD_curve()
def run_models(save_dir, timer): task_name = 'gp' # launch gp models print_fl("Loading models...", end='') models = get_model_funs() print_fl("Running %d models..." % len(models), end='') driver = TaskDriver(task_name, WATCH_TMP_DIR, len(models.keys()), timer=timer) driver.print_driver() for name, model in models.items(): if not USE_SLURM: run_model(name, save_dir) child_done(task_name, WATCH_TMP_DIR, name) pass else: exports = ("MODEL=%s,SLURM_WORKING_DIR=%s,CONDA_PATH=%s,CONDA_ENV=%s" % \ (name.replace(' ', '_'), SLURM_WORKING_DIR, CONDA_PATH, CONDA_ENV)) script = 'scripts/4_analysis/gp.sh' submit_sbatch(exports, script, WATCH_TMP_DIR) driver.wait_for_tasks() print_fl()
def compute_cross_correlations(strand='sense'): from src.cross_correlation_kernel import MNaseSeqDensityKernel from src.cross_correlation import calculate_cross_correlation_all_chromosomes cc_orfs = paper_orfs cc_dir = cc_sense_chrom_dir cross_corr_path = cross_corr_sense_path if strand == 'antisense': cc_orfs = antisense_orfs cc_dir = cc_antisense_chrom_dir cross_corr_path = cross_corr_antisense_path mkdirs_safe([cc_dir]) nuc_kernel = MNaseSeqDensityKernel(filepath=nuc_kernel_path) sm_kernel = MNaseSeqDensityKernel(filepath=sm_kernel_path) triple_kernel = compute_triple_kernel(nuc_kernel) print_fl("Cross correlating %d ORFs..." % len(cc_orfs)) cross, summary_cross = calculate_cross_correlation_all_chromosomes( all_mnase_data, cc_orfs, nuc_kernel, sm_kernel, triple_kernel, save_chrom_dir=cc_dir, timer=timer, log=True, find_antisense=(strand == 'antisense')) cross.to_hdf(cross_corr_path, 'cross_correlation', mode='w', complevel=9, complib='zlib') summary_cross.to_csv('%s/cross_correlation_summary_%s.csv' % (mnase_dir, strand)) print_fl("Done.") timer.print_time() print_fl()
def create_merged_metrics_df(write_dir, occupancy, cross): if cross: # merge cross correlation dataframes print_fl("Merging cross correlation...") cross_correlation = pd.DataFrame() for chrom in range(1, 17): path = "%s/orf_cross_correlation_chr%d.h5.z" % (write_dir, chrom) cur_df = pd.read_hdf(path, 'cross_correlation') cross_correlation = cross_correlation.append(cur_df) # write to disk cross_correlation.to_hdf('%s/orf_cross_correlation.h5.z' % write_dir, 'cross_correlation', mode='w', complevel=9, complib='zlib') # merge cross correlation summary print_fl("Merging cross correlation summary...") orf_cross_correlation_summary = pd.DataFrame() for chrom in range(1, 17): path = "%s/orf_cross_correlation_summary_chr%d.csv" % (write_dir, chrom) cur_df = pd.read_csv(path) orf_cross_correlation_summary = orf_cross_correlation_summary.append( cur_df) # write to disk orf_cross_correlation_summary.to_csv( '%s/orf_cross_correlation_summary.csv' % write_dir, index=False) # merge coverage if occupancy: print_fl("Merging coverage...") coverage = pd.DataFrame() for chrom in range(1, 17): path = "%s/coverage_chr%d.csv" % (write_dir, chrom) cur_df = pd.read_csv(path) coverage = coverage.append(cur_df) # write to disk coverage.to_csv('%s/coverage.csv' % write_dir, index=False) # merge coverage print_fl("Merging occupancy...") occupancy = pd.DataFrame() for chrom in range(1, 17): path = "%s/occupancy_chr%d.csv" % (write_dir, chrom) cur_df = pd.read_csv(path) occupancy = occupancy.append(cur_df) # write to disk occupancy.to_csv('%s/occupancy.csv' % write_dir, index=False)
def get_cross_correlation(wide_counts_df, kernel, times=[0.0, 7.5, 15, 30, 60, 120]): """ Assumes ndarray of (orf, time, length, position) """ # calculate indices to be create the resulting dataframe kernel_span = kernel.extent[0], kernel.extent[1] positions = wide_counts_df.columns.values pos_span = positions.min(), positions.max() kernel_width_2 = (kernel_span[1] - kernel_span[0]) / 2 result_span = pos_span[0] + kernel_width_2, pos_span[1] - kernel_width_2 result_len = result_span[1] - result_span[0] + 1 orf_idxs = wide_counts_df.index.levels[0] n = 1 num_times = len(times) kern_mat = kernel.kernel_mat conv_df = create_orfs_time_df(orf_idxs, columns=np.arange(result_span[0], result_span[1] + 1)) for orf_name in orf_idxs: for time in times: try: cur_arr = wide_counts_df.loc[orf_name].loc[time].values except Exception as e: print_fl("Exception thrown for ORF %s.\n%s" % (orf_name, str(e))) continue cur_conv_score = correlate2d(cur_arr, kern_mat, mode='valid') conv_df.loc[orf_name].loc[time] = cur_conv_score return conv_df.astype(float)
def print_go(self): go_terms = {} for cluster in self.clusters: terms = list(self.clustered_go_sig[\ self.clustered_go_sig['cluster'] == cluster]['name'].values) # remove high-level terms drop_items = { 'molecular_function', 'cytoplasm', 'cellular_component', 'biological_process', 'nucleolus', 'cytoplasmic vesicle' } terms = set(terms) - drop_items terms = [t[0].upper() + t[1:] for t in terms] go_value = ('Cluster %d\n' % (cluster)) + '\n'.join(list(terms)) go_terms[str(cluster)] = go_value if len(terms) > 0: print_fl(go_value) print_fl("-------------------") self.go_terms = go_terms
def __init__(self, mnase_path=None, rna_seq_pileup_path=None, orfs=None, times=[0.0, 7.5, 15, 30, 60, 120]): self.orfs = orfs print_fl("Loading MNase-seq...") self.all_mnase_data = pd.read_hdf(mnase_path, 'mnase_data') self.CDS_introns = read_sgd_orf_introns() print_fl("Loading RNA-seq pileup...") pileup = pd.read_hdf(rna_seq_pileup_path, 'pileup') self.rna_seq_plotter = RNASeqPlotter(pileup) self.orfs_plotter = ORFAnnotationPlotter(orfs, self.CDS_introns) self.times = times self.span = None self.chrom = None self.set_config()