def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) log.info('Do naive overlap...') overlap_peak = naive_overlap( args.prefix, args.peak1, args.peak2, args.peak_pooled, args.peak_type, args.nonamecheck, args.mem_gb, args.out_dir, ) log.info('Blacklist-filtering peaks...') bfilt_overlap_peak = blacklist_filter(overlap_peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir) log.info('Checking if output is empty...') assert_file_not_empty(bfilt_overlap_peak) log.info('Converting peak to bigbed...') peak_to_bigbed(bfilt_overlap_peak, args.peak_type, args.chrsz, args.mem_gb, args.out_dir) log.info('Converting peak to starch...') peak_to_starch(bfilt_overlap_peak, args.out_dir) log.info('Converting peak to hammock...') peak_to_hammock(bfilt_overlap_peak, args.mem_gb, args.out_dir) if args.ta: # if TAG-ALIGN is given if args.fraglen: # chip-seq log.info('Shifted FRiP with fragment length...') frip_shifted(args.ta, bfilt_overlap_peak, args.chrsz, args.fraglen, args.out_dir) else: # atac-seq log.info('FRiP without fragment length...') frip(args.ta, bfilt_overlap_peak, args.out_dir) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) log.info('Do IDR...') idr_peak, idr_plot, idr_out_gz, idr_stdout = idr( args.prefix, args.peak1, args.peak2, args.peak_pooled, args.peak_type, args.chrsz, args.idr_thresh, args.idr_rank, args.mem_gb, args.out_dir, ) log.info('Checking if output is empty...') assert_file_not_empty(idr_peak, help= 'No IDR peaks found. IDR threshold might be too stringent ' 'or replicates have very poor concordance.') log.info('Blacklist-filtering peaks...') bfilt_idr_peak = blacklist_filter( idr_peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir) log.info('Converting peak to bigbed...') peak_to_bigbed(bfilt_idr_peak, args.peak_type, args.chrsz, args.mem_gb, args.out_dir) log.info('Converting peak to starch...') peak_to_starch(bfilt_idr_peak, args.out_dir) log.info('Converting peak to hammock...') peak_to_hammock(bfilt_idr_peak, args.mem_gb, args.out_dir) if args.ta: # if TAG-ALIGN is given if args.fraglen: # chip-seq log.info('Shifted FRiP with fragment length...') frip_shifted(args.ta, bfilt_idr_peak, args.chrsz, args.fraglen, args.out_dir) else: # atac-seq log.info('FRiP without fragment length...') frip(args.ta, bfilt_idr_peak, args.out_dir) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) log.info('Blacklist-filtering peaks...') bfilt_peak = blacklist_filter(args.peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir) log.info('Checking if output is empty...') assert_file_not_empty(bfilt_peak) log.info('Converting peak to bigbed...') peak_to_bigbed(bfilt_peak, args.peak_type, args.chrsz, args.mem_gb, args.out_dir) log.info('Converting peak to starch...') peak_to_starch(bfilt_peak, args.out_dir) log.info('Converting peak to hammock...') peak_to_hammock(bfilt_peak, args.mem_gb, args.out_dir) log.info('Shifted FRiP with fragment length...') frip_qc = frip_shifted(args.ta, bfilt_peak, args.chrsz, args.fraglen, args.out_dir) log.info('Calculating (blacklist-filtered) peak region size QC/plot...') region_size_qc, region_size_plot = get_region_size_metrics(bfilt_peak) log.info('Calculating number of peaks (blacklist-filtered)...') num_peak_qc = get_num_peaks(bfilt_peak) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def test_starch_and_unstarch(tmp_path): """Cannot use md5hash of output starch file since it includes timestamp. So unstarch it and calculate md5 hash. This is actually an integration test of starch/unstarch. Two functions are tested together: - peak_to_starch(): cannot control starch's timestamp inclusion. - starch_to_bed_gz(): this function gzips with -n (excluding timestamp). """ peak = tmp_path / 'idr_peak.gz' peak.write_text(IDR_PEAK_FILE_CONTENTS) starch = peak_to_starch(peak, tmp_path) bed_gz = starch_to_bed_gz(starch, tmp_path) with open(bed_gz, 'rb') as fp: assert hashlib.md5(fp.read()).hexdigest() == MD5_HASH_IDR_PEAK_UNSTARCHED
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) log.info('Reproducibility QC...') # description for variables # N: list of number of peaks in peak files from pseudo replicates # Nt: top number of peaks in peak files # from true replicates (rep-x_vs_rep-y) # Np: number of peaks in peak files from pooled pseudo replicate N = [get_num_lines(peak) for peak in args.peaks_pr] if len(args.peaks): # multiple replicate case num_rep = infer_n_from_nC2(len(args.peaks)) num_peaks_tr = [get_num_lines(peak) for peak in args.peaks] Nt = max(num_peaks_tr) Np = get_num_lines(args.peak_ppr) rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N)) / float(min(N)) Nt_idx = num_peaks_tr.index(Nt) label_tr = infer_pair_label_from_idx(num_rep, Nt_idx) conservative_set = label_tr conservative_peak = args.peaks[Nt_idx] N_conservative = Nt if Nt > Np: optimal_set = conservative_set optimal_peak = conservative_peak N_optimal = N_conservative else: optimal_set = "pooled-pr1_vs_pooled-pr2" optimal_peak = args.peak_ppr N_optimal = Np else: # single replicate case num_rep = 1 Nt = 0 Np = 0 rescue_ratio = 0.0 self_consistency_ratio = 1.0 conservative_set = 'rep1-pr1_vs_rep1-pr2' conservative_peak = args.peaks_pr[0] N_conservative = N[0] optimal_set = conservative_set optimal_peak = conservative_peak N_optimal = N_conservative reproducibility = 'pass' if rescue_ratio > 2.0 or self_consistency_ratio > 2.0: reproducibility = 'borderline' if rescue_ratio > 2.0 and self_consistency_ratio > 2.0: reproducibility = 'fail' log.info('Writing optimal/conservative peak files...') optimal_peak_file = os.path.join( args.out_dir, '{}optimal_peak.{}.gz'.format( (args.prefix + '.') if args.prefix else '', args.peak_type)) conservative_peak_file = os.path.join( args.out_dir, '{}conservative_peak.{}.gz'.format( (args.prefix + '.') if args.prefix else '', args.peak_type)) copy_f_to_f(optimal_peak, optimal_peak_file) copy_f_to_f(conservative_peak, conservative_peak_file) if args.chrsz: log.info('Converting peak to bigbed...') peak_to_bigbed(optimal_peak_file, args.peak_type, args.chrsz, args.out_dir) peak_to_bigbed(conservative_peak_file, args.peak_type, args.chrsz, args.out_dir) log.info('Converting peak to starch...') peak_to_starch(optimal_peak_file, args.out_dir) peak_to_starch(conservative_peak_file, args.out_dir) log.info('Converting peak to hammock...') peak_to_hammock(optimal_peak_file, args.out_dir) peak_to_hammock(conservative_peak_file, args.out_dir) log.info('Writing reproducibility QC log...') if args.prefix: reproducibility_qc = '{}.reproducibility.qc'.format(args.prefix) else: reproducibility_qc = 'reproducibility.qc' reproducibility_qc = os.path.join(args.out_dir, reproducibility_qc) with open(reproducibility_qc, 'w') as fp: header = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 'Nt', '\t'.join(['N{}'.format(i + 1) for i in range(num_rep)]), 'Np', 'N_opt', 'N_consv', 'opt_set', 'consv_set', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility', ) line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( Nt, '\t'.join([str(i) for i in N]), Np, N_optimal, N_conservative, optimal_set, conservative_set, rescue_ratio, self_consistency_ratio, reproducibility) fp.write(header) fp.write(line) log.info('Calculating (optimal) peak region size QC/plot...') region_size_qc, region_size_plot = get_region_size_metrics( optimal_peak_file) log.info('Calculating number of peaks (optimal)...') get_num_peaks(optimal_peak_file) log.info('All done.')