def parse_arguments(): parser = argparse.ArgumentParser( prog='ENCODE DCC reproducibility QC.', description='IDR peak or overlap peak only.') parser.add_argument('peaks', type=str, nargs='*', help='List of peak files \ from true replicates in a sorted order. \ For example of 4 true replicates, \ 0,1 0,2 0,3 1,2 1,3 2,3. \ x,y means peak file from rep-x vs rep-y.') parser.add_argument('--peaks-pr', type=str, nargs='+', required=True, help='List of peak files from pseudo replicates.') parser.add_argument('--peak-ppr', type=str, help='Peak file from pooled pseudo replicate.') parser.add_argument( '--peak-type', type=str, default='narrowPeak', choices=['narrowPeak', 'regionPeak', 'broadPeak', 'gappedPeak'], help='Peak file type.') parser.add_argument('--chrsz', type=str, help='2-col chromosome sizes file.') parser.add_argument('--prefix', type=str, help='Basename prefix for reproducibility QC file.') parser.add_argument('--out-dir', default='', type=str, help='Output directory.') parser.add_argument('--log-level', default='INFO', choices=[ 'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'ERROR', 'CRITICAL' ], help='Log level') args = parser.parse_args() if len(args.peaks_pr) != infer_n_from_nC2(len(args.peaks)): raise argparse.ArgumentTypeError( 'Invalid number of peak files or --peaks-pr.') log.setLevel(args.log_level) log.info(sys.argv) return args
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) log.info('Reproducibility QC...') # description for variables # N: list of number of peaks in peak files from pseudo replicates # Nt: top number of peaks in peak files # from true replicates (rep-x_vs_rep-y) # Np: number of peaks in peak files from pooled pseudo replicate N = [get_num_lines(peak) for peak in args.peaks_pr] if len(args.peaks): # multiple replicate case num_rep = infer_n_from_nC2(len(args.peaks)) num_peaks_tr = [get_num_lines(peak) for peak in args.peaks] Nt = max(num_peaks_tr) Np = get_num_lines(args.peak_ppr) rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N)) / float(min(N)) Nt_idx = num_peaks_tr.index(Nt) label_tr = infer_pair_label_from_idx(num_rep, Nt_idx) conservative_set = label_tr conservative_peak = args.peaks[Nt_idx] N_conservative = Nt if Nt > Np: optimal_set = conservative_set optimal_peak = conservative_peak N_optimal = N_conservative else: optimal_set = "pooled-pr1_vs_pooled-pr2" optimal_peak = args.peak_ppr N_optimal = Np else: # single replicate case num_rep = 1 Nt = 0 Np = 0 rescue_ratio = 0.0 self_consistency_ratio = 1.0 conservative_set = 'rep1-pr1_vs_rep1-pr2' conservative_peak = args.peaks_pr[0] N_conservative = N[0] optimal_set = conservative_set optimal_peak = conservative_peak N_optimal = N_conservative reproducibility = 'pass' if rescue_ratio > 2.0 or self_consistency_ratio > 2.0: reproducibility = 'borderline' if rescue_ratio > 2.0 and self_consistency_ratio > 2.0: reproducibility = 'fail' log.info('Writing optimal/conservative peak files...') optimal_peak_file = os.path.join( args.out_dir, '{}optimal_peak.{}.gz'.format( (args.prefix + '.') if args.prefix else '', args.peak_type)) conservative_peak_file = os.path.join( args.out_dir, '{}conservative_peak.{}.gz'.format( (args.prefix + '.') if args.prefix else '', args.peak_type)) copy_f_to_f(optimal_peak, optimal_peak_file) copy_f_to_f(conservative_peak, conservative_peak_file) if args.chrsz: log.info('Converting peak to bigbed...') peak_to_bigbed(optimal_peak_file, args.peak_type, args.chrsz, args.out_dir) peak_to_bigbed(conservative_peak_file, args.peak_type, args.chrsz, args.out_dir) log.info('Converting peak to starch...') peak_to_starch(optimal_peak_file, args.out_dir) peak_to_starch(conservative_peak_file, args.out_dir) log.info('Converting peak to hammock...') peak_to_hammock(optimal_peak_file, args.out_dir) peak_to_hammock(conservative_peak_file, args.out_dir) log.info('Writing reproducibility QC log...') if args.prefix: reproducibility_qc = '{}.reproducibility.qc'.format(args.prefix) else: reproducibility_qc = 'reproducibility.qc' reproducibility_qc = os.path.join(args.out_dir, reproducibility_qc) with open(reproducibility_qc, 'w') as fp: header = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 'Nt', '\t'.join(['N{}'.format(i + 1) for i in range(num_rep)]), 'Np', 'N_opt', 'N_consv', 'opt_set', 'consv_set', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility', ) line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( Nt, '\t'.join([str(i) for i in N]), Np, N_optimal, N_conservative, optimal_set, conservative_set, rescue_ratio, self_consistency_ratio, reproducibility) fp.write(header) fp.write(line) log.info('Calculating (optimal) peak region size QC/plot...') region_size_qc, region_size_plot = get_region_size_metrics( optimal_peak_file) log.info('Calculating number of peaks (optimal)...') get_num_peaks(optimal_peak_file) log.info('All done.')
def make_cat_replication(args, cat_root): cat_replication = QCCategory( 'replication', html_head='<h1>Replication quality metrics</h1><hr>', parent=cat_root ) cat_idr = QCCategory( 'idr', html_head='<h2>IDR (Irreproducible Discovery Rate) plots</h2>', parent=cat_replication, ) if args.idr_plots: num_rep = infer_n_from_nC2(len(args.idr_plots)) for i, plot in enumerate(args.idr_plots): if plot: cat_idr.add_plot( plot, key=infer_pair_label_from_idx(num_rep, i)) if args.idr_plots_pr: for i, plot in enumerate(args.idr_plots_pr): if plot: cat_idr.add_plot( plot, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i+1)) if args.idr_plot_ppr: cat_idr.add_plot(args.idr_plot_ppr[0], key='pooled-pr1_vs_pooled-pr2') cat_reproducibility = QCCategory( 'reproducibility', html_head='<h2>Reproducibility QC and peak detection statistics</h2>', html_foot=""" <div id='help-reproducibility'><p>Reproducibility QC<br> <ul> <li>N1: Replicate 1 self-consistent peaks (comparing two pseudoreplicates generated by subsampling Rep1 reads) </li> <li>N2: Replicate 2 self-consistent peaks (comparing two pseudoreplicates generated by subsampling Rep2 reads) </li> <li>Ni: Replicate i self-consistent peaks (comparing two pseudoreplicates generated by subsampling RepX reads) </li> <li>Nt: True Replicate consistent peaks (comparing true replicates Rep1 vs Rep2) </li> <li>Np: Pooled-pseudoreplicate consistent peaks (comparing two pseudoreplicates generated by subsampling pooled reads from Rep1 and Rep2) </li> <li>Self-consistency Ratio: max(N1,N2) / min (N1,N2) </li> <li>Rescue Ratio: max(Np,Nt) / min (Np,Nt) </li> <li>Reproducibility Test: If Self-consistency Ratio >2 AND Rescue Ratio > 2, then 'Fail' else 'Pass' </li> </ul></p></div><br> """, parser=parse_reproducibility_qc, map_key_desc=MAP_KEY_DESC_REPRODUCIBILITY_QC, parent=cat_replication, ) if args.overlap_reproducibility_qc: qc = args.overlap_reproducibility_qc[0] cat_reproducibility.add_log(qc, key='overlap') if args.idr_reproducibility_qc: qc = args.idr_reproducibility_qc[0] cat_reproducibility.add_log(qc, key='idr') if args.peak_caller == 'spp': extra_info = 'with FDR 0.01' elif args.peak_caller == 'macs2': extra_info = 'with p-val threshold {}'.format(args.pval_thresh) else: extra_info = '' cat_num_peak = QCCategory( 'num_peaks', html_head='<h2>Number of raw peaks</h2>', html_foot=""" Top {num_peak} raw peaks from {peak_caller} {extra_info} """.format( num_peak=args.cap_num_peak, peak_caller=args.peak_caller, extra_info=extra_info, ), parser=parse_num_peak_qc, map_key_desc=MAP_KEY_DESC_NUM_PEAK_QC, parent=cat_replication, ) if args.num_peak_qcs: for i, qc in enumerate(args.num_peak_qcs): if qc: cat_num_peak.add_log(qc, key=str_rep(i)) return cat_replication
def make_cat_peak_enrich(args, cat_root): cat_peak_enrich = QCCategory( 'peak_enrich', html_head='<h1>Peak enrichment</h1><hr>', parent=cat_root ) cat_frip = QCCategory( 'frac_reads_in_peaks', html_head='<h2>Fraction of reads in peaks (FRiP)</h2>', html_foot=""" <div id='help-FRiP'> For {peak_caller} raw peaks:<br> <p><ul> <li>repX: Peak from true replicate X </li> <li>repX-prY: Peak from Yth pseudoreplicates from replicate X </li> <li>pooled: Peak from pooled true replicates (pool of rep1, rep2, ...) </li> <li>pooled-pr1: Peak from 1st pooled pseudo replicate (pool of rep1-pr1, rep2-pr1, ...)</li> <li>pooled-pr2: Peak from 2nd pooled pseudo replicate (pool of rep1-pr2, rep2-pr2, ...)</li> </ul></p> <br> For overlap/IDR peaks:<br> <p><ul> <li>repX_vs_repY: Comparing two peaks from true replicates X and Y </li> <li>repX-pr1_vs_repX-pr2: Comparing two peaks from both pseudoreplicates from replicate X </li> <li>pooled-pr1_vs_pooled-pr2: Comparing two peaks from 1st and 2nd pooled pseudo replicates </li> </ul></p> </div> """.format( peak_caller=args.peak_caller), parent=cat_peak_enrich, ) # raw peaks cat_frip_call_peak = QCCategory( args.peak_caller, html_head='<h3>FRiP for {} raw peaks</h3>'.format(args.peak_caller), parser=parse_frip_qc, map_key_desc=MAP_KEY_DESC_FRIP_QC, parent=cat_frip ) if args.frip_qcs: for i, qc in enumerate(args.frip_qcs): if qc: cat_frip_call_peak.add_log(qc, key=str_rep(i)) if args.frip_qcs_pr1: for i, qc in enumerate(args.frip_qcs_pr1): if qc: cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr1') if args.frip_qcs_pr2: for i, qc in enumerate(args.frip_qcs_pr2): if qc: cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr2') if args.frip_qc_pooled: cat_frip_call_peak.add_log(args.frip_qc_pooled[0], key='pooled') if args.frip_qc_ppr1: cat_frip_call_peak.add_log(args.frip_qc_ppr1[0], key='pooled-pr1') if args.frip_qc_ppr2: cat_frip_call_peak.add_log(args.frip_qc_ppr2[0], key='pooled-pr2') # overlap cat_frip_overlap = QCCategory( 'overlap', html_head='<h3>FRiP for overlap peaks</h3>', parser=parse_frip_qc, map_key_desc=MAP_KEY_DESC_FRIP_QC, parent=cat_frip ) if args.frip_overlap_qcs: num_rep = infer_n_from_nC2(len(args.frip_overlap_qcs)) for i, qc in enumerate(args.frip_overlap_qcs): if qc: cat_frip_overlap.add_log( qc, key=infer_pair_label_from_idx(num_rep, i)) if args.frip_overlap_qcs_pr: for i, qc in enumerate(args.frip_overlap_qcs_pr): if qc: cat_frip_overlap.add_log( qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i + 1)) if args.frip_overlap_qc_ppr: cat_frip_overlap.add_log(args.frip_overlap_qc_ppr[0], key='pooled-pr1_vs_pooled-pr2') # IDR cat_frip_idr = QCCategory( 'idr', html_head='<h3>FRiP for IDR peaks</h3>', parser=parse_frip_qc, map_key_desc=MAP_KEY_DESC_FRIP_QC, parent=cat_frip ) if args.frip_idr_qcs: num_rep = infer_n_from_nC2(len(args.frip_idr_qcs)) for i, qc in enumerate(args.frip_idr_qcs): if qc: cat_frip_idr.add_log( qc, key=infer_pair_label_from_idx(num_rep, i)) if args.frip_idr_qcs_pr: for i, qc in enumerate(args.frip_idr_qcs_pr): if qc: cat_frip_idr.add_log( qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i+1)) if args.frip_idr_qc_ppr: cat_frip_idr.add_log(args.frip_idr_qc_ppr[0], key='pooled-pr1_vs_pooled-pr2') cat_annot_enrich = QCCategory( 'frac_reads_in_annot', html_head='<h2>Annotated genomic region enrichment</h2>', html_foot=""" <p>Signal to noise can be assessed by considering whether reads are falling into known open regions (such as DHS regions) or not. A high fraction of reads should fall into the universal (across cell type) DHS set. A small fraction should fall into the blacklist regions. A high set (though not all) should fall into the promoter regions. A high set (though not all) should fall into the enhancer regions. The promoter regions should not take up all reads, as it is known that there is a bias for promoters in open chromatin assays.</p><br> """, parser=parse_annot_enrich_qc, map_key_desc=MAP_KEY_DESC_ANNOT_ENRICH_QC, parent=cat_peak_enrich, ) if args.annot_enrich_qcs: for i, qc in enumerate(args.annot_enrich_qcs): if qc: cat_annot_enrich.add_log(qc, key=str_rep(i)) return cat_peak_enrich
def make_cat_peak_enrich(args, cat_root): cat_peak_enrich = QCCategory('peak_enrich', html_head='<h1>Peak enrichment</h1><hr>', parent=cat_root) cat_frip = QCCategory( 'frac_reads_in_peaks', html_head='<h2>Fraction of reads in peaks (FRiP)</h2>', html_foot=""" <div id='help-FRiP'> For {peak_caller} raw peaks:<br> <p><ul> <li>repX: Peak from true replicate X </li> <li>repX-prY: Peak from Yth pseudoreplicates from replicate X </li> <li>pooled: Peak from pooled true replicates (pool of rep1, rep2, ...) </li> <li>pooled-pr1: Peak from 1st pooled pseudo replicate (pool of rep1-pr1, rep2-pr1, ...)</li> <li>pooled-pr2: Peak from 2nd pooled pseudo replicate (pool of rep1-pr2, rep2-pr2, ...)</li> </ul></p> <br> For overlap/IDR peaks:<br> <p><ul> <li>repX_vs_repY: Comparing two peaks from true replicates X and Y </li> <li>repX-pr1_vs_repX-pr2: Comparing two peaks from both pseudoreplicates from replicate X </li> <li>pooled-pr1_vs_pooled-pr2: Comparing two peaks from 1st and 2nd pooled pseudo replicates </li> </ul></p> </div> """.format(peak_caller=args.peak_caller), parent=cat_peak_enrich, ) # raw peaks cat_frip_call_peak = QCCategory( args.peak_caller, html_head='<h3>FRiP for {} raw peaks</h3>'.format(args.peak_caller), parser=parse_frip_qc, map_key_desc=MAP_KEY_DESC_FRIP_QC, parent=cat_frip) if args.frip_qcs: for i, qc in enumerate(args.frip_qcs): if qc: cat_frip_call_peak.add_log(qc, key=str_rep(i)) if args.frip_qcs_pr1: for i, qc in enumerate(args.frip_qcs_pr1): if qc: cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr1') if args.frip_qcs_pr2: for i, qc in enumerate(args.frip_qcs_pr2): if qc: cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr2') if args.frip_qc_pooled: cat_frip_call_peak.add_log(args.frip_qc_pooled[0], key='pooled') if args.frip_qc_ppr1: cat_frip_call_peak.add_log(args.frip_qc_ppr1[0], key='pooled-pr1') if args.frip_qc_ppr2: cat_frip_call_peak.add_log(args.frip_qc_ppr2[0], key='pooled-pr2') # overlap cat_frip_overlap = QCCategory('overlap', html_head='<h3>FRiP for overlap peaks</h3>', parser=parse_frip_qc, map_key_desc=MAP_KEY_DESC_FRIP_QC, parent=cat_frip) if args.frip_overlap_qcs: num_rep = infer_n_from_nC2(len(args.frip_overlap_qcs)) for i, qc in enumerate(args.frip_overlap_qcs): if qc: cat_frip_overlap.add_log(qc, key=infer_pair_label_from_idx( num_rep, i)) if args.frip_overlap_qcs_pr: for i, qc in enumerate(args.frip_overlap_qcs_pr): if qc: cat_frip_overlap.add_log( qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i + 1)) if args.frip_overlap_qc_ppr: cat_frip_overlap.add_log(args.frip_overlap_qc_ppr[0], key='pooled-pr1_vs_pooled-pr2') # IDR cat_frip_idr = QCCategory('idr', html_head='<h3>FRiP for IDR peaks</h3>', parser=parse_frip_qc, map_key_desc=MAP_KEY_DESC_FRIP_QC, parent=cat_frip) if args.frip_idr_qcs: num_rep = infer_n_from_nC2(len(args.frip_idr_qcs)) for i, qc in enumerate(args.frip_idr_qcs): if qc: cat_frip_idr.add_log(qc, key=infer_pair_label_from_idx(num_rep, i)) if args.frip_idr_qcs_pr: for i, qc in enumerate(args.frip_idr_qcs_pr): if qc: cat_frip_idr.add_log( qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i + 1)) if args.frip_idr_qc_ppr: cat_frip_idr.add_log(args.frip_idr_qc_ppr[0], key='pooled-pr1_vs_pooled-pr2') return cat_peak_enrich