def parse_arguments():
    parser = argparse.ArgumentParser(
        prog='ENCODE DCC reproducibility QC.',
        description='IDR peak or overlap peak only.')
    parser.add_argument('peaks',
                        type=str,
                        nargs='*',
                        help='List of peak files \
                        from true replicates in a sorted order. \
                        For example of 4 true replicates, \
                         0,1 0,2 0,3 1,2 1,3 2,3. \
                         x,y means peak file from rep-x vs rep-y.')
    parser.add_argument('--peaks-pr',
                        type=str,
                        nargs='+',
                        required=True,
                        help='List of peak files from pseudo replicates.')
    parser.add_argument('--peak-ppr',
                        type=str,
                        help='Peak file from pooled pseudo replicate.')
    parser.add_argument(
        '--peak-type',
        type=str,
        default='narrowPeak',
        choices=['narrowPeak', 'regionPeak', 'broadPeak', 'gappedPeak'],
        help='Peak file type.')
    parser.add_argument('--chrsz',
                        type=str,
                        help='2-col chromosome sizes file.')
    parser.add_argument('--prefix',
                        type=str,
                        help='Basename prefix for reproducibility QC file.')
    parser.add_argument('--out-dir',
                        default='',
                        type=str,
                        help='Output directory.')
    parser.add_argument('--log-level',
                        default='INFO',
                        choices=[
                            'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'CRITICAL',
                            'ERROR', 'CRITICAL'
                        ],
                        help='Log level')
    args = parser.parse_args()
    if len(args.peaks_pr) != infer_n_from_nC2(len(args.peaks)):
        raise argparse.ArgumentTypeError(
            'Invalid number of peak files or --peaks-pr.')

    log.setLevel(args.log_level)
    log.info(sys.argv)
    return args
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    log.info('Reproducibility QC...')
    # description for variables
    # N: list of number of peaks in peak files from pseudo replicates
    # Nt: top number of peaks in peak files
    #     from true replicates (rep-x_vs_rep-y)
    # Np: number of peaks in peak files from pooled pseudo replicate
    N = [get_num_lines(peak) for peak in args.peaks_pr]
    if len(args.peaks):
        # multiple replicate case
        num_rep = infer_n_from_nC2(len(args.peaks))
        num_peaks_tr = [get_num_lines(peak) for peak in args.peaks]

        Nt = max(num_peaks_tr)
        Np = get_num_lines(args.peak_ppr)
        rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
        self_consistency_ratio = float(max(N)) / float(min(N))

        Nt_idx = num_peaks_tr.index(Nt)
        label_tr = infer_pair_label_from_idx(num_rep, Nt_idx)

        conservative_set = label_tr
        conservative_peak = args.peaks[Nt_idx]
        N_conservative = Nt
        if Nt > Np:
            optimal_set = conservative_set
            optimal_peak = conservative_peak
            N_optimal = N_conservative
        else:
            optimal_set = "pooled-pr1_vs_pooled-pr2"
            optimal_peak = args.peak_ppr
            N_optimal = Np
    else:
        # single replicate case
        num_rep = 1

        Nt = 0
        Np = 0
        rescue_ratio = 0.0
        self_consistency_ratio = 1.0

        conservative_set = 'rep1-pr1_vs_rep1-pr2'
        conservative_peak = args.peaks_pr[0]
        N_conservative = N[0]
        optimal_set = conservative_set
        optimal_peak = conservative_peak
        N_optimal = N_conservative

    reproducibility = 'pass'
    if rescue_ratio > 2.0 or self_consistency_ratio > 2.0:
        reproducibility = 'borderline'
    if rescue_ratio > 2.0 and self_consistency_ratio > 2.0:
        reproducibility = 'fail'

    log.info('Writing optimal/conservative peak files...')
    optimal_peak_file = os.path.join(
        args.out_dir, '{}optimal_peak.{}.gz'.format(
            (args.prefix + '.') if args.prefix else '', args.peak_type))
    conservative_peak_file = os.path.join(
        args.out_dir, '{}conservative_peak.{}.gz'.format(
            (args.prefix + '.') if args.prefix else '', args.peak_type))
    copy_f_to_f(optimal_peak, optimal_peak_file)
    copy_f_to_f(conservative_peak, conservative_peak_file)

    if args.chrsz:
        log.info('Converting peak to bigbed...')
        peak_to_bigbed(optimal_peak_file, args.peak_type, args.chrsz,
                       args.out_dir)
        peak_to_bigbed(conservative_peak_file, args.peak_type, args.chrsz,
                       args.out_dir)

        log.info('Converting peak to starch...')
        peak_to_starch(optimal_peak_file, args.out_dir)
        peak_to_starch(conservative_peak_file, args.out_dir)

        log.info('Converting peak to hammock...')
        peak_to_hammock(optimal_peak_file, args.out_dir)
        peak_to_hammock(conservative_peak_file, args.out_dir)

    log.info('Writing reproducibility QC log...')
    if args.prefix:
        reproducibility_qc = '{}.reproducibility.qc'.format(args.prefix)
    else:
        reproducibility_qc = 'reproducibility.qc'
    reproducibility_qc = os.path.join(args.out_dir, reproducibility_qc)

    with open(reproducibility_qc, 'w') as fp:
        header = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            'Nt',
            '\t'.join(['N{}'.format(i + 1) for i in range(num_rep)]),
            'Np',
            'N_opt',
            'N_consv',
            'opt_set',
            'consv_set',
            'rescue_ratio',
            'self_consistency_ratio',
            'reproducibility',
        )
        line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            Nt, '\t'.join([str(i) for i in N]), Np, N_optimal, N_conservative,
            optimal_set, conservative_set, rescue_ratio,
            self_consistency_ratio, reproducibility)
        fp.write(header)
        fp.write(line)

    log.info('Calculating (optimal) peak region size QC/plot...')
    region_size_qc, region_size_plot = get_region_size_metrics(
        optimal_peak_file)

    log.info('Calculating number of peaks (optimal)...')
    get_num_peaks(optimal_peak_file)

    log.info('All done.')
def make_cat_replication(args, cat_root):
    cat_replication = QCCategory(
        'replication',
        html_head='<h1>Replication quality metrics</h1><hr>',
        parent=cat_root
    )

    cat_idr = QCCategory(
        'idr',
        html_head='<h2>IDR (Irreproducible Discovery Rate) plots</h2>',
        parent=cat_replication,
    )
    if args.idr_plots:
        num_rep = infer_n_from_nC2(len(args.idr_plots))
        for i, plot in enumerate(args.idr_plots):
            if plot:
                cat_idr.add_plot(
                    plot,
                    key=infer_pair_label_from_idx(num_rep, i))
    if args.idr_plots_pr:
        for i, plot in enumerate(args.idr_plots_pr):
            if plot:
                cat_idr.add_plot(
                    plot,
                    key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i+1))
    if args.idr_plot_ppr:
        cat_idr.add_plot(args.idr_plot_ppr[0], key='pooled-pr1_vs_pooled-pr2')

    cat_reproducibility = QCCategory(
        'reproducibility',
        html_head='<h2>Reproducibility QC and peak detection statistics</h2>',
        html_foot="""
            <div id='help-reproducibility'><p>Reproducibility QC<br>
            <ul>
            <li>N1: Replicate 1 self-consistent peaks (comparing two pseudoreplicates generated by subsampling Rep1 reads) </li>
            <li>N2: Replicate 2 self-consistent peaks (comparing two pseudoreplicates generated by subsampling Rep2 reads) </li>
            <li>Ni: Replicate i self-consistent peaks (comparing two pseudoreplicates generated by subsampling RepX reads) </li>
            <li>Nt: True Replicate consistent peaks (comparing true replicates Rep1 vs Rep2) </li>
            <li>Np: Pooled-pseudoreplicate consistent peaks (comparing two pseudoreplicates generated by subsampling pooled reads from Rep1 and Rep2) </li>
            <li>Self-consistency Ratio: max(N1,N2) / min (N1,N2) </li>
            <li>Rescue Ratio: max(Np,Nt) / min (Np,Nt) </li>
            <li>Reproducibility Test: If Self-consistency Ratio >2 AND Rescue Ratio > 2, then 'Fail' else 'Pass' </li>
            </ul></p></div><br>
        """,
        parser=parse_reproducibility_qc,
        map_key_desc=MAP_KEY_DESC_REPRODUCIBILITY_QC,
        parent=cat_replication,
    )
    if args.overlap_reproducibility_qc:
        qc = args.overlap_reproducibility_qc[0]
        cat_reproducibility.add_log(qc, key='overlap')

    if args.idr_reproducibility_qc:
        qc = args.idr_reproducibility_qc[0]
        cat_reproducibility.add_log(qc, key='idr')

    if args.peak_caller == 'spp':
        extra_info = 'with FDR 0.01'
    elif args.peak_caller == 'macs2':
        extra_info = 'with p-val threshold {}'.format(args.pval_thresh)
    else:
        extra_info = ''

    cat_num_peak = QCCategory(
        'num_peaks',
        html_head='<h2>Number of raw peaks</h2>',
        html_foot="""
            Top {num_peak} raw peaks from {peak_caller} {extra_info}
        """.format(
            num_peak=args.cap_num_peak,
            peak_caller=args.peak_caller,
            extra_info=extra_info,
        ),
        parser=parse_num_peak_qc,
        map_key_desc=MAP_KEY_DESC_NUM_PEAK_QC,
        parent=cat_replication,
    )
    if args.num_peak_qcs:
        for i, qc in enumerate(args.num_peak_qcs):
            if qc:
                cat_num_peak.add_log(qc, key=str_rep(i))

    return cat_replication
def make_cat_peak_enrich(args, cat_root):
    cat_peak_enrich = QCCategory(
        'peak_enrich',
        html_head='<h1>Peak enrichment</h1><hr>',
        parent=cat_root
    )

    cat_frip = QCCategory(
        'frac_reads_in_peaks',
        html_head='<h2>Fraction of reads in peaks (FRiP)</h2>',
        html_foot="""
            <div id='help-FRiP'>
            For {peak_caller} raw peaks:<br>
            <p><ul>
            <li>repX: Peak from true replicate X </li>
            <li>repX-prY: Peak from Yth pseudoreplicates from replicate X </li>
            <li>pooled: Peak from pooled true replicates (pool of rep1, rep2, ...) </li>
            <li>pooled-pr1: Peak from 1st pooled pseudo replicate (pool of rep1-pr1, rep2-pr1, ...)</li>
            <li>pooled-pr2: Peak from 2nd pooled pseudo replicate (pool of rep1-pr2, rep2-pr2, ...)</li>
            </ul></p>
            <br>
            For overlap/IDR peaks:<br>
            <p><ul>
            <li>repX_vs_repY: Comparing two peaks from true replicates X and Y </li>
            <li>repX-pr1_vs_repX-pr2: Comparing two peaks from both pseudoreplicates from replicate X </li>
            <li>pooled-pr1_vs_pooled-pr2: Comparing two peaks from 1st and 2nd pooled pseudo replicates </li>
            </ul></p>
            </div>
        """.format(
            peak_caller=args.peak_caller),
        parent=cat_peak_enrich,
    )

    # raw peaks
    cat_frip_call_peak = QCCategory(
        args.peak_caller,
        html_head='<h3>FRiP for {} raw peaks</h3>'.format(args.peak_caller),
        parser=parse_frip_qc,
        map_key_desc=MAP_KEY_DESC_FRIP_QC,
        parent=cat_frip
    )
    if args.frip_qcs:
        for i, qc in enumerate(args.frip_qcs):
            if qc:
                cat_frip_call_peak.add_log(qc, key=str_rep(i))
    if args.frip_qcs_pr1:
        for i, qc in enumerate(args.frip_qcs_pr1):
            if qc:
                cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr1')
    if args.frip_qcs_pr2:
        for i, qc in enumerate(args.frip_qcs_pr2):
            if qc:
                cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr2')
    if args.frip_qc_pooled:
        cat_frip_call_peak.add_log(args.frip_qc_pooled[0], key='pooled')
    if args.frip_qc_ppr1:
        cat_frip_call_peak.add_log(args.frip_qc_ppr1[0], key='pooled-pr1')
    if args.frip_qc_ppr2:
        cat_frip_call_peak.add_log(args.frip_qc_ppr2[0], key='pooled-pr2')

    # overlap
    cat_frip_overlap = QCCategory(
        'overlap',
        html_head='<h3>FRiP for overlap peaks</h3>',
        parser=parse_frip_qc,
        map_key_desc=MAP_KEY_DESC_FRIP_QC,
        parent=cat_frip
    )
    if args.frip_overlap_qcs:
        num_rep = infer_n_from_nC2(len(args.frip_overlap_qcs))
        for i, qc in enumerate(args.frip_overlap_qcs):
            if qc:
                cat_frip_overlap.add_log(
                    qc, key=infer_pair_label_from_idx(num_rep, i))
    if args.frip_overlap_qcs_pr:
        for i, qc in enumerate(args.frip_overlap_qcs_pr):
            if qc:
                cat_frip_overlap.add_log(
                    qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i + 1))
    if args.frip_overlap_qc_ppr:
        cat_frip_overlap.add_log(args.frip_overlap_qc_ppr[0],
                                 key='pooled-pr1_vs_pooled-pr2')

    # IDR
    cat_frip_idr = QCCategory(
        'idr',
        html_head='<h3>FRiP for IDR peaks</h3>',
        parser=parse_frip_qc,
        map_key_desc=MAP_KEY_DESC_FRIP_QC,
        parent=cat_frip
    )
    if args.frip_idr_qcs:
        num_rep = infer_n_from_nC2(len(args.frip_idr_qcs))
        for i, qc in enumerate(args.frip_idr_qcs):
            if qc:
                cat_frip_idr.add_log(
                    qc, key=infer_pair_label_from_idx(num_rep, i))
    if args.frip_idr_qcs_pr:
        for i, qc in enumerate(args.frip_idr_qcs_pr):
            if qc:
                cat_frip_idr.add_log(
                    qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i+1))
    if args.frip_idr_qc_ppr:
        cat_frip_idr.add_log(args.frip_idr_qc_ppr[0],
                             key='pooled-pr1_vs_pooled-pr2')

    cat_annot_enrich = QCCategory(
        'frac_reads_in_annot',
        html_head='<h2>Annotated genomic region enrichment</h2>',
        html_foot="""
            <p>Signal to noise can be assessed by considering whether reads are falling into
            known open regions (such as DHS regions) or not. A high fraction of reads
            should fall into the universal (across cell type) DHS set. A small fraction
            should fall into the blacklist regions. A high set (though not all) should
            fall into the promoter regions. A high set (though not all) should fall into
            the enhancer regions. The promoter regions should not take up all reads, as
            it is known that there is a bias for promoters in open chromatin assays.</p><br>
        """,
        parser=parse_annot_enrich_qc,
        map_key_desc=MAP_KEY_DESC_ANNOT_ENRICH_QC,
        parent=cat_peak_enrich,
    )

    if args.annot_enrich_qcs:
        for i, qc in enumerate(args.annot_enrich_qcs):
            if qc:
                cat_annot_enrich.add_log(qc, key=str_rep(i))

    return cat_peak_enrich
def make_cat_peak_enrich(args, cat_root):
    cat_peak_enrich = QCCategory('peak_enrich',
                                 html_head='<h1>Peak enrichment</h1><hr>',
                                 parent=cat_root)

    cat_frip = QCCategory(
        'frac_reads_in_peaks',
        html_head='<h2>Fraction of reads in peaks (FRiP)</h2>',
        html_foot="""
            <div id='help-FRiP'>
            For {peak_caller} raw peaks:<br>
            <p><ul>
            <li>repX: Peak from true replicate X </li>
            <li>repX-prY: Peak from Yth pseudoreplicates from replicate X </li>
            <li>pooled: Peak from pooled true replicates (pool of rep1, rep2, ...) </li>
            <li>pooled-pr1: Peak from 1st pooled pseudo replicate (pool of rep1-pr1, rep2-pr1, ...)</li>
            <li>pooled-pr2: Peak from 2nd pooled pseudo replicate (pool of rep1-pr2, rep2-pr2, ...)</li>
            </ul></p>
            <br>
            For overlap/IDR peaks:<br>
            <p><ul>
            <li>repX_vs_repY: Comparing two peaks from true replicates X and Y </li>
            <li>repX-pr1_vs_repX-pr2: Comparing two peaks from both pseudoreplicates from replicate X </li>
            <li>pooled-pr1_vs_pooled-pr2: Comparing two peaks from 1st and 2nd pooled pseudo replicates </li>
            </ul></p>
            </div>
        """.format(peak_caller=args.peak_caller),
        parent=cat_peak_enrich,
    )

    # raw peaks
    cat_frip_call_peak = QCCategory(
        args.peak_caller,
        html_head='<h3>FRiP for {} raw peaks</h3>'.format(args.peak_caller),
        parser=parse_frip_qc,
        map_key_desc=MAP_KEY_DESC_FRIP_QC,
        parent=cat_frip)
    if args.frip_qcs:
        for i, qc in enumerate(args.frip_qcs):
            if qc:
                cat_frip_call_peak.add_log(qc, key=str_rep(i))
    if args.frip_qcs_pr1:
        for i, qc in enumerate(args.frip_qcs_pr1):
            if qc:
                cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr1')
    if args.frip_qcs_pr2:
        for i, qc in enumerate(args.frip_qcs_pr2):
            if qc:
                cat_frip_call_peak.add_log(qc, key=str_rep(i) + '-pr2')
    if args.frip_qc_pooled:
        cat_frip_call_peak.add_log(args.frip_qc_pooled[0], key='pooled')
    if args.frip_qc_ppr1:
        cat_frip_call_peak.add_log(args.frip_qc_ppr1[0], key='pooled-pr1')
    if args.frip_qc_ppr2:
        cat_frip_call_peak.add_log(args.frip_qc_ppr2[0], key='pooled-pr2')

    # overlap
    cat_frip_overlap = QCCategory('overlap',
                                  html_head='<h3>FRiP for overlap peaks</h3>',
                                  parser=parse_frip_qc,
                                  map_key_desc=MAP_KEY_DESC_FRIP_QC,
                                  parent=cat_frip)
    if args.frip_overlap_qcs:
        num_rep = infer_n_from_nC2(len(args.frip_overlap_qcs))
        for i, qc in enumerate(args.frip_overlap_qcs):
            if qc:
                cat_frip_overlap.add_log(qc,
                                         key=infer_pair_label_from_idx(
                                             num_rep, i))
    if args.frip_overlap_qcs_pr:
        for i, qc in enumerate(args.frip_overlap_qcs_pr):
            if qc:
                cat_frip_overlap.add_log(
                    qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i + 1))
    if args.frip_overlap_qc_ppr:
        cat_frip_overlap.add_log(args.frip_overlap_qc_ppr[0],
                                 key='pooled-pr1_vs_pooled-pr2')

    # IDR
    cat_frip_idr = QCCategory('idr',
                              html_head='<h3>FRiP for IDR peaks</h3>',
                              parser=parse_frip_qc,
                              map_key_desc=MAP_KEY_DESC_FRIP_QC,
                              parent=cat_frip)
    if args.frip_idr_qcs:
        num_rep = infer_n_from_nC2(len(args.frip_idr_qcs))
        for i, qc in enumerate(args.frip_idr_qcs):
            if qc:
                cat_frip_idr.add_log(qc,
                                     key=infer_pair_label_from_idx(num_rep, i))
    if args.frip_idr_qcs_pr:
        for i, qc in enumerate(args.frip_idr_qcs_pr):
            if qc:
                cat_frip_idr.add_log(
                    qc, key='rep{X}-pr1_vs_rep{X}-pr2'.format(X=i + 1))
    if args.frip_idr_qc_ppr:
        cat_frip_idr.add_log(args.frip_idr_qc_ppr[0],
                             key='pooled-pr1_vs_pooled-pr2')

    return cat_peak_enrich