def main():
    # read params
    log.info('Parsing QC logs and reading QC plots...')
    args = parse_arguments()

    # make a root QCCategory
    cat_root = make_cat_root(args)

    # make QCCategory for each category
    make_cat_align(args, cat_root)
    make_cat_lib_complexity(args, cat_root)
    make_cat_replication(args, cat_root)
    make_cat_peak_stat(args, cat_root)
    make_cat_align_enrich(args, cat_root)
    make_cat_peak_enrich(args, cat_root)
    make_cat_etc(args, cat_root)

    log.info('Creating HTML report...')
    write_txt(args.out_qc_html, cat_root.to_html())

    log.info('Creating QC JSON file...')
    j = cat_root.to_dict()
    write_txt(args.out_qc_json, json.dumps(j, indent=4))

    if args.qc_json_ref:
        log.info('Comparing QC JSON file with reference...')
        # exclude general section from comparing
        # because it includes metadata like date, pipeline_ver, ...
        # we want to compare actual quality metrics only
        j.pop('general')
        # exclude JSD (last 3 columns are random)
        # JSD is tested in task level test.
        if 'align_enrich' in j and 'jsd' in j['align_enrich']:
            j['align_enrich'].pop('jsd')
        with open(args.qc_json_ref, 'r') as fp:
            j_ref = json.load(fp, object_pairs_hook=OrderedDict)
            if 'general' in j_ref:
                j_ref.pop("general")
            if 'align_enrich' in j_ref and 'jsd' in j_ref['align_enrich']:
                j_ref['align_enrich'].pop('jsd')
            match_qc_json_ref = j == j_ref
    else:
        match_qc_json_ref = False

    run_shell_cmd('echo {} > qc_json_ref_match.txt'.format(match_qc_json_ref))

    log.info('All done.')
def frip_shifted(ta, peak, chrsz, fraglen, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)
    half_fraglen = (fraglen + 1) / 2

    if get_num_lines(peak) == 0:
        val1 = 0.0
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools slop -i {} -g {} '
        cmd += '-s -l {} -r {} | '
        cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | '
        cmd += 'bedtools intersect -nonamecheck -a stdin -b {} '
        cmd += '-wa -u | wc -l'
        cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        rm_f(tmp2)
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    return frip_qc
def frip(ta, peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)

    if get_num_lines(peak) == 0:
        val1 = 0.0
        tmp_files = []
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(ta, 'tmp1', out_dir)
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l'
        cmd = cmd.format(
            tmp1,  # ta
            tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        tmp_files = [tmp1, tmp2]
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    rm_f(tmp_files)
    return frip_qc
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')

    # make out_dir (root of all outputs)
    mkdir_p(args.out_dir)

    # reproducibility QC
    log.info('Choosing appropriate control for each IP replicate...')
    num_rep = len(args.tas)
    num_ctl = len(args.ctl_tas)

    # num lines in tagaligns
    depths = [get_num_lines(ta) for ta in args.tas]
    # num lines in control tagaligns
    depths_ctl = [get_num_lines(ctl_ta) for ctl_ta in args.ctl_tas]
    depth_rep_pooled = sum(depths)
    depth_ctl_pooled = sum(depths_ctl)

    # make them dicts including -1 key (meaning pooled one)
    depths = dict(enumerate(depths))
    depths_ctl = dict(enumerate(depths_ctl))

    depths[-1] = depth_rep_pooled
    depths_ctl[-1] = depth_ctl_pooled

    ctl_ta_idx = [0]*num_rep
    if num_ctl == 1:
        # if only one control, use it for all replicates
        pass
    elif args.always_use_pooled_ctl:
        # if --always-use-pooled-ctl, then always use pooled control
        ctl_ta_idx = [-1]*num_rep
    else:
        # if multiple controls,
        # check # of lines in replicate/control tagaligns and
        # apply ctl_depth_ratio

        # make depths dicts including pooled ones

        # check every num lines in every pair of control tagaligns
        # if ratio of two entries in any pair > ctl_depth_ratio then
        # use pooled control for all
        use_pooled_ctl = False
        for i in range(num_ctl):
            for j in range(i+1, num_ctl):
                if depths_ctl[i]/float(depths_ctl[j]) > \
                        args.ctl_depth_ratio or \
                        depths_ctl[j]/float(depths_ctl[i]) > \
                        args.ctl_depth_ratio:
                    use_pooled_ctl = True
                    log.info(
                        'Number of reads in controls differ by a factor of {}.'
                        'Using pooled controls.'.format(
                            args.ctl_depth_ratio))
                    break

        if use_pooled_ctl:
            # use pooled control for all exp replicates
            ctl_ta_idx = [-1]*num_rep
        else:
            for i in range(num_rep):
                if i > num_ctl-1:
                    ctl_ta_idx[i] = -1  # use pooled control
                elif depths_ctl[i] < depths[i]:
                    log.info(
                        'Fewer reads in control {} than experiment replicate '
                        '{}. Using pooled control for replicate {}.'.format(
                            i+1, i+1, i+1))
                    ctl_ta_idx[i] = -1  # use pooled control
                else:
                    ctl_ta_idx[i] = i

    ctl_ta_subsample = [0] * num_rep
    ctl_ta_subsampled_pooled = 0
    if args.exp_ctl_depth_ratio_limit or args.ctl_depth_limit:
        # subsampling chosen control for each replicate
        for rep in range(num_rep):
            chosen_ctl = ctl_ta_idx[rep]
            depth = depths[rep]
            depth_ctl = depths_ctl[chosen_ctl]
            limit = int(max(depth * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit))
            if depth_ctl > limit:
                ctl_ta_subsample[rep] = limit

        # subsampling pooled control for pooled replicate
        limit = int(max(depth_rep_pooled * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit))
        if depth_ctl_pooled > limit:
            ctl_ta_subsampled_pooled = limit

    # for each replicate check
    log.info('Writing idx.txt...')
    out_txt = os.path.join(args.out_dir, args.out_tsv_basename)
    write_txt(out_txt, ctl_ta_idx)

    log.info('Writing subsample txt...')
    out_subsample_txt = os.path.join(args.out_dir, args.out_tsv_subsample_basename)
    write_txt(out_subsample_txt, ctl_ta_subsample)

    log.info('Writing subsample_pooled txt...')
    out_subsample_pooled_txt = os.path.join(args.out_dir, args.out_txt_subsample_pooled_basename)
    write_txt(out_subsample_pooled_txt, ctl_ta_subsampled_pooled)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')