예제 #1
0
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    # declare temp arrays
    temp_files = []  # files to deleted later at the end

    log.info('Subsampling TAGALIGN for xcor...')
    if args.paired_end:
        ta_subsampled = subsample_ta_pe(args.ta, args.subsample, True,
                                        args.mito_chr_name, True, args.out_dir)
    else:
        ta_subsampled = subsample_ta_se(args.ta, args.subsample, True,
                                        args.mito_chr_name, args.out_dir)
    temp_files.append(ta_subsampled)

    log.info('Cross-correlation analysis...')
    xcor_plot_pdf, xcor_plot_png, xcor_score, fraglen_txt = xcor(
        ta_subsampled, args.speak, args.mito_chr_name, args.nth, args.out_dir,
        args.chip_seq_type, args.exclusion_range_min, args.exclusion_range_max)

    log.info('Removing temporary files...')
    rm_f(temp_files)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak,
          ctl_subsample, ctl_paired_end, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)

        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)
    temp_files = []

    cmd0 = ' macs2 callpeak '
    cmd0 += '-t {} {} -f BED -n {} -g {} -p {} '
    cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR'
    cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix,
                       gensz, pval_thresh, 0, fraglen)
    run_shell_cmd(cmd0)

    cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | '
    cmd1 += 'awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
    cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}'
    cmd1 = cmd1.format(prefix, npeak_tmp)
    run_shell_cmd(cmd1)

    cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2)
    run_shell_cmd(cmd2)

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return npeak
def main():
    # read params
    args = parse_arguments()
    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    if args.paired_end:
        subsampled_ta = subsample_ta_pe(args.ta,
                                        args.subsample,
                                        non_mito=False,
                                        mito_chr_name=None,
                                        r1_only=False,
                                        out_dir=args.out_dir)
    else:
        subsampled_ta = subsample_ta_se(args.ta,
                                        args.subsample,
                                        non_mito=False,
                                        mito_chr_name=None,
                                        out_dir=args.out_dir)
    log.info('Checking if output is empty...')
    assert_file_not_empty(subsampled_ta)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
예제 #4
0
def spp(ta, ctl_ta, chrsz, fraglen, cap_num_peak, fdr_thresh, ctl_subsample,
        ctl_paired_end, nth, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))

    if ctl_subsample:
        if ctl_paired_end:
            ctl_ta = subsample_ta_pe(ctl_ta,
                                     ctl_subsample,
                                     non_mito=False,
                                     mito_chr_name=None,
                                     r1_only=False,
                                     out_dir=out_dir)
        else:
            ctl_ta = subsample_ta_se(ctl_ta,
                                     ctl_subsample,
                                     non_mito=False,
                                     mito_chr_name=None,
                                     out_dir=out_dir)
    basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
    basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
    if len(basename_prefix) > 200:  # UNIX cannot have filename > 255
        basename_prefix = '{}_x_control'.format(basename_ta)
    nth_param = '-p={}'.format(nth) if nth >= 2 else ''
    prefix = os.path.join(out_dir, basename_prefix)
    rpeak = '{}.{}.regionPeak.gz'.format(prefix,
                                         human_readable_number(cap_num_peak))
    rpeak_tmp_prefix = '{}.tmp'.format(rpeak)
    rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak)
    rpeak_tmp2 = '{}.tmp2'.format(rpeak)

    cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} '
    cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}'
    cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir),
                       fraglen, rpeak_tmp_prefix, fdr_thresh, nth_param)
    run_shell_cmd(cmd0)

    # if we have scientific representation of chr coord. then convert it to int
    cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}'
    cmd1 += '{{if ($2<0) $2=0; '
    cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' > {}'
    cmd1 = cmd1.format(rpeak_tmp_gz, rpeak_tmp2)
    run_shell_cmd(cmd1)
    rm_f(rpeak_tmp_gz)

    # clip peaks between 0-chromSize.
    bed_clip(rpeak_tmp2, chrsz, rpeak)
    rm_f(rpeak_tmp2)

    return rpeak
예제 #5
0
def main():
    # read params
    args = parse_arguments()

    log.info('Initializing and making output directory...')
    mkdir_p(args.out_dir)

    # declare temp arrays
    temp_files = []  # files to deleted later at the end

    log.info('Converting BAM to TAGALIGN...')
    if args.paired_end:
        ta = bam2ta_pe(args.bam, args.nth, args.out_dir)
    else:
        ta = bam2ta_se(args.bam, args.out_dir)

    if args.subsample:
        log.info('Subsampling TAGALIGN...')
        if args.paired_end:
            subsampled_ta = subsample_ta_pe(ta, args.subsample, False,
                                            args.mito_chr_name, False,
                                            args.out_dir)
        else:
            subsampled_ta = subsample_ta_se(ta, args.subsample, False,
                                            args.mito_chr_name, args.out_dir)
        temp_files.append(ta)
    else:
        subsampled_ta = ta

    if args.disable_tn5_shift:
        shifted_ta = subsampled_ta
    else:
        log.info("TN5-shifting TAGALIGN...")
        shifted_ta = tn5_shift_ta(subsampled_ta, args.out_dir)
        temp_files.append(subsampled_ta)

    log.info('Checking if output is empty...')
    assert_file_not_empty(shifted_ta)

    log.info('Removing temporary files...')
    rm_f(temp_files)

    log.info('List all files in output directory...')
    ls_l(args.out_dir)

    log.info('All done.')
예제 #6
0
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak,
          ctl_subsample, ctl_paired_end, mem_gb, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)

        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    npeak = '{}.{}.{}.narrowPeak.gz'.format(
        prefix, 'pval{}'.format(pval_thresh),
        human_readable_number(cap_num_peak))
    npeak_tmp = '{}.tmp'.format(npeak)
    npeak_tmp2 = '{}.tmp2'.format(npeak)
    temp_files = []

    run_shell_cmd(
        ' macs2 callpeak '
        '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
        '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR'
        .format(
            ta=ta,
            ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '',
            prefix=prefix,
            gensz=gensz,
            pval_thresh=pval_thresh,
            shiftsize=0,
            extsize=fraglen,
        ))

    run_shell_cmd(
        'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}_peaks.narrowPeak" | '
        'awk \'BEGIN{{OFS="\\t"}}'
        '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) '
        '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            prefix=prefix,
            npeak_tmp=npeak_tmp,
        ))

    run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format(
        cap_num_peak=cap_num_peak,
        npeak_tmp=npeak_tmp,
        npeak_tmp2=npeak_tmp2,
    ))

    # clip peaks between 0-chromSize.
    bed_clip(npeak_tmp2, chrsz, npeak)

    rm_f([npeak_tmp, npeak_tmp2])

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return npeak
예제 #7
0
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen,
                       ctl_subsample, ctl_paired_end, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)
        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    temp_files = []

    cmd0 = ' macs2 callpeak '
    cmd0 += '-t {} {} -f BED -n {} -g {} -p {} '
    cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR'
    cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix,
                       gensz, pval_thresh, 0, fraglen)
    run_shell_cmd(cmd0)

    cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd3 += '-c "{}"_control_lambda.bdg '
    cmd3 += '--o-prefix "{}" -m FE '
    cmd3 = cmd3.format(prefix, prefix, prefix)
    run_shell_cmd(cmd3)

    cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | '
    cmd4 += 'awk \'{{if ($3 != -1) print $0}}\' |'
    cmd4 += 'bedClip stdin {} {}'
    cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph)
    run_shell_cmd(cmd4)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\
        'prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            fc_bedgraph,
            fc_bedgraph_srt)
    run_shell_cmd(cmd5)
    rm_f(fc_bedgraph)

    cmd6 = 'bedGraphToBigWig {} {} {}'
    cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig)
    run_shell_cmd(cmd6)
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg '
    cmd7 += '-c "{}"_control_lambda.bdg '
    cmd7 += '--o-prefix {} -m ppois -S {}'
    cmd7 = cmd7.format(prefix, prefix, prefix, sval)
    run_shell_cmd(cmd7)

    cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | '
    cmd8 += 'awk \'{{if ($3 != -1) print $0}}\' |'
    cmd8 += 'bedClip stdin {} {}'
    cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph)
    run_shell_cmd(cmd8)

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\
        'prev_chr==$1 && prev_chr_e<=$2)) ' \
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format(
            pval_bedgraph,
            pval_bedgraph_srt)
    run_shell_cmd(cmd9)
    rm_f(pval_bedgraph)

    cmd10 = 'bedGraphToBigWig {} {} {}'
    cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig)
    run_shell_cmd(cmd10)
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{}_*".format(prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen,
                       ctl_subsample, ctl_paired_end, mem_gb, out_dir):
    basename_ta = os.path.basename(strip_ext_ta(ta))
    if ctl_ta:
        if ctl_subsample:
            if ctl_paired_end:
                ctl_ta = subsample_ta_pe(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         r1_only=False,
                                         out_dir=out_dir)
            else:
                ctl_ta = subsample_ta_se(ctl_ta,
                                         ctl_subsample,
                                         non_mito=False,
                                         mito_chr_name=None,
                                         out_dir=out_dir)
        basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta))
        basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta)
        if len(basename_prefix) > 200:  # UNIX cannot have len(filename) > 255
            basename_prefix = '{}_x_control'.format(basename_ta)
    else:
        basename_prefix = basename_ta
    prefix = os.path.join(out_dir, basename_prefix)
    fc_bigwig = '{}.fc.signal.bigwig'.format(prefix)
    pval_bigwig = '{}.pval.signal.bigwig'.format(prefix)
    # temporary files
    fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix)
    fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix)
    pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix)
    pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix)

    temp_files = []

    run_shell_cmd(
        ' macs2 callpeak '
        '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} '
        '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR'
        .format(
            ta=ta,
            ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '',
            prefix=prefix,
            gensz=gensz,
            pval_thresh=pval_thresh,
            shiftsize=0,
            extsize=fraglen,
        ))

    run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" '
                  '-c "{prefix}_control_lambda.bdg" '
                  '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, ))

    run_shell_cmd('bedtools slop -i "{prefix}_FE.bdg" -g {chrsz} -b 0 | '
                  'awk \'{{if ($3 != -1) print $0}}\' |'
                  'bedClip stdin {chrsz} {fc_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      fc_bedgraph=fc_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '
        'prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'.
        format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
               fc_bedgraph=fc_bedgraph,
               fc_bedgraph_srt=fc_bedgraph_srt))
    rm_f(fc_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format(
            fc_bedgraph_srt=fc_bedgraph_srt,
            chrsz=chrsz,
            fc_bigwig=fc_bigwig,
        ))
    rm_f(fc_bedgraph_srt)

    # sval counts the number of tags per million in the (compressed) BED file
    sval = float(get_num_lines(ta)) / 1000000.0

    run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" '
                  '-c "{prefix}_control_lambda.bdg" '
                  '--o-prefix {prefix} -m ppois -S {sval}'.format(
                      prefix=prefix,
                      sval=sval,
                  ))

    run_shell_cmd('bedtools slop -i "{prefix}_ppois.bdg" -g {chrsz} -b 0 | '
                  'awk \'{{if ($3 != -1) print $0}}\' |'
                  'bedClip stdin {chrsz} {pval_bedgraph}'.format(
                      prefix=prefix,
                      chrsz=chrsz,
                      pval_bedgraph=pval_bedgraph,
                  ))

    # sort and remove any overlapping regions in bedgraph by comparing two lines in a row
    run_shell_cmd(
        'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | '
        'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '
        'prev_chr==$1 && prev_chr_e<=$2)) '
        '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'.
        format(
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pval_bedgraph=pval_bedgraph,
            pval_bedgraph_srt=pval_bedgraph_srt,
        ))
    rm_f(pval_bedgraph)

    run_shell_cmd(
        'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format(
            pval_bedgraph_srt=pval_bedgraph_srt,
            chrsz=chrsz,
            pval_bigwig=pval_bigwig))
    rm_f(pval_bedgraph_srt)

    # remove temporary files
    temp_files.append("{prefix}_*".format(prefix=prefix))
    rm_f(temp_files)

    return fc_bigwig, pval_bigwig