def subsample_ta_se(ta, subsample, non_mito, mito_chr_name, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) ta_subsampled = '{}.{}{}tagAlign.gz'.format( prefix, 'no_chrM.' if non_mito else '', '{}.'.format(human_readable_number(subsample)) if subsample > 0 else '' ) # bash-only cmd = 'zcat -f {} | ' if non_mito: # cmd += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | ' cmd += 'grep -v \'^'+mito_chr_name+'\\b\' | ' if subsample > 0: cmd += 'shuf -n {} --random-source=<(openssl enc -aes-256-ctr ' cmd += '-pass pass:$(zcat -f {} | wc -c) -nosalt ' cmd += '</dev/zero 2>/dev/null) | ' cmd += 'gzip -nc > {}' cmd = cmd.format( ta, subsample, ta, ta_subsampled) else: cmd += 'gzip -nc > {}' cmd = cmd.format( ta, ta_subsampled) run_shell_cmd(cmd) return ta_subsampled
def spp(ta, ctl_ta, fraglen, cap_num_peak, nth, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have filename > 255 basename_prefix = '{}_x_control'.format(basename_ta) nth_param = '-p={}'.format(nth) if nth < 2 else '' prefix = os.path.join(out_dir, basename_prefix) rpeak = '{}.{}.regionPeak.gz'.format(prefix, human_readable_number(cap_num_peak)) rpeak_tmp = '{}.tmp'.format(rpeak) rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak) cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} ' cmd0 += '-npeak={} -odir={} -speak={} -savr={} -rf {}' cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir), fraglen, rpeak_tmp, nth_param) run_shell_cmd(cmd0) # if we have scientific representation of chr coord. then convert it to int cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{if ($2<0) $2=0; ' cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' | ' cmd1 += 'gzip -f -nc > {}' cmd1 = cmd1.format(rpeak_tmp, rpeak) run_shell_cmd(cmd1) rm_f([rpeak_tmp, rpeak_tmp_gz]) return rpeak
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak, ctl_subsample, ctl_paired_end, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) temp_files = [] cmd0 = ' macs2 callpeak ' cmd0 += '-t {} {} -f BED -n {} -g {} -p {} ' cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR' cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix, gensz, pval_thresh, 0, fraglen) run_shell_cmd(cmd0) cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | ' cmd1 += 'awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}' cmd1 = cmd1.format(prefix, npeak_tmp) run_shell_cmd(cmd1) cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2) run_shell_cmd(cmd2) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return npeak
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) # temporary files npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] run_shell_cmd('macs2 callpeak ' '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--shift {shiftsize} --extsize {extsize} ' '--nomodel -B --SPMR --keep-dup all --call-summits'.format( ta=ta, prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=shiftsize, extsize=smooth_win, )) run_shell_cmd( 'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}"_peaks.narrowPeak | ' 'awk \'BEGIN{{OFS="\\t"}}' '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), prefix=prefix, npeak_tmp=npeak_tmp, )) run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format( cap_num_peak=cap_num_peak, npeak_tmp=npeak_tmp, npeak_tmp2=npeak_tmp2, )) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return npeak
def spp(ta, ctl_ta, chrsz, fraglen, cap_num_peak, fdr_thresh, ctl_subsample, ctl_paired_end, nth, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have filename > 255 basename_prefix = '{}_x_control'.format(basename_ta) nth_param = '-p={}'.format(nth) if nth >= 2 else '' prefix = os.path.join(out_dir, basename_prefix) rpeak = '{}.{}.regionPeak.gz'.format(prefix, human_readable_number(cap_num_peak)) rpeak_tmp_prefix = '{}.tmp'.format(rpeak) rpeak_tmp_gz = '{}.tmp.gz'.format(rpeak) rpeak_tmp2 = '{}.tmp2'.format(rpeak) cmd0 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -c={} -i={} ' cmd0 += '-npeak={} -odir={} -speak={} -savr={} -fdr={} -rf {}' cmd0 = cmd0.format(ta, ctl_ta, cap_num_peak, os.path.abspath(out_dir), fraglen, rpeak_tmp_prefix, fdr_thresh, nth_param) run_shell_cmd(cmd0) # if we have scientific representation of chr coord. then convert it to int cmd1 = 'zcat -f {} | awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{if ($2<0) $2=0; ' cmd1 += 'print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}\' > {}' cmd1 = cmd1.format(rpeak_tmp_gz, rpeak_tmp2) run_shell_cmd(cmd1) rm_f(rpeak_tmp_gz) # clip peaks between 0-chromSize. bed_clip(rpeak_tmp2, chrsz, rpeak) rm_f(rpeak_tmp2) return rpeak
def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) ta_subsampled = '{}.{}{}{}tagAlign.gz'.format( prefix, 'no_chrM.' if non_mito else '', 'R1.' if r1_only else '', '{}.'.format(human_readable_number(subsample)) if subsample > 0 else '' ) ta_tmp = '{}.tagAlign.tmp'.format(prefix) cmd0 = 'zcat -f {} | ' if non_mito: # cmd0 += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | ' cmd0 += 'grep -v \'^'+mito_chr_name+'\\b\' | ' cmd0 += 'sed \'N;s/\\n/\\t/\' ' if subsample > 0: cmd0 += '| shuf -n {} --random-source=<(openssl enc -aes-256-ctr ' cmd0 += '-pass pass:$(zcat -f {} | wc -c) -nosalt ' cmd0 += '</dev/zero 2>/dev/null) > {}' cmd0 = cmd0.format( ta, subsample, ta, ta_tmp) else: cmd0 += '> {}' cmd0 = cmd0.format( ta, ta_tmp) run_shell_cmd(cmd0) cmd = 'cat {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}} ' if r1_only: cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd += '",$1,$2,$3,$4,$5,$6}}\' | ' else: cmd += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format( ta_tmp, ta_subsampled) run_shell_cmd(cmd) rm_f(ta_tmp) return ta_subsampled
def macs2(ta, chrsz, gensz, pval_thresh, smooth_win, cap_num_peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) # temporary files npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] cmd0 = 'macs2 callpeak ' cmd0 += '-t {} -f BED -n {} -g {} -p {} ' cmd0 += '--shift {} --extsize {} ' cmd0 += '--nomodel -B --SPMR ' cmd0 += '--keep-dup all --call-summits ' cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win) run_shell_cmd(cmd0) cmd1 = 'LC_COLLATE=C sort -k 8gr,8gr "{}"_peaks.narrowPeak | ' cmd1 += 'awk \'BEGIN{{OFS="\\t"}}' cmd1 += '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' cmd1 += '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {}' cmd1 = cmd1.format(prefix, npeak_tmp) run_shell_cmd(cmd1) cmd2 = 'head -n {} {} > {}'.format(cap_num_peak, npeak_tmp, npeak_tmp2) run_shell_cmd(cmd2) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return npeak
def macs2(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, cap_num_peak, ctl_subsample, ctl_paired_end, mem_gb, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) npeak = '{}.{}.{}.narrowPeak.gz'.format( prefix, 'pval{}'.format(pval_thresh), human_readable_number(cap_num_peak)) npeak_tmp = '{}.tmp'.format(npeak) npeak_tmp2 = '{}.tmp2'.format(npeak) temp_files = [] run_shell_cmd( ' macs2 callpeak ' '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR' .format( ta=ta, ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '', prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=0, extsize=fraglen, )) run_shell_cmd( 'LC_COLLATE=C sort -k 8gr,8gr {sort_param} "{prefix}_peaks.narrowPeak" | ' 'awk \'BEGIN{{OFS="\\t"}}' '{{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; if ($10==-1) ' '$10=$2+int(($3-$2+1)/2.0); print $0}}\' > {npeak_tmp}'.format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), prefix=prefix, npeak_tmp=npeak_tmp, )) run_shell_cmd('head -n {cap_num_peak} {npeak_tmp} > {npeak_tmp2}'.format( cap_num_peak=cap_num_peak, npeak_tmp=npeak_tmp, npeak_tmp2=npeak_tmp2, )) # clip peaks between 0-chromSize. bed_clip(npeak_tmp2, chrsz, npeak) rm_f([npeak_tmp, npeak_tmp2]) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return npeak