def blacklist_filter(peak, blacklist, keep_irregular_chr, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) peak_ext = get_ext(peak) filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext) if get_num_lines(peak) == 0 or blacklist == '' \ or get_num_lines(blacklist) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, filtered) run_shell_cmd(cmd) else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(peak, 'tmp1', out_dir) tmp2 = gunzip(blacklist, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}} ' cmd += '{{if ($5>1000) $5=1000; print $0}}\' | ' if not keep_irregular_chr: cmd += 'grep -P \'chr[\\dXY]+\\b\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format( tmp1, # peak tmp2, # blacklist filtered) run_shell_cmd(cmd) rm_f([tmp1, tmp2]) return filtered
def naive_overlap(basename_prefix, peak1, peak2, peak_pooled, peak_type, nonamecheck, out_dir): prefix = os.path.join(out_dir, basename_prefix) prefix += '.overlap' overlap_peak = '{}.{}.gz'.format(prefix, peak_type) nonamecheck_param = '-nonamecheck' if nonamecheck else '' if peak_type.lower() in ('narrowpeak', 'regionpeak'): awk_param = '{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}' cut_param = '1-10' elif peak_type.lower() == 'broadpeak': awk_param = '{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}' cut_param = '1-9' elif peak_type.lower() == 'gappedpeak': awk_param = '{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}' cut_param = '1-15' else: raise ValueError('Unsupported peak_type.') # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(peak1, 'tmp1', out_dir) tmp2 = gunzip(peak2, 'tmp2', out_dir) tmp_pooled = gunzip(peak_pooled, 'tmp_pooled', out_dir) # Find pooled peaks that overlap peak1 and peak2 # where overlap is defined as the fractional overlap # wrt any one of the overlapping peak pairs >= 0.5 cmd1 = 'intersectBed {} -wo ' cmd1 += '-a {} -b {} | ' cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | ' cmd1 += 'cut -f {} | sort | uniq | ' cmd1 += 'intersectBed {} -wo ' cmd1 += '-a stdin -b {} | ' cmd1 += 'awk \'BEGIN{{FS="\\t";OFS="\\t"}} {}\' | ' cmd1 += 'cut -f {} | sort | uniq | gzip -nc > {}' cmd1 = cmd1.format( nonamecheck_param, tmp_pooled, # peak_pooled tmp1, # peak1 awk_param, cut_param, nonamecheck_param, tmp2, # peak2 awk_param, cut_param, overlap_peak) run_shell_cmd(cmd1) rm_f([tmp1, tmp2, tmp_pooled]) return overlap_peak
def frip(ta, peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) if get_num_lines(peak) == 0: val1 = 0.0 tmp_files = [] else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(ta, 'tmp1', out_dir) tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l' cmd = cmd.format( tmp1, # ta tmp2) # peak val1 = run_shell_cmd(cmd) tmp_files = [tmp1, tmp2] val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) rm_f(tmp_files) return frip_qc
def blacklist_filter_bam(bam, blacklist, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filtered = '{}.bfilt.bam'.format(prefix) if blacklist == '' or get_num_lines(blacklist) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(bam, filtered) run_shell_cmd(cmd) else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(blacklist, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}' cmd = cmd.format( bam, tmp2, # blacklist filtered) run_shell_cmd(cmd) rm_f([tmp2]) return filtered
def frip_shifted(ta, peak, chrsz, fraglen, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) half_fraglen = (fraglen + 1) / 2 if get_num_lines(peak) == 0: val1 = 0.0 else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools slop -i {} -g {} ' cmd += '-s -l {} -r {} | ' cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | ' cmd += 'bedtools intersect -nonamecheck -a stdin -b {} ' cmd += '-wa -u | wc -l' cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2) # peak val1 = run_shell_cmd(cmd) rm_f(tmp2) val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) return frip_qc