def blacklist_filter(peak, blacklist, keep_irregular_chr, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) peak_ext = get_ext(peak) filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext) if get_num_lines(peak) == 0 or blacklist == '' \ or get_num_lines(blacklist) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, filtered) run_shell_cmd(cmd) else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(peak, 'tmp1', out_dir) tmp2 = gunzip(blacklist, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}} ' cmd += '{{if ($5>1000) $5=1000; print $0}}\' | ' if not keep_irregular_chr: cmd += 'grep -P \'chr[\\dXY]+\\b\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format( tmp1, # peak tmp2, # blacklist filtered) run_shell_cmd(cmd) rm_f([tmp1, tmp2]) return filtered
def mark_dup_picard(bam, out_dir, java_heap=None): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc'.format(prefix) if java_heap is None: java_heap_param = '-Xmx4G' else: java_heap_param = '-Xmx{}'.format(java_heap) run_shell_cmd('java {java_heap_param} -XX:ParallelGCThreads=1 ' '-jar {picard} MarkDuplicates ' 'INPUT={bam} ' 'OUTPUT={dupmark_bam} ' 'METRICS_FILE={dup_qc} ' 'VALIDATION_STRINGENCY=LENIENT ' 'USE_JDK_DEFLATER=TRUE ' 'USE_JDK_INFLATER=TRUE ' 'ASSUME_SORTED=TRUE ' 'REMOVE_DUPLICATES=FALSE '.format( java_heap_param=java_heap_param, picard=locate_picard(), bam=bam, dupmark_bam=dupmark_bam, dup_qc=dup_qc, )) return dupmark_bam, dup_qc
def frac_mito(non_mito_samstat, mito_samstat, out_dir): prefix = os.path.join( out_dir, os.path.basename(strip_ext(non_mito_samstat, 'non_mito.samstats.qc'))) frac_mito_qc = '{}.frac_mito.qc'.format(prefix) non_mito_samstat_dict = parse_flagstat_qc(non_mito_samstat) mito_samstat_dict = parse_flagstat_qc(mito_samstat) if 'mapped' in non_mito_samstat_dict: # backward compatibility (old key name was 'total') key_mapped = 'mapped' elif 'mapped_reads' in non_mito_samstat_dict: key_mapped = 'mapped_reads' Rn = non_mito_samstat_dict[key_mapped] if 'mapped' in mito_samstat_dict: # backward compatibility (old key name was 'total') key_mapped = 'mapped' elif 'mapped_reads' in mito_samstat_dict: key_mapped = 'mapped_reads' Rm = mito_samstat_dict[key_mapped] frac = float(Rm) / float(Rn + Rm) with open(frac_mito_qc, 'w') as fp: fp.write('non_mito_reads\t{}\n'.format(Rn)) fp.write('mito_reads\t{}\n'.format(Rm)) fp.write('frac_mito_reads\t{}\n'.format(frac)) return frac_mito_qc
def rm_dup_pe(dupmark_bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(dupmark_bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') nodup_bam = '{}.nodup.bam'.format(prefix) cmd1 = 'samtools view -@ {} -F 1804 -f 2 -b {} > {}' cmd1 = cmd1.format(nth, dupmark_bam, nodup_bam) run_shell_cmd(cmd1) return nodup_bam
def starch_to_bed_gz(starch, out_dir): """Convert starch into gzipped BED. Required softwares: BEDOPS (tested with v2.4.39): unstarch """ prefix = os.path.join(out_dir, os.path.basename(strip_ext(starch))) bed_gz = '{}.bed.gz'.format(prefix) run_shell_cmd('unstarch {starch} | gzip -nc > {bed_gz}'.format( starch=starch, bed_gz=bed_gz, )) return bed_gz
def mark_dup_sambamba(bam, nth, out_dir): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc' cmd = 'sambamba markdup -t {} --hash-table-size=17592186044416 ' cmd += '--overflow-list-size=20000000 ' cmd += '--io-buffer-size=256 {} {} 2> {}' cmd = cmd.format(nth, bam, dupmark_bam, dup_qc) run_shell_cmd(cmd) return dupmark_bam, dup_qc
def rm_dup_pe(dupmark_bam, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(dupmark_bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') nodup_bam = '{}.nodup.bam'.format(prefix) run_shell_cmd( 'samtools view -F 1804 -f 2 -b {dupmark_bam} {res_param} > {nodup_bam}' .format( dupmark_bam=dupmark_bam, res_param=get_samtools_res_param('view', nth=nth), nodup_bam=nodup_bam, )) return nodup_bam
def pbc_qc_se(bam, mito_chr_name, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') pbc_qc = '{}.lib_complexity.qc'.format(prefix) cmd2 = 'bedtools bamtobed -i {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | ' cmd2 += 'grep -v "^{}\\s" | sort | uniq -c | ' cmd2 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' cmd2 += '($1==2){{m2=m2+1}} {{m0=m0+1}} ' cmd2 += '{{mt=mt+$1}} END{{m1_m2=-1.0; ' cmd2 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' cmd2 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' cmd2 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",' cmd2 += 'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}' cmd2 = cmd2.format(bam, mito_chr_name, pbc_qc) run_shell_cmd(cmd2) return pbc_qc
def frip_shifted(ta, peak, chrsz, fraglen, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) half_fraglen = (fraglen + 1) / 2 if get_num_lines(peak) == 0: val1 = 0.0 else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools slop -i {} -g {} ' cmd += '-s -l {} -r {} | ' cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | ' cmd += 'bedtools intersect -nonamecheck -a stdin -b {} ' cmd += '-wa -u | wc -l' cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2) # peak val1 = run_shell_cmd(cmd) rm_f(tmp2) val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) return frip_qc
def frip(ta, peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) if get_num_lines(peak) == 0: val1 = 0.0 tmp_files = [] else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(ta, 'tmp1', out_dir) tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l' cmd = cmd.format( tmp1, # ta tmp2) # peak val1 = run_shell_cmd(cmd) tmp_files = [tmp1, tmp2] val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) rm_f(tmp_files) return frip_qc
def mark_dup_picard(bam, out_dir): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc'.format(prefix) cmd = 'java -Xmx4G -XX:ParallelGCThreads=1 -jar ' cmd += locate_picard() cmd += ' MarkDuplicates ' # cmd = 'picard MarkDuplicates ' cmd += 'INPUT={} OUTPUT={} ' cmd += 'METRICS_FILE={} VALIDATION_STRINGENCY=LENIENT ' cmd += 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' cmd += 'ASSUME_SORTED=true REMOVE_DUPLICATES=false' cmd = cmd.format( bam, dupmark_bam, dup_qc) run_shell_cmd(cmd) return dupmark_bam, dup_qc
def pbc_qc_se(bam, mito_chr_name, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'dupmark') pbc_qc = '{}.lib_complexity.qc'.format(prefix) run_shell_cmd( 'bedtools bamtobed -i {bam} | ' 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | ' 'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | ' 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' '($1==2){{m2=m2+1}} {{m0=m0+1}} ' '{{mt=mt+$1}} END{{m1_m2=-1.0; ' 'if(m2>0) m1_m2=m1/m2; m0_mt=0; ' 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",' 'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format( bam=bam, mito_chr_name=mito_chr_name, sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pbc_qc=pbc_qc, )) return pbc_qc
def peak_to_bigbed(peak, peak_type, chrsz, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) bigbed = '{}.{}.bb'.format(prefix, peak_type) as_file = '{}.as'.format(prefix) chrsz_tmp = '{}.chrsz.tmp'.format(prefix) bigbed_tmp = '{}.bb.tmp'.format(prefix) bigbed_tmp2 = '{}.bb.tmp2'.format(prefix) if peak_type.lower() == 'narrowpeak' or peak_type.lower() == 'regionpeak': as_file_contents = '''table narrowPeak "BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data." ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name given to a region (preferably unique). Use . if no name is assigned" uint score; "Indicates how dark the peak will be displayed in the browser (0-1000) " char[1] strand; "+ or - or . for unknown" float signalValue; "Measurement of average enrichment for the region" float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used." int peak; "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called." ) ''' bed_param = '-type=bed6+4 -as={}'.format(as_file) elif peak_type.lower() == 'broadpeak': as_file_contents = '''table broadPeak "BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data." ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name given to a region (preferably unique). Use . if no name is assigned." uint score; "Indicates how dark the peak will be displayed in the browser (0-1000)" char[1] strand; "+ or - or . for unknown" float signalValue; "Measurement of average enrichment for the region" float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used." ) ''' bed_param = '-type=bed6+3 -as={}'.format(as_file) elif peak_type.lower() == 'gappedpeak': as_file_contents = '''table gappedPeak "This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format." ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Pseudogene alignment start position" uint chromEnd; "Pseudogene alignment end position" string name; "Name of pseudogene" uint score; "Score of pseudogene with gene (0-1000)" char[1] strand; "+ or - or . for unknown" uint thickStart; "Start of where display should be thick (start codon)" uint thickEnd; "End of where display should be thick (stop codon)" uint reserved; "Always zero for now" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions relative to chromStart" float signalValue; "Measurement of average enrichment for the region" float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." float qValue; "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used." ) ''' bed_param = '-type=bed12+3 -as={}'.format(as_file) else: raise Exception('Unsupported peak file type {}!'.format(peak_type)) # create temporary .as file with open(as_file, 'w') as fp: fp.write(as_file_contents) cmd1 = "cat {} > {}".format(chrsz, chrsz_tmp) run_shell_cmd(cmd1) cmd2 = "zcat -f {} | LC_COLLATE=C sort -k1,1 -k2,2n | " cmd2 += 'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; ' cmd2 += 'if ($5<0) $5=0; print $0}}\' > {}' cmd2 = cmd2.format(peak, bigbed_tmp) run_shell_cmd(cmd2) cmd3 = "bedClip {} {} {}".format(bigbed_tmp, chrsz_tmp, bigbed_tmp2) run_shell_cmd(cmd3) cmd4 = "bedToBigBed {} {} {} {}".format(bed_param, bigbed_tmp2, chrsz_tmp, bigbed) run_shell_cmd(cmd4) # remove temporary files rm_f([as_file, chrsz_tmp, bigbed_tmp, bigbed_tmp2]) return bigbed