def blacklist_filter(peak, blacklist, keep_irregular_chr, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) peak_ext = get_ext(peak) filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext) if get_num_lines(peak) == 0 or blacklist == '' \ or get_num_lines(blacklist) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, filtered) run_shell_cmd(cmd) else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(peak, 'tmp1', out_dir) tmp2 = gunzip(blacklist, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}} ' cmd += '{{if ($5>1000) $5=1000; print $0}}\' | ' if not keep_irregular_chr: cmd += 'grep -P \'chr[\\dXY]+\\b\' | ' cmd += 'gzip -nc > {}' cmd = cmd.format( tmp1, # peak tmp2, # blacklist filtered) run_shell_cmd(cmd) rm_f([tmp1, tmp2]) return filtered
def spr_se(ta, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) + 1) / 2) # bash-only cmd1 = 'zcat {} | shuf --random-source=<(openssl enc ' cmd1 += '-aes-256-ctr -pass pass:$(zcat -f {} | wc -c) ' cmd1 += '-nosalt </dev/zero 2>/dev/null) | ' cmd1 += 'split -d -l {} - {}.' cmd1 = cmd1.format(ta, ta, nlines, prefix) run_shell_cmd(cmd1) cmd2 = 'gzip -nc {} > {}' cmd2 = cmd2.format(tmp_pr1, ta_pr1) run_shell_cmd(cmd2) cmd3 = 'gzip -nc {} > {}' cmd3 = cmd3.format(tmp_pr2, ta_pr2) run_shell_cmd(cmd3) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def get_num_peaks(peak_file, out_dir='.'): ''' From the peak file, return number of lines in it ''' basename = os.path.basename(strip_ext_peak(peak_file)) prefix = os.path.join(out_dir, basename) log = '{}.num_peak.qc'.format(prefix) with open(log, 'w') as fp: fp.write(str(get_num_lines(peak_file)) + '\n') return log
def spr_pe(ta, pseudoreplication_random_seed, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) / 2 + 1) / 2) if pseudoreplication_random_seed == 0: random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta)) log.info( 'Using input file\'s size {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) else: random_seed = pseudoreplication_random_seed log.info( 'Using a fixed integer {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) # bash-only run_shell_cmd('zcat -f {ta} | sed \'N;s/\\n/\\t/\' | ' 'shuf --random-source=<(openssl enc -aes-256-ctr ' '-pass pass:{random_seed} -nosalt </dev/zero 2>/dev/null) | ' 'split -d -l {nlines} - {prefix}.'.format( ta=ta, random_seed=random_seed, nlines=nlines, prefix=prefix, )) run_shell_cmd('zcat -f {tmp_pr1} | ' 'awk \'BEGIN{{OFS="\\t"}} ' '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' 'gzip -nc > {ta_pr1}'.format( tmp_pr1=tmp_pr1, ta_pr1=ta_pr1, )) run_shell_cmd('zcat -f {tmp_pr2} | ' 'awk \'BEGIN{{OFS="\\t"}} ' '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' 'gzip -nc > {ta_pr2}'.format( tmp_pr2=tmp_pr2, ta_pr2=ta_pr2, )) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def frip_shifted(ta, peak, chrsz, fraglen, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) half_fraglen = (fraglen + 1) / 2 if get_num_lines(peak) == 0: val1 = 0.0 else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools slop -i {} -g {} ' cmd += '-s -l {} -r {} | ' cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | ' cmd += 'bedtools intersect -nonamecheck -a stdin -b {} ' cmd += '-wa -u | wc -l' cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2) # peak val1 = run_shell_cmd(cmd) rm_f(tmp2) val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) return frip_qc
def frip(ta, peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) if get_num_lines(peak) == 0: val1 = 0.0 tmp_files = [] else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(ta, 'tmp1', out_dir) tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l' cmd = cmd.format( tmp1, # ta tmp2) # peak val1 = run_shell_cmd(cmd) tmp_files = [tmp1, tmp2] val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) rm_f(tmp_files) return frip_qc
def get_fract_reads_in_regions(reads_bed, regions_bed): """Function that takes in bed file of reads and bed file of regions and gets fraction of reads sitting in said regions """ # uses new run_shell_cmd cmd = "bedtools sort -i {} | " cmd += "bedtools merge -i stdin | " cmd += "bedtools intersect -u -nonamecheck -a {} -b stdin | " cmd += "wc -l" cmd = cmd.format(regions_bed, reads_bed) intersect_read_count = int(run_shell_cmd(cmd)) total_read_count = get_num_lines(reads_bed) fract_reads = float(intersect_read_count) / total_read_count return intersect_read_count, fract_reads
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, use_bwa_mem_for_pe, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) if use_bwa_mem_for_pe and read_len >= 70: cmd = 'bwa mem -M -t {} {} {} {} | gzip -nc > {}' cmd = cmd.format(nth, ref_index_prefix, fastq1, fastq2, sam) temp_files.append(sam) else: sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: cmd3 = 'zcat -f {} | grep -v -F -f {} | ' cmd3 += 'samtools view -Su - | samtools sort - -o {} -T {}' cmd3 = cmd3.format(sam, badcigar, bam, prefix) else: cmd3 = 'samtools view -Su {} | samtools sort - -o {} -T {}' cmd3 = cmd3.format(sam, bam, prefix) run_shell_cmd(cmd3) rm_f(temp_files) return bam
def blacklist_filter_bam(bam, blacklist, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) filtered = '{}.bfilt.bam'.format(prefix) if blacklist == '' or get_num_lines(blacklist) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(bam, filtered) run_shell_cmd(cmd) else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(blacklist, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}' cmd = cmd.format( bam, tmp2, # blacklist filtered) run_shell_cmd(cmd) rm_f([tmp2]) return filtered
def spr_se(ta, pseudoreplication_random_seed, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) + 1) / 2) if pseudoreplication_random_seed == 0: random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta)) log.info( 'Using input file\'s size {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) else: random_seed = pseudoreplication_random_seed log.info( 'Using a fixed integer {random_seed} as random seed for pseudoreplication.' .format(random_seed=random_seed, )) # bash-only run_shell_cmd('zcat {ta} | shuf --random-source=<(openssl enc ' '-aes-256-ctr -pass pass:{random_seed} ' '-nosalt </dev/zero 2>/dev/null) | ' 'split -d -l {nlines} - {prefix}.'.format( ta=ta, random_seed=random_seed, nlines=nlines, prefix=prefix, )) run_shell_cmd('gzip -nc {tmp_pr1} > {ta_pr1}'.format(tmp_pr1=tmp_pr1, ta_pr1=ta_pr1)) run_shell_cmd('gzip -nc {tmp_pr2} > {ta_pr2}'.format(tmp_pr2=tmp_pr2, ta_pr2=ta_pr2)) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def spr_pe(ta, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) tmp_pr1 = '{}.00'.format(prefix) tmp_pr2 = '{}.01'.format(prefix) ta_pr1 = '{}.pr1.tagAlign.gz'.format(prefix) ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix) nlines = int((get_num_lines(ta) / 2 + 1) / 2) # bash-only cmd1 = 'zcat -f {} | sed \'N;s/\\n/\\t/\' | ' cmd1 += 'shuf --random-source=<(openssl enc -aes-256-ctr ' cmd1 += '-pass pass:$(zcat -f {} | wc -c) ' cmd1 += '-nosalt </dev/zero 2>/dev/null) | ' cmd1 += 'split -d -l {} - {}.' cmd1 = cmd1.format(ta, ta, nlines, prefix) run_shell_cmd(cmd1) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}} ' cmd2 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd2 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd2 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd2 += 'gzip -nc > {}' cmd2 = cmd2.format(tmp_pr1, ta_pr1) run_shell_cmd(cmd2) cmd3 = 'zcat -f {} | ' cmd3 += 'awk \'BEGIN{{OFS="\\t"}} ' cmd3 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n' cmd3 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",' cmd3 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | ' cmd3 += 'gzip -nc > {}' cmd3 = cmd3.format(tmp_pr2, ta_pr2) run_shell_cmd(cmd3) rm_f([tmp_pr1, tmp_pr2]) return ta_pr1, ta_pr2
def peak_to_hammock(peak, out_dir): peak_type = get_peak_type(peak) prefix = os.path.join(out_dir, os.path.basename(strip_ext_peak(peak))) hammock = '{}.{}.hammock'.format(prefix, peak_type) hammock_tmp = '{}.tmp'.format(hammock) hammock_tmp2 = '{}.tmp2'.format(hammock) hammock_gz = '{}.gz'.format(hammock) hammock_gz_tbi = '{}.gz.tbi'.format(hammock) if get_num_lines(peak) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz) run_shell_cmd(cmd) cmd2 = 'touch {}'.format(hammock_gz_tbi) else: cmd = "zcat -f {} | " cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}" cmd = cmd.format(peak, hammock_tmp) run_shell_cmd(cmd) with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout: id = 1 for line in fin: lst = line.rstrip().split('\t') if peak_type == 'narrowPeak' or peak_type == 'regionPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},' '{0[8]}],id:{1},'.format(lst, id)) if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') if lst[9] != '-1': fout.write('sbstroke:[' + lst[9] + ']') elif peak_type == 'gappedPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},' '{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],' 'thick:['.format(lst, id)) a = int(lst[1]) sizes = lst[10].split(',') starts = lst[11].split(',') for i in range(len(sizes)): fout.write('[{0},{1}],'.format( a + int(starts[i]), a + int(starts[i]) + int(sizes[i]))) fout.write(']},') if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') elif peak_type == 'broadPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],' 'id:{1},'.format(lst, id)) if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') else: raise Exception("Unsupported peak_type {}".format(peak)) id += 1 fout.write('\n') cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}' cmd2 = cmd2.format(hammock_tmp2, hammock_gz) run_shell_cmd(cmd2) cmd3 = 'tabix -f -p bed {}'.format(hammock_gz) run_shell_cmd(cmd3) rm_f([hammock, hammock_tmp, hammock_tmp2]) return (hammock_gz, hammock_gz_tbi)
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] cmd0 = 'macs2 callpeak ' cmd0 += '-t {} -f BED -n {} -g {} -p {} ' cmd0 += '--shift {} --extsize {} ' cmd0 += '--nomodel -B --SPMR ' cmd0 += '--keep-dup all --call-summits ' cmd0 = cmd0.format(ta, prefix, gensz, pval_thresh, shiftsize, smooth_win) run_shell_cmd(cmd0) cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd3 += '-c "{}"_control_lambda.bdg ' cmd3 += '--o-prefix "{}" -m FE ' cmd3 = cmd3.format(prefix, prefix, prefix) run_shell_cmd(cmd3) cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | ' cmd4 += 'bedClip stdin {} {}' cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph) run_shell_cmd(cmd4) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\ '|| prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( fc_bedgraph, fc_bedgraph_srt) run_shell_cmd(cmd5) rm_f(fc_bedgraph) cmd6 = 'bedGraphToBigWig {} {} {}' cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig) run_shell_cmd(cmd6) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd7 += '-c "{}"_control_lambda.bdg ' cmd7 += '--o-prefix {} -m ppois -S {}' cmd7 = cmd7.format(prefix, prefix, prefix, sval) run_shell_cmd(cmd7) cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | ' cmd8 += 'bedClip stdin {} {}' cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph) run_shell_cmd(cmd8) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 '\ '|| prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( pval_bedgraph, pval_bedgraph_srt) run_shell_cmd(cmd9) rm_f(pval_bedgraph) cmd10 = 'bedGraphToBigWig {} {} {}' cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig) run_shell_cmd(cmd10) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') mkdir_p(args.out_dir) log.info('Reproducibility QC...') # description for variables # N: list of number of peaks in peak files from pseudo replicates # Nt: top number of peaks in peak files # from true replicates (rep-x_vs_rep-y) # Np: number of peaks in peak files from pooled pseudo replicate N = [get_num_lines(peak) for peak in args.peaks_pr] if len(args.peaks): # multiple replicate case num_rep = infer_n_from_nC2(len(args.peaks)) num_peaks_tr = [get_num_lines(peak) for peak in args.peaks] Nt = max(num_peaks_tr) Np = get_num_lines(args.peak_ppr) rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N)) / float(min(N)) Nt_idx = num_peaks_tr.index(Nt) label_tr = infer_pair_label_from_idx(num_rep, Nt_idx) conservative_set = label_tr conservative_peak = args.peaks[Nt_idx] N_conservative = Nt if Nt > Np: optimal_set = conservative_set optimal_peak = conservative_peak N_optimal = N_conservative else: optimal_set = "pooled-pr1_vs_pooled-pr2" optimal_peak = args.peak_ppr N_optimal = Np else: # single replicate case num_rep = 1 Nt = 0 Np = 0 rescue_ratio = 0.0 self_consistency_ratio = 1.0 conservative_set = 'rep1-pr1_vs_rep1-pr2' conservative_peak = args.peaks_pr[0] N_conservative = N[0] optimal_set = conservative_set optimal_peak = conservative_peak N_optimal = N_conservative reproducibility = 'pass' if rescue_ratio > 2.0 or self_consistency_ratio > 2.0: reproducibility = 'borderline' if rescue_ratio > 2.0 and self_consistency_ratio > 2.0: reproducibility = 'fail' log.info('Writing optimal/conservative peak files...') optimal_peak_file = os.path.join( args.out_dir, '{}optimal_peak.{}.gz'.format( (args.prefix + '.') if args.prefix else '', args.peak_type)) conservative_peak_file = os.path.join( args.out_dir, '{}conservative_peak.{}.gz'.format( (args.prefix + '.') if args.prefix else '', args.peak_type)) copy_f_to_f(optimal_peak, optimal_peak_file) copy_f_to_f(conservative_peak, conservative_peak_file) if args.chrsz: log.info('Converting peak to bigbed...') peak_to_bigbed(optimal_peak_file, args.peak_type, args.chrsz, args.out_dir) peak_to_bigbed(conservative_peak_file, args.peak_type, args.chrsz, args.out_dir) log.info('Converting peak to starch...') peak_to_starch(optimal_peak_file, args.out_dir) peak_to_starch(conservative_peak_file, args.out_dir) log.info('Converting peak to hammock...') peak_to_hammock(optimal_peak_file, args.out_dir) peak_to_hammock(conservative_peak_file, args.out_dir) log.info('Writing reproducibility QC log...') if args.prefix: reproducibility_qc = '{}.reproducibility.qc'.format(args.prefix) else: reproducibility_qc = 'reproducibility.qc' reproducibility_qc = os.path.join(args.out_dir, reproducibility_qc) with open(reproducibility_qc, 'w') as fp: header = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 'Nt', '\t'.join(['N{}'.format(i + 1) for i in range(num_rep)]), 'Np', 'N_opt', 'N_consv', 'opt_set', 'consv_set', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility', ) line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( Nt, '\t'.join([str(i) for i in N]), Np, N_optimal, N_conservative, optimal_set, conservative_set, rescue_ratio, self_consistency_ratio, reproducibility) fp.write(header) fp.write(line) log.info('Calculating (optimal) peak region size QC/plot...') region_size_qc, region_size_plot = get_region_size_metrics( optimal_peak_file) log.info('Calculating number of peaks (optimal)...') get_num_peaks(optimal_peak_file) log.info('All done.')
def macs2_signal_track(ta, chrsz, gensz, pval_thresh, smooth_win, mem_gb, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) shiftsize = -int(round(float(smooth_win) / 2.0)) temp_files = [] run_shell_cmd('macs2 callpeak ' '-t {ta} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--shift {shiftsize} --extsize {extsize} ' '--nomodel -B --SPMR ' '--keep-dup all --call-summits '.format( ta=ta, prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=shiftsize, extsize=smooth_win, )) run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg ' '-c "{prefix}"_control_lambda.bdg ' '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, )) run_shell_cmd('bedtools slop -i "{prefix}"_FE.bdg -g {chrsz} -b 0 | ' 'bedClip stdin {chrsz} {fc_bedgraph}'.format( prefix=prefix, chrsz=chrsz, fc_bedgraph=fc_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 ' '|| prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'. format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), fc_bedgraph=fc_bedgraph, fc_bedgraph_srt=fc_bedgraph_srt)) rm_f(fc_bedgraph) run_shell_cmd( 'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format( fc_bedgraph_srt=fc_bedgraph_srt, chrsz=chrsz, fc_bigwig=fc_bigwig, )) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 run_shell_cmd('macs2 bdgcmp -t "{prefix}"_treat_pileup.bdg ' '-c "{prefix}"_control_lambda.bdg ' '--o-prefix {prefix} -m ppois -S {sval}'.format( prefix=prefix, sval=sval, )) run_shell_cmd('bedtools slop -i "{prefix}"_ppois.bdg -g {chrsz} -b 0 | ' 'bedClip stdin {chrsz} {pval_bedgraph}'.format( prefix=prefix, chrsz=chrsz, pval_bedgraph=pval_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 ' '|| prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'. format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pval_bedgraph=pval_bedgraph, pval_bedgraph_srt=pval_bedgraph_srt, )) rm_f(pval_bedgraph) run_shell_cmd( 'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format( pval_bedgraph_srt=pval_bedgraph_srt, chrsz=chrsz, pval_bigwig=pval_bigwig, )) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') # make out_dir (root of all outputs) mkdir_p(args.out_dir) # reproducibility QC log.info('Choosing appropriate control for each IP replicate...') num_rep = len(args.tas) num_ctl = len(args.ctl_tas) # num lines in tagaligns depths = [get_num_lines(ta) for ta in args.tas] # num lines in control tagaligns depths_ctl = [get_num_lines(ctl_ta) for ctl_ta in args.ctl_tas] depth_rep_pooled = sum(depths) depth_ctl_pooled = sum(depths_ctl) # make them dicts including -1 key (meaning pooled one) depths = dict(enumerate(depths)) depths_ctl = dict(enumerate(depths_ctl)) depths[-1] = depth_rep_pooled depths_ctl[-1] = depth_ctl_pooled ctl_ta_idx = [0]*num_rep if num_ctl == 1: # if only one control, use it for all replicates pass elif args.always_use_pooled_ctl: # if --always-use-pooled-ctl, then always use pooled control ctl_ta_idx = [-1]*num_rep else: # if multiple controls, # check # of lines in replicate/control tagaligns and # apply ctl_depth_ratio # make depths dicts including pooled ones # check every num lines in every pair of control tagaligns # if ratio of two entries in any pair > ctl_depth_ratio then # use pooled control for all use_pooled_ctl = False for i in range(num_ctl): for j in range(i+1, num_ctl): if depths_ctl[i]/float(depths_ctl[j]) > \ args.ctl_depth_ratio or \ depths_ctl[j]/float(depths_ctl[i]) > \ args.ctl_depth_ratio: use_pooled_ctl = True log.info( 'Number of reads in controls differ by a factor of {}.' 'Using pooled controls.'.format( args.ctl_depth_ratio)) break if use_pooled_ctl: # use pooled control for all exp replicates ctl_ta_idx = [-1]*num_rep else: for i in range(num_rep): if i > num_ctl-1: ctl_ta_idx[i] = -1 # use pooled control elif depths_ctl[i] < depths[i]: log.info( 'Fewer reads in control {} than experiment replicate ' '{}. Using pooled control for replicate {}.'.format( i+1, i+1, i+1)) ctl_ta_idx[i] = -1 # use pooled control else: ctl_ta_idx[i] = i ctl_ta_subsample = [0] * num_rep ctl_ta_subsampled_pooled = 0 if args.exp_ctl_depth_ratio_limit or args.ctl_depth_limit: # subsampling chosen control for each replicate for rep in range(num_rep): chosen_ctl = ctl_ta_idx[rep] depth = depths[rep] depth_ctl = depths_ctl[chosen_ctl] limit = int(max(depth * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit)) if depth_ctl > limit: ctl_ta_subsample[rep] = limit # subsampling pooled control for pooled replicate limit = int(max(depth_rep_pooled * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit)) if depth_ctl_pooled > limit: ctl_ta_subsampled_pooled = limit # for each replicate check log.info('Writing idx.txt...') out_txt = os.path.join(args.out_dir, args.out_tsv_basename) write_txt(out_txt, ctl_ta_idx) log.info('Writing subsample txt...') out_subsample_txt = os.path.join(args.out_dir, args.out_tsv_subsample_basename) write_txt(out_subsample_txt, ctl_ta_subsample) log.info('Writing subsample_pooled txt...') out_subsample_pooled_txt = os.path.join(args.out_dir, args.out_txt_subsample_pooled_basename) write_txt(out_subsample_pooled_txt, ctl_ta_subsampled_pooled) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, ctl_subsample, ctl_paired_end, mem_gb, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) temp_files = [] run_shell_cmd( ' macs2 callpeak ' '-t {ta} {ctl_param} -f BED -n {prefix} -g {gensz} -p {pval_thresh} ' '--nomodel --shift {shiftsize} --extsize {extsize} --keep-dup all -B --SPMR' .format( ta=ta, ctl_param='-c {ctl_ta}'.format(ctl_ta=ctl_ta) if ctl_ta else '', prefix=prefix, gensz=gensz, pval_thresh=pval_thresh, shiftsize=0, extsize=fraglen, )) run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" ' '-c "{prefix}_control_lambda.bdg" ' '--o-prefix "{prefix}" -m FE '.format(prefix=prefix, )) run_shell_cmd('bedtools slop -i "{prefix}_FE.bdg" -g {chrsz} -b 0 | ' 'awk \'{{if ($3 != -1) print $0}}\' |' 'bedClip stdin {chrsz} {fc_bedgraph}'.format( prefix=prefix, chrsz=chrsz, fc_bedgraph=fc_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {fc_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || ' 'prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {fc_bedgraph_srt}'. format(sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), fc_bedgraph=fc_bedgraph, fc_bedgraph_srt=fc_bedgraph_srt)) rm_f(fc_bedgraph) run_shell_cmd( 'bedGraphToBigWig {fc_bedgraph_srt} {chrsz} {fc_bigwig}'.format( fc_bedgraph_srt=fc_bedgraph_srt, chrsz=chrsz, fc_bigwig=fc_bigwig, )) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 run_shell_cmd('macs2 bdgcmp -t "{prefix}_treat_pileup.bdg" ' '-c "{prefix}_control_lambda.bdg" ' '--o-prefix {prefix} -m ppois -S {sval}'.format( prefix=prefix, sval=sval, )) run_shell_cmd('bedtools slop -i "{prefix}_ppois.bdg" -g {chrsz} -b 0 | ' 'awk \'{{if ($3 != -1) print $0}}\' |' 'bedClip stdin {chrsz} {pval_bedgraph}'.format( prefix=prefix, chrsz=chrsz, pval_bedgraph=pval_bedgraph, )) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row run_shell_cmd( 'LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} {pval_bedgraph} | ' 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || ' 'prev_chr==$1 && prev_chr_e<=$2)) ' '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {pval_bedgraph_srt}'. format( sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5), pval_bedgraph=pval_bedgraph, pval_bedgraph_srt=pval_bedgraph_srt, )) rm_f(pval_bedgraph) run_shell_cmd( 'bedGraphToBigWig {pval_bedgraph_srt} {chrsz} {pval_bigwig}'.format( pval_bedgraph_srt=pval_bedgraph_srt, chrsz=chrsz, pval_bigwig=pval_bigwig)) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{prefix}_*".format(prefix=prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def macs2_signal_track(ta, ctl_ta, chrsz, gensz, pval_thresh, shift, fraglen, ctl_subsample, ctl_paired_end, out_dir): basename_ta = os.path.basename(strip_ext_ta(ta)) if ctl_ta: if ctl_subsample: if ctl_paired_end: ctl_ta = subsample_ta_pe(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, r1_only=False, out_dir=out_dir) else: ctl_ta = subsample_ta_se(ctl_ta, ctl_subsample, non_mito=False, mito_chr_name=None, out_dir=out_dir) basename_ctl_ta = os.path.basename(strip_ext_ta(ctl_ta)) basename_prefix = '{}_x_{}'.format(basename_ta, basename_ctl_ta) if len(basename_prefix) > 200: # UNIX cannot have len(filename) > 255 basename_prefix = '{}_x_control'.format(basename_ta) else: basename_prefix = basename_ta prefix = os.path.join(out_dir, basename_prefix) fc_bigwig = '{}.fc.signal.bigwig'.format(prefix) pval_bigwig = '{}.pval.signal.bigwig'.format(prefix) # temporary files fc_bedgraph = '{}.fc.signal.bedgraph'.format(prefix) fc_bedgraph_srt = '{}.fc.signal.srt.bedgraph'.format(prefix) pval_bedgraph = '{}.pval.signal.bedgraph'.format(prefix) pval_bedgraph_srt = '{}.pval.signal.srt.bedgraph'.format(prefix) temp_files = [] cmd0 = ' macs2 callpeak ' cmd0 += '-t {} {} -f BED -n {} -g {} -p {} ' cmd0 += '--nomodel --shift {} --extsize {} --keep-dup all -B --SPMR' cmd0 = cmd0.format(ta, '-c {}'.format(ctl_ta) if ctl_ta else '', prefix, gensz, pval_thresh, 0, fraglen) run_shell_cmd(cmd0) cmd3 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd3 += '-c "{}"_control_lambda.bdg ' cmd3 += '--o-prefix "{}" -m FE ' cmd3 = cmd3.format(prefix, prefix, prefix) run_shell_cmd(cmd3) cmd4 = 'bedtools slop -i "{}"_FE.bdg -g {} -b 0 | ' cmd4 += 'awk \'{{if ($3 != -1) print $0}}\' |' cmd4 += 'bedClip stdin {} {}' cmd4 = cmd4.format(prefix, chrsz, chrsz, fc_bedgraph) run_shell_cmd(cmd4) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd5 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\ 'prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( fc_bedgraph, fc_bedgraph_srt) run_shell_cmd(cmd5) rm_f(fc_bedgraph) cmd6 = 'bedGraphToBigWig {} {} {}' cmd6 = cmd6.format(fc_bedgraph_srt, chrsz, fc_bigwig) run_shell_cmd(cmd6) rm_f(fc_bedgraph_srt) # sval counts the number of tags per million in the (compressed) BED file sval = float(get_num_lines(ta)) / 1000000.0 cmd7 = 'macs2 bdgcmp -t "{}"_treat_pileup.bdg ' cmd7 += '-c "{}"_control_lambda.bdg ' cmd7 += '--o-prefix {} -m ppois -S {}' cmd7 = cmd7.format(prefix, prefix, prefix, sval) run_shell_cmd(cmd7) cmd8 = 'bedtools slop -i "{}"_ppois.bdg -g {} -b 0 | ' cmd8 += 'awk \'{{if ($3 != -1) print $0}}\' |' cmd8 += 'bedClip stdin {} {}' cmd8 = cmd8.format(prefix, chrsz, chrsz, pval_bedgraph) run_shell_cmd(cmd8) # sort and remove any overlapping regions in bedgraph by comparing two lines in a row cmd9 = 'LC_COLLATE=C sort -k1,1 -k2,2n {} | ' \ 'awk \'BEGIN{{OFS="\\t"}}{{if (NR==1 || NR>1 && (prev_chr!=$1 || '\ 'prev_chr==$1 && prev_chr_e<=$2)) ' \ '{{print $0}}; prev_chr=$1; prev_chr_e=$3;}}\' > {}'.format( pval_bedgraph, pval_bedgraph_srt) run_shell_cmd(cmd9) rm_f(pval_bedgraph) cmd10 = 'bedGraphToBigWig {} {} {}' cmd10 = cmd10.format(pval_bedgraph_srt, chrsz, pval_bigwig) run_shell_cmd(cmd10) rm_f(pval_bedgraph_srt) # remove temporary files temp_files.append("{}_*".format(prefix)) rm_f(temp_files) return fc_bigwig, pval_bigwig
def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe, bwa_mem_read_len_limit, rescue_reads_for_bwa_mem, out_dir): basename = os.path.basename(strip_ext_fastq(fastq1)) prefix = os.path.join(out_dir, basename) sam = '{}.sam'.format(prefix) badcigar = '{}.badReads'.format(prefix) bam = '{}.bam'.format(prefix) temp_files = [] read_len = get_read_length(fastq1) log.info('Guessed read length of R1 FASTQ: {read_len}'.format( read_len=read_len, )) if use_bwa_mem_for_pe and read_len >= bwa_mem_read_len_limit: log.info('Use bwa mem.') cmd = 'bwa mem -M {extra_param} -t {nth} {ref_index_prefix} {fastq1} {fastq2} | gzip -nc > {sam}'.format( extra_param='-P' if rescue_reads_for_bwa_mem else '', nth=nth, ref_index_prefix=ref_index_prefix, fastq1=fastq1, fastq2=fastq2, sam=sam, ) temp_files.append(sam) else: log.info('Use bwa aln for each (R1 and R2) and then bwa sampe.') sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir) sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir) cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format( ref_index_prefix, sai1, sai2, fastq1, fastq2, sam) temp_files.extend([sai1, sai2, sam]) run_shell_cmd(cmd) cmd2 = 'zcat -f {} | ' cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" ' cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); ' cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; ' cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); ' cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | ' cmd2 += 'sort | uniq > {}' cmd2 = cmd2.format(sam, badcigar) run_shell_cmd(cmd2) # Remove bad CIGAR read pairs if get_num_lines(badcigar) > 0: run_shell_cmd( 'zcat -f {sam} | grep -v -F -f {badcigar} | ' 'samtools view -Su /dev/stdin | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, badcigar=badcigar, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) else: run_shell_cmd( 'samtools view -Su {sam} | samtools sort /dev/stdin -o {bam} -T {prefix} {res_param}' .format( sam=sam, bam=bam, prefix=prefix, res_param=get_samtools_res_param('sort', nth=nth, mem_gb=mem_gb), )) rm_f(temp_files) return bam
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') # make out_dir (root of all outputs) mkdir_p(args.out_dir) # reproducibility QC log.info('Choosing appropriate control for each IP replicate...') ctl_ta_idx = [0] * len(args.tas) if len(args.ctl_tas) == 1: # if only one control, use it for all replicates pass elif args.always_use_pooled_ctl: # if --always-use-pooled-ctl, then always use pooled control ctl_ta_idx = [-1] * len(args.tas) else: # if multiple controls, # check # of lines in replicate/control tagaligns and # apply ctl_depth_ratio # num lines in tagaligns nlines = [get_num_lines(ta) for ta in args.tas] # num lines in control tagaligns nlines_ctl = [get_num_lines(ctl_ta) for ctl_ta in args.ctl_tas] # check every num lines in every pair of control tagaligns # if ratio of two entries in any pair > ctl_depth_ratio then # use pooled control for all use_pooled_ctl = False for i in range(len(nlines_ctl)): for j in range(i + 1, len(nlines_ctl)): if nlines_ctl[i]/float(nlines_ctl[j]) > \ args.ctl_depth_ratio or \ nlines_ctl[j]/float(nlines_ctl[i]) > \ args.ctl_depth_ratio: use_pooled_ctl = True log.info( 'Number of reads in controls differ by a factor of {}.' 'Using pooled controls.'.format(args.ctl_depth_ratio)) break if use_pooled_ctl: # use pooled control for all exp replicates ctl_ta_idx = [-1] * len(args.tas) else: for i in range(len(args.tas)): if i > len(args.ctl_tas) - 1: ctl_ta_idx[i] = -1 # use pooled control elif nlines_ctl[i] < nlines[i]: log.info( 'Fewer reads in control {} than experiment replicate ' '{}. Using pooled control for replicate {}.'.format( i + 1, i + 1, i + 1)) ctl_ta_idx[i] = -1 # use pooled control else: ctl_ta_idx[i] = i # log.info('Writing idx.txt...') # out_txt = os.path.join(args.out_dir, 'idx.txt') # write_txt(out_txt, ctl_ta_idx) log.info('Writing ctl_for_repN.tagAlign.gz files...') for i, ctl_id in enumerate(ctl_ta_idx): rep_id = i + 1 dest = os.path.join(args.out_dir, 'ctl_for_rep{}.tagAlign.gz'.format(rep_id)) if ctl_id == -1: src = args.ctl_ta_pooled[0] else: src = args.ctl_tas[ctl_id] copy_f_to_f(src, dest) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')