def merge_bam(bam1, bam2, bam): if os.path.isfile(bam): logger.info(f'BAM file {bam} already exist.') else: cmd = f'samtools merge {bam} {bam1} {bam2}' cmder.run(cmd, msg=f'Merging {bam1} and {bam2} ...') return bam
def split_bam(bam, bam1, bam2): def count_mapped_reads(bam): count = int( cmder.run(f'samtools view -c -F 0x4 {bam}', msg='').stdout.read()) logger.info(f'Found {count:,} mapped reads in {bam}.') return count if os.path.isfile(bam1) and os.path.isfile(bam2): logger.info(f'BAMs {bam1} and {bam2} already exist.') else: half_lines = int(count_mapped_reads(bam) / 2) + 1 cmd = f'samtools view {bam} | shuf | split -d -l {half_lines} - {bam}' cmder.run(cmd, msg=f'Shuffling and splitting {bam} ...') tmp_bam1, tmp_bam2 = bam1.replace('.bam', '.tmp.bam'), bam2.replace( '.bam', '.tmp.bam') cmd = f'samtools view -H {bam} | cat - {bam}00 | samtools view -bS - > {tmp_bam1}' cmder.run(cmd, msg=f'Creating headers for {bam1} ...') cmder.run(f'samtools sort -@ {options.cpus} -o {bam1} {tmp_bam1}') cmd = f'samtools view -H {bam} | cat - {bam}01 | samtools view -bS - > {tmp_bam2}' cmder.run(cmd, msg=f'Creating headers for {bam2} ...') cmder.run(f'samtools sort -@ {options.cpus} -o {bam2} {tmp_bam2}') cmder.run(f'rm {bam}00 {bam}01 {tmp_bam1} {tmp_bam2}') return bam1, bam2
def merge_bam(bams, bam): if os.path.isfile(bam): logger.info(f'BAM file {bam} already exist.') else: cmd = f'samtools merge {bam} {" ".join(bams)}' cmder.run(cmd, msg=f'Merging {" ".join(bams)} to {bam} ...') return bam
def make_bigwig_files(bam, bigwig): def bam_to_bigwig(bam, scale, strand, bw): bg = bw.replace('.bw', '.bg') cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split | sort -k1,1 -k2,2n > {bg}' cmder.run( cmd, msg=f'Calculating genome coverage for {bam} ({strand} strand) ...') cmd = f'bedGraphToBigWig {bg} {options.genome}/chrNameLength.txt {bw}' cmder.run(cmd, msg=f'Converting {bg} to {bw} ...') cmder.run(f'rm {bg}') logger.info(f'Make BigWig files for {bam} ...') pos_bw, neg_bw = bigwig, bigwig.replace('.plus.bw', '.minus.bw') with pysam.AlignmentFile(bam, 'rb') as sam: total_reads = sam.mapped try: scale = 1000000.0 / total_reads except ZeroDivisionError: logger.warning( f'No reads was found in BAM {bam}, empty BigWig file was created.') with open(bigwig, 'w') as o: o.write('') return bigwig bam_to_bigwig(bam, scale, '+', pos_bw) bam_to_bigwig(bam, -1 * scale, '-', neg_bw) logger.info(f'Make BigWig files for {bam} complete.') return bigwig
def collapse_barcode(bam, out): logger.info(f'Deduplicating {bam} {size(bam)} by collapsing barcodes ...') verbosity = pysam.set_verbosity(0) with pysam.AlignmentFile(bam, 'rb') as b1, pysam.AlignmentFile(bam, 'rb') as b2: results = {} for read1, read2 in zip(itertools.islice(b1, 0, None, 2), itertools.islice(b2, 1, None, 2)): if read1.query_name != read2.query_name: raise ValueError( f'Read names do not match: {read1.query_name} != {read2.query_name}.' ) if read1.is_unmapped or read2.is_unmapped or read1.reference_name != read2.reference_name: continue if not read1.is_read1: read1, read2 = read2, read1 randomer = read1.query_name.split(':')[0] start = read1.positions[-1] if read1.is_reverse else read1.pos stop = read2.positions[-1] if read2.is_reverse else read2.pos strand = '-' if read1.is_reverse else '+' location = (read1.reference_name, start, stop, strand, randomer) if location in results: continue results[location] = (read1, read2) with pysam.AlignmentFile(out, 'wb', template=b1) as o: for (read1, read2) in results.values(): o.write(read1) o.write(read2) logger.info( f'Deduplicating {bam} {size(bam)} by collapsing barcodes complete.' ) pysam.set_verbosity(verbosity)
def clipper_peaks(bam, bed=''): bed = bed if bed else bam.replace('.ip.bam', '.peak.clusters.bed') if os.path.isfile(bed): logger.info(f'Clipper bed {bed} already exists.') else: cmd = f'clipper --species {options.species} --processors {options.cpus} --bam {bam} --outfile {bed}' cmder.run(cmd, msg=f'Calling peaks from {bam} using clipper ...', pmt=True) return bed
def motif_analysis(bed, output): basename = output.split('.motifs.')[0] cmd = [ 'motif', bed, options.species, options.outdir, basename, options.l10p, options.l2fc, options.cpus ] cmder.run(cmd, msg=f'Finding motifs in {bed} ...') logger.info(f'Parsing and compiling motifs for {basename} ...') compile_motif_html(basename, output) logger.info(f'Parsing and compiling motifs for {basename} complete.')
def split_bam(bam, basename, n): def count_mapped_reads(bam): count = int( cmder.run(f'samtools view -c -F 0x4 {bam}', msg='').stdout.read()) logger.info(f'Found {count:,} mapped reads in {bam}.') return count bams = [f'{basename}{i}.bam' for i in range(n)] if all([os.path.isfile(b) for b in bams]): logger.info(f'Split bams already exist.') else: lines = int(count_mapped_reads(bam) / n) + 1 cmd = f'samtools view {bam} | shuf | split - -a 1 --additional-suffix=.bam -d -l {lines} {basename}' cmder.run(cmd, msg=f'Shuffling and splitting {bam} ...') for b in bams: tmp = b.replace(".bam", ".tmp.bam") cmder.run( f'samtools view -H {bam} | cat - {b} | samtools view -bS - > {tmp}' ) cmder.run(f'samtools sort -@ {options.cpus} -o {b} {tmp}') cmder.run(f'rm {tmp}') return bams
def make_bigwig_files(bam, bigwig): def bam_to_bigwig(bam, scale, strand, bw): bg, bg_sort = bw.replace('.bw', '.bg'), bw.replace('.bw', '.sort.bg') cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split > {bg}' cmder.run(cmd) cmd = f'bedSort {bg} {bg_sort}' cmder.run(cmd) cmd = f'bedGraphToBigWig {bg_sort} {options.genome}/chrNameLength.txt {bw}' cmder.run(cmd) cmder.run(f'rm {bg} {bg_sort}') message, start_time = f'Make BigWig files for {bam} ...', time.perf_counter( ) logger.info(message) pos_bw, neg_bw = bigwig, bigwig.replace('.plus.bw', '.minus.bw') with pysam.AlignmentFile(bam, 'rb') as sam: total_reads = sam.mapped total_reads = total_reads / 2 if TYPE == 'paired' else total_reads try: scale = 1000000.0 / total_reads except ZeroDivisionError: logger.error( f'No reads was found in BAM {bam}, empty BigWig file was created.') with open(bigwig, 'w') as o: o.write('') return bigwig if TYPE == 'single': bam_to_bigwig(bam, scale, '+', pos_bw) bam_to_bigwig(bam, -1 * scale, '-', neg_bw) else: bam_to_bigwig(bam, -1 * scale, '-', pos_bw) bam_to_bigwig(bam, scale, '+', neg_bw) run_time = int(time.perf_counter() - start_time) message = message.replace( ' ...', f' completed in [{str(datetime.timedelta(seconds=run_time))}].') logger.info(message) return bigwig
def count_mapped_reads(bam): count = int(cmder.run(f'samtools view -c -F 0x4 {bam}', msg='').stdout.read()) logger.info(f'Found {count:,} mapped reads in {bam}.') return count
def count_lines(file): lines = int(cmder.run(f'wc -l {file}').stdout.read().split()[0]) logger.info(f'Found {lines:,} lines in {file}.') return lines
def make_hub_file(inputs, output): logger.info('Make hub track file ...') header = f"""hub {options.track.replace(' ', '_')} shortLabel {options.track} longLabel {options.track} useOneFile on email {options.email if options.email else '*****@*****.**'} genome {options.track_genome} track {options.track.replace(' ', '_')} shortLabel {options.track} longLabel {options.track} type bigWig superTrack on """ block = """ track {basename} shortLabel {basename} longLabel {basename} type bigWig visibility full alwaysZero on autoScale on aggregate transparentOverlay showSubtrackColorOnUi on parent {track} container multiWig track {name1} bigDataUrl {plus} shortLabel {basename} Plus strand longLabel {basename} Plus strand type bigWig color 0,100,0 parent {basename} track {name2} bigDataUrl {minus} shortLabel {basename} Minus strand longLabel {basename} Minus strand type bigWig color 100,0,0 parent {basename} """ track = options.track.replace(' ', '_') with open(output, 'w') as o: o.write(header) for bw in glob.iglob('*.plus.bw'): key = bw.replace('.plus.bw', '') plus, minus = f'{key}.pos.bw', f'{key}.neg.bw' name1, name2 = f'{key} plus', f'{key} minus' o.write( block.format(track=track, name1=name1, name2=name2, basename=key, plus=plus, minus=minus)) logger.info('Make hub track file complete.')
def make_hub_files(inputs, output): message, start_time = 'Make hub track file ...', time.perf_counter() logger.info(message) header = f"""hub {options.track.replace(' ', '_')} shortLabel {options.track_label} longLabel {options.track_label} useOneFile on email {options.email if options.email else '*****@*****.**'} genome {options.track_genome} track {options.track.replace(' ', '_')} shortLabel {options.track_label} longLabel {options.track_label} type bigWig superTrack on """ block = """ track {basename} shortLabel {basename} longLabel {basename} type bigWig visibility full alwaysZero on autoScale on aggregate transparentOverlay showSubtrackColorOnUi on parent {track} container multiWig track {name1} bigDataUrl {plus} shortLabel {basename} Plus strand longLabel {basename} Plus strand type bigWig color 0,100,0 parent {basename} track {name2} bigDataUrl {minus} shortLabel {basename} Minus strand longLabel {basename} Minus strand type bigWig color 100,0,0 parent {basename} """ track = options.track.replace(' ', '_') with open(output, 'w') as o: o.write(header) for key in READS: plus = f'{key}.plus.bw' name1 = plus.replace('.bw', '').replace('.', '_') name2 = name1.replace('plus', 'minus') minus = f'{key}.minus.bw' o.write( block.format(track=track, name1=name1, name2=name2, basename=key, plus=plus, minus=minus)) run_time = int(time.perf_counter() - start_time) message = message.replace( ' ...', f' completed in [{str(datetime.timedelta(seconds=run_time))}].') logger.info(message)
def demux(fastq1, fastq2, basename, barcodes): """Demultiplex paired-end reads.""" def hamming(key, barcode, seq, allow_mismatch): mismatch = len(barcode) - sum(x == y or x == 'N' or y == 'N' for x, y in zip(barcode, seq)) return (key, len(barcode), mismatch) if mismatch <= allow_mismatch else None logger.info( f'Demultiplexing {fastq1} and {fastq2} with barcodes {" and ".join(barcodes)} ...' ) barcodes_dict = { 'A01': 'AAGCAAT', 'A03': 'ATGACCNNNNT', 'A04': 'CAGCTTNNNNT', 'B06': 'GGCTTGT', 'C01': 'ACAAGTT', 'D8f': 'TGGTCCT', 'F05': 'GGATACNNNNT', 'G07': 'TCCTGTNNNNT', 'X1A': 'NNNNNCCTATAT', 'X1B': 'NNNNNTGCTATT', 'X2A': 'NNNNNTATACTT', 'X2B': 'NNNNNATCTTCT' } allow_mismatch, randomer_length = options.allow_mismatch, options.randomer_length max_barcode_length = max( len(barcode) for barcode in barcodes_dict.values()) writers = {} for barcode in set(barcodes): file1, file2 = f'{basename}.{barcode}.r1.fastq.gz', f'{basename}.{barcode}.r2.fastq.gz' writers[barcode] = (gzip.open(file1, 'wt'), gzip.open(file2, 'wt')) NEED_TO_REMOVE.extend([file1, file2]) with gzip.open(fastq1, 'rt') as f1, gzip.open(fastq2, 'rt') as f2: for i, (read1, read2) in enumerate( zip(FastqGeneralIterator(f1), FastqGeneralIterator(f2))): (name1, seq1, quality1), (name2, seq2, quality2) = read1, read2 n1, n2 = name1.split()[0], name2.split()[0] assert n1 == n2, ValueError( f'Paired-End reads have mismatch names: {name1} != {name2}') matches = (hamming(key, barcode, seq1[:max_barcode_length], allow_mismatch) for key, barcode in barcodes_dict.items()) matches = [match for match in matches if match] if matches: barcode, barcode_length, _ = sorted(matches, key=lambda x: x[2])[0] r1 = f'@{seq2[:randomer_length]}:{name1}\n{seq1[barcode_length:]}\n+\n{quality1[barcode_length:]}\n' else: barcode = 'NIL' r1 = f'@{seq2[:randomer_length]}:{name1}\n{seq1}\n+\n{quality1}\n' r2 = f'@{seq2[:randomer_length]}:{name2}\n{seq2[randomer_length:]}\n+\n{quality2[randomer_length:]}\n' if barcode in writers: writer1, writer2 = writers[barcode] writer1.write(r1) writer2.write(r2) _ = [[v[0].close(), v[1].close()] for v in writers.values()] logger.info( f'Demultiplexing {fastq1} and {fastq2} with barcodes {" and ".join(barcodes)} complete.' )