def get_adapters(): if adapters: adapters1, adapters2 = parse_adapters('-a', adapters), parse_adapters( '-a', adapters) overlap1, overlap2 = '1', '5' else: cmd = [ 'parse_barcodes.sh', randomer_length, barcodes_fasta, barcode1, barcode2 ] folder = tempfile.mkdtemp(dir=outdir) cmder.run(cmd, msg='Parsing barcodes and finding adapters ...', cwd=folder) adapters1 = parse_adapters( '-g', os.path.join(folder, 'g_adapters.fasta')) adapters1 += parse_adapters( '-A', os.path.join(folder, 'A_adapters.fasta')) adapters1 += parse_adapters( '-a', os.path.join(folder, 'a_adapters.fasta')) adapters2 = parse_adapters( '-A', os.path.join(folder, 'A_adapters.fasta')) overlap1 = parse_overlap( os.path.join(folder, 'trim_first_overlap_length.txt')) overlap2 = parse_overlap( os.path.join(folder, 'trim_again_overlap_length.txt')) shutil.rmtree(folder) return adapters1, adapters2, overlap1, overlap2
def merge_bam(bams, bam): if os.path.isfile(bam): logger.info(f'BAM file {bam} already exist.') else: cmd = f'samtools merge {bam} {" ".join(bams)}' cmder.run(cmd, msg=f'Merging {" ".join(bams)} to {bam} ...') return bam
def trim_adapters(adapters, overlap, ios, message): cmd = [ 'cutadapt', '-O', overlap, '-j', options.cores, '--match-read-wildcards', '--times', 1, '-e', 0.1, '--quality-cutoff', 6, '-m', 18 ] + adapters + ios cmder.run(cmd, msg=message, pmt=True)
def cut_adapt(fastq, output): key = fastq.rsplit(".umi.fastq.gz", maxsplit=1)[0] cmd = [ 'eclip_cut_adapt', fastq, '-o', output, '-a', fastq_to_adapters[key], '-c', args.cpus ] cmder.run(cmd, stdout=sys.stdout, stderr=sys.stderr, log_cmd=False)
def map_to_repeat_elements(fastq, mate1): fastq1, fastq2 = fastq, fastq.replace('.r1.', '.r2.') prefix = os.path.dirname(mate1) if not os.path.isdir(prefix): os.mkdir(prefix) cmd = [ 'STAR', '--runMode', 'alignReads', '--runThreadN', options.cores, '--alignEndsType', 'EndToEnd', '--genomeDir', options.repeat, '--genomeLoad', 'NoSharedMemory', '--outBAMcompression', 10, '--outFileNamePrefix', f"{prefix}/", '--outFilterMultimapNmax', 30, '--outFilterMultimapScoreRange', 1, '--outFilterScoreMin', 10, '--outFilterType', 'BySJout', '--outReadsUnmapped', 'Fastx', '--outSAMattrRGline', 'ID:foo', '--outSAMattributes', 'All', '--outSAMmode', 'Full', '--outSAMtype', 'BAM', 'Unsorted', '--outSAMunmapped', 'Within', '--outStd', 'Log', '--readFilesIn', fastq1 ] if os.path.exists(fastq2): cmd.append(fastq2) message = (f'Map paired reads {fastq1} {size(fastq1)} and\n{28 * " "}' f'{fastq2} {size(fastq2)} to repeat elements ...') else: message = f'Map single read {fastq1} {size(fastq1)} to repeat elements ...' cmder.run(cmd, msg=message, pmt=True) return mate1
def map_to_repbase(fastq, mate): bam = f'{mate.rsplit(".mate1.gz", maxsplit=1)[0]}.bam' cmd = [ 'star_repbase_map', fastq, '-x', args.repeat, '-c', args.cpus, '-o', bam ] cmder.run(cmd, stdout=sys.stdout, stderr=sys.stderr, log_cmd=False)
def get_adapters(read): adapter, adapters = read.adapters, [] if adapter: adapters1 = parse_adapters('-a', adapter) adapters2 = adapters1 overlap1, overlap2 = '1', '5' else: cmd = [ 'parse_barcodes.sh', options.randomer_length, options.barcodes_fasta ] + read.barcodes folder = tempfile.mkdtemp(dir=ECLIP) cmder.run(cmd, msg='Parsing barcodes and finding adapters ...', cwd=folder) adapters = parse_adapters('-g', os.path.join(folder, 'g_adapters.fasta')) adapters += parse_adapters( '-A', os.path.join(folder, 'A_adapters.fasta')) adapters += parse_adapters( '-a', os.path.join(folder, 'a_adapters.fasta')) adapters1 = adapters adapters2 = parse_adapters( '-A', os.path.join(folder, 'A_adapters.fasta')) overlap1 = parse_overlap( os.path.join(folder, 'trim_first_overlap_length.txt')) overlap2 = parse_overlap( os.path.join(folder, 'trim_again_overlap_length.txt')) shutil.rmtree(folder) return adapters1, adapters2, overlap1, overlap2
def extract_umi(fastq, umi): message = f'Extract UMIs for {fastq} ...' cmd = [ 'umi_tools', 'extract', '--random-seed', 1, '--stdin', fastq, '--bc-pattern', 'NNNNNNNNNN', '--stdout', umi, '>', '/dev/null' ] cmder.run(cmd, msg=message, pmt=True)
def merge_bam(bam1, bam2, bam): if os.path.isfile(bam): logger.info(f'BAM file {bam} already exist.') else: cmd = f'samtools merge {bam} {bam1} {bam2}' cmder.run(cmd, msg=f'Merging {bam1} and {bam2} ...') return bam
def map_to_reference_genome(mate1, bam): # '--outSAMunmapped' flag needs to be set to 'Within', otherwise barcode_collapse.py for duplication removal will # throw out name not match error. prefix = os.path.dirname(bam) if not os.path.isdir(prefix): os.mkdir(prefix) mate2 = mate1.replace('.mate1', '.mate2') cmd = [ 'STAR', '--runMode', 'alignReads', '--runThreadN', options.cores, '--alignEndsType', 'EndToEnd', '--genomeDir', options.genome, '--genomeLoad', 'NoSharedMemory', '--outBAMcompression', 10, '--outFileNamePrefix', f"{prefix}/", '--outFilterMultimapNmax', 1, '--outFilterMultimapScoreRange', 1, '--outFilterScoreMin', 10, '--outFilterType', 'BySJout', '--outReadsUnmapped', 'Fastx', '--outSAMattrRGline', 'ID:foo', '--outSAMattributes', 'All', '--outSAMmode', 'Full', '--outSAMtype', 'BAM', 'Unsorted', '--outSAMunmapped', 'Within', '--outStd', 'Log', '--readFilesIn', mate1 ] if os.path.exists(mate2): cmd.append(mate2) message = f'Map paired mates {mate1} {size(mate1)} and\n{28 * " "}{mate2} {size(mate2)} to reference genome ...' else: message = f'Map single mate {mate1} {size(mate1)} to reference genome ...' cmder.run(cmd, msg=message, pmt=True) return bam
def dedup_bam(bam, out): """Deduplicate SE BAM using umi_tools dedup.""" cmd = [ 'umi_tools', 'dedup', '--random-seed', 1, '--stdin', bam, '--method', 'unique', '--stdout', out ] cmder.run(cmd, msg=f'Deduplicating {bam} by umi_tools dedup ...') cmder.run(f'samtools index {out}', msg=f'Indexing {bam} ...')
def pigz(fastq, gz): print(f'Compressing {fastq}, which {os.path.exists(gz)}') if options.debug: cmd = f'pigz -p {PROCESSES} {fastq}' else: cmd = f'pigz --processes {PROCESSES} {fastq}' cmder.run(cmd, msg=f'Compressing {fastq} ...', pmt=True) return gz
def index_bam(bam, out): if TYPE == 'paired': cmder.run(f'samtools view -f 128 -@ {options.cores} -b -o {out} {bam}', msg=f'Creating bam {bam} {size(bam)} with r2 reads only ...') else: cmder.run(f'cp {bam} {out}') if not os.path.exists(f'{bam}.bai'): index_sorted_bam(out)
def clipper_peaks(bam, bed=''): bed = bed if bed else bam.replace('.ip.bam', '.peak.clusters.bed') if os.path.isfile(bed): logger.info(f'Clipper bed {bed} already exists.') else: cmd = f'clipper --species {options.species} --processors {options.cpus} --bam {bam} --outfile {bed}' cmder.run(cmd, msg=f'Calling peaks from {bam} using clipper ...', pmt=True) return bed
def prepare_bam(bam, out): if TYPE == 'single': name_sort = out.replace('.sort.bam', '.name.sort.bam') name_sort_bam(bam, name_sort) position_sort_bam(name_sort, out) index_sorted_bam(out) cmder.run(f'rm {name_sort}') else: name_sort_bam(bam, out)
def motif_analysis(bed, output): basename = output.split('.motifs.')[0] cmd = [ 'motif', bed, options.species, options.outdir, basename, options.l10p, options.l2fc, options.cpus ] cmder.run(cmd, msg=f'Finding motifs in {bed} ...') logger.info(f'Parsing and compiling motifs for {basename} ...') compile_motif_html(basename, output) logger.info(f'Parsing and compiling motifs for {basename} complete.')
def falco(fastq, txt): tmp = tempfile.mkdtemp(suffix='_qc', prefix='falco_', dir=QC) cmd = f'falco --outdir {tmp} --skip-html {fastq}' try: cmder.run( cmd, msg=f'Checking reads in {fastq} {size(fastq)} using falco ...') cmder.run(f'mv {tmp}/fastqc_data.txt {txt}') finally: shutil.rmtree(tmp)
def dedup_bam(bam, out): """Collapse barcodes of paired-end bam or umi_tools dedup single-end bam.""" if TYPE == 'single': cmd = [ 'umi_tools', 'dedup', '--random-seed', 1, '--stdin', bam, '--method', 'unique', '--stdout', out ] message = f'Deduplicating {bam} {size(bam)} by umi_tools dedup ...' cmder.run(cmd, msg=message, pmt=True) else: collapse_barcode(bam, out)
def peak(ip_bams, input_bams, peak_beds, reproducible_bed, outdir, cwd=''): cmd = [ 'peak', '--ip_bams', ' '.join(ip_bams), '--input_bam', ' '.join(input_bams), '--peak_beds', ' '.join(peak_beds), '--read_type', 'PE' if TYPE == 'paired' else 'SE', '--species', 'hg19' if options.species in ('hg19', 'hg19chr19') else options.species, '--outdir', outdir, '--cores', options.cores ] cwd = cwd if cwd else os.path.dirname(reproducible_bed) cmder.run(cmd, cwd=cwd, stdout=sys.stdout, stderr=sys.stderr) return reproducible_bed
def peak(ip_bams, input_bams, peak_beds, reproducible_bed, outdir): cmd = [ 'peak', '--ip_bams', ' '.join(ip_bams), '--input_bam', ' '.join(input_bams), '--peak_beds', ' '.join(peak_beds), '--read_type', 'SE', '--species', 'hg19' if options.species in ('hg19', 'hg19chr19') else options.species, '--outdir', outdir, '--cores', options.cpus, '--l2fc', options.l2fc, '--l10p', options.l10p ] cmder.run(cmd, cwd=options.outdir, stdout=sys.stdout, stderr=sys.stderr) return reproducible_bed
def trim_adapters(adapters, overlap, ios, message): cmd = [ 'cutadapt', '-O', overlap, '-j', cpus, '--match-read-wildcards', '--times', 1, '-e', 0.1, '--quality-cutoff', 6, '-m', 18 ] + adapters + ios if debug: cmder.run(cmd, msg=message, pmt=True, stdout=sys.stdout, stderr=sys.stderr) else: cmder.run(cmd, msg=message, pmt=True)
def peak(ip_bams, input_bams, peak_beds, reproducible_bed, outdir, cwd): cmd = ['peak', '--ip_bams', ' '.join(ip_bams), '--input_bam', ' '.join(input_bams), '--peak_beds', ' '.join(peak_beds), '--read_type', 'SE', '--species', 'hg19' if options.species in ('hg19', 'hg19chr19') else options.species, '--outdir', outdir, '--cores', options.cpus, '--l2fc', options.l2fc, '--l10p', options.l10p] if ids: cmd.extend(['--ids', ' '.join(ids)]) cwd = cwd if cwd else os.path.dirname(reproducible_bed) cmder.run(cmd, cwd=cwd, stdout=sys.stdout, stderr=sys.stderr) return reproducible_bed
def pureclip(bam, bed): ip_bam, input_bam = [[sample.ip_read.bam, sample.input_read.bam] for sample in SAMPLES if sample.ip_read.bam == bam][0] # '-iv', "'chr1;chr2;chr3'", Genomic chromosomes to learn HMM parameters cmd = [ 'pureclip', '-i', ip_bam, '-bai', f'{ip_bam}.bai', '-g', f'{options.genome}/genome.fa', '-nt', options.cores, '-ibam', input_bam, '-ibai', f'{input_bam}.bai', '-o', bed, '-or', bed.replace('.crosslink.sites.bed', '.binding.regions.bed'), '>', bed.replace('.crosslink.sites.bed', '.pureclip.log') ] cmder.run(cmd, msg=f'Calling peaks from {bam} {size(bam)} using pureCLIP ...', pmt=True)
def map_to_reference_genome(mate, bam): prefix = mate.replace('.repeat.unmap.fastq.gz', '.genome.map') try: if not os.path.isdir(prefix): os.mkdir(prefix) cmd = [ 'STAR', '--runMode', 'alignReads', '--runThreadN', options.cpus, '--alignEndsType', 'EndToEnd', '--genomeDir', options.genome, '--genomeLoad', 'NoSharedMemory', '--outBAMcompression', 10, '--outFileNamePrefix', f"{prefix}/", '--outFilterMultimapNmax', 1, '--outFilterMultimapScoreRange', 1, '--outFilterScoreMin', 10, '--outFilterType', 'BySJout', '--outReadsUnmapped', 'Fastx', '--outSAMattrRGline', 'ID:foo', '--outSAMattributes', 'All', '--outSAMmode', 'Full', '--outSAMtype', 'BAM', 'Unsorted', '--outSAMunmapped', 'None', '--outStd', 'Log', '--readFilesCommand', 'zcat', '--readFilesIn', mate ] message = f'Map SE repeat elements unmapped reads in {mate} to reference genome ...' cmder.run(cmd, msg=message) cmder.run( f'mv {prefix}/Log.final.out {bam.replace(".genome.map.bam", ".genome.map.log")}' ) unmap = bam.replace('.genome.map.bam', '.genome.unmap.fastq.gz') cmder.run( f'pigz -c -p {options.cpus} {prefix}/Unmapped.out.mate1 > {unmap}') cmder.run(f'mv {prefix}/Aligned.out.bam {bam}') finally: shutil.rmtree(prefix) return bam
def repetitive_elements_map(bam, tsv): cmd = [ 'repeat-maps', '--fastq', bam.replace('.genome.map.sort.bam', '.trim.fastq.gz'), '--bam', bam, '--dataset', bam.replace('.genome.map.sort.bam', ''), '--scheduler', 'local', '--cpus', options.cpus, '--species', options.species, '--outdir', options.outdir ] cmder.run( cmd, msg= f'Mapping {bam.replace(".genome.map.sort.bam", "")} repetitive elements ...' )
def merge_paired_bam(bam, out): if not os.path.exists(out): key = out.replace(".merge.bam", "") barcodes = READS[os.path.basename(key)].barcodes if barcodes[0] == 'NIL': cmder.run(f'cp {bam} {out}') else: b1, b2 = barcodes if b1 in bam: b1, b2 = bam, bam.replace(b1, b2) else: b1, b2 = bam.replace(b2, b1), bam cmder.run(f'samtools merge -@ {options.cores} {out} {b1} {b2}', msg=f'Merging {b1} {size(b1)} and {b2} {size(b2)} ...', pmt=True)
def starmap(fastq1, fastq2, index, mnm, bam, cpus=1, debug=False): """ Map reads to reference genome or repeat elements using STAR. :param fastq1: str, path to a FASTQ file (read 1). :param fastq2: str, path to a FASTQ file (read 2). For single-end dataset, using a non-existing file path or special string 'none' or 'None' for fastq2 to avoid required argument error. :param index: str, path to a STAR genome or repeat elements index directory. :param mnm: int, maximum number of loci the read is allowed to map to. :param bam: str, path to the output BAM file, must ends with .bam extension. :param cpus: int, the number of CPUs can be used. :param debug: bool, set to True to invoke debug mode (only for development purpose). """ outdir = os.path.dirname(fastq1) or os.getcwd() folder = tempfile.mkdtemp(prefix=os.path.basename(fastq1), suffix='.star.map', dir=outdir) cmd = [ 'STAR', '--runMode', 'alignReads', '--runThreadN', cpus, '--alignEndsType', 'EndToEnd', '--genomeDir', index, '--outBAMcompression', 10, '--outFileNamePrefix', f'{folder}/', '--outFilterMultimapNmax', mnm, '--outFilterScoreMin', 10, '--outReadsUnmapped', 'Fastx', '--outSAMattrRGline', 'ID:foo', '--outSAMattributes', 'All', '--outSAMtype', 'BAM', 'Unsorted', '--readFilesCommand', 'zcat' if fastq1.endswith('.gz') else '-', '--readFilesIn', fastq1 ] if os.path.isfile(fastq2): cmd.append(fastq2) message = f'Map paired reads {fastq1} and\n{28 * " "}{fastq2} to {label} using STAR ...' else: message = f'Map single read {fastq1} to {label} using STAR ...' cmder.run(cmd, msg=message, pmt=True, debug=debug) move = shutil.copy if debug else shutil.move move(os.path.join(folder, 'Aligned.out.bam'), '{name}.bam'.format(name=basename)) move(os.path.join(folder, 'Unmapped.out.mate1'), '{name}.unmap.mate1'.format(name=basename)) move(os.path.join(folder, 'Log.final.out'), '{name}.log.final.out'.format(name=basename)) mate2 = os.path.join(folder, 'Unmapped.out.mate2') if os.path.isfile(mate2): move(mate2, '{name}.unmap.mate2'.format(name=basename)) if not debug: shutil.rmtree(folder)
def bam_to_bigwig(bam, scale, strand, bw, genome_length): bg, bg_sort = bw.replace('.bw', '.bg'), bw.replace('.bw', '.sort.bg') cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split > {bg}' cmder.run(cmd) cmd = f'bedSort {bg} {bg_sort}' cmder.run(cmd) cmd = f'bedGraphToBigWig {bg_sort} {genome_length} {bw}' cmder.run(cmd) cmder.run(f'rm {bg} {bg_sort}')
def umi_dedup(bam, output, debug=False): """ Deduplicate single-end BAM by umi-tools dedup. :param bam: str, path to BAM file. :param output: str, path to the output file. :param debug: bool, set to True for invoking debug mode. """ cmd = [ 'umi_tools', 'dedup', '--random-seed', 1, '--stdin', bam, '--method', 'unique', '--stdout', output ] cmder.run(cmd, msg=f'Deduplicating {bam} by umi_tools dedup ...', pmt=True, debug=debug)
def prepare_reads(link, output): """Extract UMIs for single-end reads or demultiplex paired-end reads.""" read = READS[os.path.basename(link.replace('.r1.fastq.gz', ''))] fastq1, fastq2 = read.link1, read.link2 if fastq2: demux(fastq1, fastq2, fastq1.replace('.r1.fastq.gz', ''), read.barcodes) else: message = f'Extract UMIs for single-end read {fastq1} {size(fastq1)} ...' cmd = [ 'umi_tools', 'extract', '--random-seed', 1, '--stdin', fastq1, '--bc-pattern', 'NNNNNNNNNN', '--log', fastq1.replace('.fastq.gz', '.extract.metrics'), '--stdout', fastq1.replace('.r1.fastq.gz', '.umi.r1.fastq.gz') ] cmder.run(cmd, msg=message, pmt=True) NEED_TO_REMOVE.append( fastq1.replace('.r1.fastq.gz', '.umi.r1.fastq.gz'))