def run_bowtie(directory, dependencies): """Run bowtie in directory. :dependencies: list of filter jobs for this directory. """ olddir = os.path.abspath('.') os.chdir(directory) bowtie1 = make_job_file('bowtie --best --strata ' + '-p 16 --chunkmbs 2000 --maxins 2000 -m 1 -q ' + '../pat/PatRef ' + '--chunkmbs 2000 --maxins 1000' + '-1 in.1.filtered.fastq -2 in.2.filtered.fastq ' + 'pat_alignment.bam 2> pat_alignment.log', 'pat_bowtie', '24:00:00', 16, modules=['bowtie1']) bowtie2 = make_job_file('bowtie --best --strata ' + '-p 16 --chunkmbs 2000 --maxins 2000 -m 1 -q ' + '../mat/MatRef ' + '--chunkmbs 2000 --maxins 1000' + '-1 in.1.filtered.fastq -2 in.2.filtered.fastq ' + 'mat_alignment.bam 2> mat_alignment.log', 'mat_bowtie', '24:00:00', 16, modules=['bowtie1']) job1 = sl.monitor_submit(bowtie1, dependencies, max_count=MAX_JOBS) job2 = sl.monitor_submit(bowtie2, dependencies, max_count=MAX_JOBS) os.chdir(olddir) return [job1, job2]
def wasp_step_2(name, remapped, pipeline=None, dependency=None): """Run filter_remapped_reads.py following second mapping. :name: The name of the original mapped bam or sam, used to make file names :remapped: The file created by the second mapping. :pipeline: The path to the WASP pipeline. :dependency: The job number of the remapping step. :returns: The job number. """ command = os.path.join(os.path.abspath(pipeline), 'filter_remapped_reads.py') \ if pipeline else 'filter_remapped_reads.py' # Trim the name shortname = '.'.join(name.split('.')[:-1]) if name.endswith('.bam') \ or name.endswith('.sam') else name logme.log('Submitting wasp step 2 for {}'.format(shortname), level='debug') return slurmy.monitor_submit(slurmy.make_job_file( 'python2 {} {} {} {} {}'.format(command, shortname + '.to.remap.bam', remapped, shortname + '.remap.keep.bam', shortname + '.to.remap.num.gz'), shortname + '_step2', '16:00:00', 8, '30000', partition=PARTITION, modules=['python/2.7.5']), dependency, MAX_JOBS)
def clean_star(directory, dependencies): olddir = os.path.abspath('.') os.chdir(directory) clean1 = make_job_file('samtools view ' + 'pat_alignment_Aligned.sortedByCoord.out.bam' + ' > pat_alignment.sam', 'pat_clean', '08:00:00', 2, mem=10000, modules=['samtools']) clean2 = make_job_file('samtools view ' + 'mat_alignment_Aligned.sortedByCoord.out.bam' + ' > mat_alignment.sam', 'mat_clean', '08:00:00', 2, mem=10000, modules=['samtools']) job1 = sl.monitor_submit(clean1, dependencies, max_count=MAX_JOBS) job2 = sl.monitor_submit(clean2, dependencies, max_count=MAX_JOBS) os.chdir(olddir) return [job1, job2]
def clean_bowtie(directory, dependencies): olddir = os.path.abspath('.') os.chdir(directory) bowtie1 = make_job_file('samtools sort pat_alignment.bam pat_sorted\n' + 'samtools view pat_sorted.bam > ' + 'pat_alignment.sam\n' + 'rm pat_alignment.bam pat_sorted.bam', 'pat_clean', '08:00:00', 2, modules=['samtools']) bowtie2 = make_job_file('samtools sort mat_alignment.bam mat_sorted\n' + 'samtools view mat_sorted.bam > ' + 'mat_alignment.sam\n' + 'rm mat_alignment.bam mat_sorted.bam', 'mat_clean', '08:00:00', 1, modules=['samtools']) job1 = sl.monitor_submit(bowtie1, dependencies, max_count=MAX_JOBS) job2 = sl.monitor_submit(bowtie2, dependencies, max_count=MAX_JOBS) os.chdir(olddir) return [job1, job2]
def filter_fastqs(directory): """Trim out all reads with 'N's. :directory: Directory to run in, absolute path required. :returns: list of job numbers """ olddir = os.path.abspath('.') os.chdir(directory) filter1 = make_job_file( '/home/dacre/mike_tools/bin/number_fastq_records.py ' + '-i in.1.fastq -o in.1.filtered.fastq', 'filter1', '02:00:00', 1, 22000, modules='python/3.3.2') filter2 = make_job_file( '/home/dacre/mike_tools/bin/number_fastq_records.py ' + '-i in.2.fastq -o in.2.filtered.fastq', 'filter2', '02:00:00', 1, 22000, modules='python/3.3.2') job1 = sl.monitor_submit(filter1, max_count=MAX_JOBS) job2 = sl.monitor_submit(filter2, max_count=MAX_JOBS) os.chdir(olddir) return [job1, job2]
def merge(directory, dependencies, type): """Run AlleleSeq Merge Step.""" olddir = os.path.abspath('.') os.chdir(directory) merge1 = make_job_file('python ' + '../../AlleleSeq_pipeline_v1.2a/MergeBowtie.py ' + 'pat_alignment.sam mat_alignment.sam ' + '../genome/%s_' + type + '.map > ' + 'merged_reads.sam 2> merged_reads.log', 'merge', '06:00:00', 1, 8000, modules='python/2.7.5') job1 = sl.monitor_submit(merge1, dependencies, max_count=MAX_JOBS) os.chdir(olddir) return job1
def merge_bams(name, dependency=None): """Use samtools to merge two bam files.""" shortname = '.'.join(name.split('.')[:-1]) if name.endswith('.bam') \ or name.endswith('.sam') else name orig_reads = shortname + '.keep.bam' remapped = shortname + '.remap.keep.bam' uname = shortname + '_wasp_final_unsorted.bam' final_name = shortname + '_wasp_final.bam' return slurmy.monitor_submit(slurmy.make_job_file( 'samtools merge -f {} {} {}\n'.format(uname, orig_reads, remapped) + 'samtools sort -o {} {}'.format(final_name, uname), shortname + '_merge', '16:00:00', 4, '26000', partition=PARTITION, modules='samtools'), dependency, MAX_JOBS)
def run_star(directory, dependencies): """Run STAR in directory. :dependencies: list of filter jobs for this directory. """ olddir = os.path.abspath('.') os.chdir(directory) unzip = ('cat in.1.fastq.gz | /home/dacre/usr/bin/unpigz -p 16 > in.1.fastq\n' + 'cat in.2.fastq.gz | /home/dacre/usr/bin/unpigz -p 16 > in.2.fastq\n') star1 = ('/home/dacre/usr/bin/STAR --runThreadN 16 ' + '--genomeDir ../pat/pat_star ' + # '--readFilesIn in.1.fastq in.2.fastq ' + '--readFilesIn in.1.filtered.fastq in.2.filtered.fastq ' + '--outFilterMultimapNmax 1 ' + '--outFileNamePrefix pat_alignment_ ' + '--outSAMtype BAM SortedByCoordinate ' + '--outSAMattributes MD NH ' + '--clip5pNbases 6') star2 = ('/home/dacre/usr/bin/STAR --runThreadN 16 ' + '--genomeDir ../mat/mat_star ' + # '--readFilesIn in.1.fastq in.2.fastq ' + '--readFilesIn in.1.filtered.fastq in.2.filtered.fastq ' + '--outFilterMultimapNmax 1 ' + '--outFileNamePrefix mat_alignment_ ' + '--outSAMtype BAM SortedByCoordinate ' + '--outSAMattributes MD NH ' + '--clip5pNbases 6') # if not os.path.exists(os.path.join(directory, 'in.1.fastq')): # unzip = make_job_file(unzip, 'unzip', '04:00:00', 16) # unzip_job = sl.monitor_submit(unzip, dependencies, max_count=MAX_JOBS) # dependencies = unzip_job star1 = make_job_file(star1, 'pat_star', '12:00:00', 16, modules=['STAR']) star2 = make_job_file(star2, 'mat_star', '12:00:00', 16, modules=['STAR']) job1 = sl.monitor_submit(star1, dependencies, max_count=MAX_JOBS) job2 = sl.monitor_submit(star2, dependencies, max_count=MAX_JOBS) os.chdir(olddir) return [job1, job2]
def count(directory, dependencies, type): """Run AlleleSeq Count Step.""" olddir = os.path.abspath('.') os.chdir(directory) name = os.path.basename(directory) count1 = make_job_file('python ' + '../../AlleleSeq_pipeline_v1.2a/SnpCounts.py ' + '../*snps.txt merged_reads.sam ' + '../genome/%s_' + type + '.map {}.cnt '.format(name), 'count', '06:00:00', 16, 32000, modules='python/2.7.5') job1 = sl.monitor_submit(count1, dependencies, max_count=MAX_JOBS) os.chdir(olddir) return job1
def wasp_step_1(fl, snp_dir, pipeline=None, dependency=None): """Run find_intersecting_snps.py on fl. :fl: The sam or bam file to run on. :snp_dir: The SNP directory required by WASP. :pipeline: The path to the WASP pipeline. :dependency: The job number of the remapping step. :returns: The job number. """ command = os.path.join(os.path.abspath(pipeline), 'find_intersecting_snps.py') \ if pipeline else 'find_intersecting_snps.py' logme.log('Submitting wasp step 1 for {}'.format(fl), level='debug') return slurmy.monitor_submit(slurmy.make_job_file( 'python2 {} -m 1000000 {} {}'.format(command, fl, snp_dir), fl + '_step1', '16:00:00', 8, '30000', partition=PARTITION, modules=['python/2.7.5']), dependency, MAX_JOBS)
def run_mapping(name, infiles, genome, algorithm='STAR', gtf=None, dependency=None): """Run read mapping using either tophat or STAR. :name: A name prefix to use for the output. :infiles: List of fastqs, space separated for paired end, comma separated for batches. Must be a string. Note: if gzipped and using STAR, they will be unzipped and rezipped during mapping :genome: The genome or STAR genome index. :algorithm: STAR or tophat. Case ignored. :gtf: A GTF of genes for tophat, not required. :dependency: The job number of the remapping step. :returns: Job number of mapping step and name of output bam. """ if algorithm.lower() == 'star': cmnd = [] new_list = [] zipped = False for fl in infiles.split(' '): b = [] for i in fl.split(','): if i.endswith('.gz'): zipped = True cmnd.append('/home/dacre/usr/bin/unpigz -p 16 ' + i) b.append(i[:-3]) else: b.append(i) new_list.append(','.join(b)) infiles = ' '.join(new_list) cmnd.append('/home/dacre/usr/bin/STAR --runThreadN 16 ' + '--genomeDir {} '.format(genome) + '--readFilesIn {} '.format(infiles) + '--outFilterMultimapNmax 1 ' + '--outFileNamePrefix {} '.format(name) + '--outSAMtype BAM SortedByCoordinate ' + '--outSAMattributes MD NH ' + '--clip5pNbases 6 ' + '--limitBAMsortRAM {}'.format(STAR_MEM)) if zipped: for fl in new_list: for i in fl.split(','): cmnd.append( '/home/dacre/usr/bin/pigz -p 16 {}'.format(i)) command = '\n'.join(cmnd) outbam = name + 'Aligned.sortedByCoord.out.bam' modules = ['STAR'] elif algorithm.lower() == 'tophat': command = 'tophat --microexon-search -o {}'.format(name + '_tophat') command = command + ' -G ' + gtf if gtf else command command = command + ' -p 16 {} {}\n'.format(genome, infiles) outbam = name + '_accepted_hits.bam' command = command + 'mv {}/accepted_hits.bam {}'.format( name + '_tophat', outbam) modules = ['python/2.7.5', 'tophat'] else: raise Exception('Invalid algorithm: {}'.format(algorithm)) return (slurmy.monitor_submit(slurmy.make_job_file( command, name, '24:00:00', STAR_CORES, partition=PARTITION, modules=modules), dependency, MAX_JOBS), outbam)