def start_ab(args, logger): '''Perform alignment and bam processing''' import os import subprocess final_bam = args.outbam # initialize library file from given arguments library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" print "Starting alignment" (bamfiles, library) = start_alignment(args, logger) print "Starting bam processing" final_bam = start_bamprocess(library, genobox_modules.unique(bamfiles.values()), args.mapq, args.libs, args.tmpdir, args.queue, final_bam, args.realignment, args.known, args.fa, args.sample, args.partition, logger) # remove queuing system outfiles genobox_modules.rm_files(['run_genobox_*', 'semaphores.*']) print "Done" print "--------------------------------------"
def start_ab(args, logger): '''Perform alignment and bam processing''' import os import subprocess final_bam = args.outbam # initialize library file from given arguments library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" print "Starting alignment" (bamfiles, library) = start_alignment(args, logger) print "Starting bam processing" final_bam = start_bamprocess(library, genobox_modules.unique(bamfiles.values()), args.mapq, args.libs, args.tmpdir, args.queue, final_bam, args.realignment, args.known, args.fa, args.sample, args.partition, logger) # remove queuing system outfiles genobox_modules.rm_files(['run_genobox_*', 'semaphores.*']) print "Done" print "--------------------------------------"
def clean(f): '''Clean up tmp and raw files''' import genobox_modules import os # finding files to delete f_base = os.path.split(f)[1] f_base = f_base.replace('.raw.vcf.gz', '') f_base = 'tmp/tmp.' + f_base files_to_delete = [] files_to_delete.append(f_base+'.header.vcf') files_to_delete.append(f_base+'.indels.pass.vcf') files_to_delete.append(f_base+'.indels.pass.vcf.idx') files_to_delete.append(f_base+'.raw.vcf.gz.indels.vcf') files_to_delete.append(f_base+'.raw.vcf.gz.indels.vcf.idx') files_to_delete.append(f_base+'.raw.vcf.gz.ref.vcf') files_to_delete.append(f_base+'.raw.vcf.gz.ref.vcf.idx') files_to_delete.append(f_base+'.raw.vcf.gz.snps.vcf') files_to_delete.append(f_base+'.raw.vcf.gz.snps.vcf.idx') files_to_delete.append(f_base+'.ref.pass.vcf') files_to_delete.append(f_base+'.ref.pass.vcf.idx') files_to_delete.append(f_base+'.snps.pass.vcf') files_to_delete.append(f_base+'.snps.pass.vcf.idx') # deleting files genobox_modules.rm_files(files_to_delete)
def start_gv(args, logger): '''Perform alignment and bam processing''' import os import subprocess genobox_modules.check_genome(args.genome) final_bcf = 'genotyping/%s.all.bcf' % args.sample # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" if args.caller == 'samtools': print "Starting genotyping (samtools)" final_bcf = start_genotyping(args.bam, args.genome, args.fa, args.prior, args.pp, args.queue, final_bcf, args.sample, args.partition, logger) print "Starting vcffiltering" final_vcf = start_vcffilter(final_bcf, args.genome, args.caller, args.Q, args.ex, args.rmsk, args.ab, args.prune, args.ovar, args.queue, args.sample, args.partition, logger) print "Start dbsnp" final_dbsnp_vcf = start_dbsnp(final_vcf, args.ex, args.dbsnp, args.ovar, args.queue, args.partition, logger) print "Start bcf2ref" start_bcf2ref(final_bcf, args.genome, args.Q, args.ex, args.dbsnp, args.rmsk, 'genotyping/indels_for_filtering.vcf', args.oref, args.queue, args.sample, args.partition, logger) elif args.caller == 'gatk': print "Start genotyping (gatk)" vcffiles = start_genotyping_gatk(args.bam, args.genome, args.fa, args.dbsnp, args.call_conf, args.call_emit, args.output_mode, args.queue, args.sample, args.partition, logger) print "Start vcffiltering (gatk)" final_vcfs = start_vcffilter_gatk(vcffiles, args.genome, args.fa, args.Q, args.rmsk, args.ab, args.prune, args.queue, args.sample, args.partition, logger) # remove queuing system outfiles genobox_modules.rm_files(['run_genobox_*', 'semaphores.*']) print "Done" print "--------------------------------------"
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp) bcfcombine_calls = bcf_combine(bcffiles, o) bcfindex_calls = bcf_index(o) consensus_calls = consensus(o, sample) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition) bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition) bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) # release jobs # print "Releasing jobs" #bamindex_moab.release() #mpileup_moab.release() #bcfcombine_moab.release() #bcfindex_moab.release() #consensus_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2*86400) s.wait() print "--------------------------------------" # remove temporary files genobox_modules.rm_files(bcffiles) # return output bcf return o
def start_gv(args, logger): '''Perform alignment and bam processing''' import os import subprocess genobox_modules.check_genome(args.genome) final_bcf = 'genotyping/%s.all.bcf' % args.sample # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" if args.caller == 'samtools': print "Starting genotyping (samtools)" final_bcf = start_genotyping(args.bam, args.genome, args.fa, args.prior, args.pp, args.queue, final_bcf, args.sample, args.partition, logger) print "Starting vcffiltering" final_vcf = start_vcffilter(final_bcf, args.genome, args.caller, args.Q, args.ex, args.rmsk, args.ab, args.prune, args.ovar, args.queue, args.sample, args.partition, logger) print "Start dbsnp" final_dbsnp_vcf = start_dbsnp(final_vcf, args.ex, args.dbsnp, args.ovar, args.queue, args.partition, logger) print "Start bcf2ref" start_bcf2ref(final_bcf, args.genome, args.Q, args.ex, args.dbsnp, args.rmsk, 'genotyping/indels_for_filtering.vcf', args.oref, args.queue, args.sample, args.partition, logger) elif args.caller == 'gatk': print "Start genotyping (gatk)" vcffiles = start_genotyping_gatk(args.bam, args.genome, args.fa, args.dbsnp, args.call_conf, args.call_emit, args.output_mode, args.queue, args.sample, args.partition, logger) print "Start vcffiltering (gatk)" final_vcfs = start_vcffilter_gatk(vcffiles, args.genome, args.fa, args.Q, args.rmsk, args.ab, args.prune, args.queue, args.sample, args.partition, logger) # remove queuing system outfiles genobox_modules.rm_files(['run_genobox_*', 'semaphores.*']) print "Done" print "--------------------------------------"
files['filterAll'] = 'genotyping/tmp.all.bcf.%s.flt.vcf.gz' % args.chr files['filterAll_tbi'] = 'genotyping/tmp.all.bcf.%s.flt.vcf.gz.tbi' % args.chr files['dbsnp_ann'] = 'genotyping/tmp.all.bcf.%s.flt.ann.vcf.gz' % args.chr files['rmsk'] = 'genotyping/tmp.all.bcf.%s.flt.ann.nr.vcf.gz' % args.chr files['indel_filt'] = 'genotyping/tmp.indel_filtered.%s.vcf' % args.chr # vcf_filter_All vcf_filterAll(args.bcf, args.chr_id, args.d, args.D, args.Q, args.ex, files['filterAll']) # tabix vcf_tabix(files['filterAll']) # dbsnp vcf_annotate_dbsnp(files['filterAll'], args.dbsnp, files['dbsnp_ann']) # rmsk filtering if args.chr.find('MT') > -1: # if chromosome short name is chrMT or MT run manual filtering for MT only manual_rmsk_filter(files['dbsnp_ann'], args.chr, args.rmsk, files['rmsk']) else: # filter for rmsk using BEDtools vcf_filter_rmsk(files['dbsnp_ann'], args.rmsk, files['rmsk']) # indel filter vcf_filter_indels(files['rmsk'], args.chr, args.indels, files['indel_filt'], args.o) # remove tmp files genobox_modules.rm_files(files.values())
def start_abgv(args, logger): '''Start alignment, bam processing, genotyping, vcffiltering, dbsnp annotation, bcf2ref''' import os import subprocess # check genome file genobox_modules.check_genome(args.genome) final_bam = 'alignment/%s.flt.sort.rmdup.bam' % args.sample final_bcf = 'genotyping/%s.all.bcf' % args.sample # initialize library file from given arguments library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" # toggle start trimming #if args.no_trim == False: # print "Starting trimming" # (se_files, pe1_files, pe2_files) = start_trim(args, logger) # library.update(Trim=se_files+pe1_files+pe2_files) print "Starting alignment" (bamfiles, library) = start_alignment(args, logger) print "Starting bam processing" final_bam = start_bamprocess(library, genobox_modules.unique(bamfiles.values()), args.mapq, args.libs, args.tmpdir, args.queue, final_bam, args.realignment, args.known, args.fa, args.sample, args.partition, logger) print "Starting bam stats" start_bamstats(args, final_bam, args.partition, logger, wait=False) print "Starting genotyping" if args.caller == 'samtools': final_bcf = start_genotyping(final_bam, args.genome, args.fa, args.prior, args.pp, args.queue, final_bcf, args.sample, args.partition, logger) print "Starting vcffiltering" final_vcf = start_vcffilter(final_bcf, args.genome, args.caller, args.Q, args.ex, args.rmsk, args.ab, args.prune, args.ovar, args.queue, args.sample, args.partition, logger) print "Start dbsnp" final_dbsnp_vcf = start_dbsnp(final_vcf, args.ex, args.dbsnp, args.ovar, args.queue, args.partition, logger) print "Start bcf2ref" start_bcf2ref(final_bcf, args.genome, args.Q, args.ex, args.dbsnp, args.rmsk, 'genotyping/indels_for_filtering.vcf', args.oref, args.queue, args.sample, args.partition, logger) elif args.caller == 'gatk': print "Start genotyping (gatk)" vcffiles = start_genotyping_gatk(final_bam, args.genome, args.fa, args.dbsnp, args.call_conf, args.args.call_emit, args.output_mode, args.queue, args.sample, args.partition, logger) print "Start vcffiltering (gatk)" final_vcfs = start_vcffilter_gatk(vcffiles, args.genome, args.fa, args.Q, args.rmsk, args.ab, args.prune, args.queue, args.sample, args.partition, args.logger) # remove queuing system outfiles genobox_modules.rm_files(['run_genobox_*', 'semaphores.*']) print "Done" print "--------------------------------------" print "Raw genotyping is written in genotyping/all.bcf" print "High confidence variants: %s" % args.ovar print "High confidence reference: %s" % args.oref print "--------------------------------------"
def start_abgv(args, logger): """Start alignment, bam processing, genotyping, vcffiltering, dbsnp annotation, bcf2ref""" import os import subprocess # check genome file genobox_modules.check_genome(args.genome) final_bam = "alignment/%s.flt.sort.rmdup.bam" % args.sample final_bcf = "genotyping/%s.all.bcf" % args.sample # initialize library file from given arguments library = genobox_modules.initialize_library( args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl ) # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" # toggle start trimming # if args.no_trim == False: # print "Starting trimming" # (se_files, pe1_files, pe2_files) = start_trim(args, logger) # library.update(Trim=se_files+pe1_files+pe2_files) print "Starting alignment" (bamfiles, library) = start_alignment(args, logger) print "Starting bam processing" final_bam = start_bamprocess( library, genobox_modules.unique(bamfiles.values()), args.mapq, args.libs, args.tmpdir, args.queue, final_bam, args.realignment, args.known, args.fa, args.sample, args.partition, logger, ) print "Starting bam stats" start_bamstats(args, final_bam, args.partition, logger, wait=False) print "Starting genotyping" if args.caller == "samtools": final_bcf = start_genotyping( final_bam, args.genome, args.fa, args.prior, args.pp, args.queue, final_bcf, args.sample, args.partition, logger, ) print "Starting vcffiltering" final_vcf = start_vcffilter( final_bcf, args.genome, args.caller, args.Q, args.ex, args.rmsk, args.ab, args.prune, args.ovar, args.queue, args.sample, args.partition, logger, ) print "Start dbsnp" final_dbsnp_vcf = start_dbsnp(final_vcf, args.ex, args.dbsnp, args.ovar, args.queue, args.partition, logger) print "Start bcf2ref" start_bcf2ref( final_bcf, args.genome, args.Q, args.ex, args.dbsnp, args.rmsk, "genotyping/indels_for_filtering.vcf", args.oref, args.queue, args.sample, args.partition, logger, ) elif args.caller == "gatk": print "Start genotyping (gatk)" vcffiles = start_genotyping_gatk( final_bam, args.genome, args.fa, args.dbsnp, args.call_conf, args.args.call_emit, args.output_mode, args.queue, args.sample, args.partition, logger, ) print "Start vcffiltering (gatk)" final_vcfs = start_vcffilter_gatk( vcffiles, args.genome, args.fa, args.Q, args.rmsk, args.ab, args.prune, args.queue, args.sample, args.partition, args.logger, ) # remove queuing system outfiles genobox_modules.rm_files(["run_genobox_*", "semaphores.*"]) print "Done" print "--------------------------------------" print "Raw genotyping is written in genotyping/all.bcf" print "High confidence variants: %s" % args.ovar print "High confidence reference: %s" % args.oref print "--------------------------------------"
paths = genobox_modules.setSystem() home = os.getcwd() # get genome file genome = get_genome(args.genome) # perform varfilter to get filtered vcf vcf_files = bcf2varfilter(args.bcf, genome, args.Q, 'genotyping/tmp.flt.') # combine to one file cat_vcfs(vcf_files, 'genotyping/tmp.flt.all.vcf') # remove in annotated repeat (rmsk) vcf_filter_rmsk('genotyping/tmp.flt.all.vcf', args.rmsk, 'genotyping/tmp.flt.all.rmsk.vcf') # filter haploid chromosomes for heterozygote calls vcf_filter_haploid('genotyping/tmp.flt.all.rmsk.vcf', genome, 'genotyping/tmp.flt.all.rmsk.hetfilt.vcf') # filter for allelic balance vcf_filter_allelic_balance('genotyping/tmp.flt.all.rmsk.hetfilt.vcf', args.ab, args.caller, 'genotyping/tmp.flt.all.rmsk.hetfilt.abfilt.vcf') # pruning of nearby calls vcf_filter_prune('genotyping/tmp.flt.all.rmsk.hetfilt.abfilt.vcf', args.prune, args.o) # write indels for filtering of reference calls write_indels_for_filtering(args.o, args.ex) # remove temporary files genobox_modules.rm_files(['genotyping/tmp.flt*'])
files = {} files["filterAll"] = "genotyping/tmp.all.bcf.%s.flt.vcf.gz" % args.chr files["filterAll_tbi"] = "genotyping/tmp.all.bcf.%s.flt.vcf.gz.tbi" % args.chr files["dbsnp_ann"] = "genotyping/tmp.all.bcf.%s.flt.ann.vcf.gz" % args.chr files["rmsk"] = "genotyping/tmp.all.bcf.%s.flt.ann.nr.vcf.gz" % args.chr files["indel_filt"] = "genotyping/tmp.indel_filtered.%s.vcf" % args.chr # vcf_filter_All vcf_filterAll(args.bcf, args.chr_id, args.d, args.D, args.Q, args.ex, files["filterAll"]) # tabix vcf_tabix(files["filterAll"]) # dbsnp vcf_annotate_dbsnp(files["filterAll"], args.dbsnp, files["dbsnp_ann"]) # rmsk filtering if args.chr.find("MT") > -1: # if chromosome short name is chrMT or MT run manual filtering for MT only manual_rmsk_filter(files["dbsnp_ann"], args.chr, args.rmsk, files["rmsk"]) else: # filter for rmsk using BEDtools vcf_filter_rmsk(files["dbsnp_ann"], args.rmsk, files["rmsk"]) # indel filter vcf_filter_indels(files["rmsk"], args.chr, args.indels, files["indel_filt"], args.o) # remove tmp files genobox_modules.rm_files(files.values())
genome = get_genome(args.genome) # perform varfilter to get filtered vcf vcf_files = bcf2varfilter(args.bcf, genome, args.Q, 'genotyping/tmp.flt.') # combine to one file cat_vcfs(vcf_files, 'genotyping/tmp.flt.all.vcf') # remove in annotated repeat (rmsk) vcf_filter_rmsk('genotyping/tmp.flt.all.vcf', args.rmsk, 'genotyping/tmp.flt.all.rmsk.vcf') # filter haploid chromosomes for heterozygote calls vcf_filter_haploid('genotyping/tmp.flt.all.rmsk.vcf', genome, 'genotyping/tmp.flt.all.rmsk.hetfilt.vcf') # filter for allelic balance vcf_filter_allelic_balance('genotyping/tmp.flt.all.rmsk.hetfilt.vcf', args.ab, args.caller, 'genotyping/tmp.flt.all.rmsk.hetfilt.abfilt.vcf') # pruning of nearby calls vcf_filter_prune('genotyping/tmp.flt.all.rmsk.hetfilt.abfilt.vcf', args.prune, args.o) # write indels for filtering of reference calls write_indels_for_filtering(args.o, args.ex) # remove temporary files genobox_modules.rm_files(['genotyping/tmp.flt*'])
#!/panvol1/simon/bin/python2.7 from genobox_modules import rm_files import subprocess import os rm_files([ 'run_genobox_velveth.*', 'run_genobox_velvetg.*', 'run_genobox_interleave.*', '*.interleaved', 'pbsjob.tmp*', 'run_genobox_velvetaccept.*', 'run_mlst_trim.*' ]) if os.path.exists('trimmed'): subprocess.call('rm -r trimmed/', shell=True)
#!/panvol1/simon/bin/python2.7 from genobox_modules import rm_files import subprocess import os rm_files(['run_genobox_velveth.*', 'run_genobox_velvetg.*', 'run_genobox_interleave.*', '*.interleaved', 'pbsjob.tmp*', 'run_genobox_velvetaccept.*', 'run_mlst_trim.*']) if os.path.exists('trimmed'): subprocess.call('rm -r trimmed/', shell=True)
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp) bcfcombine_calls = bcf_combine(bcffiles, o) bcfindex_calls = bcf_index(o) consensus_calls = consensus(o, sample) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition) bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition) bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) # release jobs # print "Releasing jobs" #bamindex_moab.release() #mpileup_moab.release() #bcfcombine_moab.release() #bcfindex_moab.release() #consensus_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" # remove temporary files genobox_modules.rm_files(bcffiles) # return output bcf return o