def start_unmapped_assembly(): '''Start denovo assembly of unmapped reads given by args''' # args is different when from genobox.py from genobox_classes import Library import genobox_modules #library = Library(args.libfile) library = Library("libs.NA12891.txt.RISO17V9VS") library.read() #bamfiles = library.getValues('ID', 'BAM') libs = library.getValues('LB', 'BAM') (unmapped_calls, unmapped) = extract_unmapped_reads(bamfiles) # update library library.update_with_tag('ID', 'UNM', unmapped, force=True)
def initialize_library( libfile, se=[], pe1=[], pe2=[], sample="sample", mapq=[30], libs=["A"], pl=["ILLUMINA"], bams=None ): """Initiates library file from arguments""" from genobox_classes import Library import random import string def try_append(index, from_list, target_list): """Try to append value (indexed) from list to another list if the value does not exist reuse first value of list Converts all input values to strings """ try: target_list.append(str(from_list[index])) except: target_list.append(str(from_list[0])) if libfile: # copy library file so that it can be edited rand = "".join(random.choice(string.ascii_uppercase + string.digits) for x in range(10)) newlibfile = os.getcwd() + "/" + os.path.split(libfile)[1] + "." + rand returnmsg = subprocess.check_call("cp %s %s" % (libfile, newlibfile), shell=True) libfile = newlibfile # create instance and read in library file (Library(libfile) ; .read()) library = Library(libfile) library.read() # remove all non-input lines from library file library.keep("Data", se + pe1 + pe2) else: # else create new from input library = Library("libs.%s.txt" % sample) # check if sample is None if not sample: sample = "sample" # create the library file f_count = 0 (ID, Data, SM, MAPQ, LB, PL, BAM) = ([], [], [], [], [], [], []) if se and se != "None": for i, f in enumerate(se): ID.append(sample + "_%i" % f_count) Data.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if pe1 and pe1 != "None": for i, f in enumerate(pe1): ID.append(sample + "_%i" % f_count) Data.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if pe2 and pe2 != "None": for i, f in enumerate(pe2): ID.append(sample + "_%i" % f_count) Data.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if bams and bams != "None": for i, f in enumerate(bams): ID.append(sample + "_%i" % f_count) Data.append(f) BAM.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if bams and bams != "None": library.create(ID=ID, Data=Data, SM=SM, MAPQ=MAPQ, LB=LB, PL=PL, BAM=BAM) else: library.create(ID=ID, Data=Data, SM=SM, MAPQ=MAPQ, LB=LB, PL=PL) return library
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger): '''Starts bam processing of input files''' import subprocess import genobox_modules from genobox_classes import Moab, Semaphore, Library import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600' cpuG = 'nodes=1:ppn=1,mem=6gb,walltime=345600' cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600' # create library instance if library_file and library_file != 'None': if isinstance(library_file, Library): library = library_file else: library = Library(library_file) library.read() else: library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams) (bam2lib, lib2bam) = library.getBamLibs() ## CREATE CALLS ## # filter bam and sort (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000) # merge to libs (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir) # rmdup on libs (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir) # optional: realignment if realignment: (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known) else: # merge to final file (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) ## SUBMIT JOBS ## print "Submitting jobs" filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition) mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition) rmdup_moab = Moab( rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuG, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition ) # NB: If memory should be changed, also change java memory spec in rmdup function mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition) if realignment: realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition) # realignment calls needs to be written together in a shell-file or dependent on each other # # release jobs # print "Releasing jobs" #filtersort_moab.release() #mergelib_moab.release() #rmdup_moab.release() #mergefinal_moab.release() #if realignment: realign_moab.release() # semaphore print "Waiting for jobs to finish ..." if realignment: s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 345600) else: s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 345600) s.wait() print "--------------------------------------" # return final bamfile return final_bam
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger): '''Starts bam processing of input files''' import subprocess import genobox_modules from genobox_classes import Moab, Semaphore, Library import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600' cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600' # create library instance if library_file and library_file != 'None': if isinstance(library_file, Library): library = library_file else: library = Library(library_file) library.read() else: library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams) (bam2lib, lib2bam) = library.getBamLibs() ## CREATE CALLS ## # filter bam and sort (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000) # merge to libs (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir) # rmdup on libs (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir) # optional: realignment if realignment: (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known) else: # merge to final file (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) ## SUBMIT JOBS ## print "Submitting jobs" filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition) mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition) rmdup_moab = Moab(rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition) # NB: If memory should be changed, also change java memory spec in rmdup function mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition) if realignment: realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition) # realignment calls needs to be written together in a shell-file or dependent on each other # # release jobs # print "Releasing jobs" #filtersort_moab.release() #mergelib_moab.release() #rmdup_moab.release() #mergefinal_moab.release() #if realignment: realign_moab.release() # semaphore print "Waiting for jobs to finish ..." if realignment: s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 2*86400) else: s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 2*86400) s.wait() print "--------------------------------------" # return final bamfile return final_bam
def initialize_library(libfile, se=[], pe1=[], pe2=[], sample='sample', mapq=[30], libs=['A'], pl=['ILLUMINA'], bams=None): '''Initiates library file from arguments''' from genobox_classes import Library import random import string def try_append(index, from_list, target_list): '''Try to append value (indexed) from list to another list if the value does not exist reuse first value of list Converts all input values to strings ''' try: target_list.append(str(from_list[index])) except: target_list.append(str(from_list[0])) if libfile: # copy library file so that it can be edited rand = ''.join( random.choice(string.ascii_uppercase + string.digits) for x in range(10)) newlibfile = os.getcwd() + '/' + os.path.split(libfile)[1] + '.' + rand returnmsg = subprocess.check_call('cp %s %s' % (libfile, newlibfile), shell=True) libfile = newlibfile # create instance and read in library file (Library(libfile) ; .read()) library = Library(libfile) library.read() # remove all non-input lines from library file library.keep('Data', se + pe1 + pe2) else: # else create new from input library = Library('libs.%s.txt' % sample) # check if sample is None if not sample: sample = 'sample' # create the library file f_count = 0 (ID, Data, SM, MAPQ, LB, PL, BAM) = ([], [], [], [], [], [], []) if se and se != 'None': for i, f in enumerate(se): ID.append(sample + '_%i' % f_count) Data.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if pe1 and pe1 != 'None': for i, f in enumerate(pe1): ID.append(sample + '_%i' % f_count) Data.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if pe2 and pe2 != 'None': for i, f in enumerate(pe2): ID.append(sample + '_%i' % f_count) Data.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if bams and bams != 'None': for i, f in enumerate(bams): ID.append(sample + '_%i' % f_count) Data.append(f) BAM.append(f) SM.append(sample) try_append(f_count, mapq, MAPQ) try_append(f_count, libs, LB) try_append(f_count, pl, PL) f_count += 1 if bams and bams != 'None': library.create(ID=ID, Data=Data, SM=SM, MAPQ=MAPQ, LB=LB, PL=PL, BAM=BAM) else: library.create(ID=ID, Data=Data, SM=SM, MAPQ=MAPQ, LB=LB, PL=PL) return library