def sortIndexBAMs(self, path_to_exe=False, force=False, max_cpus=-1): if not path_to_exe: path_to_exe = _get_exe_path('samtools') processes = set() max_processes = _decide_max_processes(max_cpus) paths_to_BAMs_dd_si = [] for SAM in self.paths_to_BAMs_dd: BAM_out = SAM[:-4] + '_si.bam' if not _os.path.exists(BAM_out) or force: cmd = '{0} sort {1} {2}_si; {0} index {2}_si.bam'.format( path_to_exe, SAM, SAM[:-4]) print('Called: %s' % cmd) processes.add(_subprocess.Popen(cmd, shell=True)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(BAM_out) print('use "force = True" to overwrite') paths_to_BAMs_dd_si += [BAM_out] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() self.paths_to_BAMs_dd_si = paths_to_BAMs_dd_si
def sortIndexBAMs(self, path_to_exe = False, force = False, max_cpus = -1): if not path_to_exe: path_to_exe = _get_exe_path('samtools') processes = set() max_processes = _decide_max_processes( max_cpus ) paths_to_BAMs_dd_si = [] for SAM in self.paths_to_BAMs_dd: BAM_out = SAM[:-4] + '_si.bam' if not _os.path.exists(BAM_out) or force: cmd = '{0} sort {1} {2}_si; {0} index {2}_si.bam'.format(path_to_exe, SAM, SAM[:-4]) print('Called: %s' % cmd) processes.add( _subprocess.Popen(cmd, shell=True) ) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(BAM_out) print('use "force = True" to overwrite') paths_to_BAMs_dd_si += [BAM_out] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() self.paths_to_BAMs_dd_si = paths_to_BAMs_dd_si
def removeDuplicates(self, path_to_jar=False, force=False, mem_num_gigs=2, max_cpus=-1): if not path_to_jar: path_to_jar = _get_jar_path('picard') processes = set() max_processes = _decide_max_processes(max_cpus) print('Print: will use %s cpus for picard' % max_processes) paths_to_BAMs_dd = [] for BAM in self.paths_to_BAMs: BAM_out = BAM[:-4] + '_dd.bam' if not _os.path.exists(BAM_out) or force: # an alternative parallel code with polling not waiting # while len(processes) >= max_processes: # _sleep(1) # print('processes: %s' % (len(processes))) # processes.difference_update( # [p for p in processes if p.poll() is not None] # ) picard_command = [ 'MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=', BAM[:-4] + '_dd.log' ] #, 'VALIDATION_STRINGENCY=','LENIENT'] cmd = ['java', '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar ] + picard_command processes.add(_subprocess.Popen(cmd, shell=False)) print('Called: %s' % (' '.join(map(str, cmd)))) #_subprocess.call(cmd, shell=False) print('processes: %s, max_processes: %s' % (len(processes), max_processes)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(BAM_out) print('use "force = True" to overwrite') paths_to_BAMs_dd += [BAM_out] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() self.paths_to_BAMs_dd = paths_to_BAMs_dd
def removeDuplicates(self, path_to_jar = False, force = False, mem_num_gigs = 2, max_cpus = -1): if not path_to_jar: path_to_jar = _get_jar_path('picard') processes = set() max_processes = _decide_max_processes( max_cpus ) print('Print: will use %s cpus for picard' % max_processes) paths_to_BAMs_dd = [] for BAM in self.paths_to_BAMs: BAM_out = BAM[:-4] + '_dd.bam' if not _os.path.exists(BAM_out) or force: # an alternative parallel code with polling not waiting # while len(processes) >= max_processes: # _sleep(1) # print('processes: %s' % (len(processes))) # processes.difference_update( # [p for p in processes if p.poll() is not None] # ) picard_command = ['MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=', BAM[:-4] + '_dd.log'] #, 'VALIDATION_STRINGENCY=','LENIENT'] cmd = ['java', '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar] + picard_command processes.add( _subprocess.Popen(cmd, shell=False) ) print('Called: %s' % (' '.join(map(str, cmd)))) #_subprocess.call(cmd, shell=False) print('processes: %s, max_processes: %s' % (len(processes),max_processes)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(BAM_out) print('use "force = True" to overwrite') paths_to_BAMs_dd += [BAM_out] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() self.paths_to_BAMs_dd = paths_to_BAMs_dd
def trim(self, path_to_exe = False, force = False, max_cpus = -1): if not path_to_exe: exe_sickle = _get_exe_path('sickle') else: exe_sickle = _os.path.sep.join(path_to_exe) e1 = 'Could not find "adaptorcut_read_files" attribute. \ Before quality score trimming, reads must be cleaned of \ library preparation sequences. Please run cutAdaptors() \ method on this Reads instance.' assert hasattr(self, 'adaptorcut_read_files'), e1 e2 = 'Could not find %s. Either run cutAdaptors() again \ or ensure file exists' for pairname, files in self.adaptorcut_read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[1]), e2 % files[1] trimmed_read_files = {} print(sorted(self.adaptorcut_read_files)) cmds = [] processed_paths_to_do = [] for pairname,files in self.adaptorcut_read_files.items(): processed_path_1 = insert_suffix(files[1], '_qual') processed_path_2 = insert_suffix(files[2], '_qual') processed_path_s = insert_suffix(files[2], '_singletons_qual') # Illumina quality using CASAVA >= 1.8 is Sanger encoded QSscore_scale = 'sanger' cmd = [exe_sickle, 'pe', '-f', files[1] ,'-r', files[2], '-t', QSscore_scale, '-o', processed_path_1, '-p', processed_path_2, '-s', processed_path_s, # quality 25, length 50 (of 150) '-q','25','-l','50'] if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2), _os.path.exists(processed_path_s)]) \ or force: # collect expected outputs processed_paths_to_do += [(processed_path_1,processed_path_2,processed_path_s)] # collect all the commands to be issued cmds += [(pairname,cmd)] else: print('Found:') print(processed_path_1) print(processed_path_2) print(processed_path_s) print('use "force = True" to overwrite') trimmed_read_files[pairname] = {} trimmed_read_files[pairname][1] = processed_path_1 trimmed_read_files[pairname][2] = processed_path_2 if len(cmds): max_processes = _decide_max_processes(max_cpus) processes = {} ### how to combine this which hangs on _os.wait() for pairname,cmd in cmds: print('Called: "%s"' % ' '.join(cmd)) # process is key, open file being piped to is value # baga CollectReads currently includes path in pairname this_stdout_file = open(pairname+'_sickle.log',"w") thisprocess = _subprocess.Popen(cmd, shell = False, stdout = this_stdout_file) processes[thisprocess] = this_stdout_file if len(processes) >= max_processes: _os.wait() finished = dict([(p,f) for p,f in processes.items() if p.poll() is not None]) # close files for finished processes for process,stdout_file in finished.items(): stdout_file.close() # update active processes del processes[process] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() fails = [] for (pairname,cmd),(processed_path_1,processed_path_2,processed_path_s) in zip(cmds,processed_paths_to_do): if _os.path.exists(processed_path_1) and _os.path.exists(processed_path_2): print('Found:') print(processed_path_1) print(processed_path_2) trimmed_read_files[pairname] = {} trimmed_read_files[pairname][1] = processed_path_1 trimmed_read_files[pairname][2] = processed_path_2 else: print('Processing of the following pair seems to have failed') print(processed_path_1) print(processed_path_2) fails += [(processed_path_1,processed_path_2)] assert len(fails) == 0, 'There was a problem finding all of the output from sickle. Try repeating this or an earlier step with the --force option to overwite previous, possibly incomplete, files' self.trimmed_read_files = trimmed_read_files
def cutAdaptors(self, path_to_exe = False, force = False, max_cpus = -1): if not path_to_exe: path_to_exe = _get_exe_path('cutadapt') adaptorcut_read_files = {} adaptor_seqs = [ 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'AGATCGGAAGAGCACACGTCT', 'AGATCGGAAGAGC', 'GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG', 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', ] cmds = [] processed_paths_to_do = [] for cnum,(pairname,files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_adpt') processed_path_2 = insert_suffix(files[2], '_adpt') # print(files[1], processed_path_1) # print(files[2], processed_path_2) # single end cmd = [path_to_exe] + \ [a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \ ['-o', processed_path_1, files[1]] # paired end cmd = [path_to_exe] + \ [a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \ [a for b in [('-A', a) for a in adaptor_seqs] for a in b] + \ ['-o', processed_path_1, '-p', processed_path_2] + \ [files[1], files[2]] if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: # collect expected outputs processed_paths_to_do += [(processed_path_1,processed_path_2)] # collect all the commands to be issued cmds += [(pairname,cmd)] else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') adaptorcut_read_files[pairname] = {} adaptorcut_read_files[pairname][1] = processed_path_1 adaptorcut_read_files[pairname][2] = processed_path_2 if len(cmds): max_processes = _decide_max_processes(max_cpus) processes = {} ### how to combine this which hangs on _os.wait() for pairname,cmd in cmds: print('Called: "%s"' % ' '.join(cmd)) # process is key, open file being piped to is value # baga CollectReads currently includes path in pairname this_stdout_file = open(pairname+'_cutadapt.log',"w") thisprocess = _subprocess.Popen(cmd, shell=False, stdout = this_stdout_file) processes[thisprocess] = this_stdout_file if len(processes) >= max_processes: _os.wait() finished = dict([(p,f) for p,f in processes.items() if p.poll() is not None]) # close files for finished processes for process,stdout_file in finished.items(): stdout_file.close() # update active processes del processes[process] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() fails = [] for (pairname,cmd),(processed_path_1,processed_path_2) in zip(cmds,processed_paths_to_do): if _os.path.exists(processed_path_1) and _os.path.exists(processed_path_2): print('Found:') print(processed_path_1) print(processed_path_2) adaptorcut_read_files[pairname] = {} adaptorcut_read_files[pairname][1] = processed_path_1 adaptorcut_read_files[pairname][2] = processed_path_2 else: print('Processing of the following pair seems to have failed') print(processed_path_1) print(processed_path_2) fails += [(processed_path_1,processed_path_2)] assert len(fails) == 0, 'There was a problem finding all of the output from cutadapt. Try repeating this or an eralier step with the --force option to overwite previous, possibly incomplete, files' self.adaptorcut_read_files = adaptorcut_read_files
def IndelRealignGATK(self, jar = ['external_programs', 'GenomeAnalysisTK', 'GenomeAnalysisTK.jar'], picard_jar = False, samtools_exe = False, use_java = 'java', force = False, mem_num_gigs = 2, max_cpus = -1): # GATK is manually downloaded by user and placed in folder of their choice jar = _os.path.sep.join(jar) if not picard_jar: picard_jar = _get_jar_path('picard') if not samtools_exe: samtools_exe = _get_exe_path('samtools') genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id) e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.' assert hasattr(self, 'paths_to_BAMs_dd_si'), e1 e2 = 'Could not find %s. Please ensure file exists' for BAM in self.paths_to_BAMs_dd_si: assert _os.path.exists(BAM), e2 % BAM if not _os.path.exists(genome_fna[:-4] + '.dict'): print('Creating sequence dictionary for %s' % genome_fna) _subprocess.call([use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=', genome_fna, 'O=', genome_fna[:-4] + '.dict']) #have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa','fai')] have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('fai',)] if not all(have_index_files): print('Writing index files for %s' % genome_fna) _subprocess.call([samtools_exe, 'faidx', genome_fna]) processes = set() max_processes = _decide_max_processes( max_cpus ) for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' if not _os.path.exists(intervals) or force: cmd = [use_java, '-Xmx%sg' % mem_num_gigs, '-jar', jar, '-T', 'RealignerTargetCreator', '-R', genome_fna, '-I', BAM, '-o', intervals] #, '--validation_strictness', 'LENIENT'] print(' '.join(map(str, cmd))) processes.add( _subprocess.Popen(cmd, shell=False) ) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(intervals) print('use "force = True" to overwrite') # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() paths_to_BAMs_dd_si_ra = [] for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' bam_out = BAM[:-4] + '_realn.bam' if not _os.path.exists(bam_out) or force: cmd = [use_java, '-Xmx4g', '-jar', jar, '-T', 'IndelRealigner', '-R', genome_fna, '-I', BAM, '-targetIntervals', intervals, '-o', bam_out, '--filter_bases_not_stored'] print(' '.join(map(str, cmd))) processes.add( _subprocess.Popen(cmd, shell=False) ) if len(processes) >= max_processes: _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(bam_out) print('use "force = True" to overwrite') paths_to_BAMs_dd_si_ra += [bam_out] for p in processes: if p.poll() is None: p.wait() # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK # both IndelRealignGATK and recalibBaseScoresGATK put here self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
def trim(self, path_to_exe=False, force=False, max_cpus=-1): if not path_to_exe: exe_sickle = _get_exe_path('sickle') else: exe_sickle = _os.path.sep.join(path_to_exe) e1 = 'Could not find "adaptorcut_read_files" attribute. \ Before quality score trimming, reads must be cleaned of \ library preparation sequences. Please run cutAdaptors() \ method on this Reads instance.' assert hasattr(self, 'adaptorcut_read_files'), e1 e2 = 'Could not find %s. Either run cutAdaptors() again \ or ensure file exists' for pairname, files in self.adaptorcut_read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[1]), e2 % files[1] trimmed_read_files = {} print(sorted(self.adaptorcut_read_files)) cmds = [] processed_paths_to_do = [] for pairname, files in self.adaptorcut_read_files.items(): processed_path_1 = insert_suffix(files[1], '_qual') processed_path_2 = insert_suffix(files[2], '_qual') processed_path_s = insert_suffix(files[2], '_singletons_qual') # Illumina quality using CASAVA >= 1.8 is Sanger encoded QSscore_scale = 'sanger' cmd = [ exe_sickle, 'pe', '-f', files[1], '-r', files[2], '-t', QSscore_scale, '-o', processed_path_1, '-p', processed_path_2, '-s', processed_path_s, # quality 25, length 50 (of 150) '-q', '25', '-l', '50' ] if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2), _os.path.exists(processed_path_s)]) \ or force: # collect expected outputs processed_paths_to_do += [(processed_path_1, processed_path_2, processed_path_s)] # collect all the commands to be issued cmds += [(pairname, cmd)] else: print('Found:') print(processed_path_1) print(processed_path_2) print(processed_path_s) print('use "force = True" to overwrite') trimmed_read_files[pairname] = {} trimmed_read_files[pairname][1] = processed_path_1 trimmed_read_files[pairname][2] = processed_path_2 if len(cmds): max_processes = _decide_max_processes(max_cpus) processes = {} ### how to combine this which hangs on _os.wait() for pairname, cmd in cmds: print('Called: "%s"' % ' '.join(cmd)) # process is key, open file being piped to is value # baga CollectReads currently includes path in pairname this_stdout_file = open(pairname + '_sickle.log', "w") thisprocess = _subprocess.Popen(cmd, shell=False, stdout=this_stdout_file) processes[thisprocess] = this_stdout_file if len(processes) >= max_processes: _os.wait() finished = dict([(p, f) for p, f in processes.items() if p.poll() is not None]) # close files for finished processes for process, stdout_file in finished.items(): stdout_file.close() # update active processes del processes[process] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() fails = [] for (pairname, cmd), (processed_path_1, processed_path_2, processed_path_s) in zip(cmds, processed_paths_to_do): if _os.path.exists(processed_path_1) and _os.path.exists( processed_path_2): print('Found:') print(processed_path_1) print(processed_path_2) trimmed_read_files[pairname] = {} trimmed_read_files[pairname][1] = processed_path_1 trimmed_read_files[pairname][2] = processed_path_2 else: print('Processing of the following pair seems to have failed') print(processed_path_1) print(processed_path_2) fails += [(processed_path_1, processed_path_2)] assert len( fails ) == 0, 'There was a problem finding all of the output from sickle. Try repeating this or an earlier step with the --force option to overwite previous, possibly incomplete, files' self.trimmed_read_files = trimmed_read_files
def cutAdaptors(self, path_to_exe=False, force=False, max_cpus=-1): if not path_to_exe: path_to_exe = _get_exe_path('cutadapt') adaptorcut_read_files = {} adaptor_seqs = [ 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'AGATCGGAAGAGCACACGTCT', 'AGATCGGAAGAGC', 'GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG', 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', ] cmds = [] processed_paths_to_do = [] for cnum, (pairname, files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_adpt') processed_path_2 = insert_suffix(files[2], '_adpt') # print(files[1], processed_path_1) # print(files[2], processed_path_2) # single end cmd = [path_to_exe] + \ [a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \ ['-o', processed_path_1, files[1]] # paired end cmd = [path_to_exe] + \ [a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \ [a for b in [('-A', a) for a in adaptor_seqs] for a in b] + \ ['-o', processed_path_1, '-p', processed_path_2] + \ [files[1], files[2]] if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: # collect expected outputs processed_paths_to_do += [(processed_path_1, processed_path_2)] # collect all the commands to be issued cmds += [(pairname, cmd)] else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') adaptorcut_read_files[pairname] = {} adaptorcut_read_files[pairname][1] = processed_path_1 adaptorcut_read_files[pairname][2] = processed_path_2 if len(cmds): max_processes = _decide_max_processes(max_cpus) processes = {} ### how to combine this which hangs on _os.wait() for pairname, cmd in cmds: print('Called: "%s"' % ' '.join(cmd)) # process is key, open file being piped to is value # baga CollectReads currently includes path in pairname this_stdout_file = open(pairname + '_cutadapt.log', "w") thisprocess = _subprocess.Popen(cmd, shell=False, stdout=this_stdout_file) processes[thisprocess] = this_stdout_file if len(processes) >= max_processes: _os.wait() finished = dict([(p, f) for p, f in processes.items() if p.poll() is not None]) # close files for finished processes for process, stdout_file in finished.items(): stdout_file.close() # update active processes del processes[process] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() fails = [] for (pairname, cmd), (processed_path_1, processed_path_2) in zip(cmds, processed_paths_to_do): if _os.path.exists(processed_path_1) and _os.path.exists( processed_path_2): print('Found:') print(processed_path_1) print(processed_path_2) adaptorcut_read_files[pairname] = {} adaptorcut_read_files[pairname][1] = processed_path_1 adaptorcut_read_files[pairname][2] = processed_path_2 else: print('Processing of the following pair seems to have failed') print(processed_path_1) print(processed_path_2) fails += [(processed_path_1, processed_path_2)] assert len( fails ) == 0, 'There was a problem finding all of the output from cutadapt. Try repeating this or an eralier step with the --force option to overwite previous, possibly incomplete, files' self.adaptorcut_read_files = adaptorcut_read_files
def IndelRealignGATK(self, jar=[ 'external_programs', 'GenomeAnalysisTK', 'GenomeAnalysisTK.jar' ], picard_jar=False, samtools_exe=False, use_java='java', force=False, mem_num_gigs=2, max_cpus=-1): # GATK is manually downloaded by user and placed in folder of their choice jar = _os.path.sep.join(jar) if not picard_jar: picard_jar = _get_jar_path('picard') if not samtools_exe: samtools_exe = _get_exe_path('samtools') genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id) e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.' assert hasattr(self, 'paths_to_BAMs_dd_si'), e1 e2 = 'Could not find %s. Please ensure file exists' for BAM in self.paths_to_BAMs_dd_si: assert _os.path.exists(BAM), e2 % BAM # always (re)generate dict in case of upstream changes in data print('Creating sequence dictionary for %s' % genome_fna) _subprocess.call([ use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=', genome_fna, 'O=', genome_fna[:-4] + '.dict' ]) # always (re)index in case of upstream changes in data print('Writing index files for %s' % genome_fna) _subprocess.call([samtools_exe, 'faidx', genome_fna]) processes = set() max_processes = _decide_max_processes(max_cpus) for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' if not _os.path.exists(intervals) or force: cmd = [ use_java, '-Xmx%sg' % mem_num_gigs, '-jar', jar, '-T', 'RealignerTargetCreator', '-R', genome_fna, '-I', BAM, '-o', intervals ] #, '--validation_strictness', 'LENIENT'] print(' '.join(map(str, cmd))) processes.add(_subprocess.Popen(cmd, shell=False)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(intervals) print('use "force = True" to overwrite') # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() paths_to_BAMs_dd_si_ra = [] for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' bam_out = BAM[:-4] + '_realn.bam' if not _os.path.exists(bam_out) or force: cmd = [ use_java, '-Xmx4g', '-jar', jar, '-T', 'IndelRealigner', '-R', genome_fna, '-I', BAM, '-targetIntervals', intervals, '-o', bam_out, '--filter_bases_not_stored' ] print(' '.join(map(str, cmd))) processes.add(_subprocess.Popen(cmd, shell=False)) if len(processes) >= max_processes: _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(bam_out) print('use "force = True" to overwrite') paths_to_BAMs_dd_si_ra += [bam_out] for p in processes: if p.poll() is None: p.wait() # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK # both IndelRealignGATK and recalibBaseScoresGATK put here self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
def generateReads(self, path_to_exe = False, paths_to_genomes = False, readcov = 60, readlen = 100, fraglen = 350, sterrfraglen = 20, model = 4, max_cpus = -1): ''' Call GemSIM to generate reads Need to have written genome sequences to generate from, possibly with generated SNPs, small indels and large deletions. ''' #max_cpus etc if paths_to_genomes: use_genomes = sorted(paths_to_genomes) elif hasattr(self, 'written_genomes'): use_genomes = sorted(self.written_genomes) else: raise ValueError('provide either paths_to_genomes or generate some then .writeSequences()') if not path_to_exe: path_to_exe = _get_exe_path('gemsim') comment2 = ''' to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands: GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01 ''' num_pairs = len(self.genome.sequence) * readcov / (readlen*2) if model == 4: path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v4_p.gzip']) elif model == 5: path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v5_p.gzip']) print('Using error model: {}'.format(path_to_model)) print('Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'.format( num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id)) processes = set() max_processes = _decide_max_processes( max_cpus ) import time start = time.time() out_raw = [] for i,genome_in in enumerate(use_genomes): # could use per genome length . . less consistent than using reference # genome_len = len(_SeqIO.read(genome_in,'fasta').seq) # num_pairs = genome_len * readcov / (readlen*2) outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i+1) cmd = [path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u', fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q', 33, '-p', '-o', outprefix] out_raw += [outprefix+'_fir.fastq', outprefix+'_sec.fastq'] # this would be better to rename and compress all in one # maybe as a shell script? Then resuming (--force) would be easier. if _os.path.exists(outprefix+'_fir.fastq') and \ _os.path.exists(outprefix+'_sec.fastq'): print('Found output for {}_fir.fastq (and sec), not regenerating, '\ 'delete these to start from scratch'.format(outprefix)) else: cmd = map(str,cmd) print(' '.join(cmd)) processes.add( _subprocess.Popen(cmd, shell=False) ) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() missing = [] for o in out_raw: if not _os.path.exists(o): missing += [o] assert len(missing) == 0, 'Could not find:\n{}'.format('\n'.join(missing)) print('all finished after {} minutes'.format(int(round((time.time() - start)/60.0)))) outdir = _os.path.sep.join(['simulated_reads',self.genome.id]) try: _os.makedirs(outdir) except OSError: pass for o in out_raw: new = _os.path.sep.join([outdir, o.replace('fir','R1').replace('sec','R2')]) print('{} ==> {}'.format(o, new)) _os.rename(o, new) cmd = ['gzip', new] print(' '.join(cmd)) _subprocess.call(cmd)
def generateReads(self, path_to_exe=False, paths_to_genomes=False, readcov=60, readlen=100, fraglen=350, sterrfraglen=20, model=4, max_cpus=-1): ''' Call GemSIM to generate reads Need to have written genome sequences to generate from, possibly with generated SNPs, small indels and large deletions. ''' #max_cpus etc if paths_to_genomes: use_genomes = sorted(paths_to_genomes) elif hasattr(self, 'written_genomes'): use_genomes = sorted(self.written_genomes) else: raise ValueError( 'provide either paths_to_genomes or generate some then .writeSequences()' ) if not path_to_exe: path_to_exe = _get_exe_path('gemsim') comment2 = ''' to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands: GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01 ''' num_pairs = len(self.genome.sequence) * readcov / (readlen * 2) if model == 4: path_to_model = _os.path.sep.join( path_to_exe.split(_os.path.sep)[:-1] + ['models', 'ill100v4_p.gzip']) elif model == 5: path_to_model = _os.path.sep.join( path_to_exe.split(_os.path.sep)[:-1] + ['models', 'ill100v5_p.gzip']) print('Using error model: {}'.format(path_to_model)) print( 'Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})' .format(num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id)) processes = set() max_processes = _decide_max_processes(max_cpus) import time start = time.time() out_raw = [] for i, genome_in in enumerate(use_genomes): # could use per genome length . . less consistent than using reference # genome_len = len(_SeqIO.read(genome_in,'fasta').seq) # num_pairs = genome_len * readcov / (readlen*2) outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i + 1) cmd = [ path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u', fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q', 33, '-p', '-o', outprefix ] out_raw += [outprefix + '_fir.fastq', outprefix + '_sec.fastq'] # this would be better to rename and compress all in one # maybe as a shell script? Then resuming (--force) would be easier. if _os.path.exists(outprefix+'_fir.fastq') and \ _os.path.exists(outprefix+'_sec.fastq'): print('Found output for {}_fir.fastq (and sec), not regenerating, '\ 'delete these to start from scratch'.format(outprefix)) else: cmd = map(str, cmd) print(' '.join(cmd)) processes.add(_subprocess.Popen(cmd, shell=False)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() missing = [] for o in out_raw: if not _os.path.exists(o): missing += [o] assert len(missing) == 0, 'Could not find:\n{}'.format( '\n'.join(missing)) print('all finished after {} minutes'.format( int(round((time.time() - start) / 60.0)))) outdir = _os.path.sep.join(['simulated_reads', self.genome.id]) try: _os.makedirs(outdir) except OSError: pass for o in out_raw: new = _os.path.sep.join( [outdir, o.replace('fir', 'R1').replace('sec', 'R2')]) print('{} ==> {}'.format(o, new)) _os.rename(o, new) cmd = ['gzip', new] print(' '.join(cmd)) _subprocess.call(cmd)