def worker(scorer, fut): while True: value = legion.coordinator().get_future(fut) if value is None: break item, reply_fut = value result = scorer(item) fut = legion.coordinator().new_future() legion.coordinator().deliver_future(reply_fut, (result, fut))
def execute(args, stdin=None, stdout=None, stderr=None, cores=1, **kwargs): """ Run a program. Raise an error if it has an exit code other than 0. """ from nesoni import legion if cores > 1: legion.coordinator().trade_cores(1, cores) try: p = run(args, stdin=stdin, stdout=stdout, stderr=stderr, **kwargs) assert p.wait() == 0, 'Failed to execute "%s"' % _describe_args(args,kwargs) finally: if cores > 1: legion.coordinator().trade_cores(cores, 1)
def execute(args, stdin=None, stdout=None, stderr=None, cores=1, **kwargs): """ Run a program. Raise an error if it has an exit code other than 0. """ from nesoni import legion if cores > 1: legion.coordinator().trade_cores(1, cores) try: p = run(args, stdin=stdin, stdout=stdout, stderr=stderr, **kwargs) assert p.wait() == 0, 'Failed to execute "%s"' % _describe_args( args, kwargs) finally: if cores > 1: legion.coordinator().trade_cores(cores, 1)
def run_job(): import sys, os, imp, base64 # Connect to coordinator current_dir, python_path, main_file, address, authkey, mail_number = eval( base64.b64decode(sys.argv[1])) # Try to recreate execution environment os.chdir(current_dir) sys.path = python_path from nesoni import legion legion.manager(address, authkey, connect=True) if main_file is not None: # so unpickling functions in __main__ works module = imp.new_module('__job__') module.__file__ = main_file sys.modules['__job__'] = module sys.modules['__main__'] = module execfile(main_file, module.__dict__) # Retrieve function and execute func, args, kwargs = legion.coordinator().get_mail(mail_number) func(*args, **kwargs) sys.exit(0)
def run_job(): import sys, os, imp, base64 # Connect to coordinator current_dir, python_path, main_file, address, authkey, mail_number = eval(base64.b64decode(sys.argv[1])) # Try to recreate execution environment os.chdir(current_dir) sys.path = python_path from nesoni import legion legion.manager(address, authkey, connect=True) if main_file is not None: # so unpickling functions in __main__ works module = imp.new_module('__job__') module.__file__ = main_file sys.modules['__job__'] = module sys.modules['__main__'] = module execfile(main_file, module.__dict__) # Retrieve function and execute func, args, kwargs = legion.coordinator().get_mail(mail_number) func(*args,**kwargs) sys.exit(0)
def sort_bam(in_filename, out_prefix, by_name=False, cores=8): cores = min(cores, legion.coordinator().get_cores()) megs = max(10, 800 // cores) io.execute( [ 'samtools', 'sort', '-@', '%d' % cores, '-m', '%dM' % megs ] + ([ '-n' ] if by_name else [ ]) + [ in_filename, out_prefix ], cores=cores)
def pipe_from(args, stdin=None, stderr=None, cores=1, **kwargs): """ Context to pipe from a process, eg with io.pipe_from(['ls']) as f: print f.read().rstrip('\n').split('\n') """ if cores > 1: legion.coordinator().trade_cores(1, cores) process = run(args, stdin=stdin, stdout=PIPE, stderr=stderr, **kwargs) try: yield process.stdout finally: process.stdout.close() exit_code = process.wait() if cores > 1: legion.coordinator().trade_cores(cores, 1) assert exit_code == 0, 'Failed: "%s"' % _describe_args(args,kwargs)
def pipe_to(args, stdout=None, stderr=None, cores=1, **kwargs): """ Context to pipe to a process, eg with io.pipe_to(['less']) as f: print >> f, 'Hello, world.' """ if cores > 1: legion.coordinator().trade_cores(1, cores) process = run(args, stdin=PIPE, stdout=stdout, stderr=stderr, **kwargs) try: yield process.stdin finally: process.stdin.close() exit_code = process.wait() if cores > 1: legion.coordinator().trade_cores(cores, 1) assert exit_code == 0, 'Failed: "%s"' % _describe_args(args,kwargs)
def pipe_from(args, stdin=None, stderr=None, cores=1, **kwargs): """ Context to pipe from a process, eg with io.pipe_from(['ls']) as f: print f.read().rstrip('\n').split('\n') """ if cores > 1: legion.coordinator().trade_cores(1, cores) process = run(args, stdin=stdin, stdout=PIPE, stderr=stderr, **kwargs) try: yield process.stdout finally: process.stdout.close() exit_code = process.wait() if cores > 1: legion.coordinator().trade_cores(cores, 1) assert exit_code == 0, 'Failed: "%s"' % _describe_args(args, kwargs)
def pipe_to(args, stdout=None, stderr=None, cores=1, **kwargs): """ Context to pipe to a process, eg with io.pipe_to(['less']) as f: print >> f, 'Hello, world.' """ if cores > 1: legion.coordinator().trade_cores(1, cores) process = run(args, stdin=PIPE, stdout=stdout, stderr=stderr, **kwargs) try: yield process.stdin finally: process.stdin.close() exit_code = process.wait() if cores > 1: legion.coordinator().trade_cores(cores, 1) assert exit_code == 0, 'Failed: "%s"' % _describe_args(args, kwargs)
def status(string): """ Display a status string. """ from nesoni import legion return legion.coordinator().set_status(legion.process_identity(), string)
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(), 'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [0] def tempname(): n[0] += 1 return temp / ('%d.fq' % n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [] twos = [] singles = [] for pair in self.pairs: assert len( pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name, seq, qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error( 'Interleaved file contains odd number of sequences' ) io.write_fastq(right, name, seq, qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ([ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:' + working.name, ] + self.bowtie_options + ['-x', reference.get_bowtie_index_prefix()]) commands = [] if ones: commands.append(command + ['-1', ','.join(ones), '-2', ','.join(twos)]) if singles: commands.append(command + ['-U', ','.join(singles)]) temp_bam_name = temp / 'temp.bam' with io.pipe_to(['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name, 'wb'), stderr=log_file) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from(command, stderr=log_file, cores=cores) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working / 'alignments', by_name=True, cores=self.cores) log_file.close()
def improve(comment, constrainer, scorer, start_x, ftol=1e-4, xtol=1e-6, initial_accuracy=0.001, monitor = lambda x,y: None): pool_size = legion.coordinator().get_cores() worker_futs = [ legion.coordinator().new_future() for i in xrange(pool_size) ] reply_futs = [ ] workers = [ legion.future(worker,scorer,fut) for fut in worker_futs ] last_t = 0.0 try: best = start_x c_score = constrainer(best) if c_score: best_score = (c_score, 0.0) else: best_score = (0.0, scorer(best)) n_good = 0 n_real = 0 i = 0 jobs = [ ] pool_size = int(len(best)*5) print len(best),'parameters, pool size', pool_size currents = [ (best, best_score) ] done = False while not done or reply_futs: t = time.time() if t > last_t+20.0: def rep(x): if x[0]: return 'C%.6f' % x[0] return '%.6f' % x[1] grace.status('%s %s %d %d %d %d %s'%(rep(best_score), rep(max(item[1] for item in currents)), len(currents), n_good, n_real, i, comment)) if best_score[0] == 0: monitor(best, [ item[0] for item in currents ]) last_t = time.time() have_score = False if not done and worker_futs: new = make_update([item[0] for item in currents], initial_accuracy, len(currents) < pool_size) c_score = constrainer(new) if c_score: have_score = True new_score = (c_score, 0.0) else: reply_fut = legion.coordinator().new_future() worker_fut = worker_futs.pop(0) legion.coordinator().deliver_future(worker_fut, (new, reply_fut)) reply_futs.append( (new, reply_fut) ) if not have_score: if not reply_futs or (not done and worker_futs): continue new, reply_fut = reply_futs.pop(0) new_score, worker_fut = legion.coordinator().get_future(reply_fut) new_score = (0.0, new_score) worker_futs.append(worker_fut) if new_score[0] == 0.0: n_real += 1 l = sorted( item[1][1] for item in currents ) if pool_size < len(l): c = l[pool_size] else: c = 1e30 cutoff = (best_score[0], c) if new_score <= cutoff: currents = [ item for item in currents if item[1] <= cutoff ] currents.append((new,new_score)) n_good += 1 if new_score < best_score: best_score = new_score best = new if len(currents) >= pool_size and best_score[0] == 0.0: xspan = 0.0 for i in xrange(len(start_x)): xspan = max(xspan, max(item[0][i] for item in currents) - min(item[0][i] for item in currents) ) fspan = (max(item[1] for item in currents)[1]-best_score[1]) if xspan < xtol or (n_good >= 5000 and fspan < ftol): done = True i += 1 grace.status('') print '%s %.5f\n' % (comment, best_score[1]) finally: #pool.terminate() pass while worker_futs: fut = worker_futs.pop(0) legion.coordinator().deliver_future(fut, None) for item in workers: item() return best
def status(string): """ Display a status string. """ from nesoni import legion legion.coordinator().set_status( legion.process_identity(), string )
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E': None, '-T': None, '-N': str(cores), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames)) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',', '_'), workspace.name.replace(',', '_')) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(),'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [ 0 ] def tempname(): n[0] += 1 return temp/('%d.fq'%n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [ ] twos = [ ] singles = [ ] for pair in self.pairs: assert len(pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name,seq,qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error('Interleaved file contains odd number of sequences') io.write_fastq(right, name,seq,qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ( [ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:'+working.name, ] + self.bowtie_options + [ '-x', reference.get_bowtie_index_prefix() ] ) commands = [ ] if ones: commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ]) if singles: commands.append(command + [ '-U', ','.join(singles) ]) temp_bam_name = temp/'temp.bam' with io.pipe_to( ['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name,'wb'), stderr=log_file ) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from( command, stderr=log_file, cores=cores ) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores) log_file.close()
def improve(comment, constrainer, scorer, start_x, ftol=1e-4, xtol=1e-6, initial_accuracy=0.001, monitor=lambda x, y: None): pool_size = legion.coordinator().get_cores() worker_futs = [ legion.coordinator().new_future() for i in xrange(pool_size) ] reply_futs = [] workers = [legion.future(worker, scorer, fut) for fut in worker_futs] last_t = 0.0 try: best = start_x c_score = constrainer(best) if c_score: best_score = (c_score, 0.0) else: best_score = (0.0, scorer(best)) n_good = 0 n_real = 0 i = 0 jobs = [] pool_size = int(len(best) * 5) #5 print len(best), 'parameters, pool size', pool_size currents = [(best, best_score)] done = False while not done or reply_futs: t = time.time() if t > last_t + 20.0: def rep(x): if x[0]: return 'C%.6f' % x[0] return '%.6f' % x[1] grace.status( '%s %s %d %d %d %d %s' % (rep(best_score), rep(max(item[1] for item in currents)), len(currents), n_good, n_real, i, comment)) if best_score[0] == 0: monitor(best, [item[0] for item in currents]) last_t = time.time() have_score = False if not done and worker_futs: new = make_update([item[0] for item in currents], initial_accuracy, len(currents) < pool_size) c_score = constrainer(new) if c_score: have_score = True new_score = (c_score, 0.0) else: reply_fut = legion.coordinator().new_future() worker_fut = worker_futs.pop(0) legion.coordinator().deliver_future( worker_fut, (new, reply_fut)) reply_futs.append((new, reply_fut)) if not have_score: if not reply_futs or (not done and worker_futs): continue new, reply_fut = reply_futs.pop(0) new_score, worker_fut = legion.coordinator().get_future( reply_fut) new_score = (0.0, new_score) worker_futs.append(worker_fut) if new_score[0] == 0.0: n_real += 1 l = sorted(item[1][1] for item in currents) if pool_size < len(l): c = l[pool_size] else: c = 1e30 cutoff = (best_score[0], c) if new_score <= cutoff: currents = [item for item in currents if item[1] <= cutoff] currents.append((new, new_score)) n_good += 1 if new_score < best_score: best_score = new_score best = new if len(currents) >= pool_size and best_score[0] == 0.0: xspan = 0.0 for i in xrange(len(start_x)): xspan = max( xspan, max(item[0][i] for item in currents) - min(item[0][i] for item in currents)) fspan = (max(item[1] for item in currents)[1] - best_score[1]) if xspan < xtol or (n_good >= 5000 and fspan < ftol): done = True i += 1 grace.status('') print '%s %.5f\n' % (comment, best_score[1]) finally: #pool.terminate() pass while worker_futs: fut = worker_futs.pop(0) legion.coordinator().deliver_future(fut, None) for item in workers: item() return best
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, "No reference sequences given" assert self.reads or self.pairs or self.interleaved, "No reads given" for pair in self.pairs: assert len(pair) == 2, "Two files required in each pair: section" io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) # Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { "-E": None, "-T": None, "-N": str(cores), "-n": "2", "-w": "200%", "-p": "opp-in", "-I": "0,500", "-X": None, } if self.sam_unaligned: default_options["--sam-unaligned"] = None if self.half_paired: default_options["--half-paired"] = None else: default_options["--no-half-paired"] = None cutoff = "55%" # Default changed in SHRiMP 2.0.2 if "-h" in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1] # Run shrimp bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted") temp_filename = io.abspath(self.output_dir, "temp.bam") log_filename = io.abspath(self.output_dir, "shrimp_log.txt") log_file = open(log_filename, "wb") sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith("@"): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status("%s alignments produced" % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ["-p", "-I"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2 :] for flag in ["--half-paired"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1 :] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames # A little ugly ) if has_qualities: options.append("--fastq") if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ["-1", filenames[0], "-2", filenames[1]] if "--qv-offset" not in self.shrimp_options: guesses = [] for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert ( len(set(guesses)) == 1 ), "Conflicting quality offset guesses, please specify --qv-offset manually." default_options["--qv-offset"] = str(guesses[0]) default_options["--read-group"] = "%s,%s" % ( workspace.name.replace(",", "_"), workspace.name.replace(",", "_"), ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status("") full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >>sys.stderr, "Running", " ".join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status("Sort") io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status("")
def cores_required(self): return legion.coordinator().get_cores()
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [ ] for item in self.reads: read_sets.append( ([item], False) ) for item in self.pairs: read_sets.append( (item, True) ) for item in self.interleaved: read_sets.append( ([item], True) ) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E' : None, '-T' : None, '-N' : str(cores), '-n':'2', '-w':'200%', '-p': 'opp-in', '-I': '0,500', '-X':None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p','-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len( io.read_sequences(filename, qualities=True).next() ) == 3 #A little ugly for filename in filenames ) if has_qualities: options.append( '--fastq' ) if len(filenames) == 1: reads_parameters = [ filenames[0] ] else: reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames) ) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',','_'), workspace.name.replace(',','_') ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')