def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [] reads_filenames = [] shrimp_options = ['-h', threshold] if threshold.endswith('%'): threshold = -float(threshold[:-1]) / 100.0 else: threshold = int(threshold) output_dir = [] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [os.path.abspath(filename) for filename in args[1:]]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([[os.path.abspath(filename)] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append( [os.path.abspath(filename) for filename in args]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute( args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir, 'reference.fa') reference_file = open(reference_filename, 'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references': input_reference_filenames, 'reads': reads_filenames, 'stride': stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir, 'temp%d-%d.fa' % (os.getpid(), my_number)) tempname_out = os.path.join( output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname, 'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c', command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = {} # read_name -> [ hit line ] f = open(tempname_out, 'rb') for line in f: if line.startswith('>'): read_name = line.split(None, 1)[0][1:] if read_name not in hits: hits[read_name] = [] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [] reader = iter_reads(config) read_count = 0 while True: read_set = [] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append(do_shrimp(read_set)) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps)) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' read_sets = [ ] for item in self.reads: read_sets.append( ([item], False) ) for item in self.pairs: read_sets.append( (item, True) ) for item in self.interleaved: read_sets.append( ([item], True) ) default_options = { '-E' : None, '-T' : None, '-N' : str(grace.how_many_cpus()), '-n':'2', '-w':'200%', '-p': 'opp-in', '-I': '0,500', '-X':None } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ] #Create working directory workspace = self.get_workspace() #working_directory.Working(self.output_dir, must_exist=False) workspace.setup_reference(self.references) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() #workspace = io.Workspace(self.output_dir) # #workspace.update_param( # shrimp_cutoff = cutoff #) # ##Make copy of reference sequences # #reference_filename = io.abspath(self.output_dir,'reference.fa') #reference_file = open(reference_filename,'wb') # #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk') #reference_genbank_file = open(reference_genbank_filename,'wb') #any_genbank = [ False ] # #def genbank_callback(name, record): # """ Make a copy of any genbank files passed in. """ # from Bio import SeqIO # # SeqIO.write([record], reference_genbank_file, 'genbank') # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.gbk' # ), 'wb') # SeqIO.write([record], f, 'genbank') # f.close() # # any_genbank[0] = True # #for filename in self.references: # for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback): # #Don't retain any comment # name = name.split()[0] # io.write_fasta(reference_file, name, sequence.upper()) # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.fa' # ), 'wb') # io.write_fasta(f, name, sequence.upper()) # f.close() # # #reference_file.close() #reference_genbank_file.close() #if not any_genbank[0]: # os.unlink(reference_genbank_filename) # ## Create an index of the reference sequences #io.execute([ # 'samtools', 'faidx', reference_filename #]) #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) #if self.cs: # program = 'gmapper-cs' #else: # program = 'gmapper-ls' sam_header_sent = [False] n_seen = [0] def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p','-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+1:] return options if '--qv-offset' not in self.shrimp_options: guesses = [ ] for filenames, is_paired in read_sets: for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' default_options['--qv-offset'] = str(guesses[0]) for filenames, is_paired in read_sets: options = self.shrimp_options[:] has_qualities = all( len( io.read_sequences(filename, qualities=True).next() ) == 3 #A little ugly for filename in filenames ) if has_qualities: options.append( '--fastq' ) # temp_read_filename = io.abspath(working_dir, 'temp.fa') #else: # temp_read_filename = io.abspath(working_dir, 'temp.fq') #try: #if len(filenames) == 1: # gmapper can cope with gzipped and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'): # actual_read_filename = filenames[0] #else: # actual_read_filename = temp_read_filename # grace.status('Copying reads') # f = open(temp_read_filename, 'wb') # if has_qualities: # for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]): # for name, seq, qual in reads: # io.write_fastq(f, name, seq, qual) # else: # for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]): # for name, seq in reads: # io.write_fasta(f, name, seq) # f.close() # grace.status('') if len(filenames) == 1: reads_parameters = [ filenames[0] ] else: reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ] for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file) eat(p) #finally: # if os.path.exists(temp_read_filename): # os.unlink(temp_read_filename) log_file.close() sam_eater.close() grace.status('Sort') io.execute([ 'samtools', 'sort', '-n', temp_filename, bam_prefix ]) os.unlink(temp_filename) grace.status('')
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [ ] reads_filenames = [ ] shrimp_options = [ '-h', threshold ] if threshold.endswith('%'): threshold = -float(threshold[:-1])/100.0 else: threshold = int(threshold) output_dir = [ ] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [ os.path.abspath(filename) for filename in args[1:] ]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append([ os.path.abspath(filename) for filename in args ]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute(args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir,'reference.fa') reference_file = open(reference_filename,'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references' : input_reference_filenames, 'reads' : reads_filenames, 'stride' : stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number)) tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname,'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = { } # read_name -> [ hit line ] f = open(tempname_out,'rb') for line in f: if line.startswith('>'): read_name = line.split(None,1)[0][1:] if read_name not in hits: hits[read_name] = [ ] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [ ] reader = iter_reads(config) read_count = 0 while True: read_set = [ ] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append( do_shrimp(read_set) ) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) ) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) default_options = { '-E': None, '-T': None, '-N': str(grace.how_many_cpus()), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Create working directory workspace = self.get_workspace( ) #working_directory.Working(self.output_dir, must_exist=False) workspace.setup_reference(self.references) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() #workspace = io.Workspace(self.output_dir) # #workspace.update_param( # shrimp_cutoff = cutoff #) # ##Make copy of reference sequences # #reference_filename = io.abspath(self.output_dir,'reference.fa') #reference_file = open(reference_filename,'wb') # #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk') #reference_genbank_file = open(reference_genbank_filename,'wb') #any_genbank = [ False ] # #def genbank_callback(name, record): # """ Make a copy of any genbank files passed in. """ # from Bio import SeqIO # # SeqIO.write([record], reference_genbank_file, 'genbank') # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.gbk' # ), 'wb') # SeqIO.write([record], f, 'genbank') # f.close() # # any_genbank[0] = True # #for filename in self.references: # for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback): # #Don't retain any comment # name = name.split()[0] # io.write_fasta(reference_file, name, sequence.upper()) # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.fa' # ), 'wb') # io.write_fasta(f, name, sequence.upper()) # f.close() # # #reference_file.close() #reference_genbank_file.close() #if not any_genbank[0]: # os.unlink(reference_genbank_filename) # ## Create an index of the reference sequences #io.execute([ # 'samtools', 'faidx', reference_filename #]) #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) #if self.cs: # program = 'gmapper-cs' #else: # program = 'gmapper-ls' sam_header_sent = [False] n_seen = [0] def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options if '--qv-offset' not in self.shrimp_options: guesses = [] for filenames, is_paired in read_sets: for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' default_options['--qv-offset'] = str(guesses[0]) for filenames, is_paired in read_sets: options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') # temp_read_filename = io.abspath(working_dir, 'temp.fa') #else: # temp_read_filename = io.abspath(working_dir, 'temp.fq') #try: #if len(filenames) == 1: # gmapper can cope with gzipped and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'): # actual_read_filename = filenames[0] #else: # actual_read_filename = temp_read_filename # grace.status('Copying reads') # f = open(temp_read_filename, 'wb') # if has_qualities: # for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]): # for name, seq, qual in reads: # io.write_fastq(f, name, seq, qual) # else: # for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]): # for name, seq in reads: # io.write_fasta(f, name, seq) # f.close() # grace.status('') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file) eat(p) #finally: # if os.path.exists(temp_read_filename): # os.unlink(temp_read_filename) log_file.close() sam_eater.close() grace.status('Sort') io.execute(['samtools', 'sort', '-n', temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status('')