def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(), 'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [0] def tempname(): n[0] += 1 return temp / ('%d.fq' % n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [] twos = [] singles = [] for pair in self.pairs: assert len( pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name, seq, qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error( 'Interleaved file contains odd number of sequences' ) io.write_fastq(right, name, seq, qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ([ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:' + working.name, ] + self.bowtie_options + ['-x', reference.get_bowtie_index_prefix()]) commands = [] if ones: commands.append(command + ['-1', ','.join(ones), '-2', ','.join(twos)]) if singles: commands.append(command + ['-U', ','.join(singles)]) temp_bam_name = temp / 'temp.bam' with io.pipe_to(['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name, 'wb'), stderr=log_file) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from(command, stderr=log_file, cores=cores) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working / 'alignments', by_name=True, cores=self.cores) log_file.close()
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(),'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [ 0 ] def tempname(): n[0] += 1 return temp/('%d.fq'%n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [ ] twos = [ ] singles = [ ] for pair in self.pairs: assert len(pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name,seq,qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error('Interleaved file contains odd number of sequences') io.write_fastq(right, name,seq,qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ( [ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:'+working.name, ] + self.bowtie_options + [ '-x', reference.get_bowtie_index_prefix() ] ) commands = [ ] if ones: commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ]) if singles: commands.append(command + [ '-U', ','.join(singles) ]) temp_bam_name = temp/'temp.bam' with io.pipe_to( ['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name,'wb'), stderr=log_file ) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from( command, stderr=log_file, cores=cores ) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores) log_file.close()
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E': None, '-T': None, '-N': str(cores), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames)) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',', '_'), workspace.name.replace(',', '_')) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def run(self): bams = [ ] reference = None reference2 = None extra = [ ] for sample in self.samples: if sam.is_bam(sample): bams.append(sample) elif os.path.isdir(sample): working = working_directory.Working(sample,True) bams.append( working.get_filtered_sorted_bam() ) extra.append( '##sampleTags=' + ','.join(working.get_tags()) ) if reference2 is None: reference2 = working.get_reference().reference_fasta_filename() elif io.is_sequence_file(sample): assert reference is None, 'Only one reference FASTA file allowed.' reference = sample if reference is None: reference = reference2 if reference is None: raise grace.Error('No reference FASTA file given.') with nesoni.Stage() as stage: tempspace = stage.enter( workspace.tempspace() ) if self.depth_limit: with nesoni.Stage() as stage2: for i in xrange(len(bams)): sam.Bam_depth_limit( tempspace/('%d'%i), bams[i], depth=self.depth_limit ).process_make(stage2) bams[i] = tempspace/('%d.bam'%i) # FreeBayes claims to handle multiple bams, but it doesn't actually work if len(bams) > 1: sam.Bam_merge(tempspace/'merged', bams=bams, index=False).run() bams = [ tempspace/'merged.bam' ] command = [ 'freebayes', '-f', reference, '--ploidy',str(self.ploidy), '--pvar',str(self.pvar), ] + self.freebayes_options + bams self.log.log('Running: '+' '.join(command)+'\n') f_out = stage.enter( open(self.prefix+'.vcf','wb') ) f_in = stage.enter( io.pipe_from(command) ) done_extra = False for line in f_in: if not done_extra and not line.startswith('##'): for extra_line in extra: f_out.write(extra_line+'\n') done_extra = True f_out.write(line) index_vcf(self.prefix+'.vcf')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, "No reference sequences given" assert self.reads or self.pairs or self.interleaved, "No reads given" for pair in self.pairs: assert len(pair) == 2, "Two files required in each pair: section" io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) # Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { "-E": None, "-T": None, "-N": str(cores), "-n": "2", "-w": "200%", "-p": "opp-in", "-I": "0,500", "-X": None, } if self.sam_unaligned: default_options["--sam-unaligned"] = None if self.half_paired: default_options["--half-paired"] = None else: default_options["--no-half-paired"] = None cutoff = "55%" # Default changed in SHRiMP 2.0.2 if "-h" in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1] # Run shrimp bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted") temp_filename = io.abspath(self.output_dir, "temp.bam") log_filename = io.abspath(self.output_dir, "shrimp_log.txt") log_file = open(log_filename, "wb") sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith("@"): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status("%s alignments produced" % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ["-p", "-I"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2 :] for flag in ["--half-paired"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1 :] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames # A little ugly ) if has_qualities: options.append("--fastq") if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ["-1", filenames[0], "-2", filenames[1]] if "--qv-offset" not in self.shrimp_options: guesses = [] for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert ( len(set(guesses)) == 1 ), "Conflicting quality offset guesses, please specify --qv-offset manually." default_options["--qv-offset"] = str(guesses[0]) default_options["--read-group"] = "%s,%s" % ( workspace.name.replace(",", "_"), workspace.name.replace(",", "_"), ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status("") full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >>sys.stderr, "Running", " ".join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status("Sort") io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status("")
def run(self): bams = [] reference = None reference2 = None extra = [] for sample in self.samples: if sam.is_bam(sample): bams.append(sample) elif os.path.isdir(sample): working = working_directory.Working(sample, True) bams.append(working.get_filtered_sorted_bam()) extra.append('##sampleTags=' + ','.join(working.get_tags())) if reference2 is None: reference2 = working.get_reference( ).reference_fasta_filename() elif io.is_sequence_file(sample): assert reference is None, 'Only one reference FASTA file allowed.' reference = sample if reference is None: reference = reference2 if reference is None: raise grace.Error('No reference FASTA file given.') with nesoni.Stage() as stage: tempspace = stage.enter(workspace.tempspace()) if self.depth_limit: with nesoni.Stage() as stage2: for i in xrange(len(bams)): sam.Bam_depth_limit( tempspace / ('%d' % i), bams[i], depth=self.depth_limit).process_make(stage2) bams[i] = tempspace / ('%d.bam' % i) # FreeBayes claims to handle multiple bams, but it doesn't actually work if len(bams) > 1: sam.Bam_merge(tempspace / 'merged', bams=bams, index=False).run() bams = [tempspace / 'merged.bam'] command = [ 'freebayes', '-f', reference, '--ploidy', str(self.ploidy), '--pvar', str(self.pvar), ] + self.freebayes_options + bams self.log.log('Running: ' + ' '.join(command) + '\n') f_out = stage.enter(open(self.prefix + '.vcf', 'wb')) f_in = stage.enter(io.pipe_from(command)) done_extra = False for line in f_in: if not done_extra and not line.startswith('##'): for extra_line in extra: f_out.write(extra_line + '\n') done_extra = True f_out.write(line) index_vcf(self.prefix + '.vcf')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [ ] for item in self.reads: read_sets.append( ([item], False) ) for item in self.pairs: read_sets.append( (item, True) ) for item in self.interleaved: read_sets.append( ([item], True) ) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E' : None, '-T' : None, '-N' : str(cores), '-n':'2', '-w':'200%', '-p': 'opp-in', '-I': '0,500', '-X':None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p','-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len( io.read_sequences(filename, qualities=True).next() ) == 3 #A little ugly for filename in filenames ) if has_qualities: options.append( '--fastq' ) if len(filenames) == 1: reads_parameters = [ filenames[0] ] else: reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames) ) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',','_'), workspace.name.replace(',','_') ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')