def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors adaptor_set = self.adaptors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [] filenames = [] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append( itertools.izip(io.read_sequences(filename, qualities=True))) for pair_filenames in self.pairs: assert len(pair_filenames ) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append( itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True))) for filename in self.interleaved: filenames.extend(filename) any_paired = True iterators.append( deinterleave(io.read_sequences(filename, qualities=True))) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = ['read-1', 'read-2' ] if any_paired else ['read'] assert iterators, 'Nothing to clip' if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [] adaptor_names = [] if adaptor_set and adaptor_set.lower() != 'none': for item in adaptor_set.split(','): item = item.strip().lower() + ' ' any = False for line in ADAPTORS.strip().split('\n'): if line.startswith('#'): continue if not line.lower().startswith(item): continue any = True name, seq = line.rsplit(None, 1) seq = seq.replace('U', 'T') #if seq in adaptor_seqs: print 'Dup', name adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) if not any: raise grace.Error('Unknown adaptor set: ' + item) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer( self.reads_output_filenames()[0]) if fragment_reads == 2: f_paired = io.open_possibly_compressed_writer( self.interleaved_output_filenames()[0]) if output_rejects: f_reject = io.open_possibly_compressed_writer( self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [0] * fragment_reads n_out = [0] * fragment_reads n_q_clipped = [0] * fragment_reads n_a_clipped = [0] * fragment_reads n_homopolymers = [0] * fragment_reads total_out_length = [0] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single + n_in_paired) % 10000 == 0: grace.status( 'Clipping fragment %s' % grace.pretty_number(n_in_single + n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [] rejects = [] for i, (name, seq, qual) in enumerate(fragment): name = name.split()[0] seq = seq.upper() total_in_length[i] += len(seq) start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq) - trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j - start: best_start = start best_len = j - start start = j + 1 j = len(seq) - trim_end if best_len < j - start: best_start = start best_len = j - start clipped_seq = seq[best_start:best_start + best_len] clipped_qual = qual[best_start:best_start + best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append((name, seq, qual, 'quality')) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[:len(clipped_seq) - match[0]] clipped_qual = clipped_qual[:len(clipped_qual) - match[0]] end_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append((name, seq, qual, 'homopolymer')) continue graduates.append((name, clipped_seq, clipped_qual)) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name, seq, qual, reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [(name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates] if len(graduates) == 1: this_f = f_single n_single += 1 else: assert len(graduates) == 2 this_f = f_paired n_paired += 1 for name, seq, qual in graduates: write_sequence(this_f, name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: f_paired.close() f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips) + 1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum(item2[0] for item2 in item)) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts, key=lambda item2: counts[item2], reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name, 'read-pairs', n_in_paired) if n_in_single: log.datum(log_name, 'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single + n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name, 'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E': None, '-T': None, '-N': str(cores), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames)) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',', '_'), workspace.name.replace(',', '_')) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, "No reference sequences given" assert self.reads or self.pairs or self.interleaved, "No reads given" for pair in self.pairs: assert len(pair) == 2, "Two files required in each pair: section" io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) # Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { "-E": None, "-T": None, "-N": str(cores), "-n": "2", "-w": "200%", "-p": "opp-in", "-I": "0,500", "-X": None, } if self.sam_unaligned: default_options["--sam-unaligned"] = None if self.half_paired: default_options["--half-paired"] = None else: default_options["--no-half-paired"] = None cutoff = "55%" # Default changed in SHRiMP 2.0.2 if "-h" in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1] # Run shrimp bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted") temp_filename = io.abspath(self.output_dir, "temp.bam") log_filename = io.abspath(self.output_dir, "shrimp_log.txt") log_file = open(log_filename, "wb") sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith("@"): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status("%s alignments produced" % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ["-p", "-I"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2 :] for flag in ["--half-paired"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1 :] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames # A little ugly ) if has_qualities: options.append("--fastq") if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ["-1", filenames[0], "-2", filenames[1]] if "--qv-offset" not in self.shrimp_options: guesses = [] for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert ( len(set(guesses)) == 1 ), "Conflicting quality offset guesses, please specify --qv-offset manually." default_options["--qv-offset"] = str(guesses[0]) default_options["--read-group"] = "%s,%s" % ( workspace.name.replace(",", "_"), workspace.name.replace(",", "_"), ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status("") full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >>sys.stderr, "Running", " ".join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status("Sort") io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status("")
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [ ] filenames = [ ] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append(itertools.izip( io.read_sequences(filename, qualities=True) )) for pair_filenames in self.pairs: assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append(itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True) )) for filename in self.interleaved: filenames.append(filename) any_paired = True iterators.append(deinterleave( io.read_sequences(filename, qualities=True) )) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ] assert iterators, 'Nothing to clip' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [ ] adaptor_names = [ ] if self.adaptor_clip: if self.adaptor_file: adaptor_iter = io.read_sequences(self.adaptor_file) else: adaptor_iter = ADAPTORS for name, seq in adaptor_iter: seq = seq.upper().replace('U','T') adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0]) if fragment_reads == 2: names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames() f_paired = map(io.open_possibly_compressed_writer, names) if output_rejects: f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [ 0 ] * fragment_reads n_out = [ 0 ] * fragment_reads n_q_clipped = [ 0 ] * fragment_reads n_a_clipped = [ 0 ] * fragment_reads n_homopolymers = [ 0 ] * fragment_reads total_out_length = [ 0 ] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single+n_in_paired) % 10000 == 0: grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [ ] rejects = [ ] for i, (name, seq, qual) in enumerate(fragment): seq = seq.upper() total_in_length[i] += len(seq) if self.trim_to: seq = seq[:self.trim_to] qual = qual[:self.trim_to] start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq)-trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j-start: best_start = start best_len = j-start start = j + 1 j = len(seq)-trim_end if best_len < j-start: best_start = start best_len = j-start clipped_seq = seq[best_start:best_start+best_len] clipped_qual = qual[best_start:best_start+best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append( (name,seq,qual,'quality') ) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ] clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ] end_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append( (name,seq,qual,'homopolymer') ) continue graduates.append( (name, clipped_seq, clipped_qual) ) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name,seq,qual,reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [ (name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates ] if len(graduates) == 1: n_single += 1 (name, seq, qual) = graduates[0] write_sequence(f_single, name, seq, qual) else: assert len(graduates) == 2 n_paired += 1 # Write the pair to an interleaved file or separate l/r files for (lr,(name, seq, qual)) in enumerate(graduates): write_sequence(f_paired[lr%len(f_paired)], name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: map(lambda f: f.close(), f_paired) f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips)+1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum( item2[0] for item2 in item )) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name,'read-pairs', n_in_paired) if n_in_single: log.datum(log_name,'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single+n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name,'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [ ] for item in self.reads: read_sets.append( ([item], False) ) for item in self.pairs: read_sets.append( (item, True) ) for item in self.interleaved: read_sets.append( ([item], True) ) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E' : None, '-T' : None, '-N' : str(cores), '-n':'2', '-w':'200%', '-p': 'opp-in', '-I': '0,500', '-X':None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p','-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len( io.read_sequences(filename, qualities=True).next() ) == 3 #A little ugly for filename in filenames ) if has_qualities: options.append( '--fastq' ) if len(filenames) == 1: reads_parameters = [ filenames[0] ] else: reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames) ) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',','_'), workspace.name.replace(',','_') ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' read_sets = [ ] for item in self.reads: read_sets.append( ([item], False) ) for item in self.pairs: read_sets.append( (item, True) ) for item in self.interleaved: read_sets.append( ([item], True) ) default_options = { '-E' : None, '-T' : None, '-N' : str(grace.how_many_cpus()), '-n':'2', '-w':'200%', '-p': 'opp-in', '-I': '0,500', '-X':None } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ] #Create working directory workspace = self.get_workspace() #working_directory.Working(self.output_dir, must_exist=False) workspace.setup_reference(self.references) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() #workspace = io.Workspace(self.output_dir) # #workspace.update_param( # shrimp_cutoff = cutoff #) # ##Make copy of reference sequences # #reference_filename = io.abspath(self.output_dir,'reference.fa') #reference_file = open(reference_filename,'wb') # #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk') #reference_genbank_file = open(reference_genbank_filename,'wb') #any_genbank = [ False ] # #def genbank_callback(name, record): # """ Make a copy of any genbank files passed in. """ # from Bio import SeqIO # # SeqIO.write([record], reference_genbank_file, 'genbank') # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.gbk' # ), 'wb') # SeqIO.write([record], f, 'genbank') # f.close() # # any_genbank[0] = True # #for filename in self.references: # for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback): # #Don't retain any comment # name = name.split()[0] # io.write_fasta(reference_file, name, sequence.upper()) # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.fa' # ), 'wb') # io.write_fasta(f, name, sequence.upper()) # f.close() # # #reference_file.close() #reference_genbank_file.close() #if not any_genbank[0]: # os.unlink(reference_genbank_filename) # ## Create an index of the reference sequences #io.execute([ # 'samtools', 'faidx', reference_filename #]) #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) #if self.cs: # program = 'gmapper-cs' #else: # program = 'gmapper-ls' sam_header_sent = [False] n_seen = [0] def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p','-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+1:] return options if '--qv-offset' not in self.shrimp_options: guesses = [ ] for filenames, is_paired in read_sets: for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' default_options['--qv-offset'] = str(guesses[0]) for filenames, is_paired in read_sets: options = self.shrimp_options[:] has_qualities = all( len( io.read_sequences(filename, qualities=True).next() ) == 3 #A little ugly for filename in filenames ) if has_qualities: options.append( '--fastq' ) # temp_read_filename = io.abspath(working_dir, 'temp.fa') #else: # temp_read_filename = io.abspath(working_dir, 'temp.fq') #try: #if len(filenames) == 1: # gmapper can cope with gzipped and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'): # actual_read_filename = filenames[0] #else: # actual_read_filename = temp_read_filename # grace.status('Copying reads') # f = open(temp_read_filename, 'wb') # if has_qualities: # for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]): # for name, seq, qual in reads: # io.write_fastq(f, name, seq, qual) # else: # for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]): # for name, seq in reads: # io.write_fasta(f, name, seq) # f.close() # grace.status('') if len(filenames) == 1: reads_parameters = [ filenames[0] ] else: reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ] for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file) eat(p) #finally: # if os.path.exists(temp_read_filename): # os.unlink(temp_read_filename) log_file.close() sam_eater.close() grace.status('Sort') io.execute([ 'samtools', 'sort', '-n', temp_filename, bam_prefix ]) os.unlink(temp_filename) grace.status('')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) default_options = { '-E': None, '-T': None, '-N': str(grace.how_many_cpus()), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Create working directory workspace = self.get_workspace( ) #working_directory.Working(self.output_dir, must_exist=False) workspace.setup_reference(self.references) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() #workspace = io.Workspace(self.output_dir) # #workspace.update_param( # shrimp_cutoff = cutoff #) # ##Make copy of reference sequences # #reference_filename = io.abspath(self.output_dir,'reference.fa') #reference_file = open(reference_filename,'wb') # #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk') #reference_genbank_file = open(reference_genbank_filename,'wb') #any_genbank = [ False ] # #def genbank_callback(name, record): # """ Make a copy of any genbank files passed in. """ # from Bio import SeqIO # # SeqIO.write([record], reference_genbank_file, 'genbank') # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.gbk' # ), 'wb') # SeqIO.write([record], f, 'genbank') # f.close() # # any_genbank[0] = True # #for filename in self.references: # for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback): # #Don't retain any comment # name = name.split()[0] # io.write_fasta(reference_file, name, sequence.upper()) # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.fa' # ), 'wb') # io.write_fasta(f, name, sequence.upper()) # f.close() # # #reference_file.close() #reference_genbank_file.close() #if not any_genbank[0]: # os.unlink(reference_genbank_filename) # ## Create an index of the reference sequences #io.execute([ # 'samtools', 'faidx', reference_filename #]) #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) #if self.cs: # program = 'gmapper-cs' #else: # program = 'gmapper-ls' sam_header_sent = [False] n_seen = [0] def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options if '--qv-offset' not in self.shrimp_options: guesses = [] for filenames, is_paired in read_sets: for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' default_options['--qv-offset'] = str(guesses[0]) for filenames, is_paired in read_sets: options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') # temp_read_filename = io.abspath(working_dir, 'temp.fa') #else: # temp_read_filename = io.abspath(working_dir, 'temp.fq') #try: #if len(filenames) == 1: # gmapper can cope with gzipped and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'): # actual_read_filename = filenames[0] #else: # actual_read_filename = temp_read_filename # grace.status('Copying reads') # f = open(temp_read_filename, 'wb') # if has_qualities: # for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]): # for name, seq, qual in reads: # io.write_fastq(f, name, seq, qual) # else: # for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]): # for name, seq in reads: # io.write_fasta(f, name, seq) # f.close() # grace.status('') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file) eat(p) #finally: # if os.path.exists(temp_read_filename): # os.unlink(temp_read_filename) log_file.close() sam_eater.close() grace.status('Sort') io.execute(['samtools', 'sort', '-n', temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status('')