def begin_output(self): from nesoni import io if self.output is not None: return io.open_possibly_compressed_writer(self.output) else: return sys.stdout
def begin_output(self): from nesoni import io if self.output is not None: return io.open_possibly_compressed_writer(self.output) else: return sys.stdout
def write_gff3(filename, items): # IGV likes to index large GFFs, and needs them to be sorted for this items = sorted(items, key=lambda item: (item.seqid, item.start)) with io.open_possibly_compressed_writer(filename) as f: write_gff3_header(f) for item in items: print >> f, item.as_gff()
def write_gff3(filename, items, sort=True): # IGV likes to index large GFFs, and needs them to be sorted for this if sort: items = sorted(items, key=lambda item: (item.seqid, item.start)) with io.open_possibly_compressed_writer(filename) as f: write_gff3_header(f) for item in items: print >> f, item.as_gff()
def run(self): min_quality = chr(33+self.quality) with io.open_possibly_compressed_writer(self.prefix+'.csfastq.gz') as out_file: n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): score = 0 start = 0 for i in xrange(len(seq)-1): if qual[i] >= min_quality: if seq[i+1] == '0': score += 1 else: score = max(0, score-4) if not score: start = i+2 n += 1 total_before += len(seq) if start > self.length+1: if start < len(seq): n_clipped += 1 total_clipped += len(seq)-start print >> out_file, '@'+name print >> out_file, seq[:start] print >> out_file, '+' print >> out_file, qual[:start-1] else: n_discarded += 1 self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A clipping',n_discarded) self.log.datum(self.sample,'reads poly-A clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): min_quality = chr(33+self.quality) with io.open_possibly_compressed_writer(self.prefix+'.csfastq.gz') as out_file: n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): score = 0 start = 0 for i in xrange(len(seq)-1): if qual[i] >= min_quality: if seq[i+1] == '0': score += 1 else: score = max(0, score-4) if not score: start = i+2 n += 1 total_before += len(seq) if start > self.length+1: if start < len(seq): n_clipped += 1 total_clipped += len(seq)-start print >> out_file, '@'+name print >> out_file, seq[:start] print >> out_file, '+' print >> out_file, qual[:start-1] else: n_discarded += 1 self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A clipping',n_discarded) self.log.datum(self.sample,'reads poly-A clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def write_gff3(filename, items, header): with io.open_possibly_compressed_writer(filename) as f: f.write(header) for item in items: print >> f, item.as_gff()
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [ item.lower() for item in self.types.split(',') ] parts = self.parts or self.types parts = [ item.lower() for item in parts.split(',') ] all_annotations = list(annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [ ] seen = set() queue = [ (item,item) for item in annotations ] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary),item.start,item.end,item.seqid,item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend( (primary, item2) for item2 in item.children ) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension,int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end-start strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand ) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append( (tail_length,adaptor_bases) ) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = 0 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(good_quality_end): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, # at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end while True: if (score > best_score and (i >= good_quality_end or i >= a_end+len(self.adaptor))): best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= a_end+len(self.adaptor) or i >= len(seq): break if qual[i] >= ignore_quality: if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # poly(A) tail only within good quality region. if a_end >= good_quality_end: break if qual[a_end] >= ignore_quality: if seq[a_end] == 'A': aonly_score += 1 else: aonly_score -= 4 if aonly_score <= 0: break a_end += 1 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( 'I' if item<ignore_quality else ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start > self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def write_gff3(filename, items): with io.open_possibly_compressed_writer(filename) as f: write_gff3_header(f) for item in items: print >> f, item.as_gff()
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors adaptor_set = self.adaptors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [] filenames = [] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append( itertools.izip(io.read_sequences(filename, qualities=True))) for pair_filenames in self.pairs: assert len(pair_filenames ) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append( itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True))) for filename in self.interleaved: filenames.extend(filename) any_paired = True iterators.append( deinterleave(io.read_sequences(filename, qualities=True))) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = ['read-1', 'read-2' ] if any_paired else ['read'] assert iterators, 'Nothing to clip' if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [] adaptor_names = [] if adaptor_set and adaptor_set.lower() != 'none': for item in adaptor_set.split(','): item = item.strip().lower() + ' ' any = False for line in ADAPTORS.strip().split('\n'): if line.startswith('#'): continue if not line.lower().startswith(item): continue any = True name, seq = line.rsplit(None, 1) seq = seq.replace('U', 'T') #if seq in adaptor_seqs: print 'Dup', name adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) if not any: raise grace.Error('Unknown adaptor set: ' + item) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer( self.reads_output_filenames()[0]) if fragment_reads == 2: f_paired = io.open_possibly_compressed_writer( self.interleaved_output_filenames()[0]) if output_rejects: f_reject = io.open_possibly_compressed_writer( self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [0] * fragment_reads n_out = [0] * fragment_reads n_q_clipped = [0] * fragment_reads n_a_clipped = [0] * fragment_reads n_homopolymers = [0] * fragment_reads total_out_length = [0] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single + n_in_paired) % 10000 == 0: grace.status( 'Clipping fragment %s' % grace.pretty_number(n_in_single + n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [] rejects = [] for i, (name, seq, qual) in enumerate(fragment): name = name.split()[0] seq = seq.upper() total_in_length[i] += len(seq) start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq) - trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j - start: best_start = start best_len = j - start start = j + 1 j = len(seq) - trim_end if best_len < j - start: best_start = start best_len = j - start clipped_seq = seq[best_start:best_start + best_len] clipped_qual = qual[best_start:best_start + best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append((name, seq, qual, 'quality')) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[:len(clipped_seq) - match[0]] clipped_qual = clipped_qual[:len(clipped_qual) - match[0]] end_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append((name, seq, qual, 'homopolymer')) continue graduates.append((name, clipped_seq, clipped_qual)) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name, seq, qual, reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [(name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates] if len(graduates) == 1: this_f = f_single n_single += 1 else: assert len(graduates) == 2 this_f = f_paired n_paired += 1 for name, seq, qual in graduates: write_sequence(this_f, name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: f_paired.close() f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips) + 1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum(item2[0] for item2 in item)) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts, key=lambda item2: counts[item2], reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name, 'read-pairs', n_in_paired) if n_in_single: log.datum(log_name, 'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single + n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name, 'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) #ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = self.min_score-1 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(len(seq)): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, ## at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end abort_score = best_score-len(self.adaptor) abort_i = min(len(seq), a_end+len(self.adaptor)) while score >= abort_score: #if (score > best_score and # (i >= good_quality_end or i >= a_end+len(self.adaptor))): if score > best_score: best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= abort_i: break if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # Modified 2018-03-21 # poly(A) tail only within good quality region. #if a_end >= good_quality_end: break #if qual[a_end] >= ignore_quality: # if seq[a_end] == 'A': # aonly_score += 1 # else: # aonly_score -= 4 # if aonly_score <= 0: break if a_end >= len(seq): break if seq[a_end] == 'A': aonly_score += 1 else: #if qual[a_end] >= ignore_quality: aonly_score -= 4 #else: # aonly_score -= 1 a_end += 1 # 2018-03-21 # Look for tail starting after good quality, # however don't call a tail if starts after good quality if best_a_start > good_quality_end: best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_score = 0 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start >= self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') # Option to do a quick subsample if self.only and self.only <= n: break grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [ ] filenames = [ ] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append(itertools.izip( io.read_sequences(filename, qualities=True) )) for pair_filenames in self.pairs: assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append(itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True) )) for filename in self.interleaved: filenames.append(filename) any_paired = True iterators.append(deinterleave( io.read_sequences(filename, qualities=True) )) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ] assert iterators, 'Nothing to clip' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [ ] adaptor_names = [ ] if self.adaptor_clip: if self.adaptor_file: adaptor_iter = io.read_sequences(self.adaptor_file) else: adaptor_iter = ADAPTORS for name, seq in adaptor_iter: seq = seq.upper().replace('U','T') adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0]) if fragment_reads == 2: names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames() f_paired = map(io.open_possibly_compressed_writer, names) if output_rejects: f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [ 0 ] * fragment_reads n_out = [ 0 ] * fragment_reads n_q_clipped = [ 0 ] * fragment_reads n_a_clipped = [ 0 ] * fragment_reads n_homopolymers = [ 0 ] * fragment_reads total_out_length = [ 0 ] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single+n_in_paired) % 10000 == 0: grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [ ] rejects = [ ] for i, (name, seq, qual) in enumerate(fragment): seq = seq.upper() total_in_length[i] += len(seq) if self.trim_to: seq = seq[:self.trim_to] qual = qual[:self.trim_to] start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq)-trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j-start: best_start = start best_len = j-start start = j + 1 j = len(seq)-trim_end if best_len < j-start: best_start = start best_len = j-start clipped_seq = seq[best_start:best_start+best_len] clipped_qual = qual[best_start:best_start+best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append( (name,seq,qual,'quality') ) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ] clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ] end_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append( (name,seq,qual,'homopolymer') ) continue graduates.append( (name, clipped_seq, clipped_qual) ) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name,seq,qual,reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [ (name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates ] if len(graduates) == 1: n_single += 1 (name, seq, qual) = graduates[0] write_sequence(f_single, name, seq, qual) else: assert len(graduates) == 2 n_paired += 1 # Write the pair to an interleaved file or separate l/r files for (lr,(name, seq, qual)) in enumerate(graduates): write_sequence(f_paired[lr%len(f_paired)], name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: map(lambda f: f.close(), f_paired) f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips)+1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum( item2[0] for item2 in item )) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name,'read-pairs', n_in_paired) if n_in_single: log.datum(log_name,'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single+n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name,'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ min_quality = chr(33+self.quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): best_score = 0 best_a_start = len(seq) best_a_end = len(seq) best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = len(seq) best_aonly_end = len(seq) for a_start in xrange(len(seq)): if a_start and seq[a_start-1] == 'A': continue a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end score = aonly_score adaptor_bases = 0 for i in xrange(a_end,min(a_end+len(self.adaptor),len(seq))): if qual[i] >= min_quality: if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 if score > best_score: best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if a_end >= len(seq): break if qual[a_end] >= min_quality: if seq[a_end] == 'A': aonly_score += 1 else: aonly_score -= 4 if aonly_score <= 0: break a_end += 1 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( 'X' if item<min_quality else ' ' for item in qual ) print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor print ' '*aonly_start + 'A'*(aonly_end-aonly_start) print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start > self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [item.lower() for item in self.types.split(',')] parts = self.parts or self.types parts = [item.lower() for item in parts.split(',')] all_annotations = list( annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [] seen = set() queue = [(item, item) for item in annotations] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary), item.start, item.end, item.seqid, item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend((primary, item2) for item2 in item.children) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension, int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace / 'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end - start strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min( hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append((tail_length, adaptor_bases)) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [ item.lower() for item in self.types.split(',') ] annotations = [ item for item in annotation.read_annotations(annotations_filename) if item.type.lower() in types ] self.log.log('%d annotations\n' % len(annotations)) assert annotations, 'No annotations of specified types in file' index = { } for item in annotations: if item.strand >= 0: item.tail_pos = item.end item.end += self.extension else: item.tail_pos = item.start item.start -= self.extension if item.seqid not in index: index[item.seqid] = span_index.Span_index() index[item.seqid].insert(item) item.hits = [] # [ (rel_start, rel_end, tail_length) ] for item in index.itervalues(): item.prepare() for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(workspace/'alignments_filtered.bam'): for fragment in fragment_alignments: start = min(item.pos-1 for item in fragment) end = max(item.pos+item.length-1 for item in fragment) alignment_length = end-start strand = -1 if fragment[0].flag&sam.FLAG_REVERSE else 1 if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in fragment[0].extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) if fragment[0].rname in index: hits = [ gene for gene in index[fragment[0].rname].get(start,end) if gene.strand == strand ] if hits: gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice if strand > 0: rel_start = start - gene.start rel_end = end - gene.start else: rel_start = gene.end - end rel_end = gene.end - start gene.hits.append( (rel_start,rel_end,tail_length,adaptor_bases) ) f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()