def test_len(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_length(seq) == 8 # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_length(seq) == 4
def _do_trim(self, seq): 'It trims the edges of the given seqs.' annots = get_annotations(seq) if not TRIMMING_RECOMMENDATIONS in annots: return seq trim_rec = annots[TRIMMING_RECOMMENDATIONS] # fixing the trimming recommendations if TRIMMING_RECOMMENDATIONS in annots: del annots[TRIMMING_RECOMMENDATIONS] trim_segments = [] for trim_kind in TRIMMING_KINDS: trim_segments.extend(trim_rec.get(trim_kind, [])) # masking if self.mask: seq = _mask_sequence(seq, trim_segments) else: # trimming if trim_segments: trim_limits = get_longest_complementary_segment( trim_segments, get_length(seq)) if trim_limits is None: # there's no sequence left return None else: trim_limits = [] if trim_limits: seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1) return seq
def _do_trim(self, aligned_reads): max_clipping = self.max_clipping primary_alignment = _get_primary_alignment(aligned_reads) _5end = _get_longest_5end_alinged_read(aligned_reads, max_clipping) seq = alignedread_to_seqitem(primary_alignment) segments = None if _5end is not None: if not _read_is_totally_mapped([_5end], max_clipping): if not _5end.is_reverse: qend = _get_qend(_5end) else: qend = get_length(seq) - _get_qstart(_5end) segments = [(qend, get_length(seq) - 1)] if segments is not None: _add_trim_segments(segments, seq, kind=OTHER) return seq
def _do_trim(self, seq): 'It trims the masked segments of the SeqWrappers.' segments = self._matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: segments = [(segment[0], get_length(seq) - 1) for segment in segments[0]] _add_trim_segments(segments, seq, kind=OTHER) return seq
def count_seqs(seqs): 'It counts the number of sequences and the total length.' num_seqs = 0 total_len = 0 for seq in seqs: total_len += get_length(seq) num_seqs += 1 return {'num_seqs': num_seqs, 'total_length': total_len}
def _do_trim(self, seq): 'It trims the edges of the given seqs.' left = self.left right = self.right segments = [(0, left - 1)] if left else [] if right: seq_len = get_length(seq) segments.append((seq_len - right, seq_len - 1)) _add_trim_segments(segments, seq, kind=OTHER) return seq
def _do_check(self, seq): min_ = self.min max_ = self.max length = uppercase_length(get_str_seq(seq)) if self.ignore_masked else get_length(seq) passed = True if min_ is not None and length < min_: passed = False if max_ is not None and length > max_: passed = False return passed
def _do_check(self, seq): min_ = self.min max_ = self.max length = uppercase_length( get_str_seq(seq)) if self.ignore_masked else get_length(seq) passed = True if min_ is not None and length < min_: passed = False if max_ is not None and length > max_: passed = False return passed
def _mask_sequence(seq, segments): 'It masks the given segments of the sequence' if not segments: return seq segments = merge_overlaping_segments(segments) segments = get_all_segments(segments, get_length(seq)) str_seq = get_str_seq(seq) new_seq = '' for segment in segments: start = segment[0][0] end = segment[0][1] + 1 str_seq_ = str_seq[start:end] if segment[1]: str_seq_ = str_seq_.lower() new_seq += str_seq_ if seq.kind == SEQRECORD: new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet) return copy_seq(seq, seq=new_seq)
def _get_chrom_lengths(self): chrom_lens = OrderedDict() if self._ref_fhand is None: vcf_fhand = gzip.open(self._reader.fhand.name) for line in vcf_fhand: line = line.strip() if line.startswith('#'): continue items = line.split() chrom = items[0] loc = int(items[1]) if chrom not in chrom_lens: chrom_lens[chrom] = loc else: if loc > chrom_lens[chrom]: chrom_lens[chrom] = loc else: for read in read_seqs([self._ref_fhand]): chrom_lens[get_name(read)] = get_length(read) return chrom_lens
def _get_seq_lengths(fhand): return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False, nxs=None): 'It calculates some stats for the given seqs.' # get data lengths = IntCounter() quals_per_pos = IntBoxplot() nucl_freq = NuclFreqsPlot() kmer_counter = KmerCounter(kmer_size) if kmer_size else None dustscores = IntCounter() for seq in seqs: lengths[get_length(seq)] += 1 try: quals = get_int_qualities(seq) except AttributeError: quals = [] for index, qual in enumerate(quals): quals_per_pos.append(index + 1, qual) str_seq = get_str_seq(seq) for index, nucl in enumerate(str_seq): nucl_freq.append(index, nucl) if kmer_counter is not None: kmer_counter.count_seq(str_seq) if do_dust_stats: dustscore = calculate_dust_score(seq) if dustscore is not None: dustscores[int(dustscore)] += 1 lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'}) # length distribution lengths_srt = 'Length stats and distribution.\n' lengths_srt += '------------------------------\n' nxs = sorted(nxs) if nxs else [] for nx in sorted(nxs): lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx)) lengths_srt += str(lengths) lengths_srt += '\n' # agregate quals if quals_per_pos: quals = quals_per_pos.aggregated_array quals.update_labels({'sum': None, 'items': 'tot. base pairs'}) q30 = quals.count_relative_to_value(30, operator.ge) / quals.count q30 *= 100 q20 = quals.count_relative_to_value(20, operator.ge) / quals.count q20 *= 100 # qual distribution qual_str = 'Quality stats and distribution.\n' qual_str += '-------------------------------\n' qual_str += 'Q20: {:.2f}\n'.format(q20) qual_str += 'Q30: {:.2f}\n'.format(q30) qual_str += str(quals) qual_str += '\n' # qual per position boxplot qual_boxplot = 'Boxplot for quality per position.\n' qual_boxplot += '---------------------------------\n' qual_boxplot += quals_per_pos.ascii_plot qual_boxplot += '\n' else: qual_str = '' qual_boxplot = '' # nucl freqs freq_str = 'Nucleotide frequency per position.\n' freq_str += '----------------------------------\n' freq_str += nucl_freq.ascii_plot freq_str += '\n' # kmer_distriubution kmer_str = '' if kmer_counter is not None: kmers = IntCounter(kmer_counter.values) if kmers: kmers.update_labels({'sum': None, 'items': 'num. kmers'}) kmer_str = 'Kmer distribution\n' kmer_str += '-----------------\n' kmer_str += str(kmers) kmer_str += '\n' kmer_str += 'Most common kmers:\n' for kmer, number in kmer_counter.most_common(20): kmer_str += '\t{}: {}\n'.format(kmer, number) dust_str = '' if dustscores: dustscores.update_labels({'sum': None, 'items': 'num. seqs.'}) dust_str = 'Dustscores stats and distribution.\n' dust_str += '----------------------------------\n' dust7 = (dustscores.count_relative_to_value(7, operator.gt) / dustscores.count) dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7) dust_str += str(dustscores) dust_str += '\n' return {'length': lengths_srt, 'quality': qual_str, 'nucl_freq': freq_str, 'qual_boxplot': qual_boxplot, 'kmer': kmer_str, 'dustscore': dust_str}
class MatePairSplitter(object): 'It splits the input sequences with the provided linkers.' def __init__(self, linkers=None): 'The initiator' if linkers is None: linkers = get_setting('LINKERS') linkers = [ SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers) ] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') self.linkers = list(linkers) def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{ 'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True }, { 'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity }] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs def _split_by_mate_linker(self, seq, (segments, is_partial)): 'It splits the seqs using segments' if not segments: return [copy_seq(seq)] elongated_match = is_partial if len(segments) == 1: segment_start = segments[0][0] segment_end = segments[0][1] seq_end = get_length(seq) - 1 if segment_start == 0: new_seq = slice_seq(seq, segment_end + 1, None) return [new_seq] elif segment_end == seq_end: new_seq = slice_seq(seq, None, segment_start) return [new_seq] elif segment_end > seq_end: msg = 'The segment ends after the sequence has ended' raise RuntimeError(msg) else: new_seq1 = slice_seq(seq, None, segment_start) new_seq2 = slice_seq(seq, segment_end + 1, None) if elongated_match: name = get_name(seq) + '_pl' else: name = get_name(seq) new_seq1 = copy_seq(new_seq1, name=name + r'\1') new_seq2 = copy_seq(new_seq2, name=name + r'\2') return [new_seq1, new_seq2] else: seqs = [] counter = 1 seq_start = 0 for segment_start, segment_end in segments: if segment_start == 0: continue new_seq = slice_seq(seq, seq_start, segment_start) seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=seq_name) seqs.append(new_seq) counter += 1 seq_start = segment_end + 1 else: if segment_end != get_length(seq) + 1: new_seq = slice_seq(seq, segment_end + 1, None) name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=name) seqs.append(new_seq) return seqs
def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False, nxs=None): 'It calculates some stats for the given seqs.' # get data lengths = IntCounter() quals_per_pos = IntBoxplot() nucl_freq = NuclFreqsPlot() kmer_counter = KmerCounter(kmer_size) if kmer_size else None dustscores = IntCounter() for seq in seqs: lengths[get_length(seq)] += 1 try: quals = get_int_qualities(seq) except AttributeError: quals = [] for index, qual in enumerate(quals): quals_per_pos.append(index + 1, qual) str_seq = get_str_seq(seq) for index, nucl in enumerate(str_seq): nucl_freq.append(index, nucl) if kmer_counter is not None: kmer_counter.count_seq(str_seq) if do_dust_stats: dustscore = calculate_dust_score(seq) if dustscore is not None: dustscores[int(dustscore)] += 1 lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'}) # length distribution lengths_srt = 'Length stats and distribution.\n' lengths_srt += '------------------------------\n' nxs = sorted(nxs) if nxs else [] for nx in sorted(nxs): lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx)) lengths_srt += str(lengths) lengths_srt += '\n' # agregate quals if quals_per_pos: quals = quals_per_pos.aggregated_array quals.update_labels({'sum': None, 'items': 'tot. base pairs'}) q30 = quals.count_relative_to_value(30, operator.ge) / quals.count q30 *= 100 q20 = quals.count_relative_to_value(20, operator.ge) / quals.count q20 *= 100 # qual distribution qual_str = 'Quality stats and distribution.\n' qual_str += '-------------------------------\n' qual_str += 'Q20: {:.2f}\n'.format(q20) qual_str += 'Q30: {:.2f}\n'.format(q30) qual_str += str(quals) qual_str += '\n' # qual per position boxplot qual_boxplot = 'Boxplot for quality per position.\n' qual_boxplot += '---------------------------------\n' qual_boxplot += quals_per_pos.ascii_plot qual_boxplot += '\n' else: qual_str = '' qual_boxplot = '' # nucl freqs freq_str = 'Nucleotide frequency per position.\n' freq_str += '----------------------------------\n' freq_str += nucl_freq.ascii_plot freq_str += '\n' # kmer_distriubution kmer_str = '' if kmer_counter is not None: kmers = IntCounter(kmer_counter.values) if kmers: kmers.update_labels({'sum': None, 'items': 'num. kmers'}) kmer_str = 'Kmer distribution\n' kmer_str += '-----------------\n' kmer_str += str(kmers) kmer_str += '\n' kmer_str += 'Most common kmers:\n' for kmer, number in kmer_counter.most_common(20): kmer_str += '\t{}: {}\n'.format(kmer, number) dust_str = '' if dustscores: dustscores.update_labels({'sum': None, 'items': 'num. seqs.'}) dust_str = 'Dustscores stats and distribution.\n' dust_str += '----------------------------------\n' dust7 = (dustscores.count_relative_to_value(7, operator.gt) / dustscores.count) dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7) dust_str += str(dustscores) dust_str += '\n' return { 'length': lengths_srt, 'quality': qual_str, 'nucl_freq': freq_str, 'qual_boxplot': qual_boxplot, 'kmer': kmer_str, 'dustscore': dust_str }