def call_variants(self, ksize, mindist=6, logstream=sys.stderr): """Attempt to call variants from this contig alignment. If the alignment CIGAR matches a known pattern, the appropriate caller is invoked (SNV or INDEL caller). If not, a "no call" is reported. If an SNV call is within `mindist` base pairs of the end of the alignment it is ignored. Set to `None` to disable this behavior. Variant calls with no spanning interesting k-mers are designated as "passenger calls" and discarded. """ offset = 0 if self.targetshort else self.offset if self.vartype == 'snv': caller = self.call_snv(self.match.query, self.match.target, offset, ksize, mindist, logstream=logstream) for call in caller: if self.is_passenger(call): call.filter(vf.PassengerVariant) yield call elif self.vartype == 'indel': indelcaller = self.call_indel(ksize) indel = next(indelcaller) if self.is_passenger(indel): indel.filter(vf.PassengerVariant) yield indel leftflankcaller = self.call_snv(self.leftflank.query, self.leftflank.target, offset, ksize, mindist, donocall=False) offset += self.leftflank.length if self.indeltype == 'D': offset += self.indel.length rightflankcaller = self.call_snv(self.rightflank.query, self.rightflank.target, offset, ksize, mindist, donocall=False) for call in chain(leftflankcaller, rightflankcaller): if self.is_passenger(call): call.filter(vf.PassengerVariant) yield call else: nocall = Variant(self.seqid, self.pos, '.', '.', CONTIG=self.varseq, CIGAR=self.cigar, KSW2=str(self.score)) nocall.filter(vf.InscrutableCigar) yield nocall
def call_snv(self, qseq, tseq, offset, ksize, mindist=6, donocall=True, logstream=sys.stderr): """Call SNVs from the aligned mismatched sequences. The `qseq` and `tseq` are strings containing query and target sequences of identical length; `mismatches` is a list of positions where `qseq` and `tseq` do not match; `offset` is the number of 5' nucleotides in the target not aligned to the query; and `ksize` is used to compute a window that spans all reference allele k-mers in `tseq` and all alternate allele k-mers in `qseq`. """ length = len(qseq) assert len(tseq) == length diffs = [i for i in range(length) if tseq[i] != qseq[i]] if mindist: diffs = trim_terminal_snvs(diffs, length, mindist, logstream) if len(diffs) == 0: if donocall: nocall = Variant(self.seqid, self.cutout.local_to_global(offset), '.', '.', CONTIG=qseq, CIGAR=self.cigar, KSW2=str(self.score), IKMERS=str(len(self.contig.annotations))) nocall.filter(vf.PerfectMatch) yield nocall return for pos in diffs: minpos = max(pos - ksize + 1, 0) maxpos = min(pos + ksize, length) altwindow = qseq[minpos:maxpos] refrwindow = tseq[minpos:maxpos] refr = tseq[pos].upper() alt = qseq[pos].upper() localcoord = pos + offset globalcoord = self.cutout.local_to_global(localcoord) nikmers = n_ikmers_present(self.contig, altwindow) snv = Variant(self.seqid, globalcoord, refr, alt, CONTIG=qseq, CIGAR=self.cigar, KSW2=str(self.score), IKMERS=str(nikmers), ALTWINDOW=altwindow, REFRWINDOW=refrwindow) yield snv
def test_filter_field(): v = Variant('scaffold1', 12345, '.', '.') assert v.filterstr == '.' v.filter(vf.InscrutableCigar) assert v.filterstr == 'InscrutableCigar' v = Variant('chr1', 55555, '.', '.') v.filter(vf.PerfectMatch) assert v.filterstr == 'PerfectMatch' v = Variant('1', 809768, 'C', 'CAT') assert v.filterstr == 'PASS' v.filter(vf.PassengerVariant) assert v.filterstr == 'PassengerVariant' v.filter(vf.Homopolymer) assert v.filterstr == 'Homopolymer;PassengerVariant' v = Variant('one', 112358, 'T', 'A') v.filter('SNPyMcSNPface') v.filter(6.022e23) v.filter(dict(chicken='waffles', biscuits='gravy')) v.filterstr == 'PASS' # These "filters" shouldn't actually do anything