def testSummary(self): """ The summary method must return the correct result. """ titleAlignments = TitleAlignments('subject title', 10) titleAlignments.addAlignment( TitleAlignment(Read('id1', 'ACGT'), [ HSP(30, subjectStart=0, subjectEnd=2), ])) titleAlignments.addAlignment( TitleAlignment(Read('id2', 'ACGT'), [ HSP(55, subjectStart=2, subjectEnd=4), HSP(40, subjectStart=8, subjectEnd=9), ])) self.assertEqual( { 'bestScore': 55, 'coverage': 0.5, 'hspCount': 3, 'medianScore': 40, 'readCount': 2, 'subjectLength': 10, 'subjectTitle': 'subject title', }, titleAlignments.summary())
def testLength(self): """ Sorting on sequence length must work, including a secondary sort on title. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('length') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 38000 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 37000 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 35000 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 35000 'gi|887699|gb|DQ37780 Cowpox virus 15', # 30000 ], result)
def testMaxScore_EValue(self): """ Sorting on max score must work when scores are e values, including a secondary sort on title. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') # self.assertEqual([ # 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 # 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 # 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 # 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 # 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 # ], result) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 ], result)
def __init__(self, id_, sequence, alignment=None): self.significantOffsets = OrderedDict() self._originalLength = len(sequence) self.alignment = alignment # Scan the sequence for initial gaps. offset = 0 for base in sequence: if base == '-': offset += 1 else: break if offset == len(sequence): raise ValueError('Read is all gaps.') # Scan for final gaps. trailing = 0 for base in sequence[::-1]: if base == '-': trailing += 1 else: break # Make sure the read is not all gaps. assert offset + trailing < len(sequence) self.offset = offset Read.__init__(self, id_, sequence[offset:len(sequence) - trailing].upper())
def testReadSetFilterAllowAnything(self): """ The filter function must work correctly when passed a 0.0 value for minNewReads, i.e. that considers any read set sufficiently novel. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minNewReads=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testCoverageIncludesSome(self): """ The coverage function must return an titlesAlignments instance with only the expected titles if only some of its titles have sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) # To understand why the following produces the result it does, # you need to look at the HSP coverage in sample_data.py and # calculate the coverage by hand. result = titlesAlignments.filter(minCoverage=0.0003) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', ], sorted(result))
def testGapAmbiguous(self): """ Testing that the ambiguousOffset shows ambiguous characters paired with gaps as expected """ self.assertEqual( { 'match': { 'identicalMatchCount': 2, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 1, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [1], 'extraCount': 0, 'gapOffsets': [2, 3], }, 'read2': { 'ambiguousOffsets': [3], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'AN--T'), Read('id2', 'A--NT')))
def testMaxMatchingReads(self): """ The filter function must work correctly when passed a value for maxMatchingReads. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(maxMatchingReads=1) # Cowpox virus 15 is not in the results as it is matched by two # reads. self.assertEqual( sorted([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.' ]), sorted(result))
def testMatchWithIdenticalAmbiguityButStrict(self): """ Two sequences that match exactly, including one (identical) ambiguity at the same location in the sequence, must compare as expected. Strict. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTN'), Read('id2', 'ACGTTN'), matchAmbiguous=False))
def testNonDefaultGapChars(self): """ We must be able to specify the gap characters. """ for gap in '+$': self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [2], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [0], }, }, compareDNAReads(Read('id1', 'AC%sTT' % gap), Read('id2', '%sCGTT' % gap), gapChars='+$'))
def testMatchWithIncompatibleAmbiguityInBoth(self): """ Two sequences that match exactly, apart from one (incompatible) ambiguity at the same location in the sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTS')))
def testNonMatchingAmbiguityInFirst(self): """ Two sequences that match exactly, apart from one (incompatible) ambiguity in the second sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTC')))
def testMatchWithAmbiguityButStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=False to disallow ambiguous matching. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTS'), Read('id2', 'ACGTTC'), matchAmbiguous=False))
def testTitleCollection(self): """ A title that occurs in the alignments of multiple reads must have the data from both reads collected properly. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read2 = Read('id2', 'A' * 70) read3 = Read('id3', 'A' * 70) reads.add(read2) reads.add(read3) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Cowpox virus 15' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(30000, titleAlignments.subjectLength) self.assertEqual(2, len(titleAlignments)) self.assertEqual(read2, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) self.assertEqual(read3, titleAlignments[1].read) self.assertEqual(HSP(20), titleAlignments[1].hsps[0])
def testExtraInSecond(self): """ If the second sequence has extra bases, they must be indicated in the extraCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 2, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTTCC')))
def testFilterWithNoArguments(self): """ The filter function must return a TitlesAlignments instance with all the titles of the original when called with no arguments. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter() self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testExtraAmbiguous(self): """ If the first sequence has extra bases which are ambiguous,they must be indicated in the extraCount and in the ambiguousOffset. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [6], 'extraCount': 2, 'gapOffsets': [5], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT-N'), Read('id2', 'ACGTT')))
def testOffsets(self): """ If a set of wanted offsets is passed, the result must be restricted to just those offsets. """ self.assertEqual( { 'match': { 'identicalMatchCount': 1, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set([0, 4])))
def testMismatch(self): """ If the sequences have mismatched (non-ambiguous) bases, their count must be given correctly in the nonGapMismatchCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 2, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGCC')))
def testCoverageIncludesAll(self): """ The coverage function must return an titlesAlignments instance with all titles if all its titles has sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minCoverage=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testMatchWithAmbiguityAndNotStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=True to allow ambiguous matching. """ read1 = Read('id1', 'ACGTTS') read2 = Read('id2', 'ACGTTC') match = compareDNAReads(read1, read2, matchAmbiguous=True) self.assertEqual( '''\ Exact matches: 5/6 (83.33%) Ambiguous matches: 1/6 (16.67%) Exact or ambiguous matches: 6/6 (100.00%) Mismatches: 0 Not involving gaps (i.e., conflicts): 0 Involving a gap in one sequence: 0 Involving a gap in both sequences: 0 Id: id1 Length: 6 Gaps: 0 Ambiguous: 1/6 (16.67%) Id: id2 Length: 6 Gaps: 0 Ambiguous: 0''', matchToString(match, read1, read2, matchAmbiguous=True))
def testMedianScore_Bits(self): """ Sorting on median score must work when scores are bit scores, including a secondary sort on title. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n' + dumps(RECORD4) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) reads.add(Read('id4', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('medianScore') self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 25 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 20 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 20 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 20 'gi|887699|gb|DQ37780 Cowpox virus 15', # 20 ], result)
def testResidueCountsTwoReadsTwoHSPsLeftOverhang(self): """ The residueCounts method must return the correct result when two reads, each with one HSP are aligned to a title and the leftmost HSP is aligned before the left edge of the subject (i.e, will include negative subject offsets). Subject: GTT HSP1: ACGT HSP2: CGTT """ read1 = Read('id', 'ACGT') hsp1 = HSP(33, readStart=0, readEnd=4, readStartInSubject=-2, readEndInSubject=2, subjectStart=0, subjectEnd=2, readMatchedSequence='GT', subjectMatchedSequence='GT') read2 = Read('id', 'CGTT') hsp2 = HSP(33, readStart=0, readEnd=4, readStartInSubject=-1, readEndInSubject=3, subjectStart=0, subjectEnd=3, readMatchedSequence='GTT', subjectMatchedSequence='GTT') titleAlignments = TitleAlignments('subject title', 55) titleAlignment = TitleAlignment(read1, [hsp1]) titleAlignments.addAlignment(titleAlignment) titleAlignment = TitleAlignment(read2, [hsp2]) titleAlignments.addAlignment(titleAlignment) self.assertEqual( { -2: {'A': 1}, -1: {'C': 2}, 0: {'G': 2}, 1: {'T': 2}, 2: {'T': 1}, }, titleAlignments.residueCounts())
def testTitle(self): """ Sorting on title must work. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('title') self.assertEqual([ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], result)
def testMinMedianScore_EValue(self): """ The filter function must work correctly when passed a value for minMedianScore when using e values. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minMedianScore=1e-9) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testYP_009259545(self): """ Test for a match against YP_009259545 """ proteinAccession = 'YP_009259545.1' proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein'] proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id'] qid = 'query' qseq = ''.join(CODONS[aa][0] for aa in proteinSequence[10:50]) qqual = 'E' * len(qseq) with DiamondExecutor() as de: de.addSubject(Read(proteinId, proteinSequence)) queries = Reads([Read(qid, qseq, qqual)]) (result, ) = list(de.search(queries)) self.assertEqual( { 'bitscore': 83.6, 'btop': '40', 'qframe': 1, 'qend': 120, 'full_qqual': qqual, 'qlen': len(qseq), 'full_qseq': qseq, 'qseqid': 'query', 'qstart': 1, 'slen': len(proteinSequence), 'sstart': 11, 'stitle': proteinId, }, result)
def testNoOffsets(self): """ If an empty set of wanted offsets is passed, the result must be empty. """ self.assertEqual( { 'match': { 'identicalMatchCount': 0, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set()))
def testWikiAnswerAsDict(self): """ Test the example given in Wikipedia: http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm with the return result being a dict. """ seq1 = Read('seq1', 'ACACACTA') seq2 = Read('seq2', 'AGCACACA') align = LocalAlignment(seq1, seq2, match=2) result = align.createAlignment() self.assertEqual( { 'cigar': '1=1I5=1D1=', 'sequence1Start': 1, 'sequence1End': 8, 'sequence2Start': 1, 'sequence2End': 8, 'text': [ 'seq1 1 A-CACACTA 8', ' | ||||| |', 'seq2 1 AGCACAC-A 8', ] }, result)
def testWikiAnswerWithMatchOneAsDict(self): """ Test the example given in Wikipedia http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm Wikipedia uses a match score of two, here we use a score of one. Get the result as a dict. """ seq1 = Read('seq1', 'ACACACTA') seq2 = Read('seq2', 'AGCACACA') align = LocalAlignment(seq1, seq2, match=1) result = align.createAlignment() self.assertEqual( { 'cigar': '5=1D1=', 'sequence1Start': 2, 'sequence1End': 8, 'sequence2Start': 3, 'sequence2End': 8, 'text': [ 'seq1 2 CACACTA 8', ' ||||| |', 'seq2 3 CACAC-A 8', ] }, result)
def testLengthOne(self): """ A FASTA list with just one item gets de-duped to the same one item. """ reads = Reads() reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def testReverseComplementAmbiguousRNA(self): """ The reverseComplement function must work for RNA that includes ambiguous bases. """ read = Read('id', 'aucgmrwsykvhxn', type='rna') self.assertEqual('NXDBMRSWYKCGAU', read.reverseComplement().sequence)
def testReverseComplementAA(self): """ The reverseComplement function must raise a C{ValueError} when called on an amino acid sequence. """ read = Read('id', 'atcg', type='aa') error = 'Cannot reverse complement an amino acid sequence' with self.assertRaisesRegexp(ValueError, error): read.reverseComplement()
def queries(self, rcSuffix='', rcNeeded=False, padChar='-', queryInsertionChar='N', unknownQualityChar='!', allowDuplicateIds=False, addAlignment=False): """ Produce padded (with gaps) queries according to the CIGAR string and reference sequence length for each matching query sequence. @param rcSuffix: A C{str} to add to the end of query names that are reverse complemented. This is added before the /1, /2, etc., that are added for duplicated ids (if there are duplicates and C{allowDuplicateIds} is C{False}. @param rcNeeded: If C{True}, queries that are flagged as matching when reverse complemented should have reverse complementing when preparing the output sequences. This must be used if the program that created the SAM/BAM input flags reversed matches but does not also store the reverse complemented query. @param padChar: A C{str} of length one to use to pad queries with to make them the same length as the reference sequence. @param queryInsertionChar: A C{str} of length one to use to insert into queries when the CIGAR string indicates that the alignment of a query would cause a deletion in the reference. This character is inserted as a 'missing' query character (i.e., a base that can be assumed to have been lost due to an error) whose existence is necessary for the match to continue. @param unknownQualityChar: The character to put into the quality string when unknown bases are inserted in the query or the query is padded on the left/right with gaps. @param allowDuplicateIds: If C{True}, repeated query ids (due to secondary or supplemental matches) will not have /1, /2, etc. appended to their ids. So repeated ids may appear in the yielded FASTA. @param addAlignment: If C{True} the reads yielded by the returned generator will also have an C{alignment} attribute, being the C{pysam.AlignedSegment} for the query. @raises InvalidSAM: If a query has an empty SEQ field and either there is no previous alignment or the alignment is not marked as secondary or supplementary. @return: A generator that yields C{Read} instances that are padded with gap characters to align them to the length of the reference sequence. See C{addAlignment}, above, to yield reads with the corresponding C{pysam.AlignedSegment}. """ referenceLength = self.referenceLength # Hold the count for each id so we can add /1, /2 etc to duplicate # ids (unless --allowDuplicateIds was given). idCount = Counter() MATCH_OPERATIONS = {CMATCH, CEQUAL, CDIFF} for lineNumber, alignment in enumerate( self.samFilter.alignments(), start=1): query = alignment.query_sequence quality = ''.join(chr(q + 33) for q in alignment.query_qualities) if alignment.is_reverse: if rcNeeded: query = DNARead('id', query).reverseComplement().sequence quality = quality[::-1] if rcSuffix: alignment.query_name += rcSuffix # Adjust the query id if it's a duplicate and we're not allowing # duplicates. if allowDuplicateIds: queryId = alignment.query_name else: count = idCount[alignment.query_name] idCount[alignment.query_name] += 1 queryId = alignment.query_name + ( '' if count == 0 else '/%d' % count) referenceStart = alignment.reference_start atStart = True queryIndex = 0 referenceIndex = referenceStart alignedSequence = '' alignedQuality = '' for operation, length in alignment.cigartuples: # The operations are tested in the order they appear in # https://samtools.github.io/hts-specs/SAMv1.pdf It would be # more efficient to test them in order of frequency of # occurrence. if operation in MATCH_OPERATIONS: atStart = False alignedSequence += query[queryIndex:queryIndex + length] alignedQuality += quality[queryIndex:queryIndex + length] elif operation == CINS: # Insertion to the reference. This consumes query bases but # we don't output them because the reference cannot be # changed. I.e., these bases in the query would need to be # inserted into the reference. Remove these bases from the # query but record what would have been inserted into the # reference. atStart = False self.referenceInsertions[queryId].append( (referenceIndex, query[queryIndex:queryIndex + length])) elif operation == CDEL: # Delete from the reference. Some bases from the reference # would need to be deleted to continue the match. So we put # an insertion into the query to compensate. atStart = False alignedSequence += queryInsertionChar * length alignedQuality += unknownQualityChar * length elif operation == CREF_SKIP: # Skipped reference. Opens a gap in the query. For # mRNA-to-genome alignment, an N operation represents an # intron. For other types of alignments, the # interpretation of N is not defined. So this is unlikely # to occur. atStart = False alignedSequence += queryInsertionChar * length alignedQuality += unknownQualityChar * length elif operation == CSOFT_CLIP: # Bases in the query that are not part of the match. We # remove these from the query if they protrude before the # start or after the end of the reference. According to the # SAM docs, 'S' operations may only have 'H' operations # between them and the ends of the CIGAR string. if atStart: # Don't set atStart=False, in case there's another 'S' # operation. unwantedLeft = length - referenceStart if unwantedLeft > 0: # The query protrudes left. Copy its right part. alignedSequence += query[ queryIndex + unwantedLeft:queryIndex + length] alignedQuality += quality[ queryIndex + unwantedLeft:queryIndex + length] referenceStart = 0 else: referenceStart -= length alignedSequence += query[ queryIndex:queryIndex + length] alignedQuality += quality[ queryIndex:queryIndex + length] else: unwantedRight = ( (referenceStart + len(alignedSequence) + length) - referenceLength) if unwantedRight > 0: # The query protrudes right. Copy its left part. alignedSequence += query[ queryIndex:queryIndex + length - unwantedRight] alignedQuality += quality[ queryIndex:queryIndex + length - unwantedRight] else: alignedSequence += query[ queryIndex:queryIndex + length] alignedQuality += quality[ queryIndex:queryIndex + length] elif operation == CHARD_CLIP: # Some bases have been completely removed from the query. # This (H) can only be present as the first and/or last # operation. There is nothing to do as the bases are simply # not present in the query string in the SAM/BAM file. pass elif operation == CPAD: # This is "silent deletion from the padded reference", # which consumes neither query nor reference. atStart = False else: raise ValueError('Unknown CIGAR operation:', operation) if operation in _CONSUMES_QUERY: queryIndex += length if operation in _CONSUMES_REFERENCE: referenceIndex += length if queryIndex != len(query): # Oops, we did not consume the entire query. raise ValueError( 'Query %r not fully consumed when parsing CIGAR string. ' 'Query %r (len %d), final query index %d, CIGAR: %r' % (alignment.query_name, query, len(query), queryIndex, alignment.cigartuples)) # We cannot test we consumed the entire reference. The CIGAR # string applies to (and exhausts) the query but is silent # about the part of the reference that lies to the right of the # aligned query. # Put gap characters before and after the aligned sequence so that # it is offset properly and matches the length of the reference. padRightLength = (referenceLength - (referenceStart + len(alignedSequence))) paddedSequence = (padChar * referenceStart + alignedSequence + padChar * padRightLength) paddedQuality = (unknownQualityChar * referenceStart + alignedQuality + unknownQualityChar * padRightLength) read = Read(queryId, paddedSequence, paddedQuality) if addAlignment: read.alignment = alignment yield read
def testReverseComplementRNA(self): """ The reverseComplement function must work for RNA """ read = Read('id', 'aucg', type='rna') self.assertEqual('CGAU', read.reverseComplement().sequence)
def testReverseComplementDNA(self): """ The reverseComplement function must work for DNA """ read = Read('id', 'atcg', quality='!@#$', type='dna') self.assertEqual('CGAT', read.reverseComplement().sequence)
def testReverseComplementReversesQuality(self): """ The reverseComplement function must return a reversed quality string. """ read = Read('id', 'atcg', quality='!@#$') self.assertEqual('$#@!', read.reverseComplement().quality)