def testNoOffsets(self): """ If an empty set of wanted offsets is passed, the result must be empty. """ self.assertEqual( { 'match': { 'identicalMatchCount': 0, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set()))
def testLengthOne(self): """ A FASTA list with just one item gets de-duped to the same one item. """ reads = Reads() reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def testWikiAnswerWithMatchOneAsDict(self): """ Test the example given in Wikipedia http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm Wikipedia uses a match score of two, here we use a score of one. Get the result as a dict. """ seq1 = Read('seq1', 'ACACACTA') seq2 = Read('seq2', 'AGCACACA') align = LocalAlignment(seq1, seq2, match=1) result = align.createAlignment() self.assertEqual( { 'cigar': '5=1D1=', 'sequence1Start': 2, 'sequence1End': 8, 'sequence2Start': 3, 'sequence2End': 8, 'text': [ 'seq1 2 CACACTA 8', ' ||||| |', 'seq2 3 CACAC-A 8', ] }, result)
def testCoverageIncludesSome(self): """ The coverage function must return an titlesAlignments instance with only the expected titles if only some of its titles have sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) # To understand why the following produces the result it does, # you need to look at the HSP coverage in sample_data.py and # calculate the coverage by hand. result = titlesAlignments.filter(minCoverage=0.0003) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', ], sorted(result))
def testMaxScore_EValue(self): """ Sorting on max score must work when scores are e values, including a secondary sort on title. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') # self.assertEqual([ # 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 # 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 # 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 # 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 # 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 # ], result) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 ], result)
def testMaxMatchingReads(self): """ The filter function must work correctly when passed a value for maxMatchingReads. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(maxMatchingReads=1) # Cowpox virus 15 is not in the results as it is matched by two # reads. self.assertEqual( sorted([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.' ]), sorted(result))
def testReadSetFilterAllowAnything(self): """ The filter function must work correctly when passed a 0.0 value for minNewReads, i.e. that considers any read set sufficiently novel. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minNewReads=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testNonDefaultGapChars(self): """ We must be able to specify the gap characters. """ for gap in '+$': self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [2], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [0], }, }, compareDNAReads(Read('id1', 'AC%sTT' % gap), Read('id2', '%sCGTT' % gap), gapChars='+$'))
def testGapAmbiguous(self): """ Testing that the ambiguousOffset shows ambiguous characters paired with gaps as expected """ self.assertEqual( { 'match': { 'identicalMatchCount': 2, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 1, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [1], 'extraCount': 0, 'gapOffsets': [2, 3], }, 'read2': { 'ambiguousOffsets': [3], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'AN--T'), Read('id2', 'A--NT')))
def testMatchWithIncompatibleAmbiguityInBoth(self): """ Two sequences that match exactly, apart from one (incompatible) ambiguity at the same location in the sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTS')))
def testMatchWithIdenticalAmbiguityButStrict(self): """ Two sequences that match exactly, including one (identical) ambiguity at the same location in the sequence, must compare as expected. Strict. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTN'), Read('id2', 'ACGTTN'), matchAmbiguous=False))
def testNonMatchingAmbiguityInFirst(self): """ Two sequences that match exactly, apart from one (incompatible) ambiguity in the second sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTC')))
def testMatchWithAmbiguityButStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=False to disallow ambiguous matching. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTS'), Read('id2', 'ACGTTC'), matchAmbiguous=False))
def testOffsets(self): """ If a set of wanted offsets is passed, the result must be restricted to just those offsets. """ self.assertEqual( { 'match': { 'identicalMatchCount': 1, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set([0, 4])))
def testTitleCollection(self): """ A title that occurs in the alignments of multiple reads must have the data from both reads collected properly. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read2 = Read('id2', 'A' * 70) read3 = Read('id3', 'A' * 70) reads.add(read2) reads.add(read3) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Cowpox virus 15' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(30000, titleAlignments.subjectLength) self.assertEqual(2, len(titleAlignments)) self.assertEqual(read2, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) self.assertEqual(read3, titleAlignments[1].read) self.assertEqual(HSP(20), titleAlignments[1].hsps[0])
def testExtraInSecond(self): """ If the second sequence has extra bases, they must be indicated in the extraCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 2, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTTCC')))
def testFilterWithNoArguments(self): """ The filter function must return a TitlesAlignments instance with all the titles of the original when called with no arguments. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter() self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testExtraAmbiguous(self): """ If the first sequence has extra bases which are ambiguous,they must be indicated in the extraCount and in the ambiguousOffset. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [6], 'extraCount': 2, 'gapOffsets': [5], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT-N'), Read('id2', 'ACGTT')))
def testMinMedianScore_EValue(self): """ The filter function must work correctly when passed a value for minMedianScore when using e values. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minMedianScore=1e-9) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testMismatch(self): """ If the sequences have mismatched (non-ambiguous) bases, their count must be given correctly in the nonGapMismatchCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 2, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGCC')))
def testCoverageIncludesAll(self): """ The coverage function must return an titlesAlignments instance with all titles if all its titles has sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minCoverage=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testMatchWithAmbiguityAndNotStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=True to allow ambiguous matching. """ read1 = Read('id1', 'ACGTTS') read2 = Read('id2', 'ACGTTC') match = compareDNAReads(read1, read2, matchAmbiguous=True) self.assertEqual( '''\ Exact matches: 5/6 (83.33%) Ambiguous matches: 1/6 (16.67%) Exact or ambiguous matches: 6/6 (100.00%) Mismatches: 0 Not involving gaps (i.e., conflicts): 0 Involving a gap in one sequence: 0 Involving a gap in both sequences: 0 Id: id1 Length: 6 Gaps: 0 Ambiguous: 1/6 (16.67%) Id: id2 Length: 6 Gaps: 0 Ambiguous: 0''', matchToString(match, read1, read2, matchAmbiguous=True))
def testMedianScore_Bits(self): """ Sorting on median score must work when scores are bit scores, including a secondary sort on title. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n' + dumps(RECORD4) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) reads.add(Read('id4', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('medianScore') self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 25 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 20 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 20 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 20 'gi|887699|gb|DQ37780 Cowpox virus 15', # 20 ], result)
def testResidueCountsTwoReadsTwoHSPsLeftOverhang(self): """ The residueCounts method must return the correct result when two reads, each with one HSP are aligned to a title and the leftmost HSP is aligned before the left edge of the subject (i.e, will include negative subject offsets). Subject: GTT HSP1: ACGT HSP2: CGTT """ read1 = Read('id', 'ACGT') hsp1 = HSP(33, readStart=0, readEnd=4, readStartInSubject=-2, readEndInSubject=2, subjectStart=0, subjectEnd=2, readMatchedSequence='GT', subjectMatchedSequence='GT') read2 = Read('id', 'CGTT') hsp2 = HSP(33, readStart=0, readEnd=4, readStartInSubject=-1, readEndInSubject=3, subjectStart=0, subjectEnd=3, readMatchedSequence='GTT', subjectMatchedSequence='GTT') titleAlignments = TitleAlignments('subject title', 55) titleAlignment = TitleAlignment(read1, [hsp1]) titleAlignments.addAlignment(titleAlignment) titleAlignment = TitleAlignment(read2, [hsp2]) titleAlignments.addAlignment(titleAlignment) self.assertEqual( { -2: {'A': 1}, -1: {'C': 2}, 0: {'G': 2}, 1: {'T': 2}, 2: {'T': 1}, }, titleAlignments.residueCounts())
def testTitle(self): """ Sorting on title must work. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('title') self.assertEqual([ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], result)
def testSummary(self): """ The summary method must return the correct result. """ titleAlignments = TitleAlignments('subject title', 10) titleAlignments.addAlignment( TitleAlignment(Read('id1', 'ACGT'), [ HSP(30, subjectStart=0, subjectEnd=2), ])) titleAlignments.addAlignment( TitleAlignment(Read('id2', 'ACGT'), [ HSP(55, subjectStart=2, subjectEnd=4), HSP(40, subjectStart=8, subjectEnd=9), ])) self.assertEqual( { 'bestScore': 55, 'coverage': 0.5, 'hspCount': 3, 'medianScore': 40, 'readCount': 2, 'subjectLength': 10, 'subjectTitle': 'subject title', }, titleAlignments.summary())
def testWikiAnswerAsDict(self): """ Test the example given in Wikipedia: http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm with the return result being a dict. """ seq1 = Read('seq1', 'ACACACTA') seq2 = Read('seq2', 'AGCACACA') align = LocalAlignment(seq1, seq2, match=2) result = align.createAlignment() self.assertEqual( { 'cigar': '1=1I5=1D1=', 'sequence1Start': 1, 'sequence1End': 8, 'sequence2Start': 1, 'sequence2End': 8, 'text': [ 'seq1 1 A-CACACTA 8', ' | ||||| |', 'seq2 1 AGCACAC-A 8', ] }, result)
def testLength(self): """ Sorting on sequence length must work, including a secondary sort on title. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('length') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 38000 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 37000 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 35000 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 35000 'gi|887699|gb|DQ37780 Cowpox virus 15', # 30000 ], result)
def testYP_009259545(self): """ Test for a match against YP_009259545 """ proteinAccession = 'YP_009259545.1' proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein'] proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id'] qid = 'query' qseq = ''.join(CODONS[aa][0] for aa in proteinSequence[10:50]) qqual = 'E' * len(qseq) with DiamondExecutor() as de: de.addSubject(Read(proteinId, proteinSequence)) queries = Reads([Read(qid, qseq, qqual)]) (result, ) = list(de.search(queries)) self.assertEqual( { 'bitscore': 83.6, 'btop': '40', 'qframe': 1, 'qend': 120, 'full_qqual': qqual, 'qlen': len(qseq), 'full_qseq': qseq, 'qseqid': 'query', 'qstart': 1, 'slen': len(proteinSequence), 'sstart': 11, 'stitle': proteinId, }, result)
def testHardClippingInCIGARButQueryNotHardClipped(self): """ As documented in https://github.com/acorg/dark-matter/issues/630 we must deal correctly with a case in which the CIGAR string says a query is hard-clipped but the query sequence in the SAM file actually isn't. This can be due to a prior alignment with a soft clip, in which case the full query sequence has to be given before the secondary alignment with the hard clip. """ data = '\n'.join([ '@SQ SN:Chimp-D00220 LN:8', '@SQ SN:D-AM494716 LN:8', '@SQ SN:D-XXX LN:8', '@SQ SN:Chimp-YYY LN:8', 'query1 0 Chimp-D00220 1 0 3S5M * 0 0 TTTTGGTT 12345678', 'query1 256 D-AM494716 1 0 3H5M * 0 0 * *', 'query1 256 D-XXX 1 0 5H3M * 0 0 * *', 'query1 0 Chimp-YYY 1 0 8M * 0 0 * *', ]).replace(' ', '\t') with dataFile(data) as filename: ps = PaddedSAM(SAMFilter(filename)) (read1, read2, read3, read4) = list(ps.queries(addAlignment=True)) self.assertEqual(Read('query1', 'TGGTT---', '45678!!!'), read1) self.assertEqual('TTTTGGTT', read1.alignment.query_sequence) self.assertEqual(Read('query1/1', 'TGGTT---', '45678!!!'), read2) self.assertEqual('TGGTT', read2.alignment.query_sequence) self.assertEqual(Read('query1/2', 'GTT-----', '678!!!!!'), read3) self.assertEqual('GTT', read3.alignment.query_sequence) self.assertEqual(Read('query1/3', 'TTTTGGTT', '12345678'), read4) self.assertEqual('TTTTGGTT', read4.alignment.query_sequence)