def testCoverageIncludesAll(self): """ The coverage function must return an titlesAlignments instance with all titles if all its titles has sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minCoverage=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testTitle(self): """ Sorting on title must work. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('title') self.assertEqual([ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], result)
def testMaxScore_Bits(self): """ Sorting on max score must work when scores are bit scores, including a secondary sort on title. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch('__builtin__.open', mockOpener, create=True): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 25 'gi|887699|gb|DQ37780 Cowpox virus 15', # 20 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 20 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 20 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 20 ], result)
def testFromThreeSequences(self): """ If three sequences with no features are used to create an NJTree instance, the instance must 1) have a distance matrix that is zero on the diagonal and ones elsewhere, 2) save the labels, and 3) produce a simple tree with three children. """ sequences = Reads() sequences.add(AARead('id1', 'A')) sequences.add(AARead('id2', 'A')) sequences.add(AARead('id3', 'A')) labels = ['x', 'y', 'z'] njtree = NJTree.fromSequences(labels, sequences, landmarks=['AlphaHelix']) self.assertTrue(np.array_equal( [ [0, 1, 1], [1, 0, 1], [1, 1, 0], ], njtree.distance)) self.assertIs(labels, njtree.labels) self.assertEqual(['x:0.5;\n', 'y:0.5;\n', 'z:0.5;\n'], sorted(str(child) for child in njtree.tree.children))
def testMaxTitlesTwoSortOnLength(self): """ The filter function must return the two titles whose sequences are the longest when maxTitles is 2 and sortOn is 'length'. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(maxTitles=2, sortOn='length') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testWithScoreBetterThan_EValue(self): """ The filter function work correctly when passed a value for withScoreBetterThan when using e values. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', 'database.fasta', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(withScoreBetterThan=1e-10) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', ], list(result.keys()))
def testLength(self): """ Sorting on sequence length must work, including a secondary sort on title. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch('__builtin__.open', mockOpener, create=True): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('length') self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 38000 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 37000 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 35000 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 35000 'gi|887699|gb|DQ37780 Cowpox virus 15', # 30000 ], result)
def testCoverageIncludesSome(self): """ The coverage function must return an titlesAlignments instance with only the expected titles if only some of its titles have sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) # To understand why the following produces the result it does, # you need to look at the HSP coverage in sample_data.py and # calculate the coverage by hand. result = titlesAlignments.filter(minCoverage=0.0003) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', ], sorted(result))
def testWithScoreBetterThan_EValue(self): """ The filter function work correctly when passed a value for withScoreBetterThan when using e values. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', 'database.fasta', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(withScoreBetterThan=1e-10) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', ], list(result.keys()))
def testExpectedTitleDetails(self): """ An instance of TitleAlignments in a TitlesAlignments instance must have the expected attributes. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read = Read('id0', 'A' * 70) reads.add(read) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(37000, titleAlignments.subjectLength) self.assertEqual(1, len(titleAlignments)) self.assertEqual(read, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) title = 'gi|887699|gb|DQ37780 Squirrelpox virus 55' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(38000, titleAlignments.subjectLength) self.assertEqual(1, len(titleAlignments)) self.assertEqual(read, titleAlignments[0].read) self.assertEqual(HSP(25), titleAlignments[0].hsps[0])
def testLengthOne(self): """ A FASTA list with just one item gets de-duped to the same one item. """ reads = Reads() reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def testTitleCollection(self): """ A title that occurs in the alignments of multiple reads must have the data from both reads collected properly. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read2 = Read('id2', 'A' * 70) read3 = Read('id3', 'A' * 70) reads.add(read2) reads.add(read3) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Cowpox virus 15' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(30000, titleAlignments.subjectLength) self.assertEqual(2, len(titleAlignments)) self.assertEqual(read2, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) self.assertEqual(read3, titleAlignments[1].read) self.assertEqual(HSP(20), titleAlignments[1].hsps[0])
def testTabSeparatedSummary(self): """ The summary function must return the correct result. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'f.json', 'db') titlesAlignments = TitlesAlignments(readsAlignments) summary = titlesAlignments.tabSeparatedSummary(sortOn='title') expected = ( '0.000297\t' '20.000000\t' '20.000000\t' '1\t' '1\t' '37000\t' 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99' '\n' '0.000289\t' '25.000000\t' '25.000000\t' '1\t' '1\t' '38000\t' 'gi|887699|gb|DQ37780 Squirrelpox virus 55') self.assertEqual(expected, summary)
def testTitle(self): """ Sorting on title must work. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('title') self.assertEqual([ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], result)
def testMaxMatchingReads(self): """ The filter function must work correctly when passed a value for maxMatchingReads. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(maxMatchingReads=1) # Cowpox virus 15 is not in the results as it is matched by two # reads. self.assertEqual( sorted([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.' ]), sorted(result))
def testReadSetFilterAllowAnything(self): """ The filter function must work correctly when passed a 0.0 value for minNewReads, i.e. that considers any read set sufficiently novel. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minNewReads=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testTwoByTwoAsDict(self): """ If affinityMatrix is called with two reads and the database has two subjects, and a dict result is requested, the result must be as expected. """ reads = Reads() reads.add(AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')) reads.add(AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF')) subjects = Reads() subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id4', 'FRRRFRRRFAAAFRRRFRRRF')) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True, returnDict=True) self.assertEqual( { 'id1': { 'id3': 1.0, 'id4': 1.0, }, 'id2': { 'id3': 1.0, 'id4': 1.0, }, }, matrix)
def testFilterWithNoArguments(self): """ The filter function must return a TitlesAlignments instance with all the titles of the original when called with no arguments. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter() self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testReadSetFilterAllowAnything(self): """ The filter function work correctly when passed a 0.0 value for minNewReads, i.e. that considers any read set sufficiently novel. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minNewReads=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testMinMedianScore_EValue(self): """ The filter function work correctly when passed a value for minMedianScore when using e values. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minMedianScore=1e-9) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testCoverageIncludesAll(self): """ The coverage function must return an titlesAlignments instance with all titles if all its titles has sufficient coverage. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minCoverage=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testCoverageIncludesSome(self): """ The coverage function must return an titlesAlignments instance with only the expected titles if only some of its titles have sufficient coverage. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) # To understand why the following produces the result it does, # you need to look at the HSP coverage in sample_data.py and # calculate the coverage by hand. result = titlesAlignments.filter(minCoverage=0.0011) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', ], sorted(result.keys()))
def testMaxScore_EValue(self): """ Sorting on max score must work when scores are e values, including a secondary sort on title. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') # self.assertEqual([ # 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 # 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 # 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 # 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 # 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 # ], result) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 ], result)
def testTabSeparatedSummary(self): """ The summary function must return the correct result. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'f.json') titlesAlignments = TitlesAlignments(readsAlignments) summary = titlesAlignments.tabSeparatedSummary(sortOn='title') expected = ( '0.000297\t' '20.000000\t' '20.000000\t' '1\t' '1\t' '37000\t' 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99' '\n' '0.000289\t' '25.000000\t' '25.000000\t' '1\t' '1\t' '38000\t' 'gi|887699|gb|DQ37780 Squirrelpox virus 55') self.assertEqual(expected, summary)
def add(self, virusTitle, sampleName): """ Add a a virus title, sample name combination and get its FASTA file name. Write the FASTA file if it does not already exist. @param virusTitle: A C{str} virus title. @param sampleName: A C{str} sample name. @return: A C{str} FASTA file name holding all the reads (without duplicates) from the sample that matched the proteins in the given virus. """ virusIndex = self._viruses.setdefault(virusTitle, len(self._viruses)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._fastaFilenames[(virusIndex, sampleIndex)] except KeyError: result = Reads() for proteinMatch in self._proteinGrouper.virusTitles[ virusTitle][sampleName]: for read in FastaReads(proteinMatch['fastaFilename'], checkAlphabet=0): result.add(read) saveFilename = join( proteinMatch['outDir'], 'virus-%d-sample-%d.fasta' % (virusIndex, sampleIndex)) result.filter(removeDuplicates=True).save(saveFilename) self._fastaFilenames[(virusIndex, sampleIndex)] = saveFilename return saveFilename
def testFilterWithNoArguments(self): """ The filter function must return a TitlesAlignments instance with all the titles of the original when called with no arguments. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter() self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testMaxScore_Bits(self): """ Sorting on max score must work when scores are bit scores, including a secondary sort on title. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 25 'gi|887699|gb|DQ37780 Cowpox virus 15', # 20 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 20 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 20 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 20 ], result)
def testMinMedianScore_EValue(self): """ The filter function must work correctly when passed a value for minMedianScore when using e values. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minMedianScore=1e-9) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testLength(self): """ Sorting on sequence length must work, including a secondary sort on title. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('length') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 38000 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 37000 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 35000 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 35000 'gi|887699|gb|DQ37780 Cowpox virus 15', # 30000 ], result)
def add(self, pathogenName, sampleName): """ Add a (pathogen name, sample name) combination and get its FASTA/FASTQ file name and unique read count. Write the FASTA/FASTQ file if it does not already exist. Save the unique read count into C{self._proteinGrouper}. @param pathogenName: A C{str} pathogen name. @param sampleName: A C{str} sample name. @return: A C{str} giving the FASTA/FASTQ file name holding all the reads (without duplicates, by id) from the sample that matched the proteins in the given pathogen. """ pathogenIndex = self._pathogens.setdefault(pathogenName, len(self._pathogens)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._readsFilenames[(pathogenIndex, sampleIndex)] except KeyError: reads = Reads() for proteinMatch in self._proteinGrouper.pathogenNames[ pathogenName][sampleName]['proteins'].values(): for read in self._readsClass(proteinMatch['readsFilename']): reads.add(read) saveFilename = join( proteinMatch['outDir'], 'pathogen-%d-sample-%d.%s' % (pathogenIndex, sampleIndex, self._format)) reads.filter(removeDuplicatesById=True) nReads = reads.save(saveFilename, format_=self._format) # Save the unique read count into self._proteinGrouper self._proteinGrouper.pathogenNames[pathogenName][sampleName][ 'uniqueReadCount'] = nReads self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename return saveFilename
def testTitleCollection(self): """ A title that occurs in the alignments of multiple reads must have the data from both reads collected properly. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read2 = Read('id2', 'A' * 70) read3 = Read('id3', 'A' * 70) reads.add(read2) reads.add(read3) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Cowpox virus 15' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(30000, titleAlignments.subjectLength) self.assertEqual(2, len(titleAlignments)) self.assertEqual(read2, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) self.assertEqual(read3, titleAlignments[1].read) self.assertEqual(HSP(20), titleAlignments[1].hsps[0])
def testMaxScore_EValue(self): """ Sorting on max score must work when scores are e values, including a secondary sort on title. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') # self.assertEqual([ # 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 # 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 # 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 # 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 # 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 # ], result) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 ], result)
def testExpectedTitleDetails(self): """ An instance of TitleAlignments in a TitlesAlignments instance must have the expected attributes. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read = Read('id0', 'A' * 70) reads.add(read) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(37000, titleAlignments.subjectLength) self.assertEqual(1, len(titleAlignments)) self.assertEqual(read, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) title = 'gi|887699|gb|DQ37780 Squirrelpox virus 55' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(38000, titleAlignments.subjectLength) self.assertEqual(1, len(titleAlignments)) self.assertEqual(read, titleAlignments[0].read) self.assertEqual(HSP(25), titleAlignments[0].hsps[0])
def testLengthOne(self): """ A FASTA list with just one item gets de-duped to the same one item. """ reads = Reads() reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def testRemovalOfIdenticalSequences(self): """ A list with 2 copies of the same seq is de-duped to have 1 copy. """ reads = Reads() reads.add(Read('id', 'GGG')) reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def testRemovalOfIdenticalSequences(self): """ A list with 2 copies of the same seq is de-duped to have 1 copy. """ reads = Reads() reads.add(Read('id', 'GGG')) reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def testSummary(self): """ The summary function must return the correct result. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) self.assertEqual( [ { 'bestScore': 20.0, 'coverage': 0.00031428571428571427, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 35000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Monkeypox virus 456'), }, { 'bestScore': 20.0, 'coverage': 0.00031428571428571427, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 35000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.'), }, { 'bestScore': 20.0, 'coverage': 0.0002972972972972973, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 37000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99'), }, { 'bestScore': 25.0, 'coverage': 0.00028947368421052634, 'hspCount': 1, 'medianScore': 25.0, 'readCount': 1, 'subjectLength': 38000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Squirrelpox virus 55'), }, ], list(titlesAlignments.summary(sortOn='title')))
def testManuallyAddedReadsLength(self): """ A Reads instance with reads added manually must have the correct length. """ reads = Reads() reads.add(Read('id1', 'AT')) reads.add(Read('id2', 'AC')) self.assertEqual(2, len(reads))
def testRemovalOfIdenticalSequencesWithDifferingIds(self): """ A list with 2 copies of the same seq is de-duped to have 1 copy, including when the read ids differ. """ reads = Reads() reads.add(Read('id1', 'GGG')) reads.add(Read('id2', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id1', 'GGG')])
def testSummary(self): """ The summary function must return the correct result. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) self.assertEqual( [ { 'bestScore': 20.0, 'coverage': 0.00031428571428571427, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 35000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Monkeypox virus 456'), }, { 'bestScore': 20.0, 'coverage': 0.00031428571428571427, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 35000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.'), }, { 'bestScore': 20.0, 'coverage': 0.0002972972972972973, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 37000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99'), }, { 'bestScore': 25.0, 'coverage': 0.00028947368421052634, 'hspCount': 1, 'medianScore': 25.0, 'readCount': 1, 'subjectLength': 38000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Squirrelpox virus 55'), }, ], list(titlesAlignments.summary(sortOn='title')))
def testRemovalOfIdenticalSequencesWithDifferingIds(self): """ A list with 2 copies of the same seq is de-duped to have 1 copy, including when the read ids differ. """ reads = Reads() reads.add(Read('id1', 'GGG')) reads.add(Read('id2', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id1', 'GGG')])
def testAlignmentPanel(self): """ The alignmentPanel function must work properly. """ findParams = FindParameters(significanceMethod='HashFraction', binScoreMethod='FeatureAAScore') reads = Reads() reads.add(GOLV) reads.add(AKAV) alignmentPanel(reads, reads, findParams=findParams, showFigure=False)
def testCorrectNumberOfSequences(self): """ The right number of strings must be made (one for each sequence). """ reads = Reads() reads.add(MONKEYPOX) reads.add(MUMMYPOX) hashesString = HashesString(reads, 0.0, landmarks=['AlphaHelix']) result = len(hashesString.hashString) self.assertEqual(2, result)
def testManuallyAddedReads(self): """ A Reads instance with reads added manually must be able to be listed. """ reads = Reads() read1 = Read('id1', 'AT') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) self.assertEqual([read1, read2], list(reads))
def testDefaultLabel(self): """ The ClusterAnalysis initializer must assign the default label to any read without a corresponding label. """ reads = Reads() reads.add(AARead('id', 'MMM')) ca = ClusterAnalysis(reads, {}, defaultLabel=5, landmarks=['AlphaHelix']) self.assertEqual([5], ca.trueLabels)
def testMissingLabelNoDefault(self): """ The ClusterAnalysis initializer must raise ValueError if a read is given without a corresponding label and there is no default label. """ reads = Reads() reads.add(AARead('id', 'MMM')) error = "Read 'id' has no corresponding label" six.assertRaisesRegex(self, ValueError, error, ClusterAnalysis, reads, {}, landmarks=['AlphaHelix'])
def reads(self): """ Find the set of reads matching this title. @return: An instance of C{dark.reads.Reads}. """ reads = Reads() for alignment in self: reads.add(alignment.read) return reads
def testOneReadAffinity(self): """ The ClusterAnalysis initializer must assign a 1.0 affinity to a single read. """ reads = Reads() reads.add(AARead('id', 'MMM')) ca = ClusterAnalysis(reads, {}, defaultLabel=5, landmarks=['AlphaHelix']) self.assertEqual([[1.0]], ca.affinity)
def reads(self): """ Find the set of reads matching this title. @return: An instance of C{dark.reads.Reads}. """ reads = Reads() for alignment in self: reads.add(alignment.read) return reads
def testFilterOnMaxLength(self): """ Filtering on maximal length must work. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(maxLength=3) self.assertEqual([read2], list(result))
def testOneReadClusterCount(self): """ The KMeansAnalysis class must find one cluster when given one read, and set the value of nClusters to 1. """ reads = Reads() reads.add(AARead('id1', 'FRRRFRRRF')) km = KMeansAnalysis(reads, {}, defaultLabel=0, landmarks=['AlphaHelix']) km.cluster(1) self.assertEqual(1, km.nClusters)
def testOneSequenceSpecificDiagonalValue(self): """ If affinityMatrix is called with a single read and a specific diagonal value, that diagonal value must be in the result. """ reads = Reads() read = AARead('id1', 'AAA') reads.add(read) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], diagonalValue=2.0) self.assertEqual([[2.0]], matrix)
def testPlotHistogramLinesMustRun(self): """ The plotHistogramLines function must run properly. """ findParams = FindParameters(significanceMethod='HashFraction', binScoreMethod='FeatureAAScore') reads = Reads() reads.add(GOLV) reads.add(AKAV) reads.add(BUNV) plotHistogramLines(reads, findParams=findParams)
def testFilterOnLengthNothingMatches(self): """ When filtering on length, no reads should be returned if none of them satisfy the length requirements. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=10, maxLength=15) self.assertEqual([], list(result))
def testSaveWithUnknownFormat(self): """ A Reads instance must raise ValueError if asked to save in an unknown format. """ reads = Reads() read1 = Read('id1', 'AT', '!!') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) error = "Save format must be either 'fasta' or 'fastq'\\." self.assertRaisesRegexp(ValueError, error, reads.save, 'file', 'xxx')
def testSaveAsFASTQFailsOnReadWithNoQuality(self): """ A Reads instance must raise a ValueError if asked to save in FASTQ format and there is a read with no quality present. """ reads = Reads() read1 = Read('id1', 'AT', '!!') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) error = "Read 'id2' has no quality information" self.assertRaisesRegexp(ValueError, error, reads.save, 'file', 'fastq')
def testFilterWithMinLengthEqualToMaxLength(self): """ When filtering on length, a read should be returned if its length equals a passed minimum and maximum length. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=4, maxLength=4) self.assertEqual([read1], list(result))
def testFilterOnLengthEverythingMatches(self): """ When filtering on length, all reads should be returned if they all satisfy the length requirements. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=2, maxLength=5) self.assertEqual([read1, read2], list(result))
def testSaveToFileDescriptor(self): """ A Reads instance must save to a file-like object if not passed a string filename. """ reads = Reads() read1 = Read('id1', 'AT') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) fp = StringIO() reads.save(fp) self.assertEqual('>id1\nAT\n>id2\nAC\n', fp.getvalue())
def testMaxTitlesNegative(self): """ The filter function must raise a ValueError if maxTitles is less than zero. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) error = '^maxTitles \(-1\) cannot be negative\.$' six.assertRaisesRegex(self, ValueError, error, titlesAlignments.filter, maxTitles=-1)