def testTitleCollection(self): """ A title that occurs in the alignments of multiple reads must have the data from both reads collected properly. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read2 = Read('id2', 'A' * 70) read3 = Read('id3', 'A' * 70) reads.add(read2) reads.add(read3) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Cowpox virus 15' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(30000, titleAlignments.subjectLength) self.assertEqual(2, len(titleAlignments)) self.assertEqual(read2, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) self.assertEqual(read3, titleAlignments[1].read) self.assertEqual(HSP(20), titleAlignments[1].hsps[0])
def testFilterWithNoArguments(self): """ The filter function must return a TitlesAlignments instance with all the titles of the original when called with no arguments. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter() self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testMaxScore_Bits(self): """ Sorting on max score must work when scores are bit scores, including a secondary sort on title. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch('__builtin__.open', mockOpener, create=True): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 25 'gi|887699|gb|DQ37780 Cowpox virus 15', # 20 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 20 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 20 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 20 ], result)
def testMinMedianScore_EValue(self): """ The filter function work correctly when passed a value for minMedianScore when using e values. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minMedianScore=1e-9) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testCoverageIncludesAll(self): """ The coverage function must return an titlesAlignments instance with all titles if all its titles has sufficient coverage. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minCoverage=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testWithScoreBetterThan_EValue(self): """ The filter function work correctly when passed a value for withScoreBetterThan when using e values. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', 'database.fasta', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(withScoreBetterThan=1e-10) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', ], list(result.keys()))
def add(self, virusTitle, sampleName): """ Add a a virus title, sample name combination and get its FASTA file name. Write the FASTA file if it does not already exist. @param virusTitle: A C{str} virus title. @param sampleName: A C{str} sample name. @return: A C{str} FASTA file name holding all the reads (without duplicates) from the sample that matched the proteins in the given virus. """ virusIndex = self._viruses.setdefault(virusTitle, len(self._viruses)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._fastaFilenames[(virusIndex, sampleIndex)] except KeyError: result = Reads() for proteinMatch in self._proteinGrouper.virusTitles[ virusTitle][sampleName]: for read in FastaReads(proteinMatch['fastaFilename'], checkAlphabet=0): result.add(read) saveFilename = join( proteinMatch['outDir'], 'virus-%d-sample-%d.fasta' % (virusIndex, sampleIndex)) result.filter(removeDuplicates=True).save(saveFilename) self._fastaFilenames[(virusIndex, sampleIndex)] = saveFilename return saveFilename
def testTitle(self): """ Sorting on title must work. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('title') self.assertEqual([ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], result)
def testMaxScore_EValue(self): """ Sorting on max score must work when scores are e values, including a secondary sort on title. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') # self.assertEqual([ # 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 # 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 # 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 # 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 # 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 # ], result) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 ], result)
def testExpectedTitleDetails(self): """ An instance of TitleAlignments in a TitlesAlignments instance must have the expected attributes. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() read = Read('id0', 'A' * 70) reads.add(read) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(37000, titleAlignments.subjectLength) self.assertEqual(1, len(titleAlignments)) self.assertEqual(read, titleAlignments[0].read) self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) title = 'gi|887699|gb|DQ37780 Squirrelpox virus 55' titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(38000, titleAlignments.subjectLength) self.assertEqual(1, len(titleAlignments)) self.assertEqual(read, titleAlignments[0].read) self.assertEqual(HSP(25), titleAlignments[0].hsps[0])
def testLengthOne(self): """ A FASTA list with just one item gets de-duped to the same one item. """ reads = Reads() reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def testLength(self): """ Sorting on sequence length must work, including a secondary sort on title. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch('__builtin__.open', mockOpener, create=True): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('length') self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 38000 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 37000 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 35000 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 35000 'gi|887699|gb|DQ37780 Cowpox virus 15', # 30000 ], result)
def testMaxTitlesTwoSortOnLength(self): """ The filter function must return the two titles whose sequences are the longest when maxTitles is 2 and sortOn is 'length'. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(maxTitles=2, sortOn='length') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testTabSeparatedSummary(self): """ The summary function must return the correct result. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'f.json', 'db') titlesAlignments = TitlesAlignments(readsAlignments) summary = titlesAlignments.tabSeparatedSummary(sortOn='title') expected = ( '0.000297\t' '20.000000\t' '20.000000\t' '1\t' '1\t' '37000\t' 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99' '\n' '0.000289\t' '25.000000\t' '25.000000\t' '1\t' '1\t' '38000\t' 'gi|887699|gb|DQ37780 Squirrelpox virus 55') self.assertEqual(expected, summary)
def testReadSetFilterAllowAnything(self): """ The filter function work correctly when passed a 0.0 value for minNewReads, i.e. that considers any read set sufficiently novel. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minNewReads=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result.keys()))
def testCoverageIncludesSome(self): """ The coverage function must return an titlesAlignments instance with only the expected titles if only some of its titles have sufficient coverage. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) # To understand why the following produces the result it does, # you need to look at the HSP coverage in sample_data.py and # calculate the coverage by hand. result = titlesAlignments.filter(minCoverage=0.0011) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', ], sorted(result.keys()))
def testRemovalOfIdenticalSequences(self): """ A list with 2 copies of the same seq is de-duped to have 1 copy. """ reads = Reads() reads.add(Read('id', 'GGG')) reads.add(Read('id', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
def __init__(self, _files, readClass=SSAARead, upperCase=False): self._files = _files if isinstance(_files, (list, tuple)) else [_files] self._readClass = readClass self._upperCase = upperCase if PY3: super().__init__() else: Reads.__init__(self)
def testSummary(self): """ The summary function must return the correct result. """ mockOpener = mockOpen(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) self.assertEqual( [ { 'bestScore': 20.0, 'coverage': 0.00031428571428571427, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 35000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Monkeypox virus 456'), }, { 'bestScore': 20.0, 'coverage': 0.00031428571428571427, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 35000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.'), }, { 'bestScore': 20.0, 'coverage': 0.0002972972972972973, 'hspCount': 1, 'medianScore': 20.0, 'readCount': 1, 'subjectLength': 37000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99'), }, { 'bestScore': 25.0, 'coverage': 0.00028947368421052634, 'hspCount': 1, 'medianScore': 25.0, 'readCount': 1, 'subjectLength': 38000, 'subjectTitle': ( 'gi|887699|gb|DQ37780 Squirrelpox virus 55'), }, ], list(titlesAlignments.summary(sortOn='title')))
def testManuallyAddedReadsLength(self): """ A Reads instance with reads added manually must have the correct length. """ reads = Reads() reads.add(Read('id1', 'AT')) reads.add(Read('id2', 'AC')) self.assertEqual(2, len(reads))
def testRemovalOfIdenticalSequencesWithDifferingIds(self): """ A list with 2 copies of the same seq is de-duped to have 1 copy, including when the read ids differ. """ reads = Reads() reads.add(Read('id1', 'GGG')) reads.add(Read('id2', 'GGG')) self.assertEqual(list(dedupFasta(reads)), [Read('id1', 'GGG')])
def reads(self): """ Find the set of reads matching this title. @return: An instance of C{dark.reads.Reads}. """ reads = Reads() for alignment in self: reads.add(alignment.read) return reads
def testManuallyAddedReads(self): """ A Reads instance with reads added manually must be able to be listed. """ reads = Reads() read1 = Read('id1', 'AT') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) self.assertEqual([read1, read2], list(reads))
def testFilterOnMaxLength(self): """ Filtering on maximal length must work. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(maxLength=3) self.assertEqual([read2], list(result))
def testFilterWithMinLengthEqualToMaxLength(self): """ When filtering on length, a read should be returned if its length equals a passed minimum and maximum length. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=4, maxLength=4) self.assertEqual([read1], list(result))
def testFilterOnLengthEverythingMatches(self): """ When filtering on length, all reads should be returned if they all satisfy the length requirements. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=2, maxLength=5) self.assertEqual([read1, read2], list(result))
def testFilterOnLengthNothingMatches(self): """ When filtering on length, no reads should be returned if none of them satisfy the length requirements. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=10, maxLength=15) self.assertEqual([], list(result))
def testSaveWithUnknownFormat(self): """ A Reads instance must raise ValueError if asked to save in an unknown format. """ reads = Reads() read1 = Read('id1', 'AT', '!!') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) error = "Save format must be either 'fasta' or 'fastq'\\." self.assertRaisesRegexp(ValueError, error, reads.save, 'file', 'xxx')
def testSaveAsFASTQFailsOnReadWithNoQuality(self): """ A Reads instance must raise a ValueError if asked to save in FASTQ format and there is a read with no quality present. """ reads = Reads() read1 = Read('id1', 'AT', '!!') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) error = "Read 'id2' has no quality information" self.assertRaisesRegexp(ValueError, error, reads.save, 'file', 'fastq')
def testSaveToFileDescriptor(self): """ A Reads instance must save to a file-like object if not passed a string filename. """ reads = Reads() read1 = Read('id1', 'AT') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) fp = StringIO() reads.save(fp) self.assertEqual('>id1\nAT\n>id2\nAC\n', fp.getvalue())
def testUnknown(self): """ Sorting on an unknown attribute must raise C{ValueError}. """ mockOpener = mockOpen(read_data=dumps(PARAMS) + '\n') with patch.object(builtins, 'open', mockOpener): reads = Reads() readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) self.assertRaises(ValueError, titlesAlignments.sortTitles, 'xxx')
def testEmpty(self): """ If passed an empty readsAlignments, titleCounts must return an empty dictionary. """ mockOpener = mockOpen(read_data=dumps(PARAMS) + '\n') with patch.object(builtins, 'open', mockOpener): reads = Reads() readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') self.assertEqual({}, titleCounts(readsAlignments))
def testAddTitleRepeat(self): """ The addTitle function must raise a C{KeyError} if an attempt is made to add a pre-existing title to a TitlesAlignments instance. """ mockOpener = mockOpen(read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) title = 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99' titleAlignments = TitleAlignments(title, 55) error = ("Title 'gi\|887699\|gb\|DQ37780 Squirrelpox virus " "1296/99' already present in TitlesAlignments instance\.") six.assertRaisesRegex(self, KeyError, error, titlesAlignments.addTitle, title, titleAlignments)
def testUnknownSortOn(self): """ The filter function must raise a ValueError if the passed sortOn value isn't recognized. """ mockOpener = mockOpen(read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) error = ('^Sort attribute must be one of "length", "maxScore", ' '"medianScore", "readCount", "title"\.$') six.assertRaisesRegex(self, ValueError, error, titlesAlignments.filter, maxTitles=0, sortOn='unknown')
def testHeterogeneousReadsTwoDifferences(self): """ heterogeneousSites must return a dictionary with two entries as expected if reads given differ at two sites. """ read = Read('id', 'ACCG') reads = Reads([read, Read('id2', 'TCCC')]) self.assertEqual(({0: {'A': 1, 'T': 1}, 3: {'G': 1, 'C': 1}}, {0: {'A': ['id'], 'T': ['id2']}, 3: {'C': ['id2'], 'G': ['id']}}, [0, 3]), heterogeneousSites(reads, len(read), 1))
def testEmpty(self): """ Sorting when there are no titles must return the empty list. """ mockOpener = mockOpen(read_data=dumps(PARAMS) + '\n') with patch.object(builtins, 'open', mockOpener): reads = Reads() readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('title') self.assertEqual([], result)
def testRecombinantFile(self): """ Test that the recombinantFile method produces the expected string. """ reads = Reads([ Read('id1', 'A' * 200 + 'G' * 200), Read('id2', 'A' * 400), Read('id3', 'G' * 400), ]) self.ra.run(reads) self.assertEqual(join(self.ra.tmpDir, _OUTPUT_PREFIX + '.3s.rec'), self.ra.recombinantFile())
def testEmpty(self): """ An instance of TitlesAlignments must have no titles if passed an empty readsAlignments instance. """ mockOpener = mockOpen(read_data=(dumps(PARAMS) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() readsAlignments = DiamondReadsAlignments(reads, 'file.json', 'database.fasta') titlesAlignments = TitlesAlignments(readsAlignments) self.assertEqual([], list(titlesAlignments.keys()))
def testRunThresholdString(self): """ Test that the run function returns an exit status of 0, if run with a non-default (str) threshold argument. """ reads = Reads([ Read('id1', 'A' * 200 + 'G' * 200), Read('id2', 'A' * 400), Read('id3', 'G' * 400), ]) result = self.ra.run(reads, t='0.0') self.assertEqual(0, result.returncode)
def testHeterogeneousReadsFractionHigh(self): """ heterogeneousSites must return a dictionary with one entry as expected if reads given differ and are less homogeneous than specified by the homogeneity cutoff fraction. """ read = Read('id', 'ACCG') reads = Reads([read, Read('id2', 'ACCC'), Read('id3', 'ACCC')]) self.assertEqual(({3: {'C': 2, 'G': 1}}, {3: {'G': ['id'], 'C': ['id2', 'id3']}}, [3]), heterogeneousSites(reads, len(read), 0.7))
def testTwoJSONInputsWithSubjectInCommon(self): """ If two JSON files are passed to L{BlastReadsAlignments} with a matched subject in common and a TitlesAlignments is made, the title in the TitlesAlignments must have information from both reads, including the correct HSP scores. """ class SideEffect(object): def __init__(self): self.first = True def sideEffect(self, _ignoredFilename, **kwargs): if self.first: self.first = False return File([dumps(PARAMS) + '\n', dumps(RECORD2) + '\n']) else: return File([dumps(PARAMS) + '\n', dumps(RECORD4) + '\n']) title = 'gi|887699|gb|DQ37780 Cowpox virus 15' sideEffect = SideEffect() with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect.sideEffect reads = Reads() reads.add(Read('id2', 'A' * 70)) reads.add(Read('id4', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, ['file1.json', 'file2.json']) titlesAlignments = TitlesAlignments(readsAlignments) titleAlignments = titlesAlignments[title] self.assertEqual(title, titleAlignments.subjectTitle) self.assertEqual(4, titleAlignments.hspCount()) self.assertEqual('id2', titleAlignments[0].read.id) self.assertEqual('id4', titleAlignments[1].read.id) # First matching read has one HSP. self.assertEqual(HSP(20), titleAlignments[0].hsps[0]) # Second matching read has three HSPs. self.assertEqual(HSP(10), titleAlignments[1].hsps[0]) self.assertEqual(HSP(5), titleAlignments[1].hsps[1]) self.assertEqual(HSP(3), titleAlignments[1].hsps[2])
def testMaxScore_EValue(self): """ Sorting on max score must work when scores are e values, including a secondary sort on title. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') # self.assertEqual([ # 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 # 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 # 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 # 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 # 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 # ], result) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 1e-11 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 1e-10 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 1e-8 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 1e-7 'gi|887699|gb|DQ37780 Cowpox virus 15', # 1e-6 ], result)
def testFromThreeSequences(self): """ If three sequences with no features are used to create an NJTree instance, the instance must 1) have a distance matrix that is zero on the diagonal and ones elsewhere, 2) save the labels, and 3) produce a simple tree with three children. """ sequences = Reads() sequences.add(AARead('id1', 'A')) sequences.add(AARead('id2', 'A')) sequences.add(AARead('id3', 'A')) labels = ['x', 'y', 'z'] njtree = NJTree.fromSequences(labels, sequences, landmarks=['AlphaHelix']) self.assertTrue(np.array_equal( [ [0, 1, 1], [1, 0, 1], [1, 1, 0], ], njtree.distance)) self.assertIs(labels, njtree.labels) self.assertEqual(['x:0.5;\n', 'y:0.5;\n', 'z:0.5;\n'], sorted(str(child) for child in njtree.tree.children))
def testMaxScore_Bits(self): """ Sorting on max score must work when scores are bit scores, including a secondary sort on title. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('maxScore') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 25 'gi|887699|gb|DQ37780 Cowpox virus 15', # 20 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 20 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 20 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 20 ], result)
def testWithScoreBetterThan_EValue(self): """ The filter function work correctly when passed a value for withScoreBetterThan when using e values. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', 'database.fasta', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(withScoreBetterThan=1e-10) self.assertEqual([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', ], list(result.keys()))
def testLength(self): """ Sorting on sequence length must work, including a secondary sort on title. """ mockOpener = mockOpen( read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = BlastReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('length') self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 55', # 38000 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', # 37000 'gi|887699|gb|DQ37780 Monkeypox virus 456', # 35000 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', # 35000 'gi|887699|gb|DQ37780 Cowpox virus 15', # 30000 ], result)
def testMaxMatchingReads(self): """ The filter function must work correctly when passed a value for maxMatchingReads. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(maxMatchingReads=1) # Cowpox virus 15 is not in the results as it is matched by two # reads. self.assertEqual( sorted([ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.' ]), sorted(result))
def testMinMedianScore_EValue(self): """ The filter function must work correctly when passed a value for minMedianScore when using e values. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minMedianScore=1e-9) self.assertEqual( [ 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testReadSetFilterAllowAnything(self): """ The filter function must work correctly when passed a 0.0 value for minNewReads, i.e. that considers any read set sufficiently novel. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minNewReads=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testCoverageIncludesSome(self): """ The coverage function must return an titlesAlignments instance with only the expected titles if only some of its titles have sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) # To understand why the following produces the result it does, # you need to look at the HSP coverage in sample_data.py and # calculate the coverage by hand. result = titlesAlignments.filter(minCoverage=0.0003) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', ], sorted(result))
def testFilterWithNoArguments(self): """ The filter function must return a TitlesAlignments instance with all the titles of the original when called with no arguments. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter() self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testTitle(self): """ Sorting on title must work. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments( reads, 'file.json', scoreClass=LowerIsBetterScore) titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.sortTitles('title') self.assertEqual([ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], result)
def testCoverageIncludesAll(self): """ The coverage function must return an titlesAlignments instance with all titles if all its titles has sufficient coverage. """ mockOpener = mock_open(read_data=( dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' + dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' + dumps(RECORD3) + '\n')) with patch.object(builtins, 'open', mockOpener): reads = Reads() reads.add(Read('id0', 'A' * 70)) reads.add(Read('id1', 'A' * 70)) reads.add(Read('id2', 'A' * 70)) reads.add(Read('id3', 'A' * 70)) readsAlignments = DiamondReadsAlignments(reads, 'file.json') titlesAlignments = TitlesAlignments(readsAlignments) result = titlesAlignments.filter(minCoverage=0.0) self.assertEqual( [ 'gi|887699|gb|DQ37780 Cowpox virus 15', 'gi|887699|gb|DQ37780 Monkeypox virus 456', 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.', 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99', 'gi|887699|gb|DQ37780 Squirrelpox virus 55', ], sorted(result))
def testRemoveOutput(self, rmtreeMock): """ Test that the removeOutput method is called with the expected temporary directory name. """ reads = Reads([ Read('id1', 'A' * 200 + 'G' * 200), Read('id2', 'A' * 400), Read('id3', 'G' * 400), ]) self.ra.run(reads) self.ra.removeOutput() rmtreeMock.assert_called_once_with(self.ra.tmpDir)
def testHeterogeneousReadsFractionLowWithOneDifference(self): """ heterogeneousSites must return a dictionary with one entry if reads given differ at two sites and at one site are more homogeneous than specified by the homogeneity cutoff fraction; at the other site less homogeneous than specified by the homogeneity cutoff fraction. """ read = Read('id', 'ACCG') reads = Reads([read, Read('id2', 'TCCG'), Read('id3', 'TCCG'), Read('id4', 'ACCG')]) self.assertEqual(({0: {'A': 2, 'T': 2}}, {0: {'A': ['id', 'id4'], 'T': ['id2', 'id3']}}, [0]), heterogeneousSites(reads, len(read), 0.6))
def testExpectedAttrs(self): """ A ReadsAlignments instance must have the expected attributes. """ reads = Reads() params = { 'application': 'app name' } readsAlignments = ReadsAlignments(reads, params) self.assertIs(readsAlignments.reads, reads) self.assertEqual('app name', readsAlignments.params['application']) self.assertIs(params, readsAlignments.params) self.assertIs(HigherIsBetterScore, readsAlignments.scoreClass)
def testTwoByThreeWithRepeatedQueryAndSubjectIds(self): """ If affinityMatrix is called with two reads and the database has three subjects, the resulting matrix must be 2x3, and the fact that query and subject ids are not all different must not cause a problem (as it would if we called affinityMatrix with returnDict=True). """ reads = Reads() reads.add(AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')) reads.add(AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')) subjects = Reads() subjects.add(AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True) self.assertEqual( [ [1.0, 1.0, 1.0], [1.0, 1.0, 1.0] ], matrix)
def testPopulationNotAllowed(self): """ Passing a subjects keyword must result in a ValueError if database population has not been enabled. """ subjects = Reads() specifier = DatabaseSpecifier(allowPopulation=False) error = '^Database population is not enabled.$' six.assertRaisesRegex(self, ValueError, error, specifier.getDatabaseFromKeywords, subjects=subjects)
def testIdenticalMatrixIsReturnedOnRepeatedRequest(self): """ An AffinityMatrices instance must return the identical affinity matrix object when asked for it a second time. """ parameterSets = { 'test': { 'dbParams': DatabaseParameters(), 'findParams': FindParameters(), } } am = AffinityMatrices(Reads(), parameterSets=parameterSets, returnDict=True) self.assertIs(am['test'], am['test'])
def testPopulationFromInMemoryAndFastaFile(self): """ Passing both subjects and databaseFasta keywords must result in all the subjects in memory and in the file being added to the returned database. """ subjects = Reads() subject1 = AARead('id1', 'FFF') subject2 = AARead('id2', 'RRR') subjects.add(subject1) subjects.add(subject2) data = '\n'.join(['>id3', 'FFFF', '>id4', 'RRRR']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): db = DatabaseSpecifier().getDatabaseFromKeywords( subjects=subjects, databaseFasta='file.fasta') allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual( {subject1, subject2, AARead('id3', 'FFFF'), AARead('id4', 'RRRR')}, set(allSubjects))