def testNoFilesToStr(self): """ If no files have been given to a protein grouper, its text string format must as expected. """ pg = ProteinGrouper() self.assertEqual('0 viruses found in 0 samples\n', pg.toStr())
def testOneLineInOneFileWithDifferentAssetDir(self): """ If a protein grouper is given a different assetDir name, the outDir needs to have that same name, as expected. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper(assetDir='differentname') pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': { 'proteins': { 'gi|327|X|I44.6 ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'differentname/0.png', 'coverage': 0.77, 'readsFilename': 'differentname/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'differentname', 'proteinLength': 74, 'proteinName': 'gi|327|X|I44.6 ubiquitin', 'proteinURL': ( 'http://www.ncbi.nlm.nih.gov/nuccore/I44'), 'readCount': 5, }, }, 'uniqueReadCount': None, }, } }, pg.pathogenNames)
def testOneLineInOneFileFASTQ(self): """ If a protein grouper is given one file with one line, its pathogenNames dict must be as expected, including for a FASTQ file. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper(format_='fastq') pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': { 'proteins': { 'gi|327|X|I44.6 ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'out/0.png', 'coverage': 0.77, 'readsFilename': 'out/0.fastq', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinName': 'gi|327|X|I44.6 ubiquitin', 'proteinURL': ( 'http://www.ncbi.nlm.nih.gov/nuccore/I44'), 'readCount': 5, }, }, 'uniqueReadCount': None, }, } }, pg.pathogenNames)
def testOneLineInOneFile(self): """ If a protein grouper is given one file with one line, its virusTitles dict must be as expected. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': [ { 'bestScore': 48.1, 'bluePlotFilename': 'out/0.png', 'coverage': 0.77, 'fastaFilename': 'out/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinTitle': 'gi|327|X|I44.6 ubiquitin', 'proteinURL': ( 'http://www.ncbi.nlm.nih.gov/nuccore/I44'), 'readCount': 5, }, ] } }, pg.virusTitles)
def testOneLineInEachOfTwoFilesDifferentPathogens(self): """ If a protein grouper is given two files in two different directories, each with one line from the different pathogens, its pathogenNames dict must be as expected. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n') fp2 = StringIO( '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n') pg = ProteinGrouper() pg.addFile('dir-1/sample-filename-1', fp1) pg.addFile('dir-2/sample-filename-2', fp2) self.assertEqual( { 'Lausannevirus': { 'dir-1/sample-filename-1': { 'proteins': { 'gi|327410| protein 77': { 'bestScore': 44.2, 'bluePlotFilename': 'dir-1/out/0.png', 'coverage': 0.63, 'readsFilename': 'dir-1/out/0.fasta', 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'dir-1/out', 'proteinLength': 12, 'proteinName': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, }, }, 'uniqueReadCount': None, }, }, 'Hepatitis B virus': { 'dir-2/sample-filename-2': { 'proteins': { 'gi|327409| ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'dir-2/out/0.png', 'coverage': 0.77, 'readsFilename': 'dir-2/out/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'dir-2/out', 'proteinLength': 74, 'proteinName': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, }, }, 'uniqueReadCount': None, }, }, }, pg.pathogenNames)
def testOneLineInEachOfTwoFilesSamePathogen(self): """ If a protein grouper is given two files, each with one line from the same pathogen, its pathogenNames dict must be as expected. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' ) fp2 = StringIO( '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n' ) pg = ProteinGrouper() pg.addFile('sample-filename-1', fp1) pg.addFile('sample-filename-2', fp2) self.assertEqual( { 'Lausannevirus': { 'sample-filename-1': { 'proteins': { 'gi|327410| protein 77': { 'bestScore': 44.2, 'bluePlotFilename': 'out/0.png', 'coverage': 0.63, 'readsFilename': 'out/0.fasta', 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'out', 'proteinLength': 12, 'proteinName': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, }, }, 'uniqueReadCount': None, }, 'sample-filename-2': { 'proteins': { 'gi|327409| ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'out/0.png', 'coverage': 0.77, 'readsFilename': 'out/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinName': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, }, }, 'uniqueReadCount': None, }, }, }, pg.pathogenNames)
def testNoFilesToStr(self): """ If no files have been given to a protein grouper, its text string format must as expected. """ pg = ProteinGrouper() self.assertEqual( 'Overall, proteins from 0 pathogens were found in 0 samples.\n', pg.toStr())
def testTwoLinesInOneFileDifferentPathogens(self): """ If a protein grouper is given one file with two lines from different pathogens, its pathogenNames dict must be as expected. """ fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n' ) pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': { 'proteins': { 'gi|327410| protein 77': { 'bestScore': 44.2, 'bluePlotFilename': 'out/0.png', 'coverage': 0.63, 'readsFilename': 'out/0.fasta', 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'out', 'proteinLength': 12, 'proteinName': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, }, }, 'uniqueReadCount': None, }, }, 'Hepatitis B virus': { 'sample-filename': { 'proteins': { 'gi|327409| ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'out/1.png', 'coverage': 0.77, 'readsFilename': 'out/1.fasta', 'hspCount': 6, 'index': 1, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinName': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, }, }, 'uniqueReadCount': None, }, }, }, pg.pathogenNames)
def testOneLineInOneFileTitle(self): """ If a protein grouper is given one file with one line, its _title method must return the expected string. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual('1 virus found in 1 sample', pg._title())
def testNoFilesToHTML(self): """ If no files have been given to a protein grouper, its HTML string format must as expected. """ pg = ProteinGrouper() self.assertEqual( '\n'.join([ '<html>', '<head>', '<title>', '0 viruses found in 0 samples', '</title>', '</head>', '<body>', '<style>', ' body {', ' margin-left: 2%;', ' margin-right: 2%;', ' }', ' .sample {', ' margin-bottom: 2px;', ' }', ' .sample-name {', ' color: red;', ' }', ' .index {', ' font-size: small;', ' }', ' .protein-title {', ' font-family: "Courier New", Courier, ' 'monospace;', ' }', ' .stats {', ' font-family: "Courier New", Courier, ' 'monospace;', ' white-space: pre;', ' }', ' .protein-list {', ' margin-top: 2px;', ' }', '</style>', '</head>', '<body>', '<h1>0 viruses found in 0 samples</h1>', '<h2>Virus index</h2>', '</p>', '<h2>Sample index</h2>', '</p>', '<h1>Viruses by sample</h1>', '<h1>Samples by virus</h1>', '</body>', '</html>', ]), pg.toHTML())
def testTwoLinesInOneFileDifferentPathogens(self): """ If a protein grouper is given one file with two lines from different pathogens, its pathogenNames dict must be as expected. """ fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n') pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': { 'proteins': { 'gi|327410| protein 77': { 'bestScore': 44.2, 'bluePlotFilename': 'out/0.png', 'coverage': 0.63, 'readsFilename': 'out/0.fasta', 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'out', 'proteinLength': 12, 'proteinName': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, }, }, 'uniqueReadCount': None, }, }, 'Hepatitis B virus': { 'sample-filename': { 'proteins': { 'gi|327409| ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'out/1.png', 'coverage': 0.77, 'readsFilename': 'out/1.fasta', 'hspCount': 6, 'index': 1, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinName': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, }, }, 'uniqueReadCount': None, }, }, }, pg.pathogenNames)
def testOneLineInEachOfTwoFilesDifferentViruses(self): """ If a protein grouper is given two files in two different directories, each with one line from the different viruses, its virusTitles dict must be as expected. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' ) fp2 = StringIO( '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n' ) pg = ProteinGrouper() pg.addFile('dir-1/sample-filename-1', fp1) pg.addFile('dir-2/sample-filename-2', fp2) self.assertEqual( { 'Lausannevirus': { 'dir-1/sample-filename-1': [ { 'bestScore': 44.2, 'bluePlotFilename': 'dir-1/out/0.png', 'coverage': 0.63, 'fastaFilename': 'dir-1/out/0.fasta', 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'dir-1/out', 'proteinLength': 12, 'proteinTitle': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, }, ], }, 'Hepatitis B virus': { 'dir-2/sample-filename-2': [ { 'bestScore': 48.1, 'bluePlotFilename': 'dir-2/out/0.png', 'coverage': 0.77, 'fastaFilename': 'dir-2/out/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'dir-2/out', 'proteinLength': 74, 'proteinTitle': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, }, ], }, }, pg.virusTitles)
def testTwoLinesInOneFileTitle(self): """ If a protein grouper is given one file with two protein lines, each from a different virus, its _title method must return the expected string. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [X Virus]\n') pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual('2 viruses found in 1 sample', pg._title())
def testOneLineInEachOfTwoFilesDifferentVirusesTitle(self): """ If a protein grouper is given two files, each with one line from different viruses, its _title method must return the expected string. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n') fp2 = StringIO('0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [HBV]\n') pg = ProteinGrouper() pg.addFile('sample-filename-1', fp1) pg.addFile('sample-filename-2', fp2) self.assertEqual('2 viruses found in 2 samples', pg._title())
def testOpenNotCalledOnRepeatedCall(self): """ If a repeated call to pathogenSampleFiles.add is made with the same arguments, no file should be read because the original result value is cached. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('out/0.fasta', filename) self.count += 1 return File(['>id1\n', 'ACTG\n']) elif self.count == 1: self.test.assertEqual('out/pathogen-0-sample-0.fasta', filename) self.count += 1 return self.manager else: self.test.fail( 'We are only supposed to be called twice. ' 'Filename: %r, Args: %r, Keyword args: %r.' % (filename, args, kwargs)) fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' ) fastaIO = StringIO() @contextmanager def manager(): yield fastaIO pg = ProteinGrouper() pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) sideEffect = Open(self, manager()).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual('out/pathogen-0-sample-0.fasta', filename) self.assertEqual('>id1\nACTG\n', fastaIO.getvalue()) # Repeated call. The side effect open will fail if open is # called at this point. filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
def testIdenticalReadsRemoved(self): """ If two proteins in the same pathogen are matched by the same read, the de-duplicated FASTA for the pathogen must have only one copy of the duplicated read. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.expectedFilenames = { 'out/0.fasta', 'out/1.fasta', 'out/pathogen-0-sample-0.fasta' } def sideEffect(self, filename, *args, **kwargs): try: self.expectedFilenames.remove(filename) except KeyError: self.test.fail( 'Open called with unexpected filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) else: if filename == 'out/0.fasta': return File(['>id1\n', 'ACTG\n']) elif filename == 'out/1.fasta': return File(['>id1\n', 'ACTG\n', '>id2\n', 'CAGT\n']) else: return self.manager fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n') fastaIO = StringIO() @contextmanager def manager(): yield fastaIO pg = ProteinGrouper() pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) opener = Open(self, manager()) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = opener.sideEffect filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual('out/pathogen-0-sample-0.fasta', filename) self.assertEqual('>id1\nACTG\n>id2\nCAGT\n', fastaIO.getvalue()) # Make sure all expected filenames were seen by the mocked open. self.assertEqual(set(), opener.expectedFilenames)
def testDuplicatePathogenProteinSample(self): """ If a protein grouper is given duplicate information for a pathogen/protein/sample combination it must raise a ValueError. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper() pg.addFile('sample', fp) fp.seek(0) error = ("^Protein 'gi\|327\|X\|I44.6 ubiquitin' already seen for " "pathogen 'Lausannevirus' sample 'sample'\.$") assertRaisesRegex(self, ValueError, error, pg.addFile, 'sample', fp)
def testDuplicatePathogenProteinSample(self): """ If a protein grouper is given duplicate information for a pathogen/protein/sample combination it must raise a ValueError. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper() pg.addFile('sample', fp) fp.seek(0) error = ("^Protein 'gi\\|327\\|X\\|I44.6 ubiquitin' already seen for " "pathogen 'Lausannevirus' sample 'sample'\\.$") assertRaisesRegex(self, ValueError, error, pg.addFile, 'sample', fp)
def testTwoLinesInOneFileTitle(self): """ If a protein grouper is given one file with two protein lines, each from a different virus, its _title method must return the expected string. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [X Virus]\n' ) pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual('2 viruses found in 1 sample', pg._title())
def testOpenNotCalledOnRepeatedCall(self): """ If a repeated call to pathogenSampleFiles.add is made with the same arguments, no file should be read because the original result value is cached. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('out/0.fasta', filename) self.count += 1 return File(['>id1\n', 'ACTG\n']) elif self.count == 1: self.test.assertEqual('out/pathogen-0-sample-0.fasta', filename) self.count += 1 return self.manager else: self.test.fail( 'We are only supposed to be called twice. ' 'Filename: %r, Args: %r, Keyword args: %r.' % (filename, args, kwargs)) fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n') fastaIO = StringIO() @contextmanager def manager(): yield fastaIO pg = ProteinGrouper() pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) sideEffect = Open(self, manager()).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual('out/pathogen-0-sample-0.fasta', filename) self.assertEqual('>id1\nACTG\n', fastaIO.getvalue()) # Repeated call. The side effect open will fail if open is # called at this point. filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
def testIdenticalReadsRemoved(self): """ If two proteins in the same pathogen are matched by the same read, the de-duplicated FASTA for the pathogen must have only one copy of the duplicated read. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.expectedFilenames = {'out/0.fasta', 'out/1.fasta', 'out/pathogen-0-sample-0.fasta'} def sideEffect(self, filename, *args, **kwargs): try: self.expectedFilenames.remove(filename) except KeyError: self.test.fail( 'Open called with unexpected filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) else: if filename == 'out/0.fasta': return File(['>id1\n', 'ACTG\n']) elif filename == 'out/1.fasta': return File(['>id1\n', 'ACTG\n', '>id2\n', 'CAGT\n']) else: return self.manager fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n' ) fastaIO = StringIO() @contextmanager def manager(): yield fastaIO pg = ProteinGrouper() pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) opener = Open(self, manager()) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = opener.sideEffect filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual('out/pathogen-0-sample-0.fasta', filename) self.assertEqual('>id1\nACTG\n>id2\nCAGT\n', fastaIO.getvalue()) # Make sure all expected filenames were seen by the mocked open. self.assertEqual(set(), opener.expectedFilenames)
def testReadLengthsAdded(self): """ If saveReadLengths is True for a ProteinGrouper, read lengths must be saved for each protein. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.expectedFilenames = { 'out/0.fasta', 'out/1.fasta', 'out/pathogen-0-sample-0.fasta' } def sideEffect(self, filename, *args, **kwargs): if filename in self.expectedFilenames: if filename == 'out/0.fasta': return File(['>id1\n', 'ACTG\n']) elif filename == 'out/1.fasta': return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n']) else: return self.manager else: self.test.fail( 'Open called with unexpected filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n') fastaIO = StringIO() @contextmanager def manager(): yield fastaIO opener = Open(self, manager()) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = opener.sideEffect pg = ProteinGrouper(saveReadLengths=True) pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) pathogenSampleFiles.add('Lausannevirus', 'filename-1') # Read lengths must be saved correctly. proteins = pg.pathogenNames['Lausannevirus']['filename-1']['proteins'] self.assertEqual((4, ), proteins['gi|327410| protein 77']['readLengths']) self.assertEqual((2, 7), proteins['gi|327409| ubiquitin']['readLengths'])
def testReadLengthsAdded(self): """ If saveReadLengths is True for a ProteinGrouper, read lengths must be saved for each protein. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.expectedFilenames = {'out/0.fasta', 'out/1.fasta', 'out/pathogen-0-sample-0.fasta'} def sideEffect(self, filename, *args, **kwargs): if filename in self.expectedFilenames: if filename == 'out/0.fasta': return File(['>id1\n', 'ACTG\n']) elif filename == 'out/1.fasta': return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n']) else: return self.manager else: self.test.fail( 'Open called with unexpected filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n' ) fastaIO = StringIO() @contextmanager def manager(): yield fastaIO opener = Open(self, manager()) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = opener.sideEffect pg = ProteinGrouper(saveReadLengths=True) pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) pathogenSampleFiles.add('Lausannevirus', 'filename-1') # Read lengths must be saved correctly. proteins = pg.pathogenNames['Lausannevirus']['filename-1']['proteins'] self.assertEqual((4,), proteins['gi|327410| protein 77']['readLengths']) self.assertEqual((2, 7), proteins['gi|327409| ubiquitin']['readLengths'])
def testMaxProteinFraction(self): """ The maxProteinFraction method must return the correct values. """ class SideEffect(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, **kwargs): if self.count == 0: self.test.assertEqual('proteins.fasta', filename) self.count += 1 return File(['>protein 1 [pathogen 1]\n', 'ACTG\n', '>protein 2 [pathogen 1]\n', 'AA\n', '>protein 3 [pathogen 1]\n', 'AA\n', '>protein 4 [pathogen 1]\n', 'AA\n', '>no pathogen name here\n', 'AA\n', '>protein 5 [pathogen 2]\n', 'AA\n']) else: self.test.fail('We are only supposed to be called once!') sideEffect = SideEffect(self) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect.sideEffect pg = ProteinGrouper(proteinFastaFilenames=['proteins.fasta']) self.assertEqual(1, sideEffect.count) fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 1 [pathogen 1]\n' '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 5 [pathogen 2]\n' ) pg.addFile('sample-1', fp) fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 2 [pathogen 1]\n' '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 3 [pathogen 1]\n' ) pg.addFile('sample-1', fp) fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 1 [pathogen 1]\n' '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 2 [pathogen 1]\n' ) pg.addFile('sample-2', fp) self.assertEqual(0.75, pg.maxProteinFraction('pathogen 1')) self.assertEqual(1.0, pg.maxProteinFraction('pathogen 2'))
def testOneLineInEachOfTwoFilesDifferentVirusesTitle(self): """ If a protein grouper is given two files, each with one line from different viruses, its _title method must return the expected string. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' ) fp2 = StringIO( '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [HBV]\n' ) pg = ProteinGrouper() pg.addFile('sample-filename-1', fp1) pg.addFile('sample-filename-2', fp2) self.assertEqual('2 viruses found in 2 samples', pg._title())
def testOneLineInEachOfTwoFilesSamePathogenTitle(self): """ If a protein grouper is given two files, each with one line from the same pathogen, its _title method must return the expected string. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n') fp2 = StringIO( '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n') pg = ProteinGrouper() pg.addFile('sample-filename-1', fp1) pg.addFile('sample-filename-2', fp2) self.assertEqual( 'Overall, proteins from 1 pathogen were found in 2 samples.', pg._title())
def testOneLineInOneFileToStr(self): """ If a protein grouper is given one file with one line, its toStr method must produce the expected result. """ fp = StringIO('0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein X [HBV]\n') pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual( '1 virus found in 1 sample\n' '\n' 'HBV (in 1 sample)\n' ' sample-filename (1 protein, 5 reads)\n' ' 0.77\t46.60\t48.10\t 5\t 6\t 0\tgi|32|X|I4 protein X\n', pg.toStr())
def testAssetDir(self): """ If an asset directorey is given to a protein grouper, its _assetDir attribute be set to hold that value. """ pg = ProteinGrouper(assetDir='xxx') self.assertEqual('xxx', pg._assetDir)
def testOneLineInOneFileToStr(self): """ If a protein grouper is given one file with one line, its toStr method must produce the expected result. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein X [HBV]\n') pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual( '1 virus found in 1 sample\n' '\n' 'HBV (in 1 sample)\n' ' sample-filename (1 protein, 5 reads)\n' ' 0.77\t46.60\t48.10\t 5\t 6\t 0\tgi|32|X|I4 protein X\n', pg.toStr())
def testNoAssetDir(self): """ If no asset directorey is given to a protein grouper, its _assetDir attribute be the default ('out'). """ pg = ProteinGrouper() self.assertEqual('out', pg._assetDir)
def testNoRegex(self): """ If no regex is given to a protein grouper, its _sampleNameRegex attribute be None. """ pg = ProteinGrouper() self.assertEqual(None, pg._sampleNameRegex)
def testTwoLinesInOneFileSameVirus(self): """ If a protein grouper is given one file with two lines from the same virus, its virusTitles dict must be as expected. """ fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n' ) pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': [ { 'bestScore': 44.2, 'bluePlotFilename': 'out/0.png', 'coverage': 0.63, 'fastaFilename': 'out/0.fasta', 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'out', 'proteinLength': 12, 'proteinTitle': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, }, { 'bestScore': 48.1, 'bluePlotFilename': 'out/1.png', 'coverage': 0.77, 'fastaFilename': 'out/1.fasta', 'hspCount': 6, 'index': 1, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinTitle': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, }, ], }, }, pg.virusTitles)
def testOneLineInEachOfTwoFilesSameVirus(self): """ If a protein grouper is given two files, each with one line from the same virus, its virusTitles dict must be as expected. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n') fp2 = StringIO( '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n') pg = ProteinGrouper() pg.addFile('sample-filename-1', fp1) pg.addFile('sample-filename-2', fp2) self.assertEqual( { 'Lausannevirus': { 'sample-filename-1': [ { 'bestScore': 44.2, 'bluePlotFilename': 'out/0.png', 'coverage': 0.63, 'fastaFilename': 'out/0.fasta', 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'proteinLength': 12, 'proteinTitle': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, }, ], 'sample-filename-2': [ { 'bestScore': 48.1, 'bluePlotFilename': 'out/0.png', 'coverage': 0.77, 'fastaFilename': 'out/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'proteinLength': 74, 'proteinTitle': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, }, ], }, }, pg.virusTitles)
def testOneLineInEachOfTwoFilesSamePathogenTitle(self): """ If a protein grouper is given two files, each with one line from the same pathogen, its _title method must return the expected string. """ fp1 = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' ) fp2 = StringIO( '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n' ) pg = ProteinGrouper() pg.addFile('sample-filename-1', fp1) pg.addFile('sample-filename-2', fp2) self.assertEqual( 'Overall, proteins from 1 pathogen were found in 2 samples.', pg._title())
def testNoFiles(self): """ If no files have been given to a protein grouper, its sample names and virus titles attributes must both be empty. """ pg = ProteinGrouper() self.assertEqual({}, pg.virusTitles) self.assertEqual({}, pg.sampleNames)
def testOneLineInOneFileWithDifferentAssetDir(self): """ If a protein grouper is given a different assetDir name, the outDir needs to have that same name, as expected. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper(assetDir='differentname') pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': { 'proteins': { 'gi|327|X|I44.6 ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'differentname/0.png', 'coverage': 0.77, 'readsFilename': 'differentname/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'differentname', 'proteinLength': 74, 'proteinName': 'gi|327|X|I44.6 ubiquitin', 'proteinURL': ('http://www.ncbi.nlm.nih.gov/nuccore/I44'), 'readCount': 5, }, }, 'uniqueReadCount': None, }, } }, pg.pathogenNames)
def testOneLineInOneFileFASTQ(self): """ If a protein grouper is given one file with one line, its pathogenNames dict must be as expected, including for a FASTQ file. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper(format_='fastq') pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': { 'proteins': { 'gi|327|X|I44.6 ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'out/0.png', 'coverage': 0.77, 'readsFilename': 'out/0.fastq', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinName': 'gi|327|X|I44.6 ubiquitin', 'proteinURL': ('http://www.ncbi.nlm.nih.gov/nuccore/I44'), 'readCount': 5, }, }, 'uniqueReadCount': None, }, } }, pg.pathogenNames)
def testUnknownPathogenType(self): """ If the toHTML method of a protein grouper is given an unknown pathogen type it must raise a ValueError. """ pg = ProteinGrouper() error = ("^Unrecognized pathogenType argument: 'x'\\. Value must be " "either 'bacterial' or 'viral'\\.$") assertRaisesRegex(self, ValueError, error, pg.toHTML, pathogenType='x')
def testOneLineInOneFile(self): """ If a protein grouper is given one file with one line, its virusTitles dict must be as expected. """ fp = StringIO( '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n') pg = ProteinGrouper() pg.addFile('sample-filename', fp) self.assertEqual( { 'Lausannevirus': { 'sample-filename': [ { 'bestScore': 48.1, 'bluePlotFilename': 'out/0.png', 'coverage': 0.77, 'fastaFilename': 'out/0.fasta', 'hspCount': 6, 'index': 0, 'medianScore': 46.6, 'proteinLength': 74, 'proteinTitle': 'gi|327|X|I44.6 ubiquitin', 'proteinURL': ('http://www.ncbi.nlm.nih.gov/nuccore/I44'), 'readCount': 5, }, ] } }, pg.virusTitles)
def testUnknownFormat(self): """ Passing an unknown format argument must result in a ValueError being raised. """ pg = ProteinGrouper() error = "^format_ must be either 'fasta' or 'fastq'\\.$" assertRaisesRegex(self, ValueError, error, PathogenSampleFiles, pg, format_='unknown')
parser.add_argument( 'filenames', nargs='*', help='Sample file names to read input from.') parser.add_argument( '--sampleNameRegex', default=None, help=('An (optional) regular expression that can be used to extract a ' 'short sample name from full sample file name. The regular ' 'expression must have a matching group (delimited by ' 'parentheses) to capture the part of the file name that should ' 'be used as the sample name.')) parser.add_argument( '--html', default=False, action='store_true', help='If specified, output HTML instead of plain text.') args = parser.parse_args() grouper = ProteinGrouper(sampleNameRegex=args.sampleNameRegex) if args.filenames: filenames = args.filenames else: filenames = (line[:-1] for line in sys.stdin) for filename in filenames: with open(filename) as fp: grouper.addFile(filename, fp) print(grouper.toHTML() if args.html else grouper.toStr())
parser.add_argument( '--sampleNameRegex', default=None, help=('An (optional) regular expression that can be used to extract a ' 'short sample name from full sample file name. The regular ' 'expression must have a matching group (delimited by ' 'parentheses) to capture the part of the file name that should ' 'be used as the sample name.')) parser.add_argument( '--html', default=False, action='store_true', help='If specified, output HTML instead of plain text.') args = parser.parse_args() grouper = ProteinGrouper(sampleNameRegex=args.sampleNameRegex) if args.filenames: filenames = args.filenames else: filenames = (line[:-1] for line in sys.stdin) for filename in filenames: with open(filename) as fp: grouper.addFile(filename, fp) print(grouper.toHTML() if args.html else grouper.toStr())
# action='append'. We use both because it allows people to use # (e.g.) --pff on the command line either via "--pff file1 --pff # file2" or "--pff file1 file2", or a combination of these. That # way it's not necessary to remember which way you're supposed to # use it and you also can't be hit by the subtle problem # encountered in https://github.com/acorg/dark-matter/issues/453 proteinFastaFilenames = list( chain.from_iterable(args.proteinFastaFilename)) else: proteinFastaFilenames = None grouper = ProteinGrouper(assetDir=args.assetDir, sampleName=args.sampleName, sampleNameRegex=args.sampleNameRegex, format_=args.format, proteinFastaFilenames=proteinFastaFilenames, saveReadLengths=args.showReadLengths, titleRegex=args.titleRegex, negativeTitleRegex=args.negativeTitleRegex, pathogenDataDir=args.pathogenDataDir) if args.filenames: filenames = args.filenames else: filenames = (line[:-1] for line in sys.stdin) for filename in filenames: with open(filename) as fp: grouper.addFile(filename, fp) if args.html:
def testProteinsSavedCorrectly(self): """ Information about proteins must be saved correctly in the ProteinGrouper for a given pathogen/sample combination. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.expectedFilenames = { 'out/0.fasta', 'out/1.fasta', 'out/pathogen-0-sample-0.fasta' } def sideEffect(self, filename, *args, **kwargs): if filename in self.expectedFilenames: if filename == 'out/0.fasta': return File(['>id1\n', 'ACTG\n']) elif filename == 'out/1.fasta': return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n']) else: return self.manager else: self.test.fail( 'Open called with unexpected filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n') fastaIO = StringIO() @contextmanager def manager(): yield fastaIO opener = Open(self, manager()) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = opener.sideEffect pg = ProteinGrouper() pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual( { 'proteins': { 'gi|327409| ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'out/1.png', 'coverage': 0.77, 'hspCount': 6, 'index': 1, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinName': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, 'readsFilename': 'out/1.fasta', }, 'gi|327410| protein 77': { 'bestScore': 44.2, 'bluePlotFilename': 'out/0.png', 'coverage': 0.63, 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'out', 'proteinLength': 12, 'proteinName': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, 'readsFilename': 'out/0.fasta', } }, 'uniqueReadCount': 3, }, pg.pathogenNames['Lausannevirus']['filename-1'])
if args.proteinFastaFilename: # Flatten lists of lists that we get from using both nargs='+' and # action='append'. We use both because it allows people to use # (e.g.) --pff on the command line either via "--pff file1 --pff # file2" or "--pff file1 file2", or a combination of these. That # way it's not necessary to remember which way you're supposed to # use it and you also can't be hit by the subtle problem # encountered in https://github.com/acorg/dark-matter/issues/453 proteinFastaFilenames = list(chain.from_iterable( args.proteinFastaFilename)) else: proteinFastaFilenames = None grouper = ProteinGrouper(assetDir=args.assetDir, sampleNameRegex=args.sampleNameRegex, format_=args.format, proteinFastaFilenames=proteinFastaFilenames, saveReadLengths=args.showReadLengths) if args.filenames: filenames = args.filenames else: filenames = (line[:-1] for line in sys.stdin) for filename in filenames: with open(filename) as fp: grouper.addFile(filename, fp) if args.html: print(grouper.toHTML(args.pathogenPanelFilename, minProteinFraction=args.minProteinFraction,
if args.proteinFastaFilename: # Flatten lists of lists that we get from using both nargs='+' and # action='append'. We use both because it allows people to use # (e.g.) --pff on the command line either via "--pff file1 --pff # file2" or "--pff file1 file2", or a combination of these. That # way it's not necessary to remember which way you're supposed to # use it and you also can't be hit by the subtle problem # encountered in https://github.com/acorg/dark-matter/issues/453 proteinFastaFilenames = list(chain.from_iterable( args.proteinFastaFilename)) else: proteinFastaFilenames = None grouper = ProteinGrouper(assetDir=args.assetDir, sampleName=args.sampleName, sampleNameRegex=args.sampleNameRegex, format_=args.format, proteinFastaFilenames=proteinFastaFilenames, saveReadLengths=args.showReadLengths) if args.filenames: filenames = args.filenames else: filenames = (line[:-1] for line in sys.stdin) for filename in filenames: with open(filename) as fp: grouper.addFile(filename, fp) if args.html: print(grouper.toHTML(args.pathogenPanelFilename, minProteinFraction=args.minProteinFraction,
def testProteinsSavedCorrectly(self): """ Information about proteins must be saved correctly in the ProteinGrouper for a given pathogen/sample combination. """ class Open(object): def __init__(self, test, manager): self.test = test self.manager = manager self.expectedFilenames = {'out/0.fasta', 'out/1.fasta', 'out/pathogen-0-sample-0.fasta'} def sideEffect(self, filename, *args, **kwargs): if filename in self.expectedFilenames: if filename == 'out/0.fasta': return File(['>id1\n', 'ACTG\n']) elif filename == 'out/1.fasta': return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n']) else: return self.manager else: self.test.fail( 'Open called with unexpected filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) fp = StringIO( '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n' '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n' ) fastaIO = StringIO() @contextmanager def manager(): yield fastaIO opener = Open(self, manager()) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = opener.sideEffect pg = ProteinGrouper() pg.addFile('filename-1', fp) pathogenSampleFiles = PathogenSampleFiles(pg) pathogenSampleFiles.add('Lausannevirus', 'filename-1') self.assertEqual( { 'proteins': { 'gi|327409| ubiquitin': { 'bestScore': 48.1, 'bluePlotFilename': 'out/1.png', 'coverage': 0.77, 'hspCount': 6, 'index': 1, 'medianScore': 46.6, 'outDir': 'out', 'proteinLength': 74, 'proteinName': 'gi|327409| ubiquitin', 'proteinURL': None, 'readCount': 5, 'readsFilename': 'out/1.fasta', }, 'gi|327410| protein 77': { 'bestScore': 44.2, 'bluePlotFilename': 'out/0.png', 'coverage': 0.63, 'hspCount': 9, 'index': 0, 'medianScore': 41.3, 'outDir': 'out', 'proteinLength': 12, 'proteinName': 'gi|327410| protein 77', 'proteinURL': None, 'readCount': 9, 'readsFilename': 'out/0.fasta', } }, 'uniqueReadCount': 3, }, pg.pathogenNames['Lausannevirus']['filename-1'])