def testDictLookupWithFastaDirectory(self): """" The __getitem__ method (i.e., dictionary-like lookup) must return the expected read, obtained from the expected file name, when a FASTA base directory is specified. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('/tmp/f.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n') if self.count == 1: self.test.assertEqual( os.path.join('/usr/local/fasta', 'f.fasta'), filename) self.count += 1 return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:', fastaDirectory='/usr/local/fasta') index.addFile('/tmp/f.fasta') self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1']) index.close()
def testAddDuplicateFile(self): """" If a filename is passed to addFile more than once, a ValueError must be raised. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('filename.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') self.assertEqual(2, index.addFile('filename.fasta')) error = "^Duplicate file name: 'filename\\.fasta'$" assertRaisesRegex(self, ValueError, error, index._addFilename, 'filename.fasta') index.close()
def testAddDuplicateFile(self): """" If a filename is passed to addFile more than once, a ValueError must be raised. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('filename.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') self.assertEqual(2, index.addFile('filename.fasta')) error = "^Duplicate file name: 'filename.fasta'$" self.assertRaisesRegexp(ValueError, error, index._addFilename, 'filename.fasta') index.close()
def testDictLookupWithTwoFiles(self): """" The __getitem__ method (i.e., dictionary-like lookup) must return the expected reads when sequences are added from two files. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0 or self.count == 2 or self.count == 3: self.test.assertEqual('filename1.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n') elif self.count == 1 or self.count == 4: self.test.assertEqual('filename2.fasta', filename) self.count += 1 return StringIO('>seq3\nAAACCC\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') index.addFile('filename1.fasta') index.addFile('filename2.fasta') self.assertEqual(DNARead('id1', 'ACTG'), index['id1']) self.assertEqual(DNARead('id2', 'AACCTTGG'), index['id2']) self.assertEqual(DNARead('seq3', 'AAACCC'), index['seq3']) index.close()
def testAddOneFile(self): """" Test the creation of an index with sequences added from one file. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('filename.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') self.assertEqual(2, index.addFile('filename.fasta')) index.close()
def testDictLookupGzipDataWithBGZsuffix(self): """" The __getitem__ method (i.e., dictionary-like lookup) must return the expected read when the index file is in BGZF format and has a .bgz suffix. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count <= 1: self.test.assertEqual('filename.fasta.bgz', filename) self.count += 1 writerIO = BytesIO() writer = bgzf.BgzfWriter(fileobj=writerIO) writer.write(b'>id0\nAC\n') writer.flush() fileobj = BytesIO(writerIO.getvalue()) fileobj.mode = 'rb' return bgzf.BgzfReader(fileobj=fileobj) else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(bgzf, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') index.addFile('filename.fasta.bgz') self.assertEqual(DNARead('id0', 'AC'), index['id0']) index.close()
def testAddFileWithDuplicateSequence(self): """" If a sequence id is duplicated in a FASTA file, a ValueError must be raised. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('filename.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\n>id1\nAACCTTGG\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') error = ("^FASTA sequence id 'id1' found twice in file " "'filename.fasta'\\.$") assertRaisesRegex(self, ValueError, error, index.addFile, 'filename.fasta') index.close()
def testAddFilesWithDuplicateSequence(self): """" If a sequence id occurs in more than one FASTA file, a ValueError must be raised. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('filename1.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n') elif self.count == 1: self.test.assertEqual('filename2.fasta', filename) self.count += 1 return StringIO('>id2\nAAACCC\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') index.addFile('filename1.fasta') error = ("^FASTA sequence id 'id2', found in file " "'filename2\.fasta', was previously added from file " "'filename1\.fasta'\.$") self.assertRaisesRegexp(ValueError, error, index.addFile, 'filename2.fasta') index.close()
def testDictLookupSequenceCrossesNewlines(self): """" The __getitem__ method (i.e., dictionary-like lookup) must return the expected read when the sequence spans multiple lines of the input file, including lines ending in \n and \r\n. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0 or self.count == 1: self.test.assertEqual('filename.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') index.addFile('filename.fasta') self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1']) index.close()
def testDictLookupSpecificReadClass(self): """" The __getitem__ method (i.e., dictionary-like lookup) must return the expected read type. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0 or self.count == 1: self.test.assertEqual('filename.fasta', filename) self.count += 1 return StringIO('>id1\nMM\n>id2\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:', readClass=AARead) index.addFile('filename.fasta') result = index['id1'] self.assertTrue(isinstance(result, AARead)) self.assertEqual(AARead('id1', 'MM'), result) index.close()
def testAddFilename(self): """" Test the internal _addFilename method. """ index = SqliteIndex(':memory:') self.assertEqual(1, index._addFilename('filename1.fasta')) self.assertEqual(2, index._addFilename('filename2.fasta')) index.close()
def testGetNonexistentFileNumber(self): """" If the internal _getFileNumber method is called with a file whose name has not been added, it must return None. """ index = SqliteIndex(':memory:') self.assertEqual(None, index._getFileNumber('filename.fasta')) index.close()
def testGetFileNumber(self): """" The internal _getFileNumber method must return the expected result. """ index = SqliteIndex(':memory:') self.assertEqual(1, index._addFilename('filename.fasta')) self.assertEqual(1, index._getFileNumber('filename.fasta')) index.close()
def testAddDuplicateFilename(self): """" When _addFilename is called twice with the same name, a ValueError must be raised. """ index = SqliteIndex(':memory:') self.assertEqual(1, index._addFilename('f.fas')) error = "^Duplicate file name: 'f.fas'$" self.assertRaisesRegexp(ValueError, error, index._addFilename, 'f.fas')
def testAddDuplicateFilename(self): """" When _addFilename is called twice with the same name, a ValueError must be raised. """ index = SqliteIndex(':memory:') self.assertEqual(1, index._addFilename('f.fas')) error = "^Duplicate file name: 'f.fas'$" assertRaisesRegex(self, ValueError, error, index._addFilename, 'f.fas')
def testFindWithTwoFiles(self): """" The _find method must return the expected filename and offset when sequences are added from two files. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count == 0: self.test.assertEqual('filename1.fasta', filename) self.count += 1 return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n') elif self.count == 1: self.test.assertEqual('filename2.fasta', filename) self.count += 1 return StringIO('>sequence3\nAAACCC\n') else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') index.addFile('filename1.fasta') index.addFile('filename2.fasta') self.assertEqual(('filename1.fasta', 5), index._find('id1')) self.assertEqual(('filename1.fasta', 15), index._find('id2')) self.assertEqual(('filename2.fasta', 11), index._find('sequence3')) index.close()
def getSubjectSequence(self, title): """ Obtain information about a subject sequence given its title. @param title: A C{str} sequence title from a DIAMOND hit. @raise KeyError: If the C{title} is not present in the DIAMOND database. @return: An C{AAReadWithX} instance. """ if self._subjectTitleToSubject is None: if self._databaseFilename is None: # An Sqlite3 database is used to look up subjects. self._subjectTitleToSubject = SqliteIndex( self._sqliteDatabaseFilename, fastaDirectory=self._databaseDirectory, readClass=AAReadWithX) else: # Build a dict to look up subjects. titles = {} for read in FastaReads(self._databaseFilename, readClass=AAReadWithX): titles[read.id] = read self._subjectTitleToSubject = titles return self._subjectTitleToSubject[title]
def testBZ2File(self): """" Trying to add a .bz2 file must result in a ValueError. """ index = SqliteIndex(':memory:') error = ('^Compressed FASTA is only supported in BGZF format\\. Use ' 'bgzip to compresss your FASTA\\.$') assertRaisesRegex(self, ValueError, error, index.addFile, 'file.bz2')
def testDictLookupGzipData(self): """" The __getitem__ method (i.e., dictionary-like lookup) must return the expected reads when sequences span multiple lines of the input file, and include lines ending in \n and \r\n and have been compressed with bgzip, including when sequences are more than 64K bytes into the input file. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count <= 4: self.test.assertEqual('filename.fasta.gz', filename) self.count += 1 writerIO = BytesIO() writer = bgzf.BgzfWriter(fileobj=writerIO) writer.write( b'>id0\nAC\n' + b'>id1\n' + (b'A' * 70000) + b'\n' + b'>id2\r\nACTG\r\nCCCC\r\nGGG\r\n' + b'>id3\nAACCTG\n') writer.flush() fileobj = BytesIO(writerIO.getvalue()) fileobj.mode = 'rb' return bgzf.BgzfReader(fileobj=fileobj) else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(bgzf, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') index.addFile('filename.fasta.gz') self.assertEqual(DNARead('id0', 'AC'), index['id0']) self.assertEqual(DNARead('id1', 'A' * 70000), index['id1']) self.assertEqual(DNARead('id2', 'ACTGCCCCGGG'), index['id2']) self.assertEqual(DNARead('id3', 'AACCTG'), index['id3']) index.close()
def getSubjectSequence(self, title): """ Obtain information about a subject sequence given its title. This information is cached in self._subjectTitleToSubject. It can be obtained from either a) an sqlite database (given via the sqliteDatabaseFilename argument to __init__), b) the FASTA that was originally given to BLAST (via the databaseFilename argument), or c) from the BLAST database using blastdbcmd (which can be unreliable - occasionally failing to find subjects that are in its database). @param title: A C{str} sequence title from a BLAST hit. Of the form 'gi|63148399|gb|DQ011818.1| Description...'. @return: An C{AARead} or C{DNARead} instance, depending on the type of BLAST database in use. """ if self.params.application in {'blastp', 'blastx'}: readClass = AARead else: readClass = DNARead if self._subjectTitleToSubject is None: if self._databaseFilename is None: if self._sqliteDatabaseFilename is None: # Fall back to blastdbcmd. ncbidb has to be imported # as below so ncbidb.getSequence can be patched by our # test suite. from dark import ncbidb seq = ncbidb.getSequence( title, self.params.applicationParams['database']) return readClass(seq.description, str(seq.seq)) else: # An Sqlite3 database is used to look up subjects. self._subjectTitleToSubject = SqliteIndex( self._sqliteDatabaseFilename, fastaDirectory=self._databaseDirectory, readClass=readClass) else: # Build an in-memory dict to look up subjects. This only # works for small databases, obviously. titles = {} for read in FastaReads(self._databaseFilename, readClass=readClass): titles[read.id] = read self._subjectTitleToSubject = titles return self._subjectTitleToSubject[title]
def testDictLookupGzipData(self): """" The __getitem__ method (i.e., dictionary-like lookup) must return the expected reads when sequences span multiple lines of the input file, and include lines ending in \n and \r\n and have been compressed with bgzip, including when sequences are more than 64K bytes into the input file. """ class Open(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, *args, **kwargs): if self.count <= 4: self.test.assertEqual('filename.fasta.gz', filename) self.count += 1 writerIO = BytesIO() writer = bgzf.BgzfWriter(fileobj=writerIO) writer.write(b'>id0\nAC\n' + b'>id1\n' + (b'A' * 70000) + b'\n' + b'>id2\r\nACTG\r\nCCCC\r\nGGG\r\n' + b'>id3\nAACCTG\n') writer.flush() fileobj = BytesIO(writerIO.getvalue()) fileobj.mode = 'rb' return bgzf.BgzfReader(fileobj=fileobj) else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs)) sideEffect = Open(self).sideEffect with patch.object(bgzf, 'open') as mockMethod: mockMethod.side_effect = sideEffect index = SqliteIndex(':memory:') index.addFile('filename.fasta.gz') self.assertEqual(DNARead('id0', 'AC'), index['id0']) self.assertEqual(DNARead('id1', 'A' * 70000), index['id1']) self.assertEqual(DNARead('id2', 'ACTGCCCCGGG'), index['id2']) self.assertEqual(DNARead('id3', 'AACCTG'), index['id3']) index.close()
'uncompressed, or compressed with bgzip (from samtools), with ' 'a .gz suffix.')) args = parser.parse_args() if os.path.exists(args.out): if args.force: os.unlink(args.out) else: print( "Output file '%s' already exists. Use --force to overwrite." % args.out, file=sys.stderr) sys.exit(1) index = SqliteIndex(args.out) # Flatten the lists of lists that we get from using both nargs='+' and # action='append'. We use both because it allows people to use (e.g.) # --fasta on the command line either via "--fasta file1 --fasta file2" # or "--fasta file1 file2", or a combination of these. That way it's # not necessary to remember which way you're supposed to use it and you # also can't be hit by the subtle problem encountered in # https://github.com/acorg/dark-matter/issues/453 fastaFiles = list(chain.from_iterable(args.fasta)) verbose = not args.quiet for filename in fastaFiles: if verbose: print("Indexing '%s' ... " % filename, end='', file=sys.stderr)
required=True, help=('the FASTA file(s) to make the database from. These may be ' 'uncompressed, or compressed with bgzip (from samtools), with ' 'a .gz suffix.')) args = parser.parse_args() if os.path.exists(args.out): if args.force: os.unlink(args.out) else: print("Output file '%s' already exists. Use --force to overwrite." % args.out, file=sys.stderr) sys.exit(1) index = SqliteIndex(args.out) # Flatten the lists of lists that we get from using both nargs='+' and # action='append'. We use both because it allows people to use (e.g.) # --fasta on the command line either via "--fasta file1 --fasta file2" # or "--fasta file1 file2", or a combination of these. That way it's # not necessary to remember which way you're supposed to use it and you # also can't be hit by the subtle problem encountered in # https://github.com/acorg/dark-matter/issues/453 fastaFiles = list(chain.from_iterable(args.fasta)) verbose = not args.quiet for filename in fastaFiles: if verbose: print("Indexing '%s' ... " % filename, end='', file=sys.stderr)