예제 #1
0
 def testNoComplement(self):
     """
     If no ranges are on the complement strand, the orientations method
     must return just False.
     """
     ranges = GenomeRanges('join{[0:100](+), [400:600](+)}')
     self.assertEqual({True}, ranges.orientations())
예제 #2
0
 def testAllComplement(self):
     """
     If all ranges are on the complement strand, the orientations method
     must return just True.
     """
     ranges = GenomeRanges('join{[0:100](-), [400:600](-)}')
     self.assertEqual({False}, ranges.orientations())
예제 #3
0
 def testMixed(self):
     """
     If the ranges are on the both the regular and the complement strand,
     the orientations method must return the set {True, False}.
     """
     ranges = GenomeRanges('join{[0:100](+), [400:600](-)}')
     self.assertEqual({True, False}, ranges.orientations())
예제 #4
0
 def testMatchAtEndOfFirstRange(self):
     """
     If the start location in the DIAMOND match is at the very end of
     the first range, the correct offset must be returned.
     """
     # The offset is 20 because the match starts 15 nucleotides into the
     # protein (15 = (6 - 1 ) * 3) and the protein starts at position 5 in
     # the genome. So the match begins at nucleotide 15 + 5 = 20.
     ranges = GenomeRanges('[5:21](+)')
     self.assertEqual(20, ranges.startInGenome({'sstart': 6}))
예제 #5
0
 def testInFirstRange(self):
     """
     If the start location in the DIAMOND match is in the first range,
     the correct offset must be returned.
     """
     # The offset is 32 because the match starts 27 nucleotides into the
     # protein (27 = (10 - 1 ) * 3) and the protein starts at position 5 in
     # the genome. So the match begins at nucleotide 27 + 5 = 32.
     ranges = GenomeRanges('[5:35](+)')
     self.assertEqual(32, ranges.startInGenome({'sstart': 10}))
예제 #6
0
 def testMatchAtStartOfFirstRange(self):
     """
     If the start location in the DIAMOND match is at the very beginning of
     the first range, the correct offset must be returned.
     """
     # The offset is 5 because the match starts 0 nucleotides into the
     # protein (0 = (1 - 1 ) * 3) and the protein starts at position 5 in
     # the genome. So the match begins at nucleotide 0 + 5 = 5.
     ranges = GenomeRanges('[5:35](+)')
     self.assertEqual(5, ranges.startInGenome({'sstart': 1}))
예제 #7
0
 def testInSecondRange(self):
     """
     If the start location in the DIAMOND match is in the second range,
     the correct offset must be returned.
     """
     # The offset is 65 because the match starts 45 nucleotides into the
     # protein (45 = (16 - 1 ) * 3) and the protein has a range of 30
     # nucleotides (35 - 5 = 30) and then a range of 35 nucleotides (85 -
     # 50 = 35). So the match begins 15 (45 - 30 = 15) nucleotides into
     # the second range (which starts at 50), and 15 + 50 = 65.
     ranges = GenomeRanges('join{[5:35](+), [50:85](+)}')
     self.assertEqual(65, ranges.startInGenome({'sstart': 16}))
예제 #8
0
 def testInThirdRange(self):
     """
     If the start location in the DIAMOND match is in the third range,
     the correct offset must be returned.
     """
     # The offset is 2900 because the match starts 1200 nucleotides into
     # the protein (1200 = (401 - 1 ) * 3) and the protein has a range of
     # 100 nucleotides (100 - 0 = 100), then a range of 200 nucleotides
     # (600 - 400 = 200), then a range of 1000 nucleotides. So the match
     # begins 900 (1200 - 300 = 900) nucleotides into the third range
     # (which starts at 2000), and 2000 + 900 = 2900.
     ranges = GenomeRanges('join{[0:100](+), [400:600](+), [2000:3000](+)}')
     self.assertEqual(2900, ranges.startInGenome({'sstart': 401}))
예제 #9
0
 def testOneNakedRangePositive(self):
     """
     A single range on the positive strand must result in the the expected
     ranges value stored.
     """
     gr = GenomeRanges('[3:5](+)')
     self.assertEqual(((3, 5, True), ), gr.ranges)
예제 #10
0
 def testTwoJoinedContiguousRangesMismatchedStrands(self):
     """
     Two joined ranges that are contiguous but not on the same strand must
     return the expected unmerged (two-range) result.
     """
     gr = GenomeRanges('join{[3:5](-), [5:9](+)}')
     self.assertEqual(((3, 5, False), (5, 9, True)), gr.ranges)
예제 #11
0
 def testTwoRangesThatAreCircular(self):
     """
     The circular method must return True when given two ranges that span
     the end of the genome.
     """
     self.assertTrue(
         GenomeRanges('join{[20:40](+), [0:10](+)}').circular(40))
예제 #12
0
 def testThreeJoinedRanges(self):
     """
     Three joined ranges must return the expected result.
     """
     gr = GenomeRanges('join{[3:5](+), [7:9](-), [17:19](-)}')
     self.assertEqual(((3, 5, True), (7, 9, False), (17, 19, False)),
                      gr.ranges)
예제 #13
0
 def testOneNakedRangeNegative(self):
     """
     A single range on the negative strand must result in the the expected
     ranges value stored.
     """
     gr = GenomeRanges('[3:5](-)')
     self.assertEqual(((3, 5, False), ), gr.ranges)
예제 #14
0
 def testThreeRangesThatAreCircular2(self):
     """
     The circular method must return True when given three ranges that span
     the end of the genome when one of the ranges is the last in the passed
     BioPython GenBank string and the second range (starting at zero) is the
     first.
     """
     self.assertTrue(
         GenomeRanges('join{[0:10](+), [40:60](+), [80:90](+)}').circular(
             90))
예제 #15
0
 def testContiguousRangesComplementStrand(self):
     """
     The GenomeRanges class must raise a ValueError if passed two ranges
     that are contiguous and which have the same (+) direction.
     """
     message = (
         'Contiguous GenBank ranges detected: [3:5] followed by [5:7].')
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter('always')
         GenomeRanges('join{[3:5](+), [5:7](+)}')
         self.assertEqual(1, len(w))
         self.assertEqual(message, str(w[0].message))
예제 #16
0
 def testStartOutsideGenomeRangeTwoRanges(self):
     """
     If the start location in the DIAMOND match is greater than the length
     of the genome when two ranges are given, a ValueError must be raised.
     """
     # The starting offset in the genome is 27 because DIAMOND returns
     # 1-based offsets into the protein subject. So in Python terms the
     # start offset is amino acid 9, or nucleotide 27.
     ranges = GenomeRanges('join{[5:10](+), [12:18](+)}')
     error = (r'^Starting nucleotide offset 27 not found in protein '
              r'nucleotide ranges \(5, 10\), \(12, 18\)\.$')
     assertRaisesRegex(self, ValueError, error, ranges.startInGenome,
                       {'sstart': 10})
예제 #17
0
 def testTwoJoinedContiguousRanges(self):
     """
     Two joined ranges that are contiguous on the same strand must return
     the expected (single-range) result.
     """
     message = (
         'Contiguous GenBank ranges detected: [3:5] followed by [5:9].')
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter('always')
         gr = GenomeRanges('join{[3:5](+), [5:9](+)}')
         self.assertEqual(((3, 9, True), ), gr.ranges)
         self.assertEqual(1, len(w))
         self.assertEqual(message, str(w[0].message))
예제 #18
0
 def testTwoJoinedContiguousRangesInTheMiddleOfFourRanges(self):
     """
     Two joined ranges that are contiguous must return the expected
     result when there are non-contiguous ranges surrounding them.
     """
     message = (
         'Contiguous GenBank ranges detected: [3:5] followed by [5:9].')
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter('always')
         gr = GenomeRanges('join{[0:2](-), [3:5](-), [5:9](-), [12:15](-)}')
         self.assertEqual(((0, 2, False), (3, 9, False), (12, 15, False)),
                          gr.ranges)
         self.assertEqual(1, len(w))
         self.assertEqual(message, str(w[0].message))
예제 #19
0
    def __init__(self,
                 genomeAccession,
                 proteinGenomeDB,
                 checkTranslations=True):
        self.genomeAccession = genomeAccession
        self.proteinGenomeDB = proteinGenomeDB
        # self.proteins is keyed by protein accession number.
        self.proteins = {}
        self.coveredProteins = set()
        # self.offsets is keyed by genome offset, values are dicts that
        # contain a list of protein accession numbers that overlap that
        # offset and a set of read ids (if any) that match at that offset.
        # The offsets keys are only those that correspond to one or more
        # proteins in the genome.
        self.offsets = {}
        # self.coveredOffsetCount holds the read counts for all offsets covered
        # by reads, regardless of whether the offsets correspond to proteins or
        # not.
        self.coveredOffsetCount = Counter()
        self.samFiles = []
        self.readIdsMatchingGenome = set()

        self.genome = proteinGenomeDB.findGenome(genomeAccession)
        if self.genome is None:
            raise NoSuchGenomeError('Reference %r not found in protein/genome '
                                    'database.' % genomeAccession)

        for protein in proteinGenomeDB.findProteinsForGenome(genomeAccession):
            proteinAccession = protein['accession']
            self.proteins[proteinAccession] = protein

            ranges = GenomeRanges(protein['offsets']).ranges
            # print('Protein accession', proteinAccession)
            # print(ranges)

            for (start, stop, forward) in ranges:
                for offset in range(start, stop):
                    if offset not in self.offsets:
                        self.offsets[offset] = {
                            'proteinAccessions': set(),
                            'readIds': set(),
                        }
                    self.offsets[offset]['proteinAccessions'].add(
                        proteinAccession)

            if checkTranslations:
                self._checkTranslation(self.genome, ranges, protein)
예제 #20
0
 def testThreeJoinedContiguousRanges(self):
     """
     Three joined ranges that are contiguous must return the expected
     (single-range) result.
     """
     message1 = (
         'Contiguous GenBank ranges detected: [3:5] followed by [5:7].')
     # Note that the second warning message has a range that doesn't
     # correspond to any of the ranges in the GenomeRanges
     # initialization string. That's because by the time the second
     # warning is issued the first two ranges ([3:5] and [5:7]) have
     # been merged into one ([3:7]).
     message2 = (
         'Contiguous GenBank ranges detected: [3:7] followed by [7:9].')
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter('always')
         gr = GenomeRanges('join{[3:5](-), [5:7](-), [7:9](-)}')
         self.assertEqual(((3, 9, False), ), gr.ranges)
         self.assertEqual(2, len(w))
         self.assertEqual(message1, str(w[0].message))
         self.assertEqual(message2, str(w[1].message))
예제 #21
0
    def proteinCoverageInfo(self, proteinAccession, minReadOffsetCount=None):
        """
        Calculate coverage information for a protein.

        @param proteinAccession: A C{str} accession number.
        @param minReadOffsetCount: An C{int}, specifying the minimum number of
            reads offsets that must overlap the protein for the read to be
            considered as sufficiently intersecting the protein. Use this to
            prevent reads that just overlap the protein in a very small number
            offsets from being counted. Or C{None} to indicate that no such
            filtering should be applied.
        @raises KeyError: If C{proteinAccession} is not known.
        @return: A C{dict} containing
                * the number of covered offsets,
                * the total number of read bases that cover the protein,
                * the protein length (in nucleotides), and
                * the set of all matching read ids.
            See below for the dictionary keys.
        """
        protein = self.proteins[proteinAccession]
        coveredOffsets = 0
        totalBases = 0
        allReadIds = set()
        offsetsSeen = set()
        proteinLength = 0

        if minReadOffsetCount is not None and minReadOffsetCount < 2:
            # A minimum of zero or one is equivalent to not giving a value.
            minReadOffsetCount = None

        if minReadOffsetCount:
            readOffsetCounts = Counter()

        proteinRanges = GenomeRanges(protein['offsets']).ranges

        # Do an initial pass across all the offsets of the protein to see
        # which reads intersect and where. We will then do a second pass in
        # which we ignore reads that do not sufficiently overlap.
        for (start, stop, forward) in proteinRanges:
            proteinLength += stop - start
            for offset in range(start, stop):
                assert offset not in offsetsSeen
                offsetsSeen.add(offset)
                readIds = self.offsets[offset]['readIds']
                if readIds and minReadOffsetCount:
                    readOffsetCounts.update(readIds)

        # Sanity check that the sum of the range lengths is the same as the
        # overall length given in the database.
        #
        # The +3 in the following is because the database holds the AA
        # length, not including the stop codon. But the database range
        # covers the stop codon.
        dbProteinLength = self.proteins[proteinAccession]['length'] * 3 + 3
        if proteinLength != dbProteinLength:
            raise ValueError(
                'Sum of protein database ranges (%d) does not agree with '
                'database protein length (%d) for protein %s!' %
                (proteinLength, dbProteinLength, proteinAccession))

        # If we are not reporting reads whose overlapping offset count is
        # too low, make a set of such reads.
        if minReadOffsetCount:
            unwanted = set(readId for readId in readOffsetCounts
                           if readOffsetCounts[readId] < minReadOffsetCount)
        else:
            unwanted = set()

        # Second pass, in which we ignore unwanted (i.e., insufficiently
        # overlapping) reads.
        for (start, stop, forward) in proteinRanges:
            for offset in range(start, stop):
                readIds = set(self.offsets[offset]['readIds']) - unwanted
                if readIds:
                    allReadIds.update(readIds)
                    coveredOffsets += 1
                    totalBases += len(readIds)

        return {
            'coveredOffsets': coveredOffsets,
            'totalBases': totalBases,
            'ntLength': proteinLength,
            'readIds': allReadIds,
        }
예제 #22
0
 def key(proteinAccession):
     return GenomeRanges(
         gpi.proteins[proteinAccession]['offsets']).ranges[0][0]
예제 #23
0
 def testThreeRangesCircular(self):
     """
     If there are 3 ranges but the genome is circular, 2 must be returned.
     """
     ranges = GenomeRanges('join{[0:100](+), [400:600](+), [2000:3000](+)}')
     self.assertEqual(2, ranges.distinctRangeCount(3000))
예제 #24
0
 def testThreeRange(self):
     """
     If there are 3 ranges, 3 must be returned.
     """
     ranges = GenomeRanges('join{[0:100](+), [400:600](+), [2000:3000](+)}')
     self.assertEqual(3, ranges.distinctRangeCount(100))
예제 #25
0
 def testTwoRange(self):
     """
     If there are 2 ranges, 2 must be returned.
     """
     ranges = GenomeRanges('join{[0:100](+), [400:600](+)}')
     self.assertEqual(2, ranges.distinctRangeCount(100))
예제 #26
0
 def testOneRange(self):
     """
     If there is only one range, 1 must be returned.
     """
     ranges = GenomeRanges('[0:100](+)')
     self.assertEqual(1, ranges.distinctRangeCount(100))
예제 #27
0
 def testOneRange(self):
     """
     The circular method must return False when given only a single range
     tuple that is fully contained within the genome.
     """
     self.assertFalse(GenomeRanges('[20:40](+)').circular(100))
예제 #28
0
 def testOneRangeSpanningTheWholeGenome(self):
     """
     The circular method must return False when given only a single range
     tuple that ends at the end of the genome.
     """
     self.assertFalse(GenomeRanges('[0:40](+)').circular(40))
예제 #29
0
    def testTibetanFrogHBV(self):
        """
        Test that Tibetan frogs can get HBV.
        """
        proteinAccession = 'YP_009259545.1'
        proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein']
        proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id']
        proteinRange = SAMPLE_DATA['proteins'][proteinAccession]['range']
        ranges = GenomeRanges(proteinRange)
        queryStartInProtein = 10  # This is a 0-based amino acid offset.
        queryLenInProtein = 40  # This is an amino acid length.

        genomeAccession = 'NC_030446.1'
        genomeSequence = SAMPLE_DATA['genomes'][genomeAccession]['genome']
        genomeLen = len(genomeSequence)

        # The query sequence is nucleotides that match the amino acids in the
        # protein. Here we use the first ([0] in the below) codon for each
        # amino acid to make the nucleotide sequence.
        queryId = 'query'
        querySequence = ''.join(
            CODONS[aa][0]
            for aa in proteinSequence[queryStartInProtein:queryStartInProtein +
                                      queryLenInProtein])
        queryQuality = 'E' * len(querySequence)

        # Use the protein sequence to make a DIAMOND database and run DIAMOND
        # on the query. Yes, this really runs DIAMOND, so you need to have it
        # installed, with its executable somewhere in your shell's PATH.
        with DiamondExecutor() as de:
            de.addSubject(Read(proteinId, proteinSequence))
            queries = Reads([Read(queryId, querySequence, queryQuality)])
            (diamondResult, ) = list(de.search(queries))

        # Make sure DIAMOND gives us back what we expected.
        self.assertEqual(
            {
                'bitscore': 83.6,
                'btop': str(queryLenInProtein),  # Exact match of all AAs.
                'qframe': 1,
                'qend': 3 * queryLenInProtein,
                'full_qqual': queryQuality,
                'qlen': len(querySequence),
                'full_qseq': querySequence,
                'qseqid': 'query',
                'qstart': 1,
                'slen': len(proteinSequence),
                'sstart': queryStartInProtein + 1,  # DIAMOND is 1-based.
                'stitle': proteinId,
            },
            diamondResult)

        # Make a genomes/proteins sqlite database and add information about
        # the protein and the nucleotide genome it comes from.
        db = SqliteIndexWriter(':memory:')
        db.addProtein(proteinAccession, genomeAccession, proteinSequence,
                      proteinRange, True, ranges.circular(genomeLen),
                      ranges.distinctRangeCount(genomeLen))

        genome = _Genome(
            # genomeAccession, genomeName, genomeSequence
        )
        db.addGenome(genome, 1, 'test-db')

        # Make a DIAMOND-to-SAM writer and give it the DIAMOND output.
        writer = SimpleDiamondSAMWriter(SqliteIndex(db._connection))
        writer.addMatch('\t'.join(
            map(str, (diamondResult['bitscore'], diamondResult['btop'],
                      diamondResult['qframe'], diamondResult['qend'],
                      diamondResult['full_qqual'], diamondResult['qlen'],
                      diamondResult['full_qseq'], diamondResult['qseqid'],
                      diamondResult['qstart'], diamondResult['slen'],
                      diamondResult['sstart'], diamondResult['stitle']))))

        # Tell the writer to save the matches as SAM and check the result.
        fp = StringIO()
        writer.save(filename=fp)

        flags = '31'
        self.assertEqual(
            '\n'.join([
                '@SQ\tSN:%s\tLN:%d' % (genomeAccession, len(genomeSequence)),
                '\t'.join([
                    queryId,
                    '0',
                    genomeAccession,
                    flags,
                    '255',
                    '120M',  # (Exact) match of 40 AAs.
                    '*',
                    '0',
                    '0',
                    querySequence,
                    queryQuality,
                    'AS:i:%d' % int(diamondResult['bitscore']),
                ])
            ]) + '\n',
            fp.getvalue())
예제 #30
0
    def _SAMLine(self, match, protein, genome):
        """
        Convert DIAMOND match information to a line of SAM file output.

        @param match: A C{dict} with information about the DIAMOND match, as
            returned by C{DiamondTabularFormat().diamondFieldsToDict} which
            has been called for us by C{self._preprocessMatch}.
        @param protein: A C{dict} with information about the protein the
            DIAMOND was for. The C{dict} is as returned by
            C{dark.proteins.SqliteIndex.findProtein}.
        @param genome: A C{dict} with information about the (nucleotide) genome
            that the protein in the DIAMOND match comes from. The C{dict} is
            as returned by C{dark.proteins.SqliteIndex.findGenome}.
        @return: A TAB-separated C{str} line of SAM.
        """
        qseqid = (match['qseqid'] if self._keepDescriptions else
                  match['qseqid'].split(None, 1)[0])

        ranges = GenomeRanges(protein['offsets'])

        # matchStartInGenome = ranges.startInGenome(match)
        # queryStartInGenome = matchStartInGenome - match['qstart'] - 1
        # orientations = ranges.orientations()

        print('ranges', ranges)
        print('protein', protein)
        print('match', match)
        # If the query frame is less than zero, the match was with a
        # translation of the reverse-complemented query. We'll put the
        # reverse complement into the SAM output. This seems to be standard
        # / accepted practice, based on my web searches.  See e.g.,
        # https://www.biostars.org/p/131891/ for what Bowtie2 does and for
        # some comments on this issue for SAM/BAM files in general.
        if match['qframe'] > 0:
            flag = 0
            qseq = match['full_qseq']
            qqual = match['full_qqual'] or '*'
        else:
            flag = 16
            qseq = DNARead('id',
                           match['full_qseq']).reverseComplement().sequence
            qqual = match['full_qqual'][::-1] if match['full_qqual'] else '*'

        # Make a CIGAR string, including hard-clipped bases at the start and
        # end of the query (DIAMOND outputs a hard-clipped query sequence).
        startClipCount = match['qstart'] - 1
        endClipCount = match['qlen'] - match['qend']

        assert startClipCount >= 0
        assert endClipCount >= 0, (
            'Query sequence %s has length %d but the qend value is %d' %
            (qseq, len(match['qseq']), match['qend']))

        cigar = (('%dH' % startClipCount if startClipCount else '') +
                 btop2cigar(match['btop'], concise=False, aa=True) +
                 ('%dH' % endClipCount if endClipCount else ''))

        return '\t'.join(
            map(
                str,
                (
                    # 1. QNAME
                    qseqid,
                    # 2. FLAG
                    flag,
                    # 3. RNAME
                    genome['accession'],
                    # 4. POS. This needs to be a 1-based offset into the
                    # nucleotide-equivalent of the DIAMOND subject sequence (which was
                    # a protein since that is how DIAMOND operates). Because DIAMOND
                    # gives back a 1-based protein location, we adjust to 0-based,
                    # multiply by 3 to get to nucleotides, then adjust to 1-based.
                    3 * (match['sstart'] - 1) + 1,
                    # 5. MAPQ
                    self._mappingQuality,
                    # 6. CIGAR
                    cigar,
                    # 7. RNEXT
                    '*',
                    # 8. PNEXT
                    0,
                    # 9. TLEN
                    0,
                    # 10. SEQ
                    qseq,
                    # 11. QUAL
                    qqual,
                    # 12. Alignment score
                    'AS:i:%d' % int(match['bitscore']))))