Exemplo n.º 1
0
 def testCollectReadHashes(self):
     """
     The getHashes method must return a dict keyed by (landmark, trigPoints)
     hash with values containing the read offsets.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks], distanceBase=1.0)
     be = Backend()
     be.configure(dbParams)
     query = AARead('query', 'FRRRFRRRFASAASAFRRRFRRRFASAASA')
     scannedQuery = be.scan(query)
     hashCount = be.getHashes(scannedQuery)
     helixAt0 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2)
     helixAt15 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 15, 9, 2)
     peakAt10 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 10)
     peakAt13 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 13)
     peakAt25 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 25)
     peakAt28 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 28)
     self.assertEqual(
         {
             'A2:P:28': [[helixAt0, peakAt28]],
             'A2:P:25': [[helixAt0, peakAt25]],
             'A2:P:13': [[helixAt0, peakAt13], [helixAt15, peakAt28]],
             'A2:P:10': [[helixAt0, peakAt10], [helixAt15, peakAt25]],
             'A2:P:-5': [[helixAt15, peakAt10]],
             'A2:P:-2': [[helixAt15, peakAt13]],
             'A2:A2:15': [[helixAt0, helixAt15]],
         }, hashCount)
Exemplo n.º 2
0
    def __init__(self, histogram, query, subject, dbParams):
        self._histogram = histogram
        self._queryLen = len(query)
        self._subjectLen = len(subject)

        from light.backend import Backend
        backend = Backend()
        backend.configure(dbParams)

        scannedQuery = backend.scan(query)
        allQueryHashes = backend.getHashes(scannedQuery)
        self._allQueryFeatures = getHashFeatures(allQueryHashes)

        scannedSubject = backend.scan(subject.read)
        allSubjectHashes = backend.getHashes(scannedSubject)
        self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
Exemplo n.º 3
0
    def __init__(self, histogram, query, subject, dbParams, weights=None):
        self._histogram = histogram
        self._queryLen = len(query)
        self._subjectLen = len(subject)

        self._weights = self.DEFAULT_WEIGHTS if weights is None else weights

        from light.backend import Backend
        backend = Backend()
        backend.configure(dbParams)

        scannedQuery = backend.scan(query)
        allQueryHashes = backend.getHashes(scannedQuery)
        self._allQueryFeatures = getHashFeatures(allQueryHashes)

        scannedSubject = backend.scan(subject.read)
        allSubjectHashes = backend.getHashes(scannedSubject)
        self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
Exemplo n.º 4
0
 def testCollectReadHashesWithOneLandmark(self):
     """
     The getHashes method must return a dict keyed by (landmark, trigPoints)
     hash with values containing the read offsets. The result should be
     empty if there is only one landmark in the read.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[])
     be = Backend()
     be.configure(dbParams)
     query = AARead('query', 'FRRRFRRRF')
     scannedQuery = be.scan(query)
     hashCount = be.getHashes(scannedQuery)
     self.assertEqual({}, hashCount)
Exemplo n.º 5
0
    def __init__(self, sequences, cutoff, **kwargs):
        """
        A class to work with hashes.
        For a set of given sequences, find all hashes and for each sequence
        make a string of 1 or 0 denoting whether a hash is present in that
        sequence or not. Only include hashes if they occur in more than at '
        least a specified percentage of all given sequences.

        @param sequences: A C{str} filename with a fasta file of sequences to
            be used or a C{dark.reads.Reads} object.
        @param cutoff: A C{float} between 0.0 and 1.0 of the fraction of
            sequences in which a hash has to be present to be included in the
            final string.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences

        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)

        # Make a dictionary where the keys are the sequence ids and the value
        # is an orderedDict of hashes as returned from getHashes().
        hashes = {}
        for read in reads:
            scannedRead = backend.scan(read)
            readHashes = backend.getHashes(scannedRead)
            hashes[read.id] = readHashes

        # Make a list of all unique hashes that occur.
        totalHashes = set()
        for read in hashes:
            totalHashes.update(hashes[read].keys())

        # Make a dictionary where the key is a hash and the value is a list of
        # the reads in which the hash occurs.
        byHashes = {}
        for hash_ in totalHashes:
            viruses = []
            for readId in hashes:
                try:
                    hashes[readId][hash_]
                except KeyError:
                    continue
                viruses.append(readId)
            byHashes[hash_] = viruses

        # Make a dictionary where the key is a readId and the value is a string
        # of 1 and 0 denoting which hashes occur in which read.
        co = cutoff * len(reads)
        self.hashString = {read.id: '' for read in reads}

        for hash_ in byHashes:
            if len(byHashes[hash_]) > co:
                for virus in self.hashString:
                    if virus in byHashes[hash_]:
                        self.hashString[virus] += '1'
                    else:
                        self.hashString[virus] += '0'
Exemplo n.º 6
0
    def calculateScore(self):
        """
        Calculates the overall score, as described above.

        @return: a C{float} overall score for all significant bins (or C{None}
            if there are no significant bins) and a C{dict} with information
            about the score.
        """
        # We could do more checking here and use the score of the best bin as
        # the overall score if there is only one significant bin or if the
        # score of the best bin is 1.0.

        # Don't attempt to calculate an overall score if there are no
        # significant bins.
        if not self._significantBins:
            analysis = {
                'score': None,
                'scoreClass': self.__class__,
            }

            return None, analysis

        from light.backend import Backend
        backend = Backend()
        backend.configure(self._dbParams)

        allQueryFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._query)))

        allSubjectFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._subject.read)))

        # Keep track of variables
        state = {
            # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will
            # contain all int offsets that are in matching features (and thus
            # inside the matched region).
            'overallMatchedQueryOffsets': set(),
            'overallMatchedSubjectOffsets': set(),
            # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets
            # will contain all int offsets that are in features that don't
            # match, but which are inside the matched region.
            'overallUnmatchedQueryOffsets': set(),
            'overallUnmatchedSubjectOffsets': set(),
            # The set of all offsets in all bins (whether or not the offsets
            # are in matched features, unmatched features, or not in any
            # feature).
            'queryOffsetsInBins': set(),
            'subjectOffsetsInBins': set(),
            'score': 0.0,
            'denominatorQuery': 0.0,
            'denominatorSubject': 0.0,
            'matchedOffsetCount': 0,
            'matchedRegionScore': 0.0,
            'numeratorQuery': 0.0,
            'numeratorSubject': 0.0,
            'normalizerQuery': 0.0,
            'normalizerSubject': 0.0,
            'totalOffsetCount': 0,
            'scoreClass': self.__class__,
            'queryOffsetsInBinsCount': 0,
            'subjectOffsetsInBinsCount': 0,
            'numberOfBinsConsidered': 0,
        }

        # Consider the significantBins one by one until the overall score drops
        # below the bestBinScore, or we run out of bins.
        for i, bin_ in enumerate((sb['bin'] for sb in self._significantBins),
                                 start=1):

            result = addBin(bin_, allQueryFeatures, allSubjectFeatures, state)
            # Check if we can add more bins, or if we need to return here.
            if result['score'] >= state['score']:
                # The new overallScore is higher or equal to the current
                # overallScore. Continue adding the next bin using the newly
                # calculated values.
                state.update(result)
            else:
                # The new overallScore is lower than the current overallScore.
                break

        state['numberOfBinsConsidered'] = i
        return state['score'], state
Exemplo n.º 7
0
    def calculateScore(self):
        """
        Calculates the overall score for all significant bins, as described
        above.

        @return: a C{float} overall score for all significant bins (or C{None}
            if there are no significant bins) and a C{dict} with information
            about the score.
        """
        if self._significantBins:
            bestBinScore = self._significantBins[0]['score']

        if not self._significantBins:
            analysis = {
                'score': None,
                'scoreClass': self.__class__,
            }

            return None, analysis

        from light.backend import Backend
        backend = Backend()
        backend.configure(self._dbParams)

        allQueryFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._query)))

        allSubjectFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._subject.read)))

        # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will
        # contain all int offsets that are in matching features (and thus
        # inside the matched region).
        overallMatchedQueryOffsets = set()
        overallMatchedSubjectOffsets = set()

        # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets
        # will contain all int offsets that are in features that don't match,
        # but which are inside the matched region.
        overallUnmatchedQueryOffsets = set()
        overallUnmatchedSubjectOffsets = set()

        # The set of all offsets in all bins (whether or not the offsets are in
        # matched features, unmatched features, or not in any feature.
        queryOffsetsInBins = set()
        subjectOffsetsInBins = set()

        # Get the features and their offsets which are matched and unmatched in
        # subject and query in all bins.
        for bin_ in (sb['bin'] for sb in self._significantBins):
            # Query.
            matchedOffsets, unmatchedOffsets, minOffset, maxOffset = (
                offsetsInBin(bin_, 'query', allQueryFeatures))
            overallMatchedQueryOffsets.update(matchedOffsets)
            overallUnmatchedQueryOffsets.update(unmatchedOffsets)
            queryOffsetsInBins.update(range(minOffset, maxOffset + 1))

            # Subject.
            matchedOffsets, unmatchedOffsets, minOffset, maxOffset = (
                offsetsInBin(bin_, 'subject', allSubjectFeatures))
            overallMatchedSubjectOffsets.update(matchedOffsets)
            overallUnmatchedSubjectOffsets.update(unmatchedOffsets)
            subjectOffsetsInBins.update(range(minOffset, maxOffset + 1))

        # Make sure none of the overall matched offsets are in the overall
        # unmatchedOffsets.
        overallMatchedQueryOffsets -= overallUnmatchedQueryOffsets
        overallMatchedSubjectOffsets -= overallUnmatchedSubjectOffsets

        # Overall score calculation step 1: the matched region score (MRS).
        matchedOffsetCount = (len(overallMatchedQueryOffsets) +
                              len(overallMatchedSubjectOffsets))
        totalOffsetCount = (matchedOffsetCount +
                            len(overallUnmatchedQueryOffsets) +
                            len(overallUnmatchedSubjectOffsets))

        try:
            matchedRegionScore = matchedOffsetCount / totalOffsetCount
        except ZeroDivisionError:
            # A small optimization could be done here. If the MRS is zero,
            # we already know the overall score will be zero, so we could
            # return at this point. To keep things simple, for now, just
            # continue with the overall calculation.
            matchedRegionScore = 0.0

        # Overall score calculation step 2: the length normalizer (LN).

        normalizerQuery, numeratorQuery, denominatorQuery = (
            computeLengthNormalizer(allQueryFeatures,
                                    overallMatchedQueryOffsets,
                                    overallUnmatchedQueryOffsets,
                                    queryOffsetsInBins))

        # There is a small optimization that could be done at this point.
        # If the query normalizer is 1.0, don't bother to compute a
        # normalizer for the subject (due to the use of max() below and
        # because a normalizer is always <= 1.0).  But to keep the code
        # simpler, for now, we still compute both normalizers.

        normalizerSubject, numeratorSubject, denominatorSubject = (
            computeLengthNormalizer(allSubjectFeatures,
                                    overallMatchedSubjectOffsets,
                                    overallUnmatchedSubjectOffsets,
                                    subjectOffsetsInBins))

        # Calculate the final score, as descibed in the docstring.
        score = matchedRegionScore * max(normalizerQuery, normalizerSubject)

        # The overall score can be lower than the best bin score, for
        # example when a sequence is compared against itself, where the
        # bestBinScore will be 1.0, but the overallScore can be lower,
        # because worse bins are taken into account. We don't allow that.
        if bestBinScore is not None and score < bestBinScore:
            overallScore = bestBinScore
            adjusted = True
        else:
            overallScore = score
            adjusted = False

        analysis = {
            'denominatorQuery': denominatorQuery,
            'denominatorSubject': denominatorSubject,
            'matchedOffsetCount': matchedOffsetCount,
            'matchedSubjectOffsetCount': len(overallMatchedSubjectOffsets),
            'matchedQueryOffsetCount': len(overallMatchedQueryOffsets),
            'matchedRegionScore': matchedRegionScore,
            'numeratorQuery': numeratorQuery,
            'numeratorSubject': numeratorSubject,
            'normalizerQuery': normalizerQuery,
            'normalizerSubject': normalizerSubject,
            'score': overallScore,
            'scoreClass': self.__class__,
            'totalOffsetCount': totalOffsetCount,
            'queryOffsetsInBins': len(queryOffsetsInBins),
            'subjectOffsetsInBins': len(subjectOffsetsInBins),
            'overallScoreAdjustedToBestBinScore': adjusted,
        }

        return overallScore, analysis