Пример #1
0
 def __init__(self, histogram, query, subject, dbParams, findParams=None):
     self._histogram = histogram
     self._queryLen = len(query)
     self._subjectLen = len(subject)
     from light.parameters import FindParameters
     self._findParams = findParams or FindParameters()
     from light.backend import Backend
     backend = Backend()
     backend.configure(dbParams)
     scannedQuery = backend.scan(query)
     self._allQueryFeatures = set(scannedQuery.landmarks +
                                  scannedQuery.trigPoints)
     scannedSubject = backend.scan(subject.read)
     self._allSubjectFeatures = set(scannedSubject.landmarks +
                                    scannedSubject.trigPoints)
Пример #2
0
    def __init__(self, histogram, query, subject, dbParams):
        self._histogram = histogram
        self._queryLen = len(query)
        self._subjectLen = len(subject)

        from light.backend import Backend
        backend = Backend()
        backend.configure(dbParams)

        scannedQuery = backend.scan(query)
        allQueryHashes = backend.getHashes(scannedQuery)
        self._allQueryFeatures = getHashFeatures(allQueryHashes)

        scannedSubject = backend.scan(subject.read)
        allSubjectHashes = backend.getHashes(scannedSubject)
        self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
Пример #3
0
    def getFractionOfStructuresCovered(self):
        """
        Return the fraction of known structures matched by at least one
        substring in the subset that is being evaluated.
        """
        hit = 0
        total = 0

        db = DatabaseSpecifier().getDatabaseFromKeywords(
            trigPoints=[],
            landmarks=['AC ' + self.structureType],
            acAlphaHelixFilename=self.acAlphaHelixFilename,
            acAlphaHelix310Filename=self.acAlphaHelix310Filename,
            acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename,
            acAlphaHelixPiFilename=self.acAlphaHelixPiFilename,
            acExtendedStrandFilename=self.acExtendedStrandFilename)

        backend = Backend()
        backend.configure(db.dbParams)

        for read in FastaReads(self.structureFile,
                               readClass=AAReadWithX,
                               checkAlphabet=0):
            total += 1
            scannedRead = backend.scan(read)
            if len(scannedRead.landmarks) > 0:
                hit += 1

        return hit / total if total else 0.0
Пример #4
0
 def testCollectReadHashes(self):
     """
     The getHashes method must return a dict keyed by (landmark, trigPoints)
     hash with values containing the read offsets.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks], distanceBase=1.0)
     be = Backend()
     be.configure(dbParams)
     query = AARead('query', 'FRRRFRRRFASAASAFRRRFRRRFASAASA')
     scannedQuery = be.scan(query)
     hashCount = be.getHashes(scannedQuery)
     helixAt0 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2)
     helixAt15 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 15, 9, 2)
     peakAt10 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 10)
     peakAt13 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 13)
     peakAt25 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 25)
     peakAt28 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 28)
     self.assertEqual(
         {
             'A2:P:28': [[helixAt0, peakAt28]],
             'A2:P:25': [[helixAt0, peakAt25]],
             'A2:P:13': [[helixAt0, peakAt13], [helixAt15, peakAt28]],
             'A2:P:10': [[helixAt0, peakAt10], [helixAt15, peakAt25]],
             'A2:P:-5': [[helixAt15, peakAt10]],
             'A2:P:-2': [[helixAt15, peakAt13]],
             'A2:A2:15': [[helixAt0, helixAt15]],
         }, hashCount)
Пример #5
0
 def testNoOverlapDefaultDistanceBase(self):
     """
     There cannot be any index overlap between landmarks found by the
     GOR4 alpha helix and beta strand finders using the default distance
     base (currently 1.1).
     """
     alphaHelixBe = Backend()
     alphaHelixBe.configure(
         DatabaseParameters(landmarks=[GOR4AlphaHelix], trigPoints=[]))
     betaStrandBe = Backend()
     betaStrandBe.configure(
         DatabaseParameters(landmarks=[GOR4BetaStrand], trigPoints=[]))
     alphaHelixScanned = alphaHelixBe.scan(self.READ)
     betaStrandScanned = betaStrandBe.scan(self.READ)
     alphaHelixIndices = alphaHelixScanned.coveredIndices()
     betaStrandIndices = betaStrandScanned.coveredIndices()
     self.assertEqual(0, len(alphaHelixIndices & betaStrandIndices))
Пример #6
0
    def __init__(self, histogram, query, subject, dbParams, weights=None):
        self._histogram = histogram
        self._queryLen = len(query)
        self._subjectLen = len(subject)

        self._weights = self.DEFAULT_WEIGHTS if weights is None else weights

        from light.backend import Backend
        backend = Backend()
        backend.configure(dbParams)

        scannedQuery = backend.scan(query)
        allQueryHashes = backend.getHashes(scannedQuery)
        self._allQueryFeatures = getHashFeatures(allQueryHashes)

        scannedSubject = backend.scan(subject.read)
        allSubjectHashes = backend.getHashes(scannedSubject)
        self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
Пример #7
0
 def testNoOverlapDistanceBaseOne(self):
     """
     There cannot be any index overlap between landmarks found by the
     GOR4 alpha helix and beta strand finders using a distance base of 1.0
     (which should do no scaling).
     """
     alphaHelixBe = Backend()
     alphaHelixBe.configure(
         DatabaseParameters(landmarks=[GOR4AlphaHelix],
                            trigPoints=[],
                            distanceBase=1.0))
     betaStrandBe = Backend()
     betaStrandBe.configure(
         DatabaseParameters(landmarks=[GOR4BetaStrand],
                            trigPoints=[],
                            distanceBase=1.0))
     alphaHelixScanned = alphaHelixBe.scan(self.READ)
     betaStrandScanned = betaStrandBe.scan(self.READ)
     alphaHelixIndices = alphaHelixScanned.coveredIndices()
     betaStrandIndices = betaStrandScanned.coveredIndices()
     self.assertEqual(0, len(alphaHelixIndices & betaStrandIndices))
Пример #8
0
 def testScan(self):
     """
     The scan method must return a scanned subject.
     """
     subject = AARead('subject', 'FRRRFRRRFASAASA')
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks])
     be = Backend()
     be.configure(dbParams)
     be.addSubject(subject, '0')
     scannedSubject = be.scan(subject)
     self.assertIsInstance(scannedSubject, ScannedRead)
Пример #9
0
 def testCollectReadHashesWithOneLandmark(self):
     """
     The getHashes method must return a dict keyed by (landmark, trigPoints)
     hash with values containing the read offsets. The result should be
     empty if there is only one landmark in the read.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[])
     be = Backend()
     be.configure(dbParams)
     query = AARead('query', 'FRRRFRRRF')
     scannedQuery = be.scan(query)
     hashCount = be.getHashes(scannedQuery)
     self.assertEqual({}, hashCount)
Пример #10
0
 def testGetScannedPairs(self):
     """
     The getSequencePairs method must return pairs of
     (landmark, trigPoints).
     """
     subject = AARead('subject', 'FRRRFRRRFASAASA')
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks], distanceBase=1.0)
     be = Backend()
     be.configure(dbParams)
     be.addSubject(subject, '0')
     scannedSubject = be.scan(subject)
     pairs = list(be.getScannedPairs(scannedSubject))
     # First pair.
     landmark, trigPoint = pairs[0]
     self.assertEqual(Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL,
                               0, 9, 2), landmark)
     self.assertEqual(TrigPoint(Peaks.NAME, Peaks.SYMBOL, 10), trigPoint)
     # Second pair.
     landmark, trigPoint = pairs[1]
     self.assertEqual(Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL,
                               0, 9, 2), landmark)
     self.assertEqual(TrigPoint(Peaks.NAME, Peaks.SYMBOL, 13), trigPoint)
     self.assertEqual(2, len(pairs))
Пример #11
0
    def __init__(self, sequences, labels, defaultLabel=None, **kwargs):
        """
        Base class for using cluster analysis to evaluate how well various
        feature finders and database parameter settings can separate a set of
        sequences. The clustering is based on feature offset deltas.

        @param sequences: Either A C{str} filename of sequences to consider or
            a C{light.reads.Reads} instance.
        @param labels: A C{dict} with a label for each sequence id in
            C{sequences}. These are the known categories of each sequence.
        @param defaultLabel: If not C{None}, a label to use for reads whose ids
            are not present in C{labels}. If C{None} and a read id has no label
            a ValueError is raised.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        @raises ValueError: If the id of a read is not in labels and no default
            label has been set, or if there are no reads in C{sequences}.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences
        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)
        allOffsetDeltas = []
        trueLabels = []

        for read in reads:
            trueLabel = labels.get(read.id, defaultLabel)
            if trueLabel is None:
                raise ValueError('Read %r has no corresponding label' %
                                 read.id)
            trueLabels.append(trueLabel)
            offsetDeltas = Counter()
            scannedRead = backend.scan(read)
            for landmark, trigPoint in backend.getScannedPairs(scannedRead):
                delta = scaleLog(trigPoint.offset - landmark.offset,
                                 database.dbParams.distanceBase)
                offsetDeltas[delta] += 1
            allOffsetDeltas.append(offsetDeltas)

        nReads = len(reads)

        if nReads == 0:
            raise ValueError('No sequences were found in %r' % sequences)

        # Don't check that len(reads) == len(labels). I.e., ignore extra labels
        # to make using this class interactively more convenient.

        # Create an affinity matrix. Initially set all values to 1.0 so we
        # don't need to later initialize the diagonal.
        affinity = np.ones((nReads, nReads))

        for row, offsetDeltas in enumerate(allOffsetDeltas):
            for col in range(row + 1, nReads):
                affinity[row,
                         col] = affinity[col,
                                         row] = (self.affinityFromOffsetDeltas(
                                             allOffsetDeltas[row],
                                             allOffsetDeltas[col]))

        self.nReads = nReads
        self.affinity = affinity
        self.trueLabels = trueLabels
Пример #12
0
    def calculateScore(self):
        """
        Calculates the overall score, as described above.

        @return: a C{float} overall score for all significant bins (or C{None}
            if there are no significant bins) and a C{dict} with information
            about the score.
        """
        # We could do more checking here and use the score of the best bin as
        # the overall score if there is only one significant bin or if the
        # score of the best bin is 1.0.

        # Don't attempt to calculate an overall score if there are no
        # significant bins.
        if not self._significantBins:
            analysis = {
                'score': None,
                'scoreClass': self.__class__,
            }

            return None, analysis

        from light.backend import Backend
        backend = Backend()
        backend.configure(self._dbParams)

        allQueryFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._query)))

        allSubjectFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._subject.read)))

        # Keep track of variables
        state = {
            # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will
            # contain all int offsets that are in matching features (and thus
            # inside the matched region).
            'overallMatchedQueryOffsets': set(),
            'overallMatchedSubjectOffsets': set(),
            # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets
            # will contain all int offsets that are in features that don't
            # match, but which are inside the matched region.
            'overallUnmatchedQueryOffsets': set(),
            'overallUnmatchedSubjectOffsets': set(),
            # The set of all offsets in all bins (whether or not the offsets
            # are in matched features, unmatched features, or not in any
            # feature).
            'queryOffsetsInBins': set(),
            'subjectOffsetsInBins': set(),
            'score': 0.0,
            'denominatorQuery': 0.0,
            'denominatorSubject': 0.0,
            'matchedOffsetCount': 0,
            'matchedRegionScore': 0.0,
            'numeratorQuery': 0.0,
            'numeratorSubject': 0.0,
            'normalizerQuery': 0.0,
            'normalizerSubject': 0.0,
            'totalOffsetCount': 0,
            'scoreClass': self.__class__,
            'queryOffsetsInBinsCount': 0,
            'subjectOffsetsInBinsCount': 0,
            'numberOfBinsConsidered': 0,
        }

        # Consider the significantBins one by one until the overall score drops
        # below the bestBinScore, or we run out of bins.
        for i, bin_ in enumerate((sb['bin'] for sb in self._significantBins),
                                 start=1):

            result = addBin(bin_, allQueryFeatures, allSubjectFeatures, state)
            # Check if we can add more bins, or if we need to return here.
            if result['score'] >= state['score']:
                # The new overallScore is higher or equal to the current
                # overallScore. Continue adding the next bin using the newly
                # calculated values.
                state.update(result)
            else:
                # The new overallScore is lower than the current overallScore.
                break

        state['numberOfBinsConsidered'] = i
        return state['score'], state
Пример #13
0
    def calculateScore(self):
        """
        Calculates the overall score for all significant bins, as described
        above.

        @return: a C{float} overall score for all significant bins (or C{None}
            if there are no significant bins) and a C{dict} with information
            about the score.
        """
        if self._significantBins:
            bestBinScore = self._significantBins[0]['score']

        if not self._significantBins:
            analysis = {
                'score': None,
                'scoreClass': self.__class__,
            }

            return None, analysis

        from light.backend import Backend
        backend = Backend()
        backend.configure(self._dbParams)

        allQueryFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._query)))

        allSubjectFeatures = getHashFeatures(
            backend.getHashes(backend.scan(self._subject.read)))

        # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will
        # contain all int offsets that are in matching features (and thus
        # inside the matched region).
        overallMatchedQueryOffsets = set()
        overallMatchedSubjectOffsets = set()

        # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets
        # will contain all int offsets that are in features that don't match,
        # but which are inside the matched region.
        overallUnmatchedQueryOffsets = set()
        overallUnmatchedSubjectOffsets = set()

        # The set of all offsets in all bins (whether or not the offsets are in
        # matched features, unmatched features, or not in any feature.
        queryOffsetsInBins = set()
        subjectOffsetsInBins = set()

        # Get the features and their offsets which are matched and unmatched in
        # subject and query in all bins.
        for bin_ in (sb['bin'] for sb in self._significantBins):
            # Query.
            matchedOffsets, unmatchedOffsets, minOffset, maxOffset = (
                offsetsInBin(bin_, 'query', allQueryFeatures))
            overallMatchedQueryOffsets.update(matchedOffsets)
            overallUnmatchedQueryOffsets.update(unmatchedOffsets)
            queryOffsetsInBins.update(range(minOffset, maxOffset + 1))

            # Subject.
            matchedOffsets, unmatchedOffsets, minOffset, maxOffset = (
                offsetsInBin(bin_, 'subject', allSubjectFeatures))
            overallMatchedSubjectOffsets.update(matchedOffsets)
            overallUnmatchedSubjectOffsets.update(unmatchedOffsets)
            subjectOffsetsInBins.update(range(minOffset, maxOffset + 1))

        # Make sure none of the overall matched offsets are in the overall
        # unmatchedOffsets.
        overallMatchedQueryOffsets -= overallUnmatchedQueryOffsets
        overallMatchedSubjectOffsets -= overallUnmatchedSubjectOffsets

        # Overall score calculation step 1: the matched region score (MRS).
        matchedOffsetCount = (len(overallMatchedQueryOffsets) +
                              len(overallMatchedSubjectOffsets))
        totalOffsetCount = (matchedOffsetCount +
                            len(overallUnmatchedQueryOffsets) +
                            len(overallUnmatchedSubjectOffsets))

        try:
            matchedRegionScore = matchedOffsetCount / totalOffsetCount
        except ZeroDivisionError:
            # A small optimization could be done here. If the MRS is zero,
            # we already know the overall score will be zero, so we could
            # return at this point. To keep things simple, for now, just
            # continue with the overall calculation.
            matchedRegionScore = 0.0

        # Overall score calculation step 2: the length normalizer (LN).

        normalizerQuery, numeratorQuery, denominatorQuery = (
            computeLengthNormalizer(allQueryFeatures,
                                    overallMatchedQueryOffsets,
                                    overallUnmatchedQueryOffsets,
                                    queryOffsetsInBins))

        # There is a small optimization that could be done at this point.
        # If the query normalizer is 1.0, don't bother to compute a
        # normalizer for the subject (due to the use of max() below and
        # because a normalizer is always <= 1.0).  But to keep the code
        # simpler, for now, we still compute both normalizers.

        normalizerSubject, numeratorSubject, denominatorSubject = (
            computeLengthNormalizer(allSubjectFeatures,
                                    overallMatchedSubjectOffsets,
                                    overallUnmatchedSubjectOffsets,
                                    subjectOffsetsInBins))

        # Calculate the final score, as descibed in the docstring.
        score = matchedRegionScore * max(normalizerQuery, normalizerSubject)

        # The overall score can be lower than the best bin score, for
        # example when a sequence is compared against itself, where the
        # bestBinScore will be 1.0, but the overallScore can be lower,
        # because worse bins are taken into account. We don't allow that.
        if bestBinScore is not None and score < bestBinScore:
            overallScore = bestBinScore
            adjusted = True
        else:
            overallScore = score
            adjusted = False

        analysis = {
            'denominatorQuery': denominatorQuery,
            'denominatorSubject': denominatorSubject,
            'matchedOffsetCount': matchedOffsetCount,
            'matchedSubjectOffsetCount': len(overallMatchedSubjectOffsets),
            'matchedQueryOffsetCount': len(overallMatchedQueryOffsets),
            'matchedRegionScore': matchedRegionScore,
            'numeratorQuery': numeratorQuery,
            'numeratorSubject': numeratorSubject,
            'normalizerQuery': normalizerQuery,
            'normalizerSubject': normalizerSubject,
            'score': overallScore,
            'scoreClass': self.__class__,
            'totalOffsetCount': totalOffsetCount,
            'queryOffsetsInBins': len(queryOffsetsInBins),
            'subjectOffsetsInBins': len(subjectOffsetsInBins),
            'overallScoreAdjustedToBestBinScore': adjusted,
        }

        return overallScore, analysis
Пример #14
0
class CalculateOverlap(object):
    """
    Calculate the overlap between the features found by our finders and the
    secondary structures found by DSSP. The secondary structures found by
    DSSP were downloaded from http://www.rcsb.org/pdb/files/ss.txt on
    11/11/2015.

    @param kwargs: See
        C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
        additional keywords, all of which are optional.
    """
    def __init__(self, **kwargs):
        # Set default landmark and trig point finders.
        if 'landmarks' not in kwargs:
            kwargs['landmarks'] = ALL_LANDMARK_CLASSES + [
                c for c in DEV_LANDMARK_CLASSES if c.NAME.startswith('PDB ')
            ]
        if 'trigPoints' not in kwargs:
            kwargs['trigPoints'] = [
                c for c in ALL_TRIG_CLASSES if c.NAME != 'Volume'
            ]

        db = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        self._backend = Backend()
        self._backend.configure(db.dbParams)

        self._names = (db.dbParams.landmarkFinderNames() +
                       db.dbParams.trigPointFinderNames())

    def getFeatures(self, ssAARead):
        """
        Extract the features from the sequence. Return information about the
        offsets covered by each feature as well as the intersection and union
        of offsets for each pair of features.

        @param ssAARead: An C{SSAARead} instance.
        @return: A triple of C{defaultdict(set)}s. These contain:
            1) The sequence features, keyed by C{str} feature name, each with
               a C{set} of C{int}s as a value, giving the offsets in
               C{ssAARead} where the feature was found.
            2) The intersection of offsets for each pair of feature finders.
               This is keyed by C{str} "name1-name2" with the names of the
               finders. The values are C{set}s of C{int}s, as in (1).
            3) The union of offsets for each pair of feature finders. This is
               keyed by C{str} "name1-name2" with the names of the finders.
               The values are C{set}s of C{int}s, as in (1).
        """

        features = defaultdict(set)
        intersection = defaultdict(set)
        union = defaultdict(set)

        scannedSequence = self._backend.scan(ssAARead)

        # Get all offsets for each landmark and trig point separately.
        for feature in scannedSequence.landmarks + scannedSequence.trigPoints:
            features[feature.name].update(feature.coveredOffsets())

        # Get the offset intersection and union of all pairs of features.
        for i, name1 in enumerate(self._names):
            for name2 in self._names[i + 1:]:
                key = frozenset((name1, name2))
                intersection[key] = features[name1] & features[name2]
                union[key] = features[name1] | features[name2]

        return features, intersection, union
Пример #15
0
    def print_(self,
               printQuery=True,
               printSequences=False,
               printFeatures=False,
               printHistograms=False,
               queryDescription='Query title',
               sortHSPsByScore=True,
               margin='',
               result=None):
        """
        Print a result in a human-readable format. If self._storeFullAnalysis
        is True, full information about all matched subjects (i.e., including
        matches that were not significant) will be printed. If not, only basic
        information about significant matches will appear.

        @param printQuery: If C{True}, also print details of the query.
        @param printSequences: If C{True}, also print query and subject
            sequences.
        @param printFeatures: If C{True}, print details of landmark and trig
            point features.
        @param printHistograms: If C{True}, print details of histograms.
        @param queryDescription: A C{str} description to print before the query
            (when printQuery is C{True}.
        @param sortHSPsByScore: If C{True}, HSPs for a subject should be
            printed in order of decreasing score. If C{False}, print sorted by
            histogram bin number.
        @param margin: A C{str} that should be inserted at the start of each
            line of output.
        @param result: A C{MultilineString} instance, or C{None} if a new
            C{MultilineString} should be created.
        @return: If C{result} was C{None}, return a C{str} representation of
            the scanned read, else C{None}.
        """
        if result is None:
            result = MultilineString(margin=margin)
            returnNone = False
        else:
            returnNone = True

        append = result.append
        extend = result.extend
        indent = result.indent
        outdent = result.outdent

        if printQuery:
            backend = Backend()
            backend.configure(self.connector.dbParams)
            scannedQuery = backend.scan(self.query)
            scannedQuery.print_(printSequence=printSequences,
                                printFeatures=printFeatures,
                                description=queryDescription,
                                margin=margin,
                                result=result)

        self._findParams.print_(margin=margin, result=result)

        # Sort matched subjects (if any) in order of decreasing score so we
        # can print them in a useful order.
        #
        # The following sorted() call will fail (with TypeError) under
        # Python 3 because bestScore is None when there are no significant
        # matches (which can happen when self._storeFullAnalysis is True).
        subjectIndices = sorted(
            iter(self.analysis.keys()),
            reverse=True,
            key=lambda index: self.analysis[index]['bestBinScore'])

        if not sortHSPsByScore:
            indexGetter = itemgetter('index')

        extend([
            'Overall matches: %d' % len(subjectIndices),
            'Significant matches: %d' % len(list(self.significantSubjects())),
            'Query hash count: %d' % self.queryHashCount,
        ])

        if subjectIndices:
            append('Matched subjects:')

        indent()

        for subjectCount, subjectIndex in enumerate(subjectIndices, start=1):
            analysis = self.analysis[subjectIndex]
            subject = self.connector.getSubjectByIndex(subjectIndex)
            minHashCount = min(self.queryHashCount, subject.hashCount)
            significantBins = analysis['significantBins']

            append('Subject %d:' % subjectCount)
            indent()

            extend([
                'Title: %s' % subject.read.id,
                'Best HSP score: %s' % analysis['bestBinScore'],
            ])

            if printSequences:
                append('Sequence: %s' % subject.read.sequence)

            extend([
                'Index in database: %s' % subjectIndex,
                'Subject hash count: %s' % subject.hashCount,
                'Subject/query min hash count: %s' % minHashCount,
                'Significance cutoff: %f' %
                (self._findParams.significanceFraction * minHashCount),
                'Number of HSPs: %d' % len(significantBins),
            ])

            if not sortHSPsByScore:
                significantBins = deepcopy(significantBins)
                significantBins.sort(key=indexGetter)

            indent()
            for hspCount, bin_ in enumerate(significantBins, start=1):
                binCount = len(bin_['bin'])
                append('HSP %d (bin %d): %d matching hash%s, score %f' %
                       (hspCount, bin_['index'], binCount,
                        '' if binCount == 1 else 'es', bin_['score']))

                if printFeatures:
                    indent()
                    for binItem in bin_['bin']:
                        extend([
                            'Landmark %s' % binItem['subjectLandmark'],
                            'Trig point %s' % binItem['subjectTrigPoint'],
                        ])
                    outdent()
            outdent()

            if printHistograms and self._storeFullAnalysis:
                histogram = analysis['histogram']
                significantBinIndices = set(
                    [bin_['index'] for bin_ in significantBins])
                maxCount = max(len(bin_) for bin_ in histogram.bins)

                append('Histogram:')
                indent()
                extend([
                    'Number of bins: %d' % len(histogram.bins),
                    'Bin width: %.10f' % histogram.binWidth,
                    'Max bin count: %r' % maxCount,
                    'Max (scaled) offset delta: %d' % histogram.max,
                    'Min (scaled) offset delta: %d' % histogram.min,
                ])

                # Calculate column widths for displaying ranges neatly.
                maxAbsoluteValue = max(
                    [-histogram.min, histogram.max, -histogram.max])
                if maxAbsoluteValue == 0:
                    # All printed range values will be '+0.0', of length 4.
                    rangeWidth = 4
                else:
                    # Add 3 because we have the sign, a decimal point, and
                    # one digit of precision.
                    rangeWidth = 3 + int(ceil(log10(maxAbsoluteValue)))
                rangeSeparator = ' to '
                rangeColumnWidth = 2 * rangeWidth + len(rangeSeparator)

                first = True
                for binIndex, bin_ in enumerate(histogram.bins):
                    binCount = len(bin_)
                    if binCount:
                        if first:
                            append('Non-empty bins:')
                            indent()
                            append('%s %s %*s %s' %
                                   ('Index', 'Count', rangeColumnWidth,
                                    'Range', 'Significant'))
                            first = False
                        binLow = histogram.min + binIndex * histogram.binWidth
                        # 5, 5, 11 embedded in the format string below are
                        # the lengths of 'Index', 'Range', and 'Significant'.
                        append('%5d %5d %+*.1f%s%+*.1f %11s' %
                               (binIndex, binCount, rangeWidth, binLow,
                                rangeSeparator, rangeWidth,
                                binLow + histogram.binWidth, 'Yes'
                                if binIndex in significantBinIndices else ''))

                if first:
                    append('All bins were empty.')
                else:
                    outdent()
                outdent()

        if not returnNone:
            return str(result)
Пример #16
0
    def __init__(self, sequences, cutoff, **kwargs):
        """
        A class to work with hashes.
        For a set of given sequences, find all hashes and for each sequence
        make a string of 1 or 0 denoting whether a hash is present in that
        sequence or not. Only include hashes if they occur in more than at '
        least a specified percentage of all given sequences.

        @param sequences: A C{str} filename with a fasta file of sequences to
            be used or a C{dark.reads.Reads} object.
        @param cutoff: A C{float} between 0.0 and 1.0 of the fraction of
            sequences in which a hash has to be present to be included in the
            final string.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences

        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)

        # Make a dictionary where the keys are the sequence ids and the value
        # is an orderedDict of hashes as returned from getHashes().
        hashes = {}
        for read in reads:
            scannedRead = backend.scan(read)
            readHashes = backend.getHashes(scannedRead)
            hashes[read.id] = readHashes

        # Make a list of all unique hashes that occur.
        totalHashes = set()
        for read in hashes:
            totalHashes.update(hashes[read].keys())

        # Make a dictionary where the key is a hash and the value is a list of
        # the reads in which the hash occurs.
        byHashes = {}
        for hash_ in totalHashes:
            viruses = []
            for readId in hashes:
                try:
                    hashes[readId][hash_]
                except KeyError:
                    continue
                viruses.append(readId)
            byHashes[hash_] = viruses

        # Make a dictionary where the key is a readId and the value is a string
        # of 1 and 0 denoting which hashes occur in which read.
        co = cutoff * len(reads)
        self.hashString = {read.id: '' for read in reads}

        for hash_ in byHashes:
            if len(byHashes[hash_]) > co:
                for virus in self.hashString:
                    if virus in byHashes[hash_]:
                        self.hashString[virus] += '1'
                    else:
                        self.hashString[virus] += '0'
Пример #17
0
    def __init__(self,
                 query,
                 connector,
                 matches,
                 queryHashCount,
                 findParams=None,
                 nonMatchingHashes=None,
                 storeFullAnalysis=False):
        self.query = query
        self.connector = connector
        self.matches = matches  # Only saved on self for testing.
        self.queryHashCount = queryHashCount
        findParams = findParams or FindParameters()
        self._findParams = findParams
        self.nonMatchingHashes = nonMatchingHashes
        self._storeFullAnalysis = storeFullAnalysis
        self.analysis = defaultdict(dict)
        deltaScale = findParams.deltaScale
        scoreGetter = itemgetter('score')
        be = Backend()
        be.configure(connector.dbParams)

        if findParams.significanceMethod == 'AAFraction':
            queryAACount = len(be.scan(query).coveredIndices())

        # Go through all the subjects that were matched at all, and put the
        # match offset deltas into bins so we can decide which (if any) of
        # the matches is significant.
        for subjectIndex in matches:
            subject = connector.getSubjectByIndex(subjectIndex)
            # Use a histogram to bin scaled (landmark, trigPoint) offset
            # deltas.
            nBins = max(len(query), len(subject))
            # Make sure the number of bins is odd, else Histogram() will raise.
            nBins |= 0x1
            histogram = Histogram(nBins)
            add = histogram.add

            # To ensure the set of query/subject offset deltas is the same
            # no matter which of the sequences is the query and which is
            # the subject, we negate all deltas if the subject sequence
            # sorts first. This is just a way of canonicalizing the set of
            # deltas. If we don't canonicalize, we get sets of deltas with
            # opposite signs, like {-4, -2, 6} and {-6, 2, 4} depending on
            # which sequence is the subject and which the query. This
            # occasionally leads to hard-to-debug and awkward-to-fix
            # differences in the histogram binning at bin boundaries due to
            # tiny floating point differences. The simple solution is to
            # canonicalize the deltas based on an arbitrary consistent
            # difference between the subject and query.
            negateDeltas = subject.read.sequence < query.sequence

            for match in matches[subjectIndex]:
                # The delta is the difference between the
                # corresponding landmark offsets
                subjectLandmarkOffset = match['subjectLandmark'].offset
                queryLandmarkOffset = match['queryLandmark'].offset
                delta = subjectLandmarkOffset - queryLandmarkOffset
                if negateDeltas:
                    delta = -delta

                # Add the information about this common landmark /
                # trig point hash to the histogram bucket for the
                # query landmark to subject landmark offset delta.
                add(scaleLinear(delta, deltaScale), match)

            histogram.finalize()

            minHashCount = min(queryHashCount, subject.hashCount)

            significanceMethod = findParams.significanceMethod
            if significanceMethod == 'Always':
                significance = Always()
            elif significanceMethod == 'HashFraction':
                significance = HashFraction(histogram, minHashCount,
                                            findParams.significanceFraction)
            elif significanceMethod == 'MaxBinHeight':
                significance = MaxBinHeight(histogram, query, connector)
            elif significanceMethod == 'MeanBinHeight':
                significance = MeanBinHeight(histogram, query, connector)
            elif significanceMethod == 'AAFraction':
                featureAACount = (queryAACount +
                                  len(be.scan(subject.read).coveredIndices()))
                significance = AAFraction(histogram, featureAACount,
                                          findParams.significanceFraction)
            else:
                raise ValueError('Unknown significance method %r' %
                                 significanceMethod)

            binScoreMethod = findParams.binScoreMethod
            if binScoreMethod == 'NoneScore':
                scorer = NoneScore()
            elif binScoreMethod == 'MinHashesScore':
                scorer = MinHashesScore(histogram, minHashCount)
            elif binScoreMethod == 'FeatureMatchingScore':
                scorer = FeatureMatchingScore(histogram, query, subject,
                                              connector.dbParams, findParams)
            elif binScoreMethod == 'FeatureAAScore':
                scorer = FeatureAAScore(histogram, query, subject,
                                        connector.dbParams)
            elif binScoreMethod == 'WeightedFeatureAAScore':
                scorer = WeightedFeatureAAScore(histogram, query, subject,
                                                connector.dbParams,
                                                findParams.weights)
            elif binScoreMethod == 'FeatureAALengthScore':
                scorer = FeatureAALengthScore(histogram, query, subject,
                                              connector.dbParams)
            else:
                raise ValueError('Unknown bin score method %r' %
                                 binScoreMethod)

            # Find bins with a significant number of elements and score them.
            significantBins = []
            for binIndex, bin_ in enumerate(histogram.bins):
                if significance.isSignificant(binIndex):
                    score, scoreAnalysis = scorer.calculateScore(binIndex)
                    significantBin = {
                        'bin': bin_,
                        'index': binIndex,
                        'score': score
                    }
                    if storeFullAnalysis:
                        significantBin['scoreAnalysis'] = scoreAnalysis
                    significantBins.append(significantBin)

            if significantBins:
                significantBins.sort(key=scoreGetter, reverse=True)
                bestBinScore = significantBins[0]['score']
            else:
                bestBinScore = None

            overallScoreMethod = findParams.overallScoreMethod
            if overallScoreMethod == 'BestBinScore':
                scorer = BestBinScore(histogram, significantBins)
            elif overallScoreMethod == 'SignificantBinScore':
                scorer = SignificantBinScore(significantBins, query, subject,
                                             connector.dbParams)
            elif overallScoreMethod == 'GreedySignificantBinScore':
                scorer = GreedySignificantBinScore(significantBins, query,
                                                   subject, connector.dbParams)
            else:
                raise ValueError('Unknown overall score method %r' %
                                 overallScoreMethod)

            overallScore, overallScoreAnalysis = scorer.calculateScore()

            if storeFullAnalysis:
                self.analysis[subjectIndex] = {
                    'histogram': histogram,
                    'bestBinScore': bestBinScore,
                    'overallScore': overallScore,
                    'overallScoreAnalysis': overallScoreAnalysis,
                    'significantBins': significantBins,
                    'significanceAnalysis': significance.analysis,
                }
            elif significantBins:
                self.analysis[subjectIndex] = {
                    'bestBinScore': bestBinScore,
                    'overallScore': overallScore,
                    'significantBins': significantBins,
                }
        cmd.rebuild()

        # Color the features and chains.
        for i, chain in enumerate(chains):
            if chain.id == chainToCompare.id:
                # Color each chain.
                what = '%s & chain %s' % (structureName, chain.id)
                try:
                    color = CHAIN_COLORS[i]
                except IndexError:
                    color = 'white'
                cmd.color(color, what)

                # Color the features.
                scannedQuery = backend.scan(chain)
                for landmark in scannedQuery.landmarks:
                    color = FEATURE_COLORS[landmark.symbol]
                    start = landmark.offset
                    end = landmark.offset + landmark.length
                    what = 'resi %d-%d & %s & chain %s' % (start, end - 1,
                                                           structureName,
                                                           chain.id)
                    cmd.color(color, what)

                for trigPoint in scannedQuery.trigPoints:
                    color = FEATURE_COLORS[trigPoint.symbol]
                    start = trigPoint.offset
                    end = trigPoint.offset + trigPoint.length
                    what = 'resi %d-%d & %s & chain %s' % (start, end - 1,
                                                           structureName,