def __init__(self, histogram, query, subject, dbParams, findParams=None): self._histogram = histogram self._queryLen = len(query) self._subjectLen = len(subject) from light.parameters import FindParameters self._findParams = findParams or FindParameters() from light.backend import Backend backend = Backend() backend.configure(dbParams) scannedQuery = backend.scan(query) self._allQueryFeatures = set(scannedQuery.landmarks + scannedQuery.trigPoints) scannedSubject = backend.scan(subject.read) self._allSubjectFeatures = set(scannedSubject.landmarks + scannedSubject.trigPoints)
def __init__(self, histogram, query, subject, dbParams): self._histogram = histogram self._queryLen = len(query) self._subjectLen = len(subject) from light.backend import Backend backend = Backend() backend.configure(dbParams) scannedQuery = backend.scan(query) allQueryHashes = backend.getHashes(scannedQuery) self._allQueryFeatures = getHashFeatures(allQueryHashes) scannedSubject = backend.scan(subject.read) allSubjectHashes = backend.getHashes(scannedSubject) self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
def getFractionOfStructuresCovered(self): """ Return the fraction of known structures matched by at least one substring in the subset that is being evaluated. """ hit = 0 total = 0 db = DatabaseSpecifier().getDatabaseFromKeywords( trigPoints=[], landmarks=['AC ' + self.structureType], acAlphaHelixFilename=self.acAlphaHelixFilename, acAlphaHelix310Filename=self.acAlphaHelix310Filename, acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename, acAlphaHelixPiFilename=self.acAlphaHelixPiFilename, acExtendedStrandFilename=self.acExtendedStrandFilename) backend = Backend() backend.configure(db.dbParams) for read in FastaReads(self.structureFile, readClass=AAReadWithX, checkAlphabet=0): total += 1 scannedRead = backend.scan(read) if len(scannedRead.landmarks) > 0: hit += 1 return hit / total if total else 0.0
def testCollectReadHashes(self): """ The getHashes method must return a dict keyed by (landmark, trigPoints) hash with values containing the read offsets. """ dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], distanceBase=1.0) be = Backend() be.configure(dbParams) query = AARead('query', 'FRRRFRRRFASAASAFRRRFRRRFASAASA') scannedQuery = be.scan(query) hashCount = be.getHashes(scannedQuery) helixAt0 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2) helixAt15 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 15, 9, 2) peakAt10 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 10) peakAt13 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 13) peakAt25 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 25) peakAt28 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 28) self.assertEqual( { 'A2:P:28': [[helixAt0, peakAt28]], 'A2:P:25': [[helixAt0, peakAt25]], 'A2:P:13': [[helixAt0, peakAt13], [helixAt15, peakAt28]], 'A2:P:10': [[helixAt0, peakAt10], [helixAt15, peakAt25]], 'A2:P:-5': [[helixAt15, peakAt10]], 'A2:P:-2': [[helixAt15, peakAt13]], 'A2:A2:15': [[helixAt0, helixAt15]], }, hashCount)
def testNoOverlapDefaultDistanceBase(self): """ There cannot be any index overlap between landmarks found by the GOR4 alpha helix and beta strand finders using the default distance base (currently 1.1). """ alphaHelixBe = Backend() alphaHelixBe.configure( DatabaseParameters(landmarks=[GOR4AlphaHelix], trigPoints=[])) betaStrandBe = Backend() betaStrandBe.configure( DatabaseParameters(landmarks=[GOR4BetaStrand], trigPoints=[])) alphaHelixScanned = alphaHelixBe.scan(self.READ) betaStrandScanned = betaStrandBe.scan(self.READ) alphaHelixIndices = alphaHelixScanned.coveredIndices() betaStrandIndices = betaStrandScanned.coveredIndices() self.assertEqual(0, len(alphaHelixIndices & betaStrandIndices))
def __init__(self, histogram, query, subject, dbParams, weights=None): self._histogram = histogram self._queryLen = len(query) self._subjectLen = len(subject) self._weights = self.DEFAULT_WEIGHTS if weights is None else weights from light.backend import Backend backend = Backend() backend.configure(dbParams) scannedQuery = backend.scan(query) allQueryHashes = backend.getHashes(scannedQuery) self._allQueryFeatures = getHashFeatures(allQueryHashes) scannedSubject = backend.scan(subject.read) allSubjectHashes = backend.getHashes(scannedSubject) self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
def testNoOverlapDistanceBaseOne(self): """ There cannot be any index overlap between landmarks found by the GOR4 alpha helix and beta strand finders using a distance base of 1.0 (which should do no scaling). """ alphaHelixBe = Backend() alphaHelixBe.configure( DatabaseParameters(landmarks=[GOR4AlphaHelix], trigPoints=[], distanceBase=1.0)) betaStrandBe = Backend() betaStrandBe.configure( DatabaseParameters(landmarks=[GOR4BetaStrand], trigPoints=[], distanceBase=1.0)) alphaHelixScanned = alphaHelixBe.scan(self.READ) betaStrandScanned = betaStrandBe.scan(self.READ) alphaHelixIndices = alphaHelixScanned.coveredIndices() betaStrandIndices = betaStrandScanned.coveredIndices() self.assertEqual(0, len(alphaHelixIndices & betaStrandIndices))
def testScan(self): """ The scan method must return a scanned subject. """ subject = AARead('subject', 'FRRRFRRRFASAASA') dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks]) be = Backend() be.configure(dbParams) be.addSubject(subject, '0') scannedSubject = be.scan(subject) self.assertIsInstance(scannedSubject, ScannedRead)
def testCollectReadHashesWithOneLandmark(self): """ The getHashes method must return a dict keyed by (landmark, trigPoints) hash with values containing the read offsets. The result should be empty if there is only one landmark in the read. """ dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[]) be = Backend() be.configure(dbParams) query = AARead('query', 'FRRRFRRRF') scannedQuery = be.scan(query) hashCount = be.getHashes(scannedQuery) self.assertEqual({}, hashCount)
def testGetScannedPairs(self): """ The getSequencePairs method must return pairs of (landmark, trigPoints). """ subject = AARead('subject', 'FRRRFRRRFASAASA') dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], distanceBase=1.0) be = Backend() be.configure(dbParams) be.addSubject(subject, '0') scannedSubject = be.scan(subject) pairs = list(be.getScannedPairs(scannedSubject)) # First pair. landmark, trigPoint = pairs[0] self.assertEqual(Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2), landmark) self.assertEqual(TrigPoint(Peaks.NAME, Peaks.SYMBOL, 10), trigPoint) # Second pair. landmark, trigPoint = pairs[1] self.assertEqual(Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2), landmark) self.assertEqual(TrigPoint(Peaks.NAME, Peaks.SYMBOL, 13), trigPoint) self.assertEqual(2, len(pairs))
def __init__(self, sequences, labels, defaultLabel=None, **kwargs): """ Base class for using cluster analysis to evaluate how well various feature finders and database parameter settings can separate a set of sequences. The clustering is based on feature offset deltas. @param sequences: Either A C{str} filename of sequences to consider or a C{light.reads.Reads} instance. @param labels: A C{dict} with a label for each sequence id in C{sequences}. These are the known categories of each sequence. @param defaultLabel: If not C{None}, a label to use for reads whose ids are not present in C{labels}. If C{None} and a read id has no label a ValueError is raised. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @raises ValueError: If the id of a read is not in labels and no default label has been set, or if there are no reads in C{sequences}. """ if isinstance(sequences, str): reads = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) else: reads = sequences database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) backend = Backend() backend.configure(database.dbParams) allOffsetDeltas = [] trueLabels = [] for read in reads: trueLabel = labels.get(read.id, defaultLabel) if trueLabel is None: raise ValueError('Read %r has no corresponding label' % read.id) trueLabels.append(trueLabel) offsetDeltas = Counter() scannedRead = backend.scan(read) for landmark, trigPoint in backend.getScannedPairs(scannedRead): delta = scaleLog(trigPoint.offset - landmark.offset, database.dbParams.distanceBase) offsetDeltas[delta] += 1 allOffsetDeltas.append(offsetDeltas) nReads = len(reads) if nReads == 0: raise ValueError('No sequences were found in %r' % sequences) # Don't check that len(reads) == len(labels). I.e., ignore extra labels # to make using this class interactively more convenient. # Create an affinity matrix. Initially set all values to 1.0 so we # don't need to later initialize the diagonal. affinity = np.ones((nReads, nReads)) for row, offsetDeltas in enumerate(allOffsetDeltas): for col in range(row + 1, nReads): affinity[row, col] = affinity[col, row] = (self.affinityFromOffsetDeltas( allOffsetDeltas[row], allOffsetDeltas[col])) self.nReads = nReads self.affinity = affinity self.trueLabels = trueLabels
def calculateScore(self): """ Calculates the overall score, as described above. @return: a C{float} overall score for all significant bins (or C{None} if there are no significant bins) and a C{dict} with information about the score. """ # We could do more checking here and use the score of the best bin as # the overall score if there is only one significant bin or if the # score of the best bin is 1.0. # Don't attempt to calculate an overall score if there are no # significant bins. if not self._significantBins: analysis = { 'score': None, 'scoreClass': self.__class__, } return None, analysis from light.backend import Backend backend = Backend() backend.configure(self._dbParams) allQueryFeatures = getHashFeatures( backend.getHashes(backend.scan(self._query))) allSubjectFeatures = getHashFeatures( backend.getHashes(backend.scan(self._subject.read))) # Keep track of variables state = { # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will # contain all int offsets that are in matching features (and thus # inside the matched region). 'overallMatchedQueryOffsets': set(), 'overallMatchedSubjectOffsets': set(), # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets # will contain all int offsets that are in features that don't # match, but which are inside the matched region. 'overallUnmatchedQueryOffsets': set(), 'overallUnmatchedSubjectOffsets': set(), # The set of all offsets in all bins (whether or not the offsets # are in matched features, unmatched features, or not in any # feature). 'queryOffsetsInBins': set(), 'subjectOffsetsInBins': set(), 'score': 0.0, 'denominatorQuery': 0.0, 'denominatorSubject': 0.0, 'matchedOffsetCount': 0, 'matchedRegionScore': 0.0, 'numeratorQuery': 0.0, 'numeratorSubject': 0.0, 'normalizerQuery': 0.0, 'normalizerSubject': 0.0, 'totalOffsetCount': 0, 'scoreClass': self.__class__, 'queryOffsetsInBinsCount': 0, 'subjectOffsetsInBinsCount': 0, 'numberOfBinsConsidered': 0, } # Consider the significantBins one by one until the overall score drops # below the bestBinScore, or we run out of bins. for i, bin_ in enumerate((sb['bin'] for sb in self._significantBins), start=1): result = addBin(bin_, allQueryFeatures, allSubjectFeatures, state) # Check if we can add more bins, or if we need to return here. if result['score'] >= state['score']: # The new overallScore is higher or equal to the current # overallScore. Continue adding the next bin using the newly # calculated values. state.update(result) else: # The new overallScore is lower than the current overallScore. break state['numberOfBinsConsidered'] = i return state['score'], state
def calculateScore(self): """ Calculates the overall score for all significant bins, as described above. @return: a C{float} overall score for all significant bins (or C{None} if there are no significant bins) and a C{dict} with information about the score. """ if self._significantBins: bestBinScore = self._significantBins[0]['score'] if not self._significantBins: analysis = { 'score': None, 'scoreClass': self.__class__, } return None, analysis from light.backend import Backend backend = Backend() backend.configure(self._dbParams) allQueryFeatures = getHashFeatures( backend.getHashes(backend.scan(self._query))) allSubjectFeatures = getHashFeatures( backend.getHashes(backend.scan(self._subject.read))) # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will # contain all int offsets that are in matching features (and thus # inside the matched region). overallMatchedQueryOffsets = set() overallMatchedSubjectOffsets = set() # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets # will contain all int offsets that are in features that don't match, # but which are inside the matched region. overallUnmatchedQueryOffsets = set() overallUnmatchedSubjectOffsets = set() # The set of all offsets in all bins (whether or not the offsets are in # matched features, unmatched features, or not in any feature. queryOffsetsInBins = set() subjectOffsetsInBins = set() # Get the features and their offsets which are matched and unmatched in # subject and query in all bins. for bin_ in (sb['bin'] for sb in self._significantBins): # Query. matchedOffsets, unmatchedOffsets, minOffset, maxOffset = ( offsetsInBin(bin_, 'query', allQueryFeatures)) overallMatchedQueryOffsets.update(matchedOffsets) overallUnmatchedQueryOffsets.update(unmatchedOffsets) queryOffsetsInBins.update(range(minOffset, maxOffset + 1)) # Subject. matchedOffsets, unmatchedOffsets, minOffset, maxOffset = ( offsetsInBin(bin_, 'subject', allSubjectFeatures)) overallMatchedSubjectOffsets.update(matchedOffsets) overallUnmatchedSubjectOffsets.update(unmatchedOffsets) subjectOffsetsInBins.update(range(minOffset, maxOffset + 1)) # Make sure none of the overall matched offsets are in the overall # unmatchedOffsets. overallMatchedQueryOffsets -= overallUnmatchedQueryOffsets overallMatchedSubjectOffsets -= overallUnmatchedSubjectOffsets # Overall score calculation step 1: the matched region score (MRS). matchedOffsetCount = (len(overallMatchedQueryOffsets) + len(overallMatchedSubjectOffsets)) totalOffsetCount = (matchedOffsetCount + len(overallUnmatchedQueryOffsets) + len(overallUnmatchedSubjectOffsets)) try: matchedRegionScore = matchedOffsetCount / totalOffsetCount except ZeroDivisionError: # A small optimization could be done here. If the MRS is zero, # we already know the overall score will be zero, so we could # return at this point. To keep things simple, for now, just # continue with the overall calculation. matchedRegionScore = 0.0 # Overall score calculation step 2: the length normalizer (LN). normalizerQuery, numeratorQuery, denominatorQuery = ( computeLengthNormalizer(allQueryFeatures, overallMatchedQueryOffsets, overallUnmatchedQueryOffsets, queryOffsetsInBins)) # There is a small optimization that could be done at this point. # If the query normalizer is 1.0, don't bother to compute a # normalizer for the subject (due to the use of max() below and # because a normalizer is always <= 1.0). But to keep the code # simpler, for now, we still compute both normalizers. normalizerSubject, numeratorSubject, denominatorSubject = ( computeLengthNormalizer(allSubjectFeatures, overallMatchedSubjectOffsets, overallUnmatchedSubjectOffsets, subjectOffsetsInBins)) # Calculate the final score, as descibed in the docstring. score = matchedRegionScore * max(normalizerQuery, normalizerSubject) # The overall score can be lower than the best bin score, for # example when a sequence is compared against itself, where the # bestBinScore will be 1.0, but the overallScore can be lower, # because worse bins are taken into account. We don't allow that. if bestBinScore is not None and score < bestBinScore: overallScore = bestBinScore adjusted = True else: overallScore = score adjusted = False analysis = { 'denominatorQuery': denominatorQuery, 'denominatorSubject': denominatorSubject, 'matchedOffsetCount': matchedOffsetCount, 'matchedSubjectOffsetCount': len(overallMatchedSubjectOffsets), 'matchedQueryOffsetCount': len(overallMatchedQueryOffsets), 'matchedRegionScore': matchedRegionScore, 'numeratorQuery': numeratorQuery, 'numeratorSubject': numeratorSubject, 'normalizerQuery': normalizerQuery, 'normalizerSubject': normalizerSubject, 'score': overallScore, 'scoreClass': self.__class__, 'totalOffsetCount': totalOffsetCount, 'queryOffsetsInBins': len(queryOffsetsInBins), 'subjectOffsetsInBins': len(subjectOffsetsInBins), 'overallScoreAdjustedToBestBinScore': adjusted, } return overallScore, analysis
class CalculateOverlap(object): """ Calculate the overlap between the features found by our finders and the secondary structures found by DSSP. The secondary structures found by DSSP were downloaded from http://www.rcsb.org/pdb/files/ss.txt on 11/11/2015. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. """ def __init__(self, **kwargs): # Set default landmark and trig point finders. if 'landmarks' not in kwargs: kwargs['landmarks'] = ALL_LANDMARK_CLASSES + [ c for c in DEV_LANDMARK_CLASSES if c.NAME.startswith('PDB ') ] if 'trigPoints' not in kwargs: kwargs['trigPoints'] = [ c for c in ALL_TRIG_CLASSES if c.NAME != 'Volume' ] db = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) self._backend = Backend() self._backend.configure(db.dbParams) self._names = (db.dbParams.landmarkFinderNames() + db.dbParams.trigPointFinderNames()) def getFeatures(self, ssAARead): """ Extract the features from the sequence. Return information about the offsets covered by each feature as well as the intersection and union of offsets for each pair of features. @param ssAARead: An C{SSAARead} instance. @return: A triple of C{defaultdict(set)}s. These contain: 1) The sequence features, keyed by C{str} feature name, each with a C{set} of C{int}s as a value, giving the offsets in C{ssAARead} where the feature was found. 2) The intersection of offsets for each pair of feature finders. This is keyed by C{str} "name1-name2" with the names of the finders. The values are C{set}s of C{int}s, as in (1). 3) The union of offsets for each pair of feature finders. This is keyed by C{str} "name1-name2" with the names of the finders. The values are C{set}s of C{int}s, as in (1). """ features = defaultdict(set) intersection = defaultdict(set) union = defaultdict(set) scannedSequence = self._backend.scan(ssAARead) # Get all offsets for each landmark and trig point separately. for feature in scannedSequence.landmarks + scannedSequence.trigPoints: features[feature.name].update(feature.coveredOffsets()) # Get the offset intersection and union of all pairs of features. for i, name1 in enumerate(self._names): for name2 in self._names[i + 1:]: key = frozenset((name1, name2)) intersection[key] = features[name1] & features[name2] union[key] = features[name1] | features[name2] return features, intersection, union
def print_(self, printQuery=True, printSequences=False, printFeatures=False, printHistograms=False, queryDescription='Query title', sortHSPsByScore=True, margin='', result=None): """ Print a result in a human-readable format. If self._storeFullAnalysis is True, full information about all matched subjects (i.e., including matches that were not significant) will be printed. If not, only basic information about significant matches will appear. @param printQuery: If C{True}, also print details of the query. @param printSequences: If C{True}, also print query and subject sequences. @param printFeatures: If C{True}, print details of landmark and trig point features. @param printHistograms: If C{True}, print details of histograms. @param queryDescription: A C{str} description to print before the query (when printQuery is C{True}. @param sortHSPsByScore: If C{True}, HSPs for a subject should be printed in order of decreasing score. If C{False}, print sorted by histogram bin number. @param margin: A C{str} that should be inserted at the start of each line of output. @param result: A C{MultilineString} instance, or C{None} if a new C{MultilineString} should be created. @return: If C{result} was C{None}, return a C{str} representation of the scanned read, else C{None}. """ if result is None: result = MultilineString(margin=margin) returnNone = False else: returnNone = True append = result.append extend = result.extend indent = result.indent outdent = result.outdent if printQuery: backend = Backend() backend.configure(self.connector.dbParams) scannedQuery = backend.scan(self.query) scannedQuery.print_(printSequence=printSequences, printFeatures=printFeatures, description=queryDescription, margin=margin, result=result) self._findParams.print_(margin=margin, result=result) # Sort matched subjects (if any) in order of decreasing score so we # can print them in a useful order. # # The following sorted() call will fail (with TypeError) under # Python 3 because bestScore is None when there are no significant # matches (which can happen when self._storeFullAnalysis is True). subjectIndices = sorted( iter(self.analysis.keys()), reverse=True, key=lambda index: self.analysis[index]['bestBinScore']) if not sortHSPsByScore: indexGetter = itemgetter('index') extend([ 'Overall matches: %d' % len(subjectIndices), 'Significant matches: %d' % len(list(self.significantSubjects())), 'Query hash count: %d' % self.queryHashCount, ]) if subjectIndices: append('Matched subjects:') indent() for subjectCount, subjectIndex in enumerate(subjectIndices, start=1): analysis = self.analysis[subjectIndex] subject = self.connector.getSubjectByIndex(subjectIndex) minHashCount = min(self.queryHashCount, subject.hashCount) significantBins = analysis['significantBins'] append('Subject %d:' % subjectCount) indent() extend([ 'Title: %s' % subject.read.id, 'Best HSP score: %s' % analysis['bestBinScore'], ]) if printSequences: append('Sequence: %s' % subject.read.sequence) extend([ 'Index in database: %s' % subjectIndex, 'Subject hash count: %s' % subject.hashCount, 'Subject/query min hash count: %s' % minHashCount, 'Significance cutoff: %f' % (self._findParams.significanceFraction * minHashCount), 'Number of HSPs: %d' % len(significantBins), ]) if not sortHSPsByScore: significantBins = deepcopy(significantBins) significantBins.sort(key=indexGetter) indent() for hspCount, bin_ in enumerate(significantBins, start=1): binCount = len(bin_['bin']) append('HSP %d (bin %d): %d matching hash%s, score %f' % (hspCount, bin_['index'], binCount, '' if binCount == 1 else 'es', bin_['score'])) if printFeatures: indent() for binItem in bin_['bin']: extend([ 'Landmark %s' % binItem['subjectLandmark'], 'Trig point %s' % binItem['subjectTrigPoint'], ]) outdent() outdent() if printHistograms and self._storeFullAnalysis: histogram = analysis['histogram'] significantBinIndices = set( [bin_['index'] for bin_ in significantBins]) maxCount = max(len(bin_) for bin_ in histogram.bins) append('Histogram:') indent() extend([ 'Number of bins: %d' % len(histogram.bins), 'Bin width: %.10f' % histogram.binWidth, 'Max bin count: %r' % maxCount, 'Max (scaled) offset delta: %d' % histogram.max, 'Min (scaled) offset delta: %d' % histogram.min, ]) # Calculate column widths for displaying ranges neatly. maxAbsoluteValue = max( [-histogram.min, histogram.max, -histogram.max]) if maxAbsoluteValue == 0: # All printed range values will be '+0.0', of length 4. rangeWidth = 4 else: # Add 3 because we have the sign, a decimal point, and # one digit of precision. rangeWidth = 3 + int(ceil(log10(maxAbsoluteValue))) rangeSeparator = ' to ' rangeColumnWidth = 2 * rangeWidth + len(rangeSeparator) first = True for binIndex, bin_ in enumerate(histogram.bins): binCount = len(bin_) if binCount: if first: append('Non-empty bins:') indent() append('%s %s %*s %s' % ('Index', 'Count', rangeColumnWidth, 'Range', 'Significant')) first = False binLow = histogram.min + binIndex * histogram.binWidth # 5, 5, 11 embedded in the format string below are # the lengths of 'Index', 'Range', and 'Significant'. append('%5d %5d %+*.1f%s%+*.1f %11s' % (binIndex, binCount, rangeWidth, binLow, rangeSeparator, rangeWidth, binLow + histogram.binWidth, 'Yes' if binIndex in significantBinIndices else '')) if first: append('All bins were empty.') else: outdent() outdent() if not returnNone: return str(result)
def __init__(self, sequences, cutoff, **kwargs): """ A class to work with hashes. For a set of given sequences, find all hashes and for each sequence make a string of 1 or 0 denoting whether a hash is present in that sequence or not. Only include hashes if they occur in more than at ' least a specified percentage of all given sequences. @param sequences: A C{str} filename with a fasta file of sequences to be used or a C{dark.reads.Reads} object. @param cutoff: A C{float} between 0.0 and 1.0 of the fraction of sequences in which a hash has to be present to be included in the final string. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. """ if isinstance(sequences, str): reads = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) else: reads = sequences database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) backend = Backend() backend.configure(database.dbParams) # Make a dictionary where the keys are the sequence ids and the value # is an orderedDict of hashes as returned from getHashes(). hashes = {} for read in reads: scannedRead = backend.scan(read) readHashes = backend.getHashes(scannedRead) hashes[read.id] = readHashes # Make a list of all unique hashes that occur. totalHashes = set() for read in hashes: totalHashes.update(hashes[read].keys()) # Make a dictionary where the key is a hash and the value is a list of # the reads in which the hash occurs. byHashes = {} for hash_ in totalHashes: viruses = [] for readId in hashes: try: hashes[readId][hash_] except KeyError: continue viruses.append(readId) byHashes[hash_] = viruses # Make a dictionary where the key is a readId and the value is a string # of 1 and 0 denoting which hashes occur in which read. co = cutoff * len(reads) self.hashString = {read.id: '' for read in reads} for hash_ in byHashes: if len(byHashes[hash_]) > co: for virus in self.hashString: if virus in byHashes[hash_]: self.hashString[virus] += '1' else: self.hashString[virus] += '0'
def __init__(self, query, connector, matches, queryHashCount, findParams=None, nonMatchingHashes=None, storeFullAnalysis=False): self.query = query self.connector = connector self.matches = matches # Only saved on self for testing. self.queryHashCount = queryHashCount findParams = findParams or FindParameters() self._findParams = findParams self.nonMatchingHashes = nonMatchingHashes self._storeFullAnalysis = storeFullAnalysis self.analysis = defaultdict(dict) deltaScale = findParams.deltaScale scoreGetter = itemgetter('score') be = Backend() be.configure(connector.dbParams) if findParams.significanceMethod == 'AAFraction': queryAACount = len(be.scan(query).coveredIndices()) # Go through all the subjects that were matched at all, and put the # match offset deltas into bins so we can decide which (if any) of # the matches is significant. for subjectIndex in matches: subject = connector.getSubjectByIndex(subjectIndex) # Use a histogram to bin scaled (landmark, trigPoint) offset # deltas. nBins = max(len(query), len(subject)) # Make sure the number of bins is odd, else Histogram() will raise. nBins |= 0x1 histogram = Histogram(nBins) add = histogram.add # To ensure the set of query/subject offset deltas is the same # no matter which of the sequences is the query and which is # the subject, we negate all deltas if the subject sequence # sorts first. This is just a way of canonicalizing the set of # deltas. If we don't canonicalize, we get sets of deltas with # opposite signs, like {-4, -2, 6} and {-6, 2, 4} depending on # which sequence is the subject and which the query. This # occasionally leads to hard-to-debug and awkward-to-fix # differences in the histogram binning at bin boundaries due to # tiny floating point differences. The simple solution is to # canonicalize the deltas based on an arbitrary consistent # difference between the subject and query. negateDeltas = subject.read.sequence < query.sequence for match in matches[subjectIndex]: # The delta is the difference between the # corresponding landmark offsets subjectLandmarkOffset = match['subjectLandmark'].offset queryLandmarkOffset = match['queryLandmark'].offset delta = subjectLandmarkOffset - queryLandmarkOffset if negateDeltas: delta = -delta # Add the information about this common landmark / # trig point hash to the histogram bucket for the # query landmark to subject landmark offset delta. add(scaleLinear(delta, deltaScale), match) histogram.finalize() minHashCount = min(queryHashCount, subject.hashCount) significanceMethod = findParams.significanceMethod if significanceMethod == 'Always': significance = Always() elif significanceMethod == 'HashFraction': significance = HashFraction(histogram, minHashCount, findParams.significanceFraction) elif significanceMethod == 'MaxBinHeight': significance = MaxBinHeight(histogram, query, connector) elif significanceMethod == 'MeanBinHeight': significance = MeanBinHeight(histogram, query, connector) elif significanceMethod == 'AAFraction': featureAACount = (queryAACount + len(be.scan(subject.read).coveredIndices())) significance = AAFraction(histogram, featureAACount, findParams.significanceFraction) else: raise ValueError('Unknown significance method %r' % significanceMethod) binScoreMethod = findParams.binScoreMethod if binScoreMethod == 'NoneScore': scorer = NoneScore() elif binScoreMethod == 'MinHashesScore': scorer = MinHashesScore(histogram, minHashCount) elif binScoreMethod == 'FeatureMatchingScore': scorer = FeatureMatchingScore(histogram, query, subject, connector.dbParams, findParams) elif binScoreMethod == 'FeatureAAScore': scorer = FeatureAAScore(histogram, query, subject, connector.dbParams) elif binScoreMethod == 'WeightedFeatureAAScore': scorer = WeightedFeatureAAScore(histogram, query, subject, connector.dbParams, findParams.weights) elif binScoreMethod == 'FeatureAALengthScore': scorer = FeatureAALengthScore(histogram, query, subject, connector.dbParams) else: raise ValueError('Unknown bin score method %r' % binScoreMethod) # Find bins with a significant number of elements and score them. significantBins = [] for binIndex, bin_ in enumerate(histogram.bins): if significance.isSignificant(binIndex): score, scoreAnalysis = scorer.calculateScore(binIndex) significantBin = { 'bin': bin_, 'index': binIndex, 'score': score } if storeFullAnalysis: significantBin['scoreAnalysis'] = scoreAnalysis significantBins.append(significantBin) if significantBins: significantBins.sort(key=scoreGetter, reverse=True) bestBinScore = significantBins[0]['score'] else: bestBinScore = None overallScoreMethod = findParams.overallScoreMethod if overallScoreMethod == 'BestBinScore': scorer = BestBinScore(histogram, significantBins) elif overallScoreMethod == 'SignificantBinScore': scorer = SignificantBinScore(significantBins, query, subject, connector.dbParams) elif overallScoreMethod == 'GreedySignificantBinScore': scorer = GreedySignificantBinScore(significantBins, query, subject, connector.dbParams) else: raise ValueError('Unknown overall score method %r' % overallScoreMethod) overallScore, overallScoreAnalysis = scorer.calculateScore() if storeFullAnalysis: self.analysis[subjectIndex] = { 'histogram': histogram, 'bestBinScore': bestBinScore, 'overallScore': overallScore, 'overallScoreAnalysis': overallScoreAnalysis, 'significantBins': significantBins, 'significanceAnalysis': significance.analysis, } elif significantBins: self.analysis[subjectIndex] = { 'bestBinScore': bestBinScore, 'overallScore': overallScore, 'significantBins': significantBins, }
cmd.rebuild() # Color the features and chains. for i, chain in enumerate(chains): if chain.id == chainToCompare.id: # Color each chain. what = '%s & chain %s' % (structureName, chain.id) try: color = CHAIN_COLORS[i] except IndexError: color = 'white' cmd.color(color, what) # Color the features. scannedQuery = backend.scan(chain) for landmark in scannedQuery.landmarks: color = FEATURE_COLORS[landmark.symbol] start = landmark.offset end = landmark.offset + landmark.length what = 'resi %d-%d & %s & chain %s' % (start, end - 1, structureName, chain.id) cmd.color(color, what) for trigPoint in scannedQuery.trigPoints: color = FEATURE_COLORS[trigPoint.symbol] start = trigPoint.offset end = trigPoint.offset + trigPoint.length what = 'resi %d-%d & %s & chain %s' % (start, end - 1, structureName,