def testBooleanWhenNotEmpty(self): """ The queue must test False when empty. """ pq = PriorityQueue() pq.add(4) self.assertTrue(pq)
def testLengthAfterAdd(self): """ An queue with one thing in it must have a length of one. """ pq = PriorityQueue() pq.add(3) self.assertEqual(1, len(pq))
def testContains(self): """ The __contains__ function must work as expected. """ pq = PriorityQueue() pq.add('hey') self.assertTrue('hey' in pq) self.assertFalse('hi' in pq)
def testLengthAfterDoubleAdd(self): """ An queue with an item that is added twice must have a length of one. """ pq = PriorityQueue() pq.add(3) pq.add(3) self.assertEqual(1, len(pq))
def testPopRaisesOnEmpty(self): """ The pop method must raise a KeyError if the queue is empty. """ pq = PriorityQueue() error = 'pop from an empty priority queue' assertRaisesRegex(self, KeyError, error, pq.pop)
def testLowestPriorityRaisesOnEmpty(self): """ The lowestPriority method must raise a KeyError if the queue is empty. """ pq = PriorityQueue() error = 'peek on an empty priority queue' assertRaisesRegex(self, KeyError, error, pq.lowestPriority)
def testPop(self): """ The queue pop method must return the lowest priority item. """ pq = PriorityQueue() pq.add('hey', 9) pq.add('you', 7) pq.add('two', 8) self.assertEqual('you', pq.pop())
def testLowestPriority(self): """ The lowestPriority method must return the lowest priority without disturbing the length of the queue. """ pq = PriorityQueue() pq.add('hey', 9) pq.add('you', 7) pq.add('two', 8) self.assertEqual(7, pq.lowestPriority()) self.assertEqual(3, len(pq))
def testRemove(self): """ The remove method must work as expected. """ pq = PriorityQueue() pq.add('hey') pq.remove('hey') self.assertFalse(pq) self.assertEqual(0, len(pq))
def analyzeReferenceId(self, referenceId, alignmentFile, outputDir): """ Analyze the given reference id in the given alignment file (if an alignment to the reference id is present). @param referenceId: The C{str} id of the reference sequence to analyze. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. @return: C{None} if C{referenceId} is not present in C{alignmentFile} or if no significant offsets are found. Else, a C{dict} containing the signifcant offsets and the consensus sequence that best matches C{referenceId}. """ analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile, outputDir) if analysis: (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets, samFilter, paddedSAM) = analysis else: return insignificantOffsets = set( range(genomeLength)) - set(significantOffsets) reference = self.referenceGenomes[referenceId] referenceSequence = reference.sequence consensus = [] for base in referenceSequence: ob = OffsetBases() ob.incorporateBase(base) consensus.append(ob) readQueue = PriorityQueue() self.updatePriorityQueue(readQueue, alignedReads, consensus, significantOffsets) consensusFilename = join(outputDir, 'reference-consensus.sam') nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam') self.report(' Writing consensus SAM to', consensusFilename) self.report(' Writing non-consensus SAM to', nonConsensusFilename) with samfile(alignmentFile) as sam: consensusAlignment = AlignmentFile(consensusFilename, mode='w', template=sam) nonConsensusAlignment = AlignmentFile(nonConsensusFilename, mode='w', template=sam) # Reads with no significant offsets get written to both output files. readsWithNoSignificantOffsetsCount = 0 for read in alignedReads: if not read.significantOffsets: readsWithNoSignificantOffsetsCount += 1 consensusAlignment.write(read.alignment) nonConsensusAlignment.write(read.alignment) for offset in insignificantOffsets: base = read.base(offset) if base is not None: consensus[offset].incorporateBase(base) self.report(' %d read%s did not overlap any significant offsets' % (readsWithNoSignificantOffsetsCount, s(readsWithNoSignificantOffsetsCount))) readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0 cutoff = self.cutoff while readQueue: mismatchFraction, _ = readQueue.lowestPriority() read = readQueue.pop() if mismatchFraction <= cutoff: # We want this read. Incorporate it into the consensus. readsMatchingConsensusCount += 1 consensusAlignment.write(read.alignment) affectedReads = set() for offset in read.significantOffsets: readBase = read.base(offset) consensus[offset].incorporateBase(readBase) for readAtOffset in readsAtOffset[offset]: if readAtOffset in readQueue: affectedReads.add(readAtOffset) self.updatePriorityQueue(readQueue, affectedReads, consensus, significantOffsets) else: readsNotMatchingConsensusCount += 1 nonConsensusAlignment.write(read.alignment) consensusAlignment.close() nonConsensusAlignment.close() self.report( ' %d read%s matched the consensus, %d did not.' % (readsMatchingConsensusCount, s(readsMatchingConsensusCount), readsNotMatchingConsensusCount)) # Remove the reference bases from the consensus. for offset, base in enumerate(referenceSequence): consensus[offset].unincorporateBase(base) consensusInfoFilename = join(outputDir, 'reference-consensus.txt') self.report(' Writing consensus info to', consensusInfoFilename) with open(consensusInfoFilename, 'w') as fp: consensusSequence = [] for offset in range(genomeLength): # Take a copy of the commonest set because we may pop from # it below. commonest = set(consensus[offset].commonest) referenceBase = referenceSequence[offset] if len(commonest) > 1: nucleotides = ' Nucleotides: %s' % ( consensus[offset].baseCountsToStr()) else: nucleotides = '' if referenceBase in commonest: consensusBase = referenceBase else: if len(commonest) == 1: # Nothing in the included reads covers this offset. consensusBase = '-' elif len(commonest) > 1: # Report a draw (in which the reference base is not # included and so cannot be used to break the draw). commonest.pop() else: consensusBase = commonest.pop() consensusSequence.append(consensusBase) mismatch = '' if referenceBase == consensusBase else ( ' Mismatch (reference has %s)' % referenceBase) print('%d: %s%s%s' % (offset + 1, consensusBase, mismatch, nucleotides), file=fp) consensusRead = Read('gready-consensus-%s' % referenceId, ''.join(consensusSequence)) consensusFilename = join(outputDir, 'reference-consensus.fasta') self.report(' Writing gready consensus info to', consensusFilename) Reads([consensusRead]).save(consensusFilename) return { 'consensusRead': consensusRead, 'significantOffsets': significantOffsets, }
class DistanceCache(object): """ Maintain a set of distances between objects, with lazy evaluation and removal from the set. @param distFunc: A function that computes the distance between two objects. """ def __init__(self, distFunc): self._distFunc = distFunc self._distances = defaultdict(dict) self._pq = PriorityQueue() def distance(self, a, b): """ Find the distance between a pair of objects. @param a: An immutable object. @param b: An immutable object. @return: The distance between C{a} and C{b}, according to the distance function passed to __init__. """ return self._distances[a][b] def add(self, a): """ Add an object. @param a: An immutable object. """ assert a not in self._distances if self._distances: for b in list(self._distances): d = self._distFunc(a, b) self._distances[b][a] = self._distances[a][b] = d self._pq.add(_key(a, b), d) else: # This is the first element, so it has no distances to # anything. Mention it to create its distance dictionary so it # will be found when subsequent elements are added. self._distances[a] def lowestDistance(self): """ Get the lowest distance between any two clusters. @return: A C{float} distance. """ try: return self._pq.lowestPriority() except KeyError: return None def pop(self): """ Pop the lowest distance cluster pair. @raise KeyError: If the distance priority queue is empty. @return: A 2-C{tuple} of C{int} cluster numbers. """ return self._pq.pop() def __contains__(self, pair): """ Test if a pair has a computed distance (useful for testing). @param pair: A 2-tuple of objects. @return: A Boolean indicating membership. """ return ((pair[0], pair[1]) in self._distances or (pair[1], pair[0]) in self._distances) def remove(self, a): """ Remove an object. @param a: An object. """ errorCount = 0 for b in self._distances: if b != a: try: self._pq.remove(_key(a, b)) except KeyError: # We allow one KeyError since 'a' has likely just been # popped as part of the lowest scoring pain. errorCount += 1 if errorCount > 1: raise del self._distances[b][a] del self._distances[a]
def __init__(self, distFunc): self._distFunc = distFunc self._distances = defaultdict(dict) self._pq = PriorityQueue()
def testBooleanWhenEmpty(self): """ The queue must test False when empty. """ pq = PriorityQueue() self.assertFalse(pq)
def testEmptyLength(self): """ An empty queue must have a zero length. """ pq = PriorityQueue() self.assertEqual(0, len(pq))