def testOneIntervalExactCovering(self): """ If there is a single interval that spans the whole hit exactly, just that one interval should be returned by walk, and it should be full. """ ri = ReadIntervals(100) ri.add(0, 100) self.assertEqual([(self.FULL, (0, 100))], list(ri.walk()))
def testOneIntervalExactCoveringCoverage(self): """ If there is a single interval that spans the whole hit exactly, coverage should return 1.0. """ ri = ReadIntervals(100) ri.add(0, 100) self.assertEqual(1.0, ri.coverage())
def testOneIntervalCoveringAllExtendingLeftCoverage(self): """ If there is a single interval that spans the whole hit, including going negative to the left, coverage should return 1.0. """ ri = ReadIntervals(100) ri.add(-10, 100) self.assertEqual(1.0, ri.coverage())
def testOneIntervalCoveringAllExtendingRightCoverage(self): """ If there is a single interval that spans the whole hit, including going beyond the hit to the right, coverage should return 1.0. """ ri = ReadIntervals(100) ri.add(0, 110) self.assertEqual(1.0, ri.coverage())
def testOneIntervalEndingAfterHitEndCoverage(self): """ If there is a single interval that ends after the end of the hit but doesn't start at zero, coverage should return the correct value. """ ri = ReadIntervals(100) ri.add(50, 150) self.assertEqual(0.5, ri.coverage())
def testOneIntervalStartingBeforeZeroCoverage(self): """ If there is a single interval that starts before zero but doesn't cover the whole hit, coverage should return the correct value. """ ri = ReadIntervals(100) ri.add(-50, 50) self.assertEqual(0.5, ri.coverage())
def testOneIntervalStartingAtZeroCoverageCounts(self): """ If there is a single interval that starts at zero but doesn't cover the whole hit, coverageCounts should return the correct result. """ ri = ReadIntervals(10) ri.add(0, 5) c = Counter([0, 1, 2, 3, 4]) self.assertEqual(c, ri.coverageCounts())
def testTwoOverlappingIntervalsInMiddle(self): """ If there are two overlapping intervals in the middle of the hit, we should get 3 intervals back from walk: empty, full, empty. """ ri = ReadIntervals(100) ri.add(50, 60) ri.add(55, 70) self.assertEqual([(self.EMPTY, (0, 50)), (self.FULL, (50, 70)), (self.EMPTY, (70, 100))], list(ri.walk()))
def testOneIntervalCoveringAllExtendingBothCoverage(self): """ If there is a single interval that spans the whole hit, including starting before zero and also going beyond the hit to the right, coverage should return 1.0 """ ri = ReadIntervals(100) ri.add(-10, 110) self.assertEqual(1.0, ri.coverage())
def testOneIntervalInMiddleCoverage(self): """ If there is a single interval in the middle of the hit, coverage should return the correct value. """ ri = ReadIntervals(100) ri.add(50, 60) self.assertEqual(0.1, ri.coverage())
def coverageCounts(self): """ For each location in the title sequence, return a count of how many times that location is covered by a read. """ intervals = ReadIntervals(self.subjectLength) for hsp in self.hsps(): intervals.add(hsp.subjectStart, hsp.subjectEnd) return intervals.coverageCounts()
def testOneIntervalCoveringAllExtendingLeft(self): """ If there is a single interval that spans the whole hit, including going negative to the left, that one interval should be returned by walk, and it should be full. """ ri = ReadIntervals(100) ri.add(-10, 100) self.assertEqual([(self.FULL, (-10, 100))], list(ri.walk()))
def testOneIntervalInMiddleCoverageCounts(self): """ If there is a single interval in the middle of the hit, coverageCounts should return the correct result. """ ri = ReadIntervals(10) ri.add(5, 6) c = Counter([5]) self.assertEqual(c, ri.coverageCounts())
def testOneIntervalExactCoveringCoverageCounts(self): """ If there is a single interval that spans the whole hit exactly, coverageCounts should return the correct result. """ ri = ReadIntervals(10) ri.add(0, 10) c = Counter([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) self.assertEqual(c, ri.coverageCounts())
def testOneIntervalEndingAfterHitEnd(self): """ If there is a single interval that ends after the end of the hit but doesn't start at zero, we should get 2 intervals back from walk, an empty then a full. """ ri = ReadIntervals(100) ri.add(50, 150) self.assertEqual([(self.EMPTY, (0, 50)), (self.FULL, (50, 150))], list(ri.walk()))
def testOneIntervalStartingBeforeZero(self): """ If there is a single interval that starts before zero but doesn't cover the whole hit, we should get 2 intervals back from walk, a full one and then an empty. """ ri = ReadIntervals(100) ri.add(-50, 50) self.assertEqual([(self.FULL, (-50, 50)), (self.EMPTY, (50, 100))], list(ri.walk()))
def testOneIntervalCoveringAllExtendingBoth(self): """ If there is a single interval that spans the whole hit, including starting before zero and also going beyond the hit to the right, that one interval should be returned by walk, and it should be full. """ ri = ReadIntervals(100) ri.add(-10, 110) self.assertEqual([(self.FULL, (-10, 110))], list(ri.walk()))
def testTwoOverlappingIntervalsInMiddleCoverage(self): """ If there are two overlapping intervals in the middle of the hit, coverage should return the correct value. """ ri = ReadIntervals(100) ri.add(50, 60) ri.add(55, 70) self.assertEqual(0.2, ri.coverage())
def testOneIntervalInMiddle(self): """ If there is a single interval in the middle of the hit, we should get 3 intervals back from walk: empty, full, empty. """ ri = ReadIntervals(100) ri.add(50, 60) self.assertEqual([(self.EMPTY, (0, 50)), (self.FULL, (50, 60)), (self.EMPTY, (60, 100))], list(ri.walk()))
def testTwoOverlappingIntervalsInMiddleCoverageCounts(self): """ If there are two overlapping intervals in the middle of the hit, coverageCounts should return the correct result. """ ri = ReadIntervals(10) ri.add(5, 7) ri.add(6, 8) c = Counter([5, 6, 6, 7]) self.assertEqual(c, ri.coverageCounts())
def testOneIntervalEndingAtHitEndCoverageCounts(self): """ If there is a single interval that ends at the end of the hit but doesn't start at zero, coverageCounts should return the correct result. """ ri = ReadIntervals(10) ri.add(5, 10) c = Counter([5, 6, 7, 8, 9]) self.assertEqual(c, ri.coverageCounts())
def testOneReadAtEnd(self): """ When one read is added to the end of an interval, there should be one reduction for the empty section before the read. """ ri = ReadIntervals(228) ri.add(128, 228) adjuster = OffsetAdjuster(ri) self.assertEqual([(128, 121)], adjuster.adjustments()) self.assertEqual(107, adjuster.adjustOffset(228))
def testOneIntervalCoveringAllExtendingRightCoverageCounts(self): """ If there is a single interval that spans the whole hit, including going beyond the hit to the right, coverageCounts should return the correct result. """ ri = ReadIntervals(10) ri.add(0, 12) c = Counter([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) self.assertEqual(c, ri.coverageCounts())
def testOneReadInMiddle(self): """ When one read is added to the middle of an interval, there should be two reductions. """ ri = ReadIntervals(106) ri.add(32, 42) adjuster = OffsetAdjuster(ri) self.assertEqual([(32, 27), (106, 58)], adjuster.adjustments()) self.assertEqual(106 - 27 - 58, adjuster.adjustOffset(106))
def testOneReadThatExactlyCoversHit(self): """ When one read is given that exactly covers the hit, there should be no length reductions. """ ri = ReadIntervals(106) ri.add(0, 106) adjuster = OffsetAdjuster(ri) self.assertEqual([], adjuster.adjustments()) self.assertEqual(106, adjuster.adjustOffset(106))
def testOneReadThatExceedsHitOnBothEnds(self): """ When one read is given that exceeds the hit at both ends, there should be no length reductions. """ ri = ReadIntervals(106) ri.add(-100, 200) adjuster = OffsetAdjuster(ri) self.assertEqual([], adjuster.adjustments()) self.assertEqual(106, adjuster.adjustOffset(106))
def testOneReadBeforeStart(self): """ When one read is added to the start of an interval before zero, there should be one reduction for the empty section after the read. """ ri = ReadIntervals(228) ri.add(-10, 100) adjuster = OffsetAdjuster(ri) self.assertEqual([(228, 121)], adjuster.adjustments()) self.assertEqual(107, adjuster.adjustOffset(228))
def coverage(self): """ Get the fraction of this title sequence that is matched by its reads. @return: The C{float} fraction of the title sequence matched by its reads. """ intervals = ReadIntervals(self.subjectLength) for hsp in self.hsps(): intervals.add(hsp.subjectStart, hsp.subjectEnd) return intervals.coverage()
def testOneIntervalCoveringAllExtendingLeft(self): """ If there is a single interval that spans the whole hit, including going negative to the left, that one interval should be returned by walk, and it should be full. """ ri = ReadIntervals(100) ri.add(-10, 100) self.assertEqual( [ (self.FULL, (-10, 100)) ], list(ri.walk()))
def testOneReadThatExactlyCoversHit(self): """ When one read is given that exactly covers the hit, there should be no length reductions. """ ri = ReadIntervals(106) ri.add(0, 106) adjuster = OffsetAdjuster(ri) self.assertEqual( [ ], adjuster.adjustments()) self.assertEqual(106, adjuster.adjustOffset(106))
def testOneReadThatExceedsHitOnBothEnds(self): """ When one read is given that exceeds the hit at both ends, there should be no length reductions. """ ri = ReadIntervals(106) ri.add(-100, 200) adjuster = OffsetAdjuster(ri) self.assertEqual( [ ], adjuster.adjustments()) self.assertEqual(106, adjuster.adjustOffset(106))
def testOneReadInMiddle(self): """ When one read is added to the middle of an interval, there should be two reductions. """ ri = ReadIntervals(106) ri.add(32, 42) adjuster = OffsetAdjuster(ri) self.assertEqual([ (32, 27), (106, 58), ], adjuster.adjustments()) self.assertEqual(106 - 27 - 58, adjuster.adjustOffset(106))
def testOneIntervalCoveringAllExtendingBoth(self): """ If there is a single interval that spans the whole hit, including starting before zero and also going beyond the hit to the right, that one interval should be returned by walk, and it should be full. """ ri = ReadIntervals(100) ri.add(-10, 110) self.assertEqual( [ (self.FULL, (-10, 110)) ], list(ri.walk()))
def testTwoOverlappingIntervalsInMiddle(self): """ If there are two overlapping intervals in the middle of the hit, we should get 3 intervals back from walk: empty, full, empty. """ ri = ReadIntervals(100) ri.add(50, 60) ri.add(55, 70) self.assertEqual([ (self.EMPTY, (0, 50)), (self.FULL, (50, 70)), (self.EMPTY, (70, 100)), ], list(ri.walk()))
def testOneReadAfterEnd(self): """ When one read is added to the end of an interval, going beyond the end of the hit, there should be one reduction for the empty section before the read. """ ri = ReadIntervals(228) ri.add(128, 250) adjuster = OffsetAdjuster(ri) self.assertEqual([ (128, 121), ], adjuster.adjustments()) self.assertEqual(107, adjuster.adjustOffset(228))
def testPairOfTwoOverlappingIntervalsCoverage(self): """ If there are two sets of two overlapping intervals in the middle of the hit, coverage should return the correct value. """ ri = ReadIntervals(100) # First overlapping pair, 50-70. ri.add(50, 60) ri.add(55, 70) # First overlapping pair, 80-95. ri.add(80, 90) ri.add(85, 95) self.assertEqual(0.35, ri.coverage())
def testOneIntervalInMiddle(self): """ If there is a single interval in the middle of the hit, we should get 3 intervals back from walk: empty, full, empty. """ ri = ReadIntervals(100) ri.add(50, 60) self.assertEqual([ (self.EMPTY, (0, 50)), (self.FULL, (50, 60)), (self.EMPTY, (60, 100)), ], list(ri.walk()))
def testOneIntervalEndingAfterHitEnd(self): """ If there is a single interval that ends after the end of the hit but doesn't start at zero, we should get 2 intervals back from walk, an empty then a full. """ ri = ReadIntervals(100) ri.add(50, 150) self.assertEqual( [ (self.EMPTY, (0, 50)), (self.FULL, (50, 150)), ], list(ri.walk()))
def testOneReadBeforeStart(self): """ When one read is added to the start of an interval before zero, there should be one reduction for the empty section after the read. """ ri = ReadIntervals(228) ri.add(-10, 100) adjuster = OffsetAdjuster(ri) self.assertEqual( [ (228, 121), ], adjuster.adjustments()) self.assertEqual(107, adjuster.adjustOffset(228))
def testOneIntervalStartingBeforeZero(self): """ If there is a single interval that starts before zero but doesn't cover the whole hit, we should get 2 intervals back from walk, a full one and then an empty. """ ri = ReadIntervals(100) ri.add(-50, 50) self.assertEqual( [ (self.FULL, (-50, 50)), (self.EMPTY, (50, 100)), ], list(ri.walk()))
def testTwoReadsInMiddle(self): """ When two reads are added to the middle of an interval, there should be three reductions (after first empty area, after 2nd empty area, after final empty area. """ ri = ReadIntervals(132) ri.add(32, 42) ri.add(58, 68) adjuster = OffsetAdjuster(ri) self.assertEqual( [ (32, 27), (58, 12), (132, 58), ], adjuster.adjustments()) self.assertEqual(132 - 27 - 12 - 58, adjuster.adjustOffset(132)) # Test an HSP at the beginning is unchanged. hsp = HSP(10, readEndInSubject=10, readStartInSubject=0, subjectEnd=10, subjectStart=0) adjuster.adjustHSP(hsp) self.assertEqual(10, hsp.readEndInSubject) self.assertEqual(0, hsp.readStartInSubject) self.assertEqual(10, hsp.subjectEnd) self.assertEqual(0, hsp.subjectStart) # Test an HSP in the first read region. hsp = HSP(10, readEndInSubject=42, readStartInSubject=32, subjectEnd=40, subjectStart=35) adjuster.adjustHSP(hsp) self.assertEqual(15, hsp.readEndInSubject) self.assertEqual(5, hsp.readStartInSubject) self.assertEqual(13, hsp.subjectEnd) self.assertEqual(8, hsp.subjectStart) # Test an HSP in the second read region. hsp = HSP(10, readEndInSubject=68, readStartInSubject=58, subjectEnd=66, subjectStart=60) adjuster.adjustHSP(hsp) self.assertEqual(29, hsp.readEndInSubject) self.assertEqual(19, hsp.readStartInSubject) self.assertEqual(27, hsp.subjectEnd) self.assertEqual(21, hsp.subjectStart)
def testPairOfTwoOverlappingIntervals(self): """ If there are two sets of two overlapping intervals in the middle of the hit, we should get 5 intervals back from walk. """ ri = ReadIntervals(100) # First overlapping pair, 50-70. ri.add(50, 60) ri.add(55, 70) # First overlapping pair, 80-95. ri.add(80, 90) ri.add(85, 95) self.assertEqual([ (self.EMPTY, (0, 50)), (self.FULL, (50, 70)), (self.EMPTY, (70, 80)), (self.FULL, (80, 95)), (self.EMPTY, (95, 100)), ], list(ri.walk()))
def testOverlappingIntervalsThatCoverEverythingCoverage(self): """ If there are sets of overlapping intervals that cover the whole hit, coverage should return the correct value. """ ri = ReadIntervals(100) ri.add(-10, 20) ri.add(15, 40) ri.add(40, 70) ri.add(66, 89) ri.add(77, 93) ri.add(70, 110) self.assertEqual(1.0, ri.coverage())
def testOverlappingIntervalsThatCoverEverything(self): """ If there are sets of overlapping intervals that cover the whole hit, we should get 1 full interval back from walk. """ ri = ReadIntervals(100) ri.add(-10, 20) ri.add(15, 40) ri.add(40, 70) ri.add(66, 89) ri.add(77, 93) ri.add(70, 110) self.assertEqual([(self.FULL, (-10, 110))], list(ri.walk()))
def alignmentGraph(titlesAlignments, title, addQueryLines=True, showFeatures=True, logLinearXAxis=False, logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE, rankScores=False, colorQueryBases=False, createFigure=True, showFigure=True, readsAx=None, imageFile=None, quiet=False, idList=False, xRange='subject', showOrfs=True): """ Align a set of matching reads against a BLAST or DIAMOND hit. @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance. @param title: A C{str} sequence title that was matched. We plot the reads that hit this title. @param addQueryLines: if C{True}, draw query lines in full (these will then be partly overdrawn by the HSP match against the subject). These are the 'whiskers' that potentially protrude from each side of a query. @param showFeatures: if C{True}, look online for features of the subject sequence (given by hitId). @param logLinearXAxis: if C{True}, convert read offsets so that empty regions in the plot we're preparing will only be as wide as their logged actual values. @param logBase: The base of the logarithm to use if logLinearXAxis is C{True}. @param: rankScores: If C{True}, change the e-values and bit scores for the reads for each title to be their rank (worst to best). @param colorQueryBases: if C{True}, color each base of a query string. If C{True}, then addQueryLines is meaningless since the whole query is shown colored. @param createFigure: If C{True}, create a figure and give it a title. @param showFigure: If C{True}, show the created figure. Set this to C{False} if you're creating a panel of figures or just want to save an image (with C{imageFile}). @param readsAx: If not None, use this as the subplot for displaying reads. @param imageFile: If not None, specifies a filename to write the image to. @param quiet: If C{True}, don't print progress / timing output. @param idList: a dictionary. The keys is a color and the values is a list of read identifiers that should be colored in the respective color. @param xRange: set to either 'subject' or 'reads' to indicate the range of the X axis. @param showOrfs: If C{True}, open reading frames will be displayed. """ startTime = time() assert xRange in ('subject', 'reads'), ( 'xRange must be either "subject" or "reads".') if createFigure: width = 20 figure = plt.figure(figsize=(width, 20)) createdReadsAx = readsAx is None if showFeatures: if showOrfs: gs = gridspec.GridSpec(4, 1, height_ratios=[3, 1, 1, 12]) featureAx = plt.subplot(gs[0, 0]) orfAx = plt.subplot(gs[1, 0]) orfReversedAx = plt.subplot(gs[2, 0]) readsAx = readsAx or plt.subplot(gs[3, 0]) else: gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1]) featureAx = plt.subplot(gs[0, 0]) readsAx = readsAx or plt.subplot(gs[1, 0]) else: if showOrfs: gs = gridspec.GridSpec(3, 1, height_ratios=[1, 1, 12]) orfAx = plt.subplot(gs[0, 0]) orfReversedAx = plt.subplot(gs[1, 0]) readsAx = readsAx or plt.subplot(gs[2, 0]) else: readsAx = readsAx or plt.subplot(111) # Make a deep copy of the title alignments. We're potentially going to # change the HSP scores, the X axis offsets, etc., and we don't want to # interfere with the data we were passed. titleAlignments = deepcopy(titlesAlignments[title]) readsAlignments = titlesAlignments.readsAlignments subjectIsNucleotides = readsAlignments.params.subjectIsNucleotides if showOrfs and not subjectIsNucleotides: # We cannot show ORFs when displaying protein plots. showOrfs = False # Allow the class of titlesAlignments to adjust HSPs for plotting, # if it has a method for doing so. try: adjuster = readsAlignments.adjustHspsForPlotting except AttributeError: pass else: adjuster(titleAlignments) if rankScores: reverse = titlesAlignments.scoreClass is not HigherIsBetterScore for rank, hsp in enumerate(sorted(titleAlignments.hsps(), reverse=reverse), start=1): hsp.score.score = rank if logLinearXAxis: readIntervals = ReadIntervals(titleAlignments.subjectLength) # Examine all HSPs so we can build an offset adjuster. for hsp in titleAlignments.hsps(): readIntervals.add(hsp.readStartInSubject, hsp.readEndInSubject) # Now adjust offsets in all HSPs. offsetAdjuster = OffsetAdjuster(readIntervals, base=logBase) for hsp in titleAlignments.hsps(): offsetAdjuster.adjustHSP(hsp) # A function for adjusting other offsets, below. adjustOffset = offsetAdjuster.adjustOffset else: def adjustOffset(offset): return offset # It would be more efficient to only walk through all HSPs once and # compute these values all at once, but for now this is simple and clear. maxY = int(ceil(titleAlignments.bestHsp().score.score)) minY = int(titleAlignments.worstHsp().score.score) maxX = max(hsp.readEndInSubject for hsp in titleAlignments.hsps()) minX = min(hsp.readStartInSubject for hsp in titleAlignments.hsps()) if xRange == 'subject': # We'll display a graph for the full subject range. Adjust X axis # min/max to make sure we cover at least zero to the sequence length. maxX = max(titleAlignments.subjectLength, maxX) minX = min(0, minX) # Swap min & max Y values, if needed, as it's possible we are dealing # with LSPs but that the score adjuster made numerically greater values # for those that were small. if maxY < minY: (maxY, minY) = (minY, maxY) if logLinearXAxis: # Adjust minX and maxX if we have gaps at the subject start or end. gaps = list(readIntervals.walk()) if gaps: # Check start of first gap: intervalType, (start, stop) = gaps[0] if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(start) if adjustedStart < minX: minX = adjustedStart # Check stop of last gap: intervalType, (start, stop) = gaps[-1] if intervalType == ReadIntervals.EMPTY: adjustedStop = adjustOffset(stop) if adjustedStop > maxX: maxX = adjustedStop # We're all set up to start plotting the graph. # Add light grey vertical rectangles to show the logarithmic gaps. Add # these first so that reads will be plotted on top of them. Only draw # gaps that are more than SMALLEST_LOGGED_GAP_TO_DISPLAY pixels wide as # we could have millions of tiny gaps for a bacteria and drawing them # all will be slow and only serves to make the entire background grey. if logLinearXAxis and len(offsetAdjuster.adjustments()) < 100: for (intervalType, interval) in readIntervals.walk(): if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(interval[0]) adjustedStop = adjustOffset(interval[1]) width = adjustedStop - adjustedStart if width >= SMALLEST_LOGGED_GAP_TO_DISPLAY: readsAx.axvspan(adjustedStart, adjustedStop, color='#f4f4f4') if colorQueryBases: # Color each query by its bases. xScale = 3 yScale = 2 baseImage = BaseImage( maxX - minX, maxY - minY + (1 if rankScores else 0), xScale, yScale) for alignment in titleAlignments: for hsp in alignment.hsps: y = hsp.score.score - minY # If the product of the subject and read frame values is +ve, # then they're either both +ve or both -ve, so we just use the # read as is. Otherwise, we need to reverse complement it. if hsp.subjectFrame * hsp.readFrame > 0: query = alignment.read.sequence else: # One of the subject or query has negative sense. query = alignment.read.reverseComplement().sequence readStartInSubject = hsp.readStartInSubject # There are 3 parts of the query string we need to # display. 1) the left part (if any) before the matched # part of the subject. 2) the matched part (which can # include gaps in the query and/or subject). 3) the right # part (if any) after the matched part. For each part, # calculate the ranges in which we have to make the # comparison between subject and query. # NOTE: never use hsp['origHsp'].gaps to calculate the number # of gaps, as this number contains gaps in both subject and # query. # 1. Left part: leftRange = hsp.subjectStart - readStartInSubject # 2. Match, middle part: middleRange = len(hsp.readMatchedSequence) # 3. Right part: # Using hsp.readEndInSubject - hsp.subjectEnd to calculate the # length of the right part leads to the part being too long. # The number of gaps needs to be subtracted to get the right # length. origQuery = hsp.readMatchedSequence.upper() rightRange = (hsp.readEndInSubject - hsp.subjectEnd - origQuery.count('-')) # 1. Left part. xOffset = readStartInSubject - minX queryOffset = 0 for queryIndex in range(leftRange): color = QUERY_COLORS.get(query[queryOffset + queryIndex], DEFAULT_BASE_COLOR) baseImage.set(xOffset + queryIndex, y, color) # 2. Match part. xOffset = hsp.subjectStart - minX xIndex = 0 queryOffset = hsp.subjectStart - hsp.readStartInSubject origSubject = hsp.subjectMatchedSequence for matchIndex in range(middleRange): if origSubject[matchIndex] == '-': # A gap in the subject was needed to match the query. # In our graph we keep the subject the same even in the # case where BLAST opened gaps in it, so we compensate # for the gap in the subject by not showing this base # of the query. pass else: if origSubject[matchIndex] == origQuery[matchIndex]: # The query matched the subject at this location. # Matching bases are all colored in the same # 'match' color. color = QUERY_COLORS['match'] else: if origQuery[matchIndex] == '-': # A gap in the query. All query gaps get the # same 'gap' color. color = QUERY_COLORS['gap'] else: # Query doesn't match subject (and is not a # gap). color = QUERY_COLORS.get(origQuery[matchIndex], DEFAULT_BASE_COLOR) baseImage.set(xOffset + xIndex, y, color) xIndex += 1 # 3. Right part. xOffset = hsp.subjectEnd - minX backQuery = query[-rightRange:].upper() for queryIndex in range(rightRange): color = QUERY_COLORS.get(backQuery[queryIndex], DEFAULT_BASE_COLOR) baseImage.set(xOffset + queryIndex, y, color) readsAx.imshow(baseImage.data, aspect='auto', origin='lower', interpolation='nearest', extent=[minX, maxX, minY, maxY]) else: # Add horizontal lines for all the query sequences. These will be the # grey 'whiskers' in the plots once we (below) draw the matched part # on top of part of them. if addQueryLines: for hsp in titleAlignments.hsps(): y = hsp.score.score line = Line2D([hsp.readStartInSubject, hsp.readEndInSubject], [y, y], color='#aaaaaa') readsAx.add_line(line) # Add the horizontal BLAST alignment lines. # If an idList is given set things up to look up read colors. readColor = {} if idList: for color, reads in idList.items(): for read in reads: if read in readColor: raise ValueError('Read %s is specified multiple ' 'times in idList' % read) else: readColor[read] = color # Draw the matched region. for titleAlignment in titleAlignments: readId = titleAlignment.read.id for hsp in titleAlignment.hsps: y = hsp.score.score line = Line2D([hsp.subjectStart, hsp.subjectEnd], [y, y], color=readColor.get(readId, 'blue')) readsAx.add_line(line) if showOrfs: subject = readsAlignments.getSubjectSequence(title) orfs.addORFs(orfAx, subject.sequence, minX, maxX, adjustOffset) orfs.addReversedORFs(orfReversedAx, subject.reverseComplement().sequence, minX, maxX, adjustOffset) if showFeatures: if subjectIsNucleotides: featureAdder = NucleotideFeatureAdder() else: featureAdder = ProteinFeatureAdder() features = featureAdder.add(featureAx, title, minX, maxX, adjustOffset) # If there are features and there weren't too many of them, add # vertical feature lines to the reads and ORF axes. if features and not featureAdder.tooManyFeaturesToPlot: for feature in features: start = feature.start end = feature.end color = feature.color readsAx.axvline(x=start, color=color) readsAx.axvline(x=end, color='#cccccc') if showOrfs: orfAx.axvline(x=start, color=color) orfAx.axvline(x=end, color='#cccccc') orfReversedAx.axvline(x=start, color=color) orfReversedAx.axvline(x=end, color='#cccccc') else: features = None # We'll return some information we've gathered. result = { 'adjustOffset': adjustOffset, 'features': features, 'minX': minX, 'maxX': maxX, 'minY': minY, 'maxY': maxY, } # Allow the class of titlesAlignments to add to the plot, if it has a # method for doing so. try: adjuster = readsAlignments.adjustPlot except AttributeError: pass else: adjuster(readsAx) # Titles, axis, etc. if createFigure: readCount = titleAlignments.readCount() hspCount = titleAlignments.hspCount() figure.suptitle( '%s\nLength %d %s, %d read%s, %d HSP%s.' % ( fill(titleAlignments.subjectTitle, 80), titleAlignments.subjectLength, 'nt' if subjectIsNucleotides else 'aa', readCount, '' if readCount == 1 else 's', hspCount, '' if hspCount == 1 else 's' ), fontsize=20) # Add a title and y-axis label, but only if we made the reads axes. if createdReadsAx: readsAx.set_title('Read alignments', fontsize=20) ylabel = readsAlignments.params.scoreTitle if rankScores: ylabel += ' rank' plt.ylabel(ylabel, fontsize=17) # Set the x-axis limits. readsAx.set_xlim([minX - 1, maxX + 1]) readsAx.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)]) readsAx.grid() if createFigure: if showFigure: plt.show() if imageFile: figure.savefig(imageFile) stop = time() if not quiet: report('Graph generated in %.3f mins.' % ((stop - startTime) / 60.0)) return result
def alignmentGraph(titlesAlignments, title, accession, addQueryLines=True, showFeatures=True, logLinearXAxis=False, logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE, rankScores=False, createFigure=True, showFigure=True, readsAx=None, imageFile=None, quiet=False, idList=False, xRange='subject'): """ Align a set of matching reads against a BLAST or DIAMOND hit. @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance. @param title: A C{str} sequence title that was matched. We plot the reads that hit this title. @param accession: The C{str} accession number of the matched title. @param addQueryLines: if C{True}, draw query lines in full (these will then be partly overdrawn by the HSP match against the subject). These are the 'whiskers' that potentially protrude from each side of a query. @param showFeatures: if C{True}, look online for features of the subject sequence (given by hitId). @param logLinearXAxis: if C{True}, convert read offsets so that empty regions in the plot we're preparing will only be as wide as their logged actual values. @param logBase: The base of the logarithm to use if logLinearXAxis is C{True}. @param: rankScores: If C{True}, change the e-values and bit scores for the reads for each title to be their rank (worst to best). @param createFigure: If C{True}, create a figure and give it a title. @param showFigure: If C{True}, show the created figure. Set this to C{False} if you're creating a panel of figures or just want to save an image (with C{imageFile}). @param readsAx: If not None, use this as the subplot for displaying reads. @param imageFile: If not None, specifies a filename to write the image to. @param quiet: If C{True}, don't print progress / timing output. @param idList: a dictionary. The keys is a color and the values is a list of read identifiers that should be colored in the respective color. @param xRange: set to either 'subject' or 'reads' to indicate the range of the X axis. """ startTime = time() assert xRange in ('subject', 'reads'), ('xRange must be either "subject" or "reads".') if createFigure: width = 20 figure = plt.figure(figsize=(width, 20)) createdReadsAx = readsAx is None if showFeatures: gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1]) featureAx = plt.subplot(gs[0, 0]) readsAx = readsAx or plt.subplot(gs[1, 0]) else: readsAx = readsAx or plt.subplot(111) # Make a deep copy of the title alignments. We're potentially going to # change the HSP scores, the X axis offsets, etc., and we don't want to # interfere with the data we were passed. titleAlignments = deepcopy(titlesAlignments[title]) readsAlignments = titlesAlignments.readsAlignments subjectIsNucleotides = readsAlignments.params.subjectIsNucleotides # Allow the class of titlesAlignments to adjust HSPs for plotting, # if it has a method for doing so. try: adjuster = readsAlignments.adjustHspsForPlotting except AttributeError: pass else: adjuster(titleAlignments) if rankScores: reverse = titlesAlignments.scoreClass is not HigherIsBetterScore for rank, hsp in enumerate(sorted(titleAlignments.hsps(), reverse=reverse), start=1): hsp.score.score = rank if logLinearXAxis: readIntervals = ReadIntervals(titleAlignments.subjectLength) # Examine all HSPs so we can build an offset adjuster. for hsp in titleAlignments.hsps(): readIntervals.add(hsp.readStartInSubject, hsp.readEndInSubject) # Now adjust offsets in all HSPs. offsetAdjuster = OffsetAdjuster(readIntervals, base=logBase) for hsp in titleAlignments.hsps(): offsetAdjuster.adjustHSP(hsp) # A function for adjusting other offsets, below. adjustOffset = offsetAdjuster.adjustOffset else: def adjustOffset(offset): return offset # It would be more efficient to only walk through all HSPs once and # compute these values all at once, but for now this is simple and clear. maxY = int(ceil(titleAlignments.bestHsp().score.score)) minY = int(titleAlignments.worstHsp().score.score) maxX = max(hsp.readEndInSubject for hsp in titleAlignments.hsps()) minX = min(hsp.readStartInSubject for hsp in titleAlignments.hsps()) if xRange == 'subject': # We'll display a graph for the full subject range. Adjust X axis # min/max to make sure we cover at least zero to the sequence length. maxX = max(titleAlignments.subjectLength, maxX) minX = min(0, minX) # Swap min & max Y values, if needed, as it's possible we are dealing # with LSPs but that the score adjuster made numerically greater values # for those that were small. if maxY < minY: (maxY, minY) = (minY, maxY) if logLinearXAxis: # Adjust minX and maxX if we have gaps at the subject start or end. gaps = list(readIntervals.walk()) if gaps: # Check start of first gap: intervalType, (start, stop) = gaps[0] if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(start) if adjustedStart < minX: minX = adjustedStart # Check stop of last gap: intervalType, (start, stop) = gaps[-1] if intervalType == ReadIntervals.EMPTY: adjustedStop = adjustOffset(stop) if adjustedStop > maxX: maxX = adjustedStop # We're all set up to start plotting the graph. # Add light grey vertical rectangles to show the logarithmic gaps. Add # these first so that reads will be plotted on top of them. Only draw # gaps that are more than SMALLEST_LOGGED_GAP_TO_DISPLAY pixels wide as # we could have millions of tiny gaps for a bacteria and drawing them # all will be slow and only serves to make the entire background grey. if logLinearXAxis and len(offsetAdjuster.adjustments()) < 100: for (intervalType, interval) in readIntervals.walk(): if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(interval[0]) adjustedStop = adjustOffset(interval[1]) width = adjustedStop - adjustedStart if width >= SMALLEST_LOGGED_GAP_TO_DISPLAY: readsAx.axvspan(adjustedStart, adjustedStop, color='#f4f4f4') else: # Add horizontal lines for all the query sequences. These will be the # grey 'whiskers' in the plots once we (below) draw the matched part # on top of part of them. if addQueryLines: for hsp in titleAlignments.hsps(): y = hsp.score.score line = Line2D([hsp.readStartInSubject, hsp.readEndInSubject], [y, y], color='#aaaaaa') readsAx.add_line(line) # Add the horizontal BLAST alignment lines. # If an idList is given set things up to look up read colors. readColor = {} if idList: for color, reads in idList.items(): for read in reads: if read in readColor: raise ValueError('Read %s is specified multiple ' 'times in idList' % read) else: readColor[read] = color # Draw the matched region. for titleAlignment in titleAlignments: readId = titleAlignment.read.id for hsp in titleAlignment.hsps: y = hsp.score.score line = Line2D([hsp.subjectStart, hsp.subjectEnd], [y, y], color=readColor.get(readId, 'blue')) readsAx.add_line(line) if showFeatures: if subjectIsNucleotides: featureAdder = NucleotideFeatureAdder() else: featureAdder = ProteinFeatureAdder() features = featureAdder.add(featureAx, title, minX, maxX, adjustOffset) # If there are features and there weren't too many of them, add # vertical feature lines to the reads and ORF axes. if features and not featureAdder.tooManyFeaturesToPlot: for feature in features: start = feature.start end = feature.end color = feature.color readsAx.axvline(x=start, color=color) readsAx.axvline(x=end, color='#cccccc') else: features = None # We'll return some information we've gathered. result = { 'adjustOffset': adjustOffset, 'features': features, 'minX': minX, 'maxX': maxX, 'minY': minY, 'maxY': maxY, } # Allow the class of titlesAlignments to add to the plot, if it has a # method for doing so. try: adjuster = readsAlignments.adjustPlot except AttributeError: pass else: adjuster(readsAx) # Titles, axis, etc. if createFigure: readCount = titleAlignments.readCount() hspCount = titleAlignments.hspCount() figure.suptitle( '%s (%s)\nLength %d %s, %d read%s, %d HSP%s.' % (fill(titleAlignments.subjectTitle, 80), accession, titleAlignments.subjectLength, 'nt' if subjectIsNucleotides else 'aa', readCount, '' if readCount == 1 else 's', hspCount, '' if hspCount == 1 else 's'), fontsize=20) # Add a title and y-axis label, but only if we made the reads axes. if createdReadsAx: readsAx.set_title('Read alignments', fontsize=20) ylabel = readsAlignments.params.scoreTitle if rankScores: ylabel += ' rank' plt.ylabel(ylabel, fontsize=17) # Set the x-axis limits. readsAx.set_xlim([minX - 1, maxX + 1]) readsAx.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)]) readsAx.grid() if createFigure: if showFigure: plt.show() if imageFile: figure.savefig(imageFile) stop = time() if not quiet: report('Graph generated in %.3f mins.' % ((stop - startTime) / 60.0)) return result