def testOneFeatureAdjusted(self): """ If the sequence fetcher used by a L{_FeatureAdder} returns a feature, the C{text} and C{axis} methods on the figure must be called correctly and the C{add} call must return the sequences. """ def fetcher(title, db="database"): location = FeatureLocation(100, 200) feature = SeqFeature(type="Site", qualifiers={"a": ["b"]}, location=location) return SeqRecord(None, features=[feature]) featureAdder = ProteinFeatureAdder() fig = plt.subplot(111) fig.plot = MagicMock() fig.axis = MagicMock() fig.legend = MagicMock() adjuster = lambda x: 3 * x result = featureAdder.add(fig, "title", 0, 300, adjuster, sequenceFetcher=fetcher) fig.plot.assert_called_with( [300, 600], [-0.0, -0.0], color=(0.2298057, 0.298717966, 0.75368315299999999, 1.0), linewidth=2 ) fig.axis.assert_called_with([0, 300, -0.4, 0.2]) fig.legend.assert_called_with( ["100-200 Site. a: b"], loc="lower center", shadow=True, bbox_to_anchor=(0.5, 1.4), ncol=2, fancybox=True ) self.assertTrue(isinstance(result, _FeatureList)) self.assertEqual(1, len(result))
def testUnwantedFeature(self): """ If the sequence fetcher used by a L{_FeatureAdder} returns a feature whose type is not wanted, the figure's plot method must not be called and the C{add} method must return an empty feature list. """ def fetcher(title, db="database"): location = FeatureLocation(100, 200) feature = SeqFeature(type="unwanted", qualifiers={"a": ["b"]}, location=location) return SeqRecord(None, features=[feature]) featureAdder = ProteinFeatureAdder() fig = plt.subplot(111) fig.plot = MagicMock() result = featureAdder.add(fig, "title", 0, 300, identity, sequenceFetcher=fetcher) self.assertEqual([], fig.plot.call_args_list) self.assertEqual([], result)
def alignmentGraph(titlesAlignments, title, addQueryLines=True, showFeatures=True, logLinearXAxis=False, logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE, rankScores=False, colorQueryBases=False, createFigure=True, showFigure=True, readsAx=None, imageFile=None, quiet=False, idList=False, xRange='subject', showOrfs=True): """ Align a set of matching reads against a BLAST or DIAMOND hit. @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance. @param title: A C{str} sequence title that was matched. We plot the reads that hit this title. @param addQueryLines: if C{True}, draw query lines in full (these will then be partly overdrawn by the HSP match against the subject). These are the 'whiskers' that potentially protrude from each side of a query. @param showFeatures: if C{True}, look online for features of the subject sequence (given by hitId). @param logLinearXAxis: if C{True}, convert read offsets so that empty regions in the plot we're preparing will only be as wide as their logged actual values. @param logBase: The base of the logarithm to use if logLinearXAxis is C{True}. @param: rankScores: If C{True}, change the e-values and bit scores for the reads for each title to be their rank (worst to best). @param colorQueryBases: if C{True}, color each base of a query string. If C{True}, then addQueryLines is meaningless since the whole query is shown colored. @param createFigure: If C{True}, create a figure and give it a title. @param showFigure: If C{True}, show the created figure. Set this to C{False} if you're creating a panel of figures or just want to save an image (with C{imageFile}). @param readsAx: If not None, use this as the subplot for displaying reads. @param imageFile: If not None, specifies a filename to write the image to. @param quiet: If C{True}, don't print progress / timing output. @param idList: a dictionary. The keys is a color and the values is a list of read identifiers that should be colored in the respective color. @param xRange: set to either 'subject' or 'reads' to indicate the range of the X axis. @param showOrfs: If C{True}, open reading frames will be displayed. """ startTime = time() assert xRange in ('subject', 'reads'), ( 'xRange must be either "subject" or "reads".') if createFigure: width = 20 figure = plt.figure(figsize=(width, 20)) createdReadsAx = readsAx is None if showFeatures: if showOrfs: gs = gridspec.GridSpec(4, 1, height_ratios=[3, 1, 1, 12]) featureAx = plt.subplot(gs[0, 0]) orfAx = plt.subplot(gs[1, 0]) orfReversedAx = plt.subplot(gs[2, 0]) readsAx = readsAx or plt.subplot(gs[3, 0]) else: gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1]) featureAx = plt.subplot(gs[0, 0]) readsAx = readsAx or plt.subplot(gs[1, 0]) else: if showOrfs: gs = gridspec.GridSpec(3, 1, height_ratios=[1, 1, 12]) orfAx = plt.subplot(gs[0, 0]) orfReversedAx = plt.subplot(gs[1, 0]) readsAx = readsAx or plt.subplot(gs[2, 0]) else: readsAx = readsAx or plt.subplot(111) # Make a deep copy of the title alignments. We're potentially going to # change the HSP scores, the X axis offsets, etc., and we don't want to # interfere with the data we were passed. titleAlignments = deepcopy(titlesAlignments[title]) readsAlignments = titlesAlignments.readsAlignments subjectIsNucleotides = readsAlignments.params.subjectIsNucleotides if showOrfs and not subjectIsNucleotides: # We cannot show ORFs when displaying protein plots. showOrfs = False # Allow the class of titlesAlignments to adjust HSPs for plotting, # if it has a method for doing so. try: adjuster = readsAlignments.adjustHspsForPlotting except AttributeError: pass else: adjuster(titleAlignments) if rankScores: reverse = titlesAlignments.scoreClass is not HigherIsBetterScore for rank, hsp in enumerate(sorted(titleAlignments.hsps(), reverse=reverse), start=1): hsp.score.score = rank if logLinearXAxis: readIntervals = ReadIntervals(titleAlignments.subjectLength) # Examine all HSPs so we can build an offset adjuster. for hsp in titleAlignments.hsps(): readIntervals.add(hsp.readStartInSubject, hsp.readEndInSubject) # Now adjust offsets in all HSPs. offsetAdjuster = OffsetAdjuster(readIntervals, base=logBase) for hsp in titleAlignments.hsps(): offsetAdjuster.adjustHSP(hsp) # A function for adjusting other offsets, below. adjustOffset = offsetAdjuster.adjustOffset else: def adjustOffset(offset): return offset # It would be more efficient to only walk through all HSPs once and # compute these values all at once, but for now this is simple and clear. maxY = int(ceil(titleAlignments.bestHsp().score.score)) minY = int(titleAlignments.worstHsp().score.score) maxX = max(hsp.readEndInSubject for hsp in titleAlignments.hsps()) minX = min(hsp.readStartInSubject for hsp in titleAlignments.hsps()) if xRange == 'subject': # We'll display a graph for the full subject range. Adjust X axis # min/max to make sure we cover at least zero to the sequence length. maxX = max(titleAlignments.subjectLength, maxX) minX = min(0, minX) # Swap min & max Y values, if needed, as it's possible we are dealing # with LSPs but that the score adjuster made numerically greater values # for those that were small. if maxY < minY: (maxY, minY) = (minY, maxY) if logLinearXAxis: # Adjust minX and maxX if we have gaps at the subject start or end. gaps = list(readIntervals.walk()) if gaps: # Check start of first gap: intervalType, (start, stop) = gaps[0] if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(start) if adjustedStart < minX: minX = adjustedStart # Check stop of last gap: intervalType, (start, stop) = gaps[-1] if intervalType == ReadIntervals.EMPTY: adjustedStop = adjustOffset(stop) if adjustedStop > maxX: maxX = adjustedStop # We're all set up to start plotting the graph. # Add light grey vertical rectangles to show the logarithmic gaps. Add # these first so that reads will be plotted on top of them. Only draw # gaps that are more than SMALLEST_LOGGED_GAP_TO_DISPLAY pixels wide as # we could have millions of tiny gaps for a bacteria and drawing them # all will be slow and only serves to make the entire background grey. if logLinearXAxis and len(offsetAdjuster.adjustments()) < 100: for (intervalType, interval) in readIntervals.walk(): if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(interval[0]) adjustedStop = adjustOffset(interval[1]) width = adjustedStop - adjustedStart if width >= SMALLEST_LOGGED_GAP_TO_DISPLAY: readsAx.axvspan(adjustedStart, adjustedStop, color='#f4f4f4') if colorQueryBases: # Color each query by its bases. xScale = 3 yScale = 2 baseImage = BaseImage( maxX - minX, maxY - minY + (1 if rankScores else 0), xScale, yScale) for alignment in titleAlignments: for hsp in alignment.hsps: y = hsp.score.score - minY # If the product of the subject and read frame values is +ve, # then they're either both +ve or both -ve, so we just use the # read as is. Otherwise, we need to reverse complement it. if hsp.subjectFrame * hsp.readFrame > 0: query = else: # One of the subject or query has negative sense. query = readStartInSubject = hsp.readStartInSubject # There are 3 parts of the query string we need to # display. 1) the left part (if any) before the matched # part of the subject. 2) the matched part (which can # include gaps in the query and/or subject). 3) the right # part (if any) after the matched part. For each part, # calculate the ranges in which we have to make the # comparison between subject and query. # NOTE: never use hsp['origHsp'].gaps to calculate the number # of gaps, as this number contains gaps in both subject and # query. # 1. Left part: leftRange = hsp.subjectStart - readStartInSubject # 2. Match, middle part: middleRange = len(hsp.readMatchedSequence) # 3. Right part: # Using hsp.readEndInSubject - hsp.subjectEnd to calculate the # length of the right part leads to the part being too long. # The number of gaps needs to be subtracted to get the right # length. origQuery = hsp.readMatchedSequence.upper() rightRange = (hsp.readEndInSubject - hsp.subjectEnd - origQuery.count('-')) # 1. Left part. xOffset = readStartInSubject - minX queryOffset = 0 for queryIndex in range(leftRange): color = QUERY_COLORS.get(query[queryOffset + queryIndex], DEFAULT_BASE_COLOR) baseImage.set(xOffset + queryIndex, y, color) # 2. Match part. xOffset = hsp.subjectStart - minX xIndex = 0 queryOffset = hsp.subjectStart - hsp.readStartInSubject origSubject = hsp.subjectMatchedSequence for matchIndex in range(middleRange): if origSubject[matchIndex] == '-': # A gap in the subject was needed to match the query. # In our graph we keep the subject the same even in the # case where BLAST opened gaps in it, so we compensate # for the gap in the subject by not showing this base # of the query. pass else: if origSubject[matchIndex] == origQuery[matchIndex]: # The query matched the subject at this location. # Matching bases are all colored in the same # 'match' color. color = QUERY_COLORS['match'] else: if origQuery[matchIndex] == '-': # A gap in the query. All query gaps get the # same 'gap' color. color = QUERY_COLORS['gap'] else: # Query doesn't match subject (and is not a # gap). color = QUERY_COLORS.get(origQuery[matchIndex], DEFAULT_BASE_COLOR) baseImage.set(xOffset + xIndex, y, color) xIndex += 1 # 3. Right part. xOffset = hsp.subjectEnd - minX backQuery = query[-rightRange:].upper() for queryIndex in range(rightRange): color = QUERY_COLORS.get(backQuery[queryIndex], DEFAULT_BASE_COLOR) baseImage.set(xOffset + queryIndex, y, color) readsAx.imshow(, aspect='auto', origin='lower', interpolation='nearest', extent=[minX, maxX, minY, maxY]) else: # Add horizontal lines for all the query sequences. These will be the # grey 'whiskers' in the plots once we (below) draw the matched part # on top of part of them. if addQueryLines: for hsp in titleAlignments.hsps(): y = hsp.score.score line = Line2D([hsp.readStartInSubject, hsp.readEndInSubject], [y, y], color='#aaaaaa') readsAx.add_line(line) # Add the horizontal BLAST alignment lines. # If an idList is given set things up to look up read colors. readColor = {} if idList: for color, reads in idList.items(): for read in reads: if read in readColor: raise ValueError('Read %s is specified multiple ' 'times in idList' % read) else: readColor[read] = color # Draw the matched region. for titleAlignment in titleAlignments: readId = for hsp in titleAlignment.hsps: y = hsp.score.score line = Line2D([hsp.subjectStart, hsp.subjectEnd], [y, y], color=readColor.get(readId, 'blue')) readsAx.add_line(line) if showOrfs: subject = readsAlignments.getSubjectSequence(title) orfs.addORFs(orfAx, subject.sequence, minX, maxX, adjustOffset) orfs.addReversedORFs(orfReversedAx, subject.reverseComplement().sequence, minX, maxX, adjustOffset) if showFeatures: if subjectIsNucleotides: featureAdder = NucleotideFeatureAdder() else: featureAdder = ProteinFeatureAdder() features = featureAdder.add(featureAx, title, minX, maxX, adjustOffset) # If there are features and there weren't too many of them, add # vertical feature lines to the reads and ORF axes. if features and not featureAdder.tooManyFeaturesToPlot: for feature in features: start = feature.start end = feature.end color = feature.color readsAx.axvline(x=start, color=color) readsAx.axvline(x=end, color='#cccccc') if showOrfs: orfAx.axvline(x=start, color=color) orfAx.axvline(x=end, color='#cccccc') orfReversedAx.axvline(x=start, color=color) orfReversedAx.axvline(x=end, color='#cccccc') else: features = None # We'll return some information we've gathered. result = { 'adjustOffset': adjustOffset, 'features': features, 'minX': minX, 'maxX': maxX, 'minY': minY, 'maxY': maxY, } # Allow the class of titlesAlignments to add to the plot, if it has a # method for doing so. try: adjuster = readsAlignments.adjustPlot except AttributeError: pass else: adjuster(readsAx) # Titles, axis, etc. if createFigure: readCount = titleAlignments.readCount() hspCount = titleAlignments.hspCount() figure.suptitle( '%s\nLength %d %s, %d read%s, %d HSP%s.' % ( fill(titleAlignments.subjectTitle, 80), titleAlignments.subjectLength, 'nt' if subjectIsNucleotides else 'aa', readCount, '' if readCount == 1 else 's', hspCount, '' if hspCount == 1 else 's' ), fontsize=20) # Add a title and y-axis label, but only if we made the reads axes. if createdReadsAx: readsAx.set_title('Read alignments', fontsize=20) ylabel = readsAlignments.params.scoreTitle if rankScores: ylabel += ' rank' plt.ylabel(ylabel, fontsize=17) # Set the x-axis limits. readsAx.set_xlim([minX - 1, maxX + 1]) readsAx.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)]) readsAx.grid() if createFigure: if showFigure: if imageFile: figure.savefig(imageFile) stop = time() if not quiet: report('Graph generated in %.3f mins.' % ((stop - startTime) / 60.0)) return result
def alignmentGraph(titlesAlignments, title, accession, addQueryLines=True, showFeatures=True, logLinearXAxis=False, logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE, rankScores=False, createFigure=True, showFigure=True, readsAx=None, imageFile=None, quiet=False, idList=False, xRange='subject'): """ Align a set of matching reads against a BLAST or DIAMOND hit. @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance. @param title: A C{str} sequence title that was matched. We plot the reads that hit this title. @param accession: The C{str} accession number of the matched title. @param addQueryLines: if C{True}, draw query lines in full (these will then be partly overdrawn by the HSP match against the subject). These are the 'whiskers' that potentially protrude from each side of a query. @param showFeatures: if C{True}, look online for features of the subject sequence (given by hitId). @param logLinearXAxis: if C{True}, convert read offsets so that empty regions in the plot we're preparing will only be as wide as their logged actual values. @param logBase: The base of the logarithm to use if logLinearXAxis is C{True}. @param: rankScores: If C{True}, change the e-values and bit scores for the reads for each title to be their rank (worst to best). @param createFigure: If C{True}, create a figure and give it a title. @param showFigure: If C{True}, show the created figure. Set this to C{False} if you're creating a panel of figures or just want to save an image (with C{imageFile}). @param readsAx: If not None, use this as the subplot for displaying reads. @param imageFile: If not None, specifies a filename to write the image to. @param quiet: If C{True}, don't print progress / timing output. @param idList: a dictionary. The keys is a color and the values is a list of read identifiers that should be colored in the respective color. @param xRange: set to either 'subject' or 'reads' to indicate the range of the X axis. """ startTime = time() assert xRange in ('subject', 'reads'), ('xRange must be either "subject" or "reads".') if createFigure: width = 20 figure = plt.figure(figsize=(width, 20)) createdReadsAx = readsAx is None if showFeatures: gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1]) featureAx = plt.subplot(gs[0, 0]) readsAx = readsAx or plt.subplot(gs[1, 0]) else: readsAx = readsAx or plt.subplot(111) # Make a deep copy of the title alignments. We're potentially going to # change the HSP scores, the X axis offsets, etc., and we don't want to # interfere with the data we were passed. titleAlignments = deepcopy(titlesAlignments[title]) readsAlignments = titlesAlignments.readsAlignments subjectIsNucleotides = readsAlignments.params.subjectIsNucleotides # Allow the class of titlesAlignments to adjust HSPs for plotting, # if it has a method for doing so. try: adjuster = readsAlignments.adjustHspsForPlotting except AttributeError: pass else: adjuster(titleAlignments) if rankScores: reverse = titlesAlignments.scoreClass is not HigherIsBetterScore for rank, hsp in enumerate(sorted(titleAlignments.hsps(), reverse=reverse), start=1): hsp.score.score = rank if logLinearXAxis: readIntervals = ReadIntervals(titleAlignments.subjectLength) # Examine all HSPs so we can build an offset adjuster. for hsp in titleAlignments.hsps(): readIntervals.add(hsp.readStartInSubject, hsp.readEndInSubject) # Now adjust offsets in all HSPs. offsetAdjuster = OffsetAdjuster(readIntervals, base=logBase) for hsp in titleAlignments.hsps(): offsetAdjuster.adjustHSP(hsp) # A function for adjusting other offsets, below. adjustOffset = offsetAdjuster.adjustOffset else: def adjustOffset(offset): return offset # It would be more efficient to only walk through all HSPs once and # compute these values all at once, but for now this is simple and clear. maxY = int(ceil(titleAlignments.bestHsp().score.score)) minY = int(titleAlignments.worstHsp().score.score) maxX = max(hsp.readEndInSubject for hsp in titleAlignments.hsps()) minX = min(hsp.readStartInSubject for hsp in titleAlignments.hsps()) if xRange == 'subject': # We'll display a graph for the full subject range. Adjust X axis # min/max to make sure we cover at least zero to the sequence length. maxX = max(titleAlignments.subjectLength, maxX) minX = min(0, minX) # Swap min & max Y values, if needed, as it's possible we are dealing # with LSPs but that the score adjuster made numerically greater values # for those that were small. if maxY < minY: (maxY, minY) = (minY, maxY) if logLinearXAxis: # Adjust minX and maxX if we have gaps at the subject start or end. gaps = list(readIntervals.walk()) if gaps: # Check start of first gap: intervalType, (start, stop) = gaps[0] if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(start) if adjustedStart < minX: minX = adjustedStart # Check stop of last gap: intervalType, (start, stop) = gaps[-1] if intervalType == ReadIntervals.EMPTY: adjustedStop = adjustOffset(stop) if adjustedStop > maxX: maxX = adjustedStop # We're all set up to start plotting the graph. # Add light grey vertical rectangles to show the logarithmic gaps. Add # these first so that reads will be plotted on top of them. Only draw # gaps that are more than SMALLEST_LOGGED_GAP_TO_DISPLAY pixels wide as # we could have millions of tiny gaps for a bacteria and drawing them # all will be slow and only serves to make the entire background grey. if logLinearXAxis and len(offsetAdjuster.adjustments()) < 100: for (intervalType, interval) in readIntervals.walk(): if intervalType == ReadIntervals.EMPTY: adjustedStart = adjustOffset(interval[0]) adjustedStop = adjustOffset(interval[1]) width = adjustedStop - adjustedStart if width >= SMALLEST_LOGGED_GAP_TO_DISPLAY: readsAx.axvspan(adjustedStart, adjustedStop, color='#f4f4f4') else: # Add horizontal lines for all the query sequences. These will be the # grey 'whiskers' in the plots once we (below) draw the matched part # on top of part of them. if addQueryLines: for hsp in titleAlignments.hsps(): y = hsp.score.score line = Line2D([hsp.readStartInSubject, hsp.readEndInSubject], [y, y], color='#aaaaaa') readsAx.add_line(line) # Add the horizontal BLAST alignment lines. # If an idList is given set things up to look up read colors. readColor = {} if idList: for color, reads in idList.items(): for read in reads: if read in readColor: raise ValueError('Read %s is specified multiple ' 'times in idList' % read) else: readColor[read] = color # Draw the matched region. for titleAlignment in titleAlignments: readId = for hsp in titleAlignment.hsps: y = hsp.score.score line = Line2D([hsp.subjectStart, hsp.subjectEnd], [y, y], color=readColor.get(readId, 'blue')) readsAx.add_line(line) if showFeatures: if subjectIsNucleotides: featureAdder = NucleotideFeatureAdder() else: featureAdder = ProteinFeatureAdder() features = featureAdder.add(featureAx, title, minX, maxX, adjustOffset) # If there are features and there weren't too many of them, add # vertical feature lines to the reads and ORF axes. if features and not featureAdder.tooManyFeaturesToPlot: for feature in features: start = feature.start end = feature.end color = feature.color readsAx.axvline(x=start, color=color) readsAx.axvline(x=end, color='#cccccc') else: features = None # We'll return some information we've gathered. result = { 'adjustOffset': adjustOffset, 'features': features, 'minX': minX, 'maxX': maxX, 'minY': minY, 'maxY': maxY, } # Allow the class of titlesAlignments to add to the plot, if it has a # method for doing so. try: adjuster = readsAlignments.adjustPlot except AttributeError: pass else: adjuster(readsAx) # Titles, axis, etc. if createFigure: readCount = titleAlignments.readCount() hspCount = titleAlignments.hspCount() figure.suptitle( '%s (%s)\nLength %d %s, %d read%s, %d HSP%s.' % (fill(titleAlignments.subjectTitle, 80), accession, titleAlignments.subjectLength, 'nt' if subjectIsNucleotides else 'aa', readCount, '' if readCount == 1 else 's', hspCount, '' if hspCount == 1 else 's'), fontsize=20) # Add a title and y-axis label, but only if we made the reads axes. if createdReadsAx: readsAx.set_title('Read alignments', fontsize=20) ylabel = readsAlignments.params.scoreTitle if rankScores: ylabel += ' rank' plt.ylabel(ylabel, fontsize=17) # Set the x-axis limits. readsAx.set_xlim([minX - 1, maxX + 1]) readsAx.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)]) readsAx.grid() if createFigure: if showFigure: if imageFile: figure.savefig(imageFile) stop = time() if not quiet: report('Graph generated in %.3f mins.' % ((stop - startTime) / 60.0)) return result