def lengthHistogram(self, options): """Sequence length histogram command""" self.logger.info( '[CheckM - len_hist] Creating sequence length histogram.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) plot = LengthHistogram(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting sequence length histogram for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 plot.plot(f) outputFile = os.path.join( options.output_dir, binId) + '.len_hist.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def sequenceStats(self, outDir, binFile): """Calculate statistics for all sequences within a bin.""" # read scaffolds seqs = readFasta(binFile) seqStats = {} for seqId in seqs: seqStats[seqId] = {} self.calculateGC(seqs, seqStats) self.calculateSeqStats(seqs, seqStats) binId = binIdFromFilename(binFile) aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) if os.path.exists(aaFile): aaGenes = readFasta(aaFile) for geneId, gene in aaGenes.iteritems(): seqId = geneId[0:geneId.rfind('_')] seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3 else: # missing amino acid file likely indicates users used a pre-called gene file, so # just set some defaults seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3 return seqStats
def distributionPlots(self, options): """Reference distribution plot command""" self.logger.info( '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) genomicSignatures = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSignatures.read(options.tetra_profile) plots = DistributionPlots(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting reference distribution plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 binId = binIdFromFilename(f) plots.plot(f, tetraSigs, options.distributions) outputFile = os.path.join( options.output_dir, binId) + '.ref_dist_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def nxPlot(self, options): """Nx-plot command""" self.logger.info('[CheckM - nx_plot] Creating Nx-plots.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) nx = NxPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info('Plotting Nx-plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 nx.plot(f) outputFile = os.path.join(options.output_dir, binId) + '.nx_plot.' + options.image_type nx.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def codingDensityPlot(self, options): """Coding density plot command""" self.logger.info( '[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) plots = CodingDensityPlots(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting coding density plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, options.distributions) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.coding_density_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def coveragePcaPlot(self, options): """PCA plot of coverage profiles""" self.logger.info( '[CheckM - cov_pca] Creating PCA plot of coverage profiles.') checkDirExists(options.bin_dir) checkFileExists(options.coverage_file) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) coverage = Coverage(threads=1) coverageStats = coverage.parseCoverage(options.coverage_file) seqIds = [] coverageProfiles = [] for binId, seqDict in coverageStats.items(): for seqId, bamDict in seqDict.items(): seqIds.append(seqId) coverages = [] for _, coverage in bamDict.items(): coverages.append(coverage) coverageProfiles.append(coverages) coverageProfiles = np.array(coverageProfiles) if coverageProfiles.shape[1] < 2: self.logger.error( 'Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.' ) sys.exit(1) self.logger.info('Computing PCA of coverage profiles.\n') pca = PCA() pc, variance = pca.pcaMatrix(coverageProfiles, fraction=1.0, bCenter=True, bScale=False) plots = PcaPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting PCA of coverage profiles for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, seqIds, pc, variance) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.cov_pca_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def parallelCoordPlot(self, options): """Parallel coordinate plot command""" self.logger.info( '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) checkFileExists(options.coverage_file) binFiles = self.binFiles(options.bin_dir, options.extension) # read coverage stats file coverage = Coverage(threads=1) coverageStats = coverage.parseCoverage(options.coverage_file) # calculate sequence stats for all bins self.logger.info('Calculating sequence statistics for each bin.') binStats = BinStatistics() seqStats = {} for f in binFiles: binId = binIdFromFilename(f) seqStats[binId] = binStats.sequenceStats(options.results_dir, f) # create plot for each bin plot = ParallelCoordPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting parallel coordinates for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 plot.plot(binId, seqStats, coverageStats) outputFile = os.path.join( options.output_dir, binId) + '.paralel_coord_plot.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def __sortBinsByCompleteness(self, binFiles, binStatsExt): sortedBinIds = [] for binFile in binFiles: binId = binIdFromFilename(binFile) sortedBinIds.append([binId, binStatsExt[binId]['Completeness']]) sortedBinIds.sort(key=itemgetter(1, 0)) return [x[0] for x in sortedBinIds]
def __processBin(self, outDir, queueIn, queueOut): """Thread safe bin processing.""" while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binStats = {} scaffoldStats = {} binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # read scaffolds scaffolds = readFasta(binFile) for seqId in scaffolds: scaffoldStats[seqId] = {} # calculate GC statistics GC, stdGC = self.calculateGC(scaffolds, scaffoldStats) binStats['GC'] = GC binStats['GC std'] = stdGC # calculate statistics related to scaffold lengths maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds, scaffoldStats) binStats['Genome size'] = genomeSize binStats['# ambiguous bases'] = numAmbiguousBases binStats['# scaffolds'] = len(scaffolds) binStats['# contigs'] = numContigs binStats['Longest scaffold'] = maxScaffoldLen binStats['Longest contig'] = maxContigLen binStats['N50 (scaffolds)'] = scaffold_N50 binStats['N50 (contigs)'] = contig_N50 # calculate coding density statistics codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, genomeSize, scaffoldStats) binStats['Coding density'] = codingDensity binStats['Translation table'] = translationTable binStats['# predicted genes'] = numORFs queueOut.put((binId, binStats, scaffoldStats))
def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut): """Thread safe bin processing.""" markerSetParser = MarkerSetParser(self.threadsPerSearch) while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # run Prodigal if not bCalledGenes: prodigal = ProdigalRunner(binDir) if not prodigal.areORFsCalled(bNucORFs): prodigal.run(binFile, bNucORFs) aaGeneFile = prodigal.aaGeneFile else: aaGeneFile = binFile shutil.copyfile(aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA)) # extract HMMs into temporary file hmmModelFile = markerSetParser.createHmmModelFile(binId, markerFile) # run HMMER hmmer = HMMERRunner() tableOutPath = os.path.join(binDir, tableOut) hmmerOutPath = os.path.join(binDir, hmmerOut) keepAlignStr = '' if not bKeepAlignment: keepAlignStr = '--noali' hmmer.search(hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath, '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment) queueOut.put((binId, hmmModelFile))
def gcBiasPlot(self, options): """GC bias plot command""" self.logger.info( '[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) coverageWindows = CoverageWindows(options.threads) coverageProfile = coverageWindows.run(binFiles, options.bam_file, options.all_reads, options.min_align, options.max_edit_dist, options.window_size) plots = GcBiasPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info('Plotting GC plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, coverageProfile) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.gc_bias_plot.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def tetraPcaPlot(self, options): """PCA plot of tetranucleotide signatures""" self.logger.info( '[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) self.logger.info('Computing PCA of tetranuclotide signatures.\n') pca = PCA() seqIds, pc, variance = pca.pcaFile(options.tetra_profile, fraction=1.0, bCenter=True, bScale=False) plots = PcaPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting PCA of tetranuclotide signatures for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, seqIds, pc, variance) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.tetra_pca_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output gffFile = os.path.join(self.options.out_folder, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): print 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF sys.exit() prodigalParser = ProdigalGeneFeatureParser(gffFile) # Read reference distributions from file dist = readDistribution('cd_dist') # get coding density for windows seqs = readFasta(fastaFile) data = [] seqLens = [] for seqId, seq in seqs.iteritems(): start = 0 end = self.options.cd_window_size seqLen = len(seq) seqLens.append(seqLen) while(end < seqLen): codingBases = prodigalParser.codingBases(seqId, start, end) a, c, g, t = baseCount(seq[start:end]) data.append(float(codingBases) / (a + c + g + t)) start = end end += self.options.cd_window_size if len(data) == 0: axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot bins = [0.0] binWidth = self.options.cd_bin_width binEnd = binWidth while binEnd <= 1.0: bins.append(binEnd) binEnd += binWidth axesHist.hist(data, bins=bins, normed=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesHist.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesHist.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesHist.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesHist.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) # get CD bin statistics binTools = BinTools() meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap=pylab.cm.Greys) axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() xMinSeqs, xMaxSeqs = axesDeltaCD.get_xlim() # plot reference distributions for distToPlot in distributionsToPlot: closestCD = findNearest(np.array(dist.keys()), meanCD) # find closest distribution values sampleSeqLen = dist[closestCD].keys()[0] d = dist[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distToPlot) / 2.0) cdUpperBoundKey = findNearest(d.keys(), (100 + distToPlot) / 2.0) xL = [] xU = [] y = [] for windowSize in dist[closestCD]: xL.append(dist[closestCD][windowSize][cdLowerBoundKey]) xU.append(dist[closestCD][windowSize][cdUpperBoundKey]) y.append(windowSize) # sort by y-values sortIndexY = np.argsort(y) xL = np.array(xL)[sortIndexY] xU = np.array(xU)[sortIndexY] y = np.array(y)[sortIndexY] axesDeltaCD.plot(xL, y, 'r--', lw=0.5, zorder=0) axesDeltaCD.plot(xU, y, 'r--', lw=0.5, zorder=0) # ensure y-axis include zero and covers all sequences axesDeltaCD.set_ylim([0, yMaxSeqs]) # ensure x-axis is set appropriately for sequences axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot for a in axesDeltaCD.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesDeltaCD.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesDeltaCD.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axesDeltaCD.xaxis.get_ticklines(): line.set_color(self.axesColour) for loc, spine in axesDeltaCD.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour)
def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold): # make sure output directory exists if not os.path.exists(outputDir): os.makedirs(outputDir) # get bin id of binned contigs self.logger.info(' Determining bin assignment of sequences.') seqIdToBinId = {} for f in binFiles: binId = binIdFromFilename(f) seqIds = readFastaSeqIds(f) for seqId in seqIds: seqIdToBinId[seqId] = binId # identify 16S reads from contigs/scaffolds self.logger.info(' Identifying SSU rRNAs on sequences.') self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu')) # read HMM hits hitsPerDomain = {} for domain in ['archaea', 'bacteria', 'euk']: hits = {} seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold) if len(seqInfo) > 0: for seqId, seqHits in seqInfo.iteritems(): for hit in seqHits: self.__addHit(hits, seqId, hit, concatenateThreshold) hitsPerDomain[domain] = hits # find best domain hit for each sequence bestHits = {} for _, hits in hitsPerDomain.iteritems(): for seqId, info in hits.iteritems(): if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] self.__addDomainHit(bestHits, seqId, info) # write summary file and putative SSU rRNAs to file summaryFile = os.path.join(outputDir, 'ssu_summary.tsv') summaryOut = open(summaryFile, 'w') summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n') seqFile = os.path.join(outputDir, 'ssu.fna') seqOut = open(seqFile, 'w') seqs = readFasta(contigFile) hitsToBins = {} for seqId in bestHits: origSeqId = seqId if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] if seqId in seqIdToBinId: binId = seqIdToBinId[seqId] else: binId = DefaultValues.UNBINNED seqInfo = [origSeqId] + bestHits[origSeqId] hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo] for binId in sorted(hitsToBins.keys()): for seqInfo in hitsToBins[binId]: seqId = seqInfo[0] if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] seq = seqs[seqId] summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n') seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n') seqOut.write(seq[int(seqInfo[3]):int(seqInfo[4])] + '\n') summaryOut.close() seqOut.close() self.logger.info('') self.logger.info(' Identified ' + str(len(bestHits)) + ' putative SSU genes:') self.logger.info(' Summary of identified hits written to: ' + summaryFile) self.logger.info(' SSU sequences written to: ' + seqFile)
def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC): """Calculate coverage of sequences for each BAM file.""" # determine bin assignment of each sequence self.logger.info(' Determining bin assignment of each sequence.') seqIdToBinId = {} seqIdToSeqLen = {} for binFile in binFiles: binId = binIdFromFilename(binFile) seqs = readFasta(binFile) for seqId, seq in seqs.iteritems(): seqIdToBinId[seqId] = binId seqIdToSeqLen[seqId] = len(seq) # process each fasta file self.logger.info(" Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads)) # make sure all BAM files are sorted self.numFiles = len(bamFiles) for bamFile in bamFiles: if not os.path.exists(bamFile + '.bai'): self.logger.error(' [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n') sys.exit(1) # calculate coverage of each BAM file coverageInfo = {} numFilesStarted = 0 for bamFile in bamFiles: numFilesStarted += 1 self.logger.info(' Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles))) coverageInfo[bamFile] = mp.Manager().dict() coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile]) # redirect output self.logger.info(' Writing coverage information to file.') oldStdOut = reassignStdOut(outFile) header = 'Sequence Id\tBin Id\tSequence length (bp)' for bamFile in bamFiles: header += '\tBam Id\tCoverage\tMapped reads' print(header) # get length of all seqs for bamFile, seqIds in coverageInfo.iteritems(): for seqId in seqIds.keys(): seqIdToSeqLen[seqId] = seqIds[seqId].seqLen # write coverage stats for all scaffolds to file for seqId, seqLen in seqIdToSeqLen.iteritems(): rowStr = seqId + '\t' + seqIdToBinId.get(seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen) for bamFile in bamFiles: bamId = binIdFromFilename(bamFile) if seqId in coverageInfo[bamFile]: rowStr += '\t%s\t%f\t%d' % (bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads) else: rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0) print(rowStr) # restore stdout restoreStdOut(outFile, oldStdOut)
def plot(self, binFile, markerGeneStats, binStats): binId = binIdFromFilename(binFile) markerGenesPerSeq, _markerGeneNum = self.getMarkerGenesPerSeq(markerGeneStats) if len(markerGenesPerSeq) == 0: return False # Get length of sequences with one or more marker genes seqs = readFasta(binFile) seqLens = {} longestSeq = 0 binSize = 0 for seqId, seq in seqs.iteritems(): seqLen = len(seq) binSize += seqLen if seqId not in markerGenesPerSeq: continue seqLens[seqId] = seqLen if seqLen > longestSeq: longestSeq = seqLen sortedSeqLens = sorted(seqLens.iteritems(), key=operator.itemgetter(1), reverse=True) MAX_BINS = 100 plotBinSize = self.roundUpToNearest100(float(longestSeq) / MAX_BINS) yLabels = [x[0] for x in sortedSeqLens] # get position of genes in bin prodigalFastaParser = ProdigalFastaParser() geneFile = os.path.join(self.options.out_folder, 'bins', binId, DefaultValues.PRODIGAL_AA) genePos = prodigalFastaParser.genePositions(geneFile) # Set size of figure self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height) yLabelBounds = self.yLabelExtents(yLabels, self.options.font_size) heightBottomLabels = 0.4 + self.options.fig_padding # inches widthSideLabel = yLabelBounds.width * self.options.width + self.options.fig_padding # inches widthPerBin = (self.options.width - widthSideLabel - self.options.fig_padding) / MAX_BINS titleHeight = 0.2 HEIGHT_PER_ROW = 0.2 height = HEIGHT_PER_ROW * len(sortedSeqLens) + heightBottomLabels + self.options.fig_padding + titleHeight rowBinHeight = widthPerBin / HEIGHT_PER_ROW self.fig.set_size_inches(self.options.width, height) axes = self.fig.add_axes([widthSideLabel / self.options.width, heightBottomLabels / height, \ 1.0 - (widthSideLabel + self.options.fig_padding) / self.options.width, \ 1.0 - (heightBottomLabels + self.options.fig_padding + titleHeight) / height]) # set plot axis axes.set_xlim([0, MAX_BINS + 0.1]) axes.set_xlabel('Position (' + str(plotBinSize) + ' bp/bin)') axes.set_ylim([0, len(sortedSeqLens)]) axes.set_yticks(np.arange(0.5, len(sortedSeqLens) + 0.5, 1.0)) axes.set_yticklabels(yLabels) # legend colours = [(1.0, 1.0, 1.0), (127 / 255.0, 201 / 255.0, 127 / 255.0), (255 / 255.0, 192 / 255.0, 134 / 255.0), (190 / 255.0, 174 / 255.0, 212 / 255.0), (0.0, 0.0, 0.0)] discreteColourMap = mpl.colors.ListedColormap(colours) axisColourMap = self.fig.add_axes([self.options.fig_padding / self.options.width, self.options.fig_padding / height, 0.15, 0.03 * (self.options.width / height)]) colourBar = mpl.colorbar.ColorbarBase(axisColourMap, cmap=discreteColourMap, norm=mpl.colors.Normalize(vmin=0, vmax=1), orientation='horizontal', drawedges=True) colourBar.set_ticks([0.1, 0.3, 0.5, 0.7, 0.9]) colourBar.set_ticklabels(['0', '1', '2', '3', '4+']) # colourBar.outline.set_color(self.axesColour) colourBar.outline.set_linewidth(0.5) # colourBar.dividers.set_color(self.axesColour) colourBar.dividers.set_linewidth(0.5) for a in axisColourMap.xaxis.majorTicks: a.tick1On = False a.tick2On = False # plot each bin binPosX = 0.5 for seqId, seqLen in sortedSeqLens: markerCount = [0] * int(math.ceil(float(seqLen) / plotBinSize)) for geneId, _markerGeneId, geneStartPos, _geneEndPos in markerGenesPerSeq[seqId]: binPos = int(float(genePos[geneId][0] + geneStartPos) / plotBinSize) markerCount[binPos] += 1 for i in xrange(0, len(markerCount)): if markerCount[i] < len(colours): axes.add_patch(Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight), 0.8, 0.8 * rowBinHeight, facecolor=colours[markerCount[i]], lw=0.2)) else: axes.add_patch(Rectangle((i + 0.1, binPosX - 0.4 * rowBinHeight), 0.8, 0.8 * rowBinHeight, facecolor=colours[-1], lw=0.2)) binPosX += 1.0 # set plot title titleStr = binId + '\n' titleStr += '(%.2f Mbp, %d seqs, %.2f%% complete, %.2f%% contamination)' % (float(binSize) / 1e6, len(seqs), binStats['Completeness'], binStats['Contamination']) axes.set_title(titleStr) # Prettify plot for a in axes.yaxis.majorTicks: a.tick1On = False a.tick2On = False for a in axes.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axes.yaxis.get_ticklines(): line.set_color(self.axesColour) for line in axes.xaxis.get_ticklines(): line.set_color(self.axesColour) line.set_ms(2) for loc, spine in axes.spines.iteritems(): if loc in ['left', 'right', 'top']: spine.set_color('none') else: spine.set_color(self.axesColour) self.draw() return True