def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(markerGenes))) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) print(' Co-located gene sets: ' + str(len(colocatedSets))) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: mgCompletion = [] msCompletion = [] for _ in range(0, numReplicates): startPartialGenomeContigs = img.sampleGenome( metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker genes containedMarkerGenes = img.containedMarkerGenes( markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen) mgCompletion.append( float(len(containedMarkerGenes)) / len(markerGenes) - percentCompletion) # calculate completion with marker set comp = 0.0 for cs in colocatedSets: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) msCompletion.append(comp / len(colocatedSets) - percentCompletion) plotData.append(mgCompletion) plotData.append(msCompletion) species = ' '.join( metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:]) plotLabels.append(species + ' (' + genomeId + ')') plotLabels.append('') # plot data boxPlot = BoxPlot() plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace( ';', '_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers): print('Ubiquity threshold: ' + str(ubiquityThreshold)) print('Single-copy threshold: ' + str(singleCopyThreshold)) print('Min. genomes: ' + str(minGenomes)) print('Most specific taxonomic rank: ' + str(mostSpecificRank)) img = IMG() deltaMarkerSetSizes = [] lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) lineages = ['prokaryotes'] + lineages boxPlotLabels = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) trusted = img.trustedGenomes() genomeIds = list(genomeIds.intersection(trusted)) print('') print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds pfamTable = img.pfamTable(genomeIds) pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold * 0.9, singleCopyThreshold * 0.9) markerSet = img.markerGenes( genomeIds, pfamTable, ubiquityThreshold * (len(genomeIds) - 1), singleCopyThreshold * (len(genomeIds) - 1)) fullMarkerSetSize = len(markerSet) if fullMarkerSetSize < minMarkers: continue boxPlotLabels.append( lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')') deltaMarkerSetSize = [] numGenomes = len(genomeIds) - 1 for loo in range(0, len(genomeIds)): if loo != len(genomeIds) - 1: genomeIdSubset = genomeIds[0:loo] + genomeIds[loo + 1:] else: genomeIdSubset = genomeIds[0:loo] markerSet = img.markerGenes( genomeIdSubset, pfamTable, ubiquityThreshold * len(genomeIdSubset), singleCopyThreshold * len(genomeIdSubset)) deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet)) if fullMarkerSetSize < len(markerSet): print('[Warning] Unexpected!') deltaMarkerSetSizes.append(deltaMarkerSetSize) m = mean(deltaMarkerSetSize) s = std(deltaMarkerSetSize) print(' LOO Ubiquity >= ' + str(int(ubiquityThreshold * numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold * numGenomes))) print(' Delta Mean: %.2f +/- %.2f' % (m, s)) print(' Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize))) # plot data boxPlot = BoxPlot() plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.boxplot.png' title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() lineages = [] taxon = taxonomyStr.split(';') for r in range(0, len(taxon)): lineages.append(';'.join(taxon[0:r + 1])) # get all marker sets markerGenes = [] geneDistTable = [] colocatedSets = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(mg))) mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(mdt) cs = img.colocatedSets(colocatedGenes, mg) print(' Co-located gene sets: ' + str(len(cs))) markerGenes.append(mg) geneDistTable.append(mdt) colocatedSets.append(cs) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: completion = [[] for _ in range(len(lineages))] for _ in range(0, numReplicates): startPartialGenomeContigs = img.sampleGenome( metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker set for i in range(len(lineages)): containedMarkerGenes = img.containedMarkerGenes( markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen) comp = 0.0 for cs in colocatedSets[i]: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) completion[i].append(comp / len(colocatedSets[i]) - percentCompletion) plotLabels.append(genomeId + ' - ' + lineages[i]) for d in completion: plotData.append(d) # plot data boxPlot = BoxPlot() plotFilename = './images/sim.lineages.' + taxonomyStr.replace( ';', '_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def taxonomicPlots(self, results): # summarize results for different taxonomic groups print(' Tabulating results for taxonomic groups.') metadata = self.img.genomeMetadata() itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) comps = set() conts = set() seqLens = set() ranksToProcess = 3 taxaByRank = [set() for _ in range(0, ranksToProcess)] overallComp = [] overallCont = [] genomeInTaxon = defaultdict(set) testCases = 0 for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') if seqLen != '20000': continue if str(float(comp)) in ['0.5', '0.7', '0.8', '0.9'] and str(float(cont)) in ['0.05', '0.10', '0.1', '0.15']: print(comp, cont) taxonomy = metadata[genomeId]['taxonomy'] testCases += 1 comps.add(float(comp)) conts.add(float(cont)) seqLens.add(int(seqLen)) overallComp += results[simId][10] overallCont += results[simId][11] for r in range(0, ranksToProcess): taxon = taxonomy[r] if r == 0 and taxon == 'unclassified': print('*****************************Unclassified at domain-level*****************') continue if taxon == 'unclassified': continue taxon = rankPrefixes[r] + taxon taxaByRank[r].add(taxon) compDataDict[taxon]['best'] += results[simId][2] compDataDict[taxon]['domain'] += results[simId][6] compDataDict[taxon]['selected'] += results[simId][10] contDataDict[taxon]['best'] += results[simId][3] contDataDict[taxon]['domain'] += results[simId][7] contDataDict[taxon]['selected'] += results[simId][11] genomeInTaxon[taxon].add(genomeId) sys.stdout.write('\n') print('Test cases', testCases) print('') print('Creating plots for:') print(' comps = ', comps) print(' conts = ', conts) print('') print(' There are %d taxa.' % (len(compDataDict))) print('') print(' Overall bias:') print(' Selected comp: %.2f' % mean(overallComp)) print(' Selected cont: %.2f' % mean(overallCont)) # get list of ordered taxa by rank orderedTaxa = [] for taxa in taxaByRank: orderedTaxa += sorted(taxa) # plot data print(' Plotting results.') compData = [] contData = [] rowLabels = [] for taxon in orderedTaxa: for msStr in ['best', 'selected', 'domain']: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 10: # skip groups with only a few genomes continue rowLabels.append(msStr + ': ' + taxon + ' (' + str(numGenomes) + ')') compData.append(compDataDict[taxon][msStr]) contData.append(contDataDict[taxon][msStr]) for i, rowLabel in enumerate(rowLabels): print(rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i]))))) # print taxonomic table of results organized by class taxonomyTableOut = open(self.simCompareTaxonomyTableOut, 'w') for taxon in orderedTaxa: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 2: # skip groups with only a few genomes continue taxonomyTableOut.write(taxon + '\t' + str(numGenomes)) for msStr in ['domain', 'selected']: meanTaxonComp = mean(abs(array(compDataDict[taxon][msStr]))) stdTaxonComp = std(abs(array(compDataDict[taxon][msStr]))) meanTaxonCont = mean(abs(array(contDataDict[taxon][msStr]))) stdTaxonCont = std(abs(array(contDataDict[taxon][msStr]))) taxonomyTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont)) taxonomyTableOut.write('\n') taxonomyTableOut.close() # create box plot boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.taxonomy.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', None, r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 3, dpi = self.dpi)
def refinementPlots(self, results): # summarize results for different CheckM refinements print(' Tabulating results for different refinements.') metadata = self.img.genomeMetadata() itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) comps = set() conts = set() seqLens = set() ranksToProcess = 3 taxaByRank = [set() for _ in range(0, ranksToProcess)] overallCompIM = [] overallContIM = [] overallCompMS = [] overallContMS = [] overallCompRMS = [] overallContRMS = [] genomeInTaxon = defaultdict(set) testCases = 0 for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') taxonomy = metadata[genomeId]['taxonomy'] if float(comp) < 0.7 or float(cont) > 0.1: continue comps.add(float(comp)) conts.add(float(cont)) seqLens.add(int(seqLen)) overallCompIM.append(results[simId][8]) overallContIM.append(results[simId][9]) overallCompMS.append(results[simId][10]) overallContMS.append(results[simId][11]) overallCompRMS.append(results[simId][12]) overallContRMS.append(results[simId][13]) for r in range(0, ranksToProcess): taxon = taxonomy[r] if taxon == 'unclassified': continue taxaByRank[r].add(taxon) compDataDict[taxon]['IM'] += results[simId][8] compDataDict[taxon]['MS'] += results[simId][10] compDataDict[taxon]['RMS'] += results[simId][12] contDataDict[taxon]['IM'] += results[simId][9] contDataDict[taxon]['MS'] += results[simId][11] contDataDict[taxon]['RMS'] += results[simId][13] genomeInTaxon[taxon].add(genomeId) sys.stdout.write('\n') print('Creating plots for:') print(' comps = ', comps) print(' conts = ', conts) print('') print(' There are %d taxon.' % (len(compDataDict))) print('') print('Percentage change MS-IM comp: %.4f' % ((mean(abs(array(overallCompMS))) - mean(abs(array(overallCompIM)))) * 100 / mean(abs(array(overallCompIM))))) print('Percentage change MS-IM cont: %.4f' % ((mean(abs(array(overallContMS))) - mean(abs(array(overallContIM)))) * 100 / mean(abs(array(overallContIM))))) print('') print('Percentage change RMS-MS comp: %.4f' % ((mean(abs(array(overallCompRMS))) - mean(abs(array(overallCompMS)))) * 100 / mean(abs(array(overallCompIM))))) print('Percentage change RMS-MS cont: %.4f' % ((mean(abs(array(overallContRMS))) - mean(abs(array(overallContMS)))) * 100 / mean(abs(array(overallContIM))))) print('') # get list of ordered taxa by rank orderedTaxa = [] for taxa in taxaByRank: orderedTaxa += sorted(taxa) # print table of results organized by class refinmentTableOut = open(self.simCompareRefinementTableOut, 'w') for taxon in orderedTaxa: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 2: # skip groups with only a few genomes continue refinmentTableOut.write(taxon + '\t' + str(numGenomes)) for refineStr in ['IM', 'MS']: meanTaxonComp = mean(abs(array(compDataDict[taxon][refineStr]))) stdTaxonComp = std(abs(array(compDataDict[taxon][refineStr]))) meanTaxonCont = mean(abs(array(contDataDict[taxon][refineStr]))) stdTaxonCont = std(abs(array(contDataDict[taxon][refineStr]))) refinmentTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont)) perCompChange = (mean(abs(array(compDataDict[taxon]['IM']))) - meanTaxonComp) * 100 / mean(abs(array(compDataDict[taxon]['IM']))) perContChange = (mean(abs(array(contDataDict[taxon]['IM']))) - meanTaxonCont) * 100 / mean(abs(array(contDataDict[taxon]['IM']))) refinmentTableOut.write('\t%.2f\t%.2f\n' % (perCompChange, perContChange)) refinmentTableOut.close() # plot data print(' Plotting results.') compData = [] contData = [] rowLabels = [] for taxon in orderedTaxa: for refineStr in ['RMS', 'MS', 'IM']: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 10: # skip groups with only a few genomes continue rowLabels.append(refineStr + ': ' + taxon + ' (' + str(numGenomes) + ')') compData.append(compDataDict[taxon][refineStr]) contData.append(contDataDict[taxon][refineStr]) for i, rowLabel in enumerate(rowLabels): print(rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i]))))) boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.refinements.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', None, r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 3, dpi = self.dpi)
def markerSets(self, results): # summarize results from IM vs MS print(' Tabulating results for domain-level marker genes vs marker sets.') itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) genomeIds = set() for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') genomeIds.add(genomeId) expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen)) compDataDict[expCondStr]['IM'] += results[simId][4] compDataDict[expCondStr]['MS'] += results[simId][6] contDataDict[expCondStr]['IM'] += results[simId][5] contDataDict[expCondStr]['MS'] += results[simId][7] print(' There are %d unique genomes.' % len(genomeIds)) sys.stdout.write('\n') print(' There are %d experimental conditions.' % (len(compDataDict))) # plot data print(' Plotting results.') compData = [] contData = [] rowLabels = [] for comp in self.compsToConsider: for cont in self.contsToConsider: for seqLen in [20000]: for msStr in ['MS', 'IM']: rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100)) expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) compData.append(compDataDict[expCondStr][msStr]) contData.append(contDataDict[expCondStr][msStr]) print('MS:\t%.2f\t%.2f' % (mean(abs(array(compData[0::2]))), mean(abs(array(contData[0::2]))))) print('IM:\t%.2f\t%.2f' % (mean(abs(array(compData[1::2]))), mean(abs(array(contData[1::2]))))) boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.markerSets.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', 'Simulation Conditions', r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 2, dpi = self.dpi) # print table of results tableOut = open(self.simCompareMarkerSetOut, 'w') tableOut.write('Comp. (%)\tCont. (%)\tIM (5kb)\t\tMS (5kb)\t\tIM (20kb)\t\tMS (20kb)\t\tIM (50kb)\t\tMS (50kb)\n') avgComp = defaultdict(lambda : defaultdict(list)) avgCont = defaultdict(lambda : defaultdict(list)) for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]: for cont in [0.0, 0.05, 0.1, 0.15, 0.2]: tableOut.write('%d\t%d' % (comp*100, cont*100)) for seqLen in [5000, 20000, 50000]: expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) meanCompIM = mean(abs(array(compDataDict[expCondStr]['IM']))) stdCompIM = std(abs(array(compDataDict[expCondStr]['IM']))) meanContIM = mean(abs(array(contDataDict[expCondStr]['IM']))) stdContIM = std(abs(array(contDataDict[expCondStr]['IM']))) avgComp[seqLen]['IM'] += compDataDict[expCondStr]['IM'] avgCont[seqLen]['IM'] += contDataDict[expCondStr]['IM'] meanCompMS = mean(abs(array(compDataDict[expCondStr]['MS']))) stdCompMS = std(abs(array(compDataDict[expCondStr]['MS']))) meanContMS = mean(abs(array(contDataDict[expCondStr]['MS']))) stdContMS = std(abs(array(contDataDict[expCondStr]['MS']))) avgComp[seqLen]['MS'] += compDataDict[expCondStr]['MS'] avgCont[seqLen]['MS'] += contDataDict[expCondStr]['MS'] tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS)) tableOut.write('\n') tableOut.write('\tAverage:') for seqLen in [5000, 20000, 50000]: meanCompIM = mean(abs(array(avgComp[seqLen]['IM']))) stdCompIM = std(abs(array(avgComp[seqLen]['IM']))) meanContIM = mean(abs(array(avgCont[seqLen]['IM']))) stdContIM = std(abs(array(avgCont[seqLen]['IM']))) meanCompMS = mean(abs(array(avgComp[seqLen]['MS']))) stdCompMS = std(abs(array(avgComp[seqLen]['MS']))) meanContMS = mean(abs(array(avgCont[seqLen]['MS']))) stdContMS = std(abs(array(avgCont[seqLen]['MS']))) tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS)) tableOut.write('\n') tableOut.close()
def conditionsPlot(self, results): # summarize results for each experimental condition print(' Tabulating results for each experimental condition using marker sets.') itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) comps = set() conts = set() seqLens = set() compOutliers = defaultdict(list) contOutliers = defaultdict(list) genomeIds = set() for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') genomeIds.add(genomeId) expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen)) comps.add(float(comp)) conts.add(float(cont)) seqLens.add(int(seqLen)) compDataDict[expCondStr]['best'] += results[simId][2] compDataDict[expCondStr]['domain'] += results[simId][6] compDataDict[expCondStr]['selected'] += results[simId][10] for dComp in results[simId][2]: compOutliers[expCondStr] += [[dComp, genomeId]] contDataDict[expCondStr]['best'] += results[simId][3] contDataDict[expCondStr]['domain'] += results[simId][7] contDataDict[expCondStr]['selected'] += results[simId][11] for dCont in results[simId][3]: contOutliers[expCondStr] += [[dCont, genomeId]] print(' There are %d unique genomes.' % len(genomeIds)) sys.stdout.write('\n') print(' There are %d experimental conditions.' % (len(compDataDict))) # plot data print(' Plotting results.') compData = [] contData = [] rowLabels = [] foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w') foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w') for comp in self.compsToConsider: for cont in self.contsToConsider: for msStr in ['best', 'selected', 'domain']: for seqLen in [20000]: rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100)) expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) compData.append(compDataDict[expCondStr][msStr]) contData.append(contDataDict[expCondStr][msStr]) # report completenes outliers foutComp.write(expCondStr) compOutliers[expCondStr].sort() dComps = array([r[0] for r in compOutliers[expCondStr]]) perc1 = scoreatpercentile(dComps, 1) perc99 = scoreatpercentile(dComps, 99) print(expCondStr, perc1, perc99) foutComp.write('\t%.2f\t%.2f' % (perc1, perc99)) outliers = [] for item in compOutliers[expCondStr]: if item[0] < perc1 or item[0] > perc99: outliers.append(item[1]) outlierCount = Counter(outliers) for genomeId, count in outlierCount.most_common(): foutComp.write('\t' + genomeId + ': ' + str(count)) foutComp.write('\n') # report contamination outliers foutCont.write(expCondStr) contOutliers[expCondStr].sort() dConts = array([r[0] for r in contOutliers[expCondStr]]) perc1 = scoreatpercentile(dConts, 1) perc99 = scoreatpercentile(dConts, 99) foutCont.write('\t%.2f\t%.2f' % (perc1, perc99)) outliers = [] for item in contOutliers[expCondStr]: if item[0] < perc1 or item[0] > perc99: outliers.append(item[1]) outlierCount = Counter(outliers) for genomeId, count in outlierCount.most_common(): foutCont.write('\t' + genomeId + ': ' + str(count)) foutCont.write('\n') foutComp.close() foutCont.close() print('best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3]))))) print('selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3]))))) print('domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3]))))) boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.conditions.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', 'Simulation Conditions', r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 3, dpi = self.dpi) # print table of results tableOut = open(self.simCompareConditionOut, 'w') tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n') avgComp = defaultdict(lambda : defaultdict(list)) avgCont = defaultdict(lambda : defaultdict(list)) for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]: for cont in [0.0, 0.05, 0.1, 0.15, 0.2]: tableOut.write('%d\t%d' % (comp*100, cont*100)) for seqLen in [5000, 20000, 50000]: expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) meanCompD = mean(abs(array(compDataDict[expCondStr]['domain']))) stdCompD = std(abs(array(compDataDict[expCondStr]['domain']))) meanContD = mean(abs(array(contDataDict[expCondStr]['domain']))) stdContD = std(abs(array(contDataDict[expCondStr]['domain']))) avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain'] avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain'] meanCompS = mean(abs(array(compDataDict[expCondStr]['selected']))) stdCompS = std(abs(array(compDataDict[expCondStr]['selected']))) meanContS = mean(abs(array(contDataDict[expCondStr]['selected']))) stdContS = std(abs(array(contDataDict[expCondStr]['selected']))) avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected'] avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected'] meanCompB = mean(abs(array(compDataDict[expCondStr]['best']))) stdCompB = std(abs(array(compDataDict[expCondStr]['best']))) meanContB = mean(abs(array(contDataDict[expCondStr]['best']))) stdContB = std(abs(array(contDataDict[expCondStr]['best']))) avgComp[seqLen]['best'] += compDataDict[expCondStr]['best'] avgCont[seqLen]['best'] += contDataDict[expCondStr]['best'] tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB)) tableOut.write('\n') tableOut.write('\tAverage:') for seqLen in [5000, 20000, 50000]: meanCompD = mean(abs(array(avgComp[seqLen]['domain']))) stdCompD = std(abs(array(avgComp[seqLen]['domain']))) meanContD = mean(abs(array(avgCont[seqLen]['domain']))) stdContD = std(abs(array(avgCont[seqLen]['domain']))) meanCompS = mean(abs(array(avgComp[seqLen]['selected']))) stdCompS = std(abs(array(avgComp[seqLen]['selected']))) meanContS = mean(abs(array(avgCont[seqLen]['selected']))) stdContS = std(abs(array(avgCont[seqLen]['selected']))) meanCompB = mean(abs(array(avgComp[seqLen]['best']))) stdCompB = std(abs(array(avgComp[seqLen]['best']))) meanContB = mean(abs(array(avgCont[seqLen]['best']))) stdContB = std(abs(array(avgCont[seqLen]['best']))) tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB)) tableOut.write('\n') tableOut.close()
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize): img = IMG() markergenes = MarkerGenes() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print('Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.') if len(genomeIds) < minGenomes: sys.stderr.write('[Error] Insufficent number of genomes.\n') sys.exit() print('') print('Ubiquity threshold: ' + str(ubiquityThreshold)) print('Single-copy threshold: ' + str(singleCopyThreshold)) meanMarkerSetSize = [] stdMarkerSetSize = [] markerSetSizes = [] if maxGenomes == -1: maxGenomes = len(genomeIds) if maxGenomes > len(genomeIds): maxGenomes = len(genomeIds) countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable) for numGenomes in range(minGenomes, maxGenomes, stepSize): markerSetSize = [] for _ in range(0, replicates): genomeIdSubset = random.sample(genomeIds, numGenomes) markerGenes = markergenes.identify( genomeIdSubset, countTable, ubiquityThreshold * len(genomeIdSubset), singleCopyThreshold * len(genomeIdSubset)) geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSize.append(len(colocatedSets)) markerSetSizes.append(markerSetSize) m = mean(markerSetSize) meanMarkerSetSize.append(m) s = std(markerSetSize) stdMarkerSetSize.append(s) print('') print('Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold * len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold * len(genomeIdSubset)))) print('Mean: %.2f +/- %.2f' % (m, s)) print('Min: %d, Max: %d' % (min(markerSetSize), max(markerSetSize))) # plot data errorBar = ErrorBar() plotFilename = './images/markerset.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.errorbar.png' title = taxonomyStr.replace( ';', '; ' ) + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title) boxPlot = BoxPlot() plotFilename = './images/markerset.' + taxonomyStr.replace( ';', '_') + '.' + str(ubiquityThreshold) + '-' + str( singleCopyThreshold) + '.boxplot.png' boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(markerGenes)) geneDistTable = img.geneDistTable(genomeIds, markerGenes) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) print ' Co-located gene sets: ' + str(len(colocatedSets)) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: mgCompletion = [] msCompletion = [] for _ in xrange(0, numReplicates): startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker genes containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen) mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion) # calculate completion with marker set comp = 0.0 for cs in colocatedSets: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) msCompletion.append(comp / len(colocatedSets) - percentCompletion) plotData.append(mgCompletion) plotData.append(msCompletion) species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:]) plotLabels.append(species + ' (' + genomeId + ')') plotLabels.append('') # plot data boxPlot = BoxPlot() plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
def refinementPlots(self, results): # summarize results for different CheckM refinements print ' Tabulating results for different refinements.' metadata = self.img.genomeMetadata() itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) comps = set() conts = set() seqLens = set() ranksToProcess = 3 taxaByRank = [set() for _ in xrange(0, ranksToProcess)] overallCompIM = [] overallContIM = [] overallCompMS = [] overallContMS = [] overallCompRMS = [] overallContRMS = [] genomeInTaxon = defaultdict(set) testCases = 0 for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') taxonomy = metadata[genomeId]['taxonomy'] if float(comp) < 0.7 or float(cont) > 0.1: continue comps.add(float(comp)) conts.add(float(cont)) seqLens.add(int(seqLen)) overallCompIM.append(results[simId][8]) overallContIM.append(results[simId][9]) overallCompMS.append(results[simId][10]) overallContMS.append(results[simId][11]) overallCompRMS.append(results[simId][12]) overallContRMS.append(results[simId][13]) for r in xrange(0, ranksToProcess): taxon = taxonomy[r] if taxon == 'unclassified': continue taxaByRank[r].add(taxon) compDataDict[taxon]['IM'] += results[simId][8] compDataDict[taxon]['MS'] += results[simId][10] compDataDict[taxon]['RMS'] += results[simId][12] contDataDict[taxon]['IM'] += results[simId][9] contDataDict[taxon]['MS'] += results[simId][11] contDataDict[taxon]['RMS'] += results[simId][13] genomeInTaxon[taxon].add(genomeId) sys.stdout.write('\n') print 'Creating plots for:' print ' comps = ', comps print ' conts = ', conts print '' print ' There are %d taxon.' % (len(compDataDict)) print '' print 'Percentage change MS-IM comp: %.4f' % ((mean(abs(array(overallCompMS))) - mean(abs(array(overallCompIM)))) * 100 / mean(abs(array(overallCompIM)))) print 'Percentage change MS-IM cont: %.4f' % ((mean(abs(array(overallContMS))) - mean(abs(array(overallContIM)))) * 100 / mean(abs(array(overallContIM)))) print '' print 'Percentage change RMS-MS comp: %.4f' % ((mean(abs(array(overallCompRMS))) - mean(abs(array(overallCompMS)))) * 100 / mean(abs(array(overallCompIM)))) print 'Percentage change RMS-MS cont: %.4f' % ((mean(abs(array(overallContRMS))) - mean(abs(array(overallContMS)))) * 100 / mean(abs(array(overallContIM)))) print '' # get list of ordered taxa by rank orderedTaxa = [] for taxa in taxaByRank: orderedTaxa += sorted(taxa) # print table of results organized by class refinmentTableOut = open(self.simCompareRefinementTableOut, 'w') for taxon in orderedTaxa: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 2: # skip groups with only a few genomes continue refinmentTableOut.write(taxon + '\t' + str(numGenomes)) for refineStr in ['IM', 'MS']: meanTaxonComp = mean(abs(array(compDataDict[taxon][refineStr]))) stdTaxonComp = std(abs(array(compDataDict[taxon][refineStr]))) meanTaxonCont = mean(abs(array(contDataDict[taxon][refineStr]))) stdTaxonCont = std(abs(array(contDataDict[taxon][refineStr]))) refinmentTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont)) perCompChange = (mean(abs(array(compDataDict[taxon]['IM']))) - meanTaxonComp) * 100 / mean(abs(array(compDataDict[taxon]['IM']))) perContChange = (mean(abs(array(contDataDict[taxon]['IM']))) - meanTaxonCont) * 100 / mean(abs(array(contDataDict[taxon]['IM']))) refinmentTableOut.write('\t%.2f\t%.2f\n' % (perCompChange, perContChange)) refinmentTableOut.close() # plot data print ' Plotting results.' compData = [] contData = [] rowLabels = [] for taxon in orderedTaxa: for refineStr in ['RMS', 'MS', 'IM']: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 10: # skip groups with only a few genomes continue rowLabels.append(refineStr + ': ' + taxon + ' (' + str(numGenomes) + ')') compData.append(compDataDict[taxon][refineStr]) contData.append(contDataDict[taxon][refineStr]) for i, rowLabel in enumerate(rowLabels): print rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i])))) boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.refinements.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', None, r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 3, dpi = self.dpi)
def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize): img = IMG() markergenes = MarkerGenes() genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' if len(genomeIds) < minGenomes: sys.stderr.write('[Error] Insufficent number of genomes.\n') sys.exit() print '' print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) meanMarkerSetSize = [] stdMarkerSetSize = [] markerSetSizes = [] if maxGenomes == -1: maxGenomes = len(genomeIds) if maxGenomes > len(genomeIds): maxGenomes = len(genomeIds) countTable = img.countTable(genomeIds) countTable = img.filterTable(genomeIds, countTable) for numGenomes in xrange(minGenomes, maxGenomes, stepSize): markerSetSize = [] for _ in xrange(0, replicates): genomeIdSubset = random.sample(genomeIds, numGenomes) markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset)) geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(geneDistTable) colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) markerSetSize.append(len(colocatedSets)) markerSetSizes.append(markerSetSize) m = mean(markerSetSize) meanMarkerSetSize.append(m) s = std(markerSetSize) stdMarkerSetSize.append(s) print '' print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset))) print 'Mean: %.2f +/- %.2f' % (m, s) print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize)) # plot data errorBar = ErrorBar() plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title) boxPlot = BoxPlot() plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png' boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
def taxonomicPlots(self, results): # summarize results for different taxonomic groups print ' Tabulating results for taxonomic groups.' metadata = self.img.genomeMetadata() itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) comps = set() conts = set() seqLens = set() ranksToProcess = 3 taxaByRank = [set() for _ in xrange(0, ranksToProcess)] overallComp = [] overallCont = [] genomeInTaxon = defaultdict(set) testCases = 0 for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') if seqLen != '20000': continue if str(float(comp)) in ['0.5', '0.7', '0.8', '0.9'] and str(float(cont)) in ['0.05', '0.10', '0.1', '0.15']: print comp, cont taxonomy = metadata[genomeId]['taxonomy'] testCases += 1 comps.add(float(comp)) conts.add(float(cont)) seqLens.add(int(seqLen)) overallComp += results[simId][10] overallCont += results[simId][11] for r in xrange(0, ranksToProcess): taxon = taxonomy[r] if r == 0 and taxon == 'unclassified': print '*****************************Unclassified at domain-level*****************' continue if taxon == 'unclassified': continue taxon = rankPrefixes[r] + taxon taxaByRank[r].add(taxon) compDataDict[taxon]['best'] += results[simId][2] compDataDict[taxon]['domain'] += results[simId][6] compDataDict[taxon]['selected'] += results[simId][10] contDataDict[taxon]['best'] += results[simId][3] contDataDict[taxon]['domain'] += results[simId][7] contDataDict[taxon]['selected'] += results[simId][11] genomeInTaxon[taxon].add(genomeId) sys.stdout.write('\n') print 'Test cases', testCases print '' print 'Creating plots for:' print ' comps = ', comps print ' conts = ', conts print '' print ' There are %d taxa.' % (len(compDataDict)) print '' print ' Overall bias:' print ' Selected comp: %.2f' % mean(overallComp) print ' Selected cont: %.2f' % mean(overallCont) # get list of ordered taxa by rank orderedTaxa = [] for taxa in taxaByRank: orderedTaxa += sorted(taxa) # plot data print ' Plotting results.' compData = [] contData = [] rowLabels = [] for taxon in orderedTaxa: for msStr in ['best', 'selected', 'domain']: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 10: # skip groups with only a few genomes continue rowLabels.append(msStr + ': ' + taxon + ' (' + str(numGenomes) + ')') compData.append(compDataDict[taxon][msStr]) contData.append(contDataDict[taxon][msStr]) for i, rowLabel in enumerate(rowLabels): print rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i])))) # print taxonomic table of results organized by class taxonomyTableOut = open(self.simCompareTaxonomyTableOut, 'w') for taxon in orderedTaxa: numGenomes = len(genomeInTaxon[taxon]) if numGenomes < 2: # skip groups with only a few genomes continue taxonomyTableOut.write(taxon + '\t' + str(numGenomes)) for msStr in ['domain', 'selected']: meanTaxonComp = mean(abs(array(compDataDict[taxon][msStr]))) stdTaxonComp = std(abs(array(compDataDict[taxon][msStr]))) meanTaxonCont = mean(abs(array(contDataDict[taxon][msStr]))) stdTaxonCont = std(abs(array(contDataDict[taxon][msStr]))) taxonomyTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont)) taxonomyTableOut.write('\n') taxonomyTableOut.close() # create box plot boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.taxonomy.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', None, r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 3, dpi = self.dpi)
def conditionsPlot(self, results): # summarize results for each experimental condition print ' Tabulating results for each experimental condition using marker sets.' itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) comps = set() conts = set() seqLens = set() compOutliers = defaultdict(list) contOutliers = defaultdict(list) genomeIds = set() for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') genomeIds.add(genomeId) expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen)) comps.add(float(comp)) conts.add(float(cont)) seqLens.add(int(seqLen)) compDataDict[expCondStr]['best'] += results[simId][2] compDataDict[expCondStr]['domain'] += results[simId][6] compDataDict[expCondStr]['selected'] += results[simId][10] for dComp in results[simId][2]: compOutliers[expCondStr] += [[dComp, genomeId]] contDataDict[expCondStr]['best'] += results[simId][3] contDataDict[expCondStr]['domain'] += results[simId][7] contDataDict[expCondStr]['selected'] += results[simId][11] for dCont in results[simId][3]: contOutliers[expCondStr] += [[dCont, genomeId]] print ' There are %d unique genomes.' % len(genomeIds) sys.stdout.write('\n') print ' There are %d experimental conditions.' % (len(compDataDict)) # plot data print ' Plotting results.' compData = [] contData = [] rowLabels = [] foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w') foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w') for comp in self.compsToConsider: for cont in self.contsToConsider: for msStr in ['best', 'selected', 'domain']: for seqLen in [20000]: rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100)) expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) compData.append(compDataDict[expCondStr][msStr]) contData.append(contDataDict[expCondStr][msStr]) # report completenes outliers foutComp.write(expCondStr) compOutliers[expCondStr].sort() dComps = array([r[0] for r in compOutliers[expCondStr]]) perc1 = scoreatpercentile(dComps, 1) perc99 = scoreatpercentile(dComps, 99) print expCondStr, perc1, perc99 foutComp.write('\t%.2f\t%.2f' % (perc1, perc99)) outliers = [] for item in compOutliers[expCondStr]: if item[0] < perc1 or item[0] > perc99: outliers.append(item[1]) outlierCount = Counter(outliers) for genomeId, count in outlierCount.most_common(): foutComp.write('\t' + genomeId + ': ' + str(count)) foutComp.write('\n') # report contamination outliers foutCont.write(expCondStr) contOutliers[expCondStr].sort() dConts = array([r[0] for r in contOutliers[expCondStr]]) perc1 = scoreatpercentile(dConts, 1) perc99 = scoreatpercentile(dConts, 99) foutCont.write('\t%.2f\t%.2f' % (perc1, perc99)) outliers = [] for item in contOutliers[expCondStr]: if item[0] < perc1 or item[0] > perc99: outliers.append(item[1]) outlierCount = Counter(outliers) for genomeId, count in outlierCount.most_common(): foutCont.write('\t' + genomeId + ': ' + str(count)) foutCont.write('\n') foutComp.close() foutCont.close() print 'best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3])))) print 'selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3])))) print 'domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3])))) boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.conditions.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', 'Simulation Conditions', r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 3, dpi = self.dpi) # print table of results tableOut = open(self.simCompareConditionOut, 'w') tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n') avgComp = defaultdict(lambda : defaultdict(list)) avgCont = defaultdict(lambda : defaultdict(list)) for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]: for cont in [0.0, 0.05, 0.1, 0.15, 0.2]: tableOut.write('%d\t%d' % (comp*100, cont*100)) for seqLen in [5000, 20000, 50000]: expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) meanCompD = mean(abs(array(compDataDict[expCondStr]['domain']))) stdCompD = std(abs(array(compDataDict[expCondStr]['domain']))) meanContD = mean(abs(array(contDataDict[expCondStr]['domain']))) stdContD = std(abs(array(contDataDict[expCondStr]['domain']))) avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain'] avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain'] meanCompS = mean(abs(array(compDataDict[expCondStr]['selected']))) stdCompS = std(abs(array(compDataDict[expCondStr]['selected']))) meanContS = mean(abs(array(contDataDict[expCondStr]['selected']))) stdContS = std(abs(array(contDataDict[expCondStr]['selected']))) avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected'] avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected'] meanCompB = mean(abs(array(compDataDict[expCondStr]['best']))) stdCompB = std(abs(array(compDataDict[expCondStr]['best']))) meanContB = mean(abs(array(contDataDict[expCondStr]['best']))) stdContB = std(abs(array(contDataDict[expCondStr]['best']))) avgComp[seqLen]['best'] += compDataDict[expCondStr]['best'] avgCont[seqLen]['best'] += contDataDict[expCondStr]['best'] tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB)) tableOut.write('\n') tableOut.write('\tAverage:') for seqLen in [5000, 20000, 50000]: meanCompD = mean(abs(array(avgComp[seqLen]['domain']))) stdCompD = std(abs(array(avgComp[seqLen]['domain']))) meanContD = mean(abs(array(avgCont[seqLen]['domain']))) stdContD = std(abs(array(avgCont[seqLen]['domain']))) meanCompS = mean(abs(array(avgComp[seqLen]['selected']))) stdCompS = std(abs(array(avgComp[seqLen]['selected']))) meanContS = mean(abs(array(avgCont[seqLen]['selected']))) stdContS = std(abs(array(avgCont[seqLen]['selected']))) meanCompB = mean(abs(array(avgComp[seqLen]['best']))) stdCompB = std(abs(array(avgComp[seqLen]['best']))) meanContB = mean(abs(array(avgCont[seqLen]['best']))) stdContB = std(abs(array(avgCont[seqLen]['best']))) tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB)) tableOut.write('\n') tableOut.close()
def markerSets(self, results): # summarize results from IM vs MS print ' Tabulating results for domain-level marker genes vs marker sets.' itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) genomeIds = set() for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') genomeIds.add(genomeId) expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen)) compDataDict[expCondStr]['IM'] += results[simId][4] compDataDict[expCondStr]['MS'] += results[simId][6] contDataDict[expCondStr]['IM'] += results[simId][5] contDataDict[expCondStr]['MS'] += results[simId][7] print ' There are %d unique genomes.' % len(genomeIds) sys.stdout.write('\n') print ' There are %d experimental conditions.' % (len(compDataDict)) # plot data print ' Plotting results.' compData = [] contData = [] rowLabels = [] for comp in self.compsToConsider: for cont in self.contsToConsider: for seqLen in [20000]: for msStr in ['MS', 'IM']: rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100)) expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) compData.append(compDataDict[expCondStr][msStr]) contData.append(contDataDict[expCondStr][msStr]) print 'MS:\t%.2f\t%.2f' % (mean(abs(array(compData[0::2]))), mean(abs(array(contData[0::2])))) print 'IM:\t%.2f\t%.2f' % (mean(abs(array(compData[1::2]))), mean(abs(array(contData[1::2])))) boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.markerSets.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', 'Simulation Conditions', r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 2, dpi = self.dpi) # print table of results tableOut = open(self.simCompareMarkerSetOut, 'w') tableOut.write('Comp. (%)\tCont. (%)\tIM (5kb)\t\tMS (5kb)\t\tIM (20kb)\t\tMS (20kb)\t\tIM (50kb)\t\tMS (50kb)\n') avgComp = defaultdict(lambda : defaultdict(list)) avgCont = defaultdict(lambda : defaultdict(list)) for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]: for cont in [0.0, 0.05, 0.1, 0.15, 0.2]: tableOut.write('%d\t%d' % (comp*100, cont*100)) for seqLen in [5000, 20000, 50000]: expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) meanCompIM = mean(abs(array(compDataDict[expCondStr]['IM']))) stdCompIM = std(abs(array(compDataDict[expCondStr]['IM']))) meanContIM = mean(abs(array(contDataDict[expCondStr]['IM']))) stdContIM = std(abs(array(contDataDict[expCondStr]['IM']))) avgComp[seqLen]['IM'] += compDataDict[expCondStr]['IM'] avgCont[seqLen]['IM'] += contDataDict[expCondStr]['IM'] meanCompMS = mean(abs(array(compDataDict[expCondStr]['MS']))) stdCompMS = std(abs(array(compDataDict[expCondStr]['MS']))) meanContMS = mean(abs(array(contDataDict[expCondStr]['MS']))) stdContMS = std(abs(array(contDataDict[expCondStr]['MS']))) avgComp[seqLen]['MS'] += compDataDict[expCondStr]['MS'] avgCont[seqLen]['MS'] += contDataDict[expCondStr]['MS'] tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS)) tableOut.write('\n') tableOut.write('\tAverage:') for seqLen in [5000, 20000, 50000]: meanCompIM = mean(abs(array(avgComp[seqLen]['IM']))) stdCompIM = std(abs(array(avgComp[seqLen]['IM']))) meanContIM = mean(abs(array(avgCont[seqLen]['IM']))) stdContIM = std(abs(array(avgCont[seqLen]['IM']))) meanCompMS = mean(abs(array(avgComp[seqLen]['MS']))) stdCompMS = std(abs(array(avgComp[seqLen]['MS']))) meanContMS = mean(abs(array(avgCont[seqLen]['MS']))) stdContMS = std(abs(array(avgCont[seqLen]['MS']))) tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS)) tableOut.write('\n') tableOut.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, minMarkers): print 'Ubiquity threshold: ' + str(ubiquityThreshold) print 'Single-copy threshold: ' + str(singleCopyThreshold) print 'Min. genomes: ' + str(minGenomes) print 'Most specific taxonomic rank: ' + str(mostSpecificRank) img = IMG() deltaMarkerSetSizes = [] lineages = img.lineagesByCriteria(minGenomes, mostSpecificRank) lineages = ['prokaryotes'] + lineages boxPlotLabels = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) trusted = img.trustedGenomes() genomeIds = list(genomeIds.intersection(trusted)) print '' print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # get table of PFAMs and do some initial filtering to remove PFAMs that are # clearly not going to pass the ubiquity and single-copy thresholds pfamTable = img.pfamTable(genomeIds) pfamTable = img.filterPfamTable(genomeIds, pfamTable, ubiquityThreshold*0.9, singleCopyThreshold*0.9) markerSet = img.markerGenes(genomeIds, pfamTable, ubiquityThreshold*(len(genomeIds)-1), singleCopyThreshold*(len(genomeIds)-1)) fullMarkerSetSize = len(markerSet) if fullMarkerSetSize < minMarkers: continue boxPlotLabels.append(lineage.split(';')[-1].strip() + ' (' + str(len(genomeIds)) + ', ' + str(fullMarkerSetSize) + ')') deltaMarkerSetSize = [] numGenomes = len(genomeIds)-1 for loo in xrange(0, len(genomeIds)): if loo != len(genomeIds) - 1: genomeIdSubset = genomeIds[0:loo] + genomeIds[loo+1:] else: genomeIdSubset = genomeIds[0:loo] markerSet = img.markerGenes(genomeIdSubset, pfamTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset)) deltaMarkerSetSize.append(fullMarkerSetSize - len(markerSet)) if fullMarkerSetSize < len(markerSet): print '[Warning] Unexpected!' deltaMarkerSetSizes.append(deltaMarkerSetSize) m = mean(deltaMarkerSetSize) s = std(deltaMarkerSetSize) print ' LOO Ubiquity >= ' + str(int(ubiquityThreshold*numGenomes)) + ', LOO Single-copy >= ' + str(int(singleCopyThreshold*numGenomes)) print ' Delta Mean: %.2f +/- %.2f' % (m, s) print ' Delta Min: %d, Delta Max: %d' % (min(deltaMarkerSetSize), max(deltaMarkerSetSize)) # plot data boxPlot = BoxPlot() plotFilename = './images/LOO.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png' title = 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold boxPlot.plot(plotFilename, deltaMarkerSetSizes, boxPlotLabels, r'$\Delta$' + ' Marker Set Size', '', False, title)
def run(self, taxonomyStr, mostSpecificRank, minGenomes, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): img = IMG() lineages = [] taxon = taxonomyStr.split(';') for r in xrange(0, len(taxon)): lineages.append(';'.join(taxon[0:r+1])) # get all marker sets markerGenes = [] geneDistTable = [] colocatedSets = [] for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' # build marker genes and colocated marker sets countTable = img.countTable(genomeIds) mg = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(mg)) mdt = img.geneDistTable(genomeIds, mg, spacingBetweenContigs=1e6) colocatedGenes = img.colocatedGenes(mdt) cs = img.colocatedSets(colocatedGenes, mg) print ' Co-located gene sets: ' + str(len(cs)) markerGenes.append(mg) geneDistTable.append(mdt) colocatedSets.append(cs) # random sample genomes if numGenomes == -1: rndGenomeIds = genomeIds else: rndGenomeIds = random.sample(genomeIds, numGenomes) # estimate completion for each genome using both the marker genes and marker sets metadata = img.genomeMetadata('Final') plotLabels = [] plotData = [] for genomeId in rndGenomeIds: completion = [[] for _ in xrange(len(lineages))] for _ in xrange(0, numReplicates): startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen) # calculate completion with marker set for i in xrange(len(lineages)): containedMarkerGenes = img.containedMarkerGenes(markerGenes[i], geneDistTable[i][genomeId], startPartialGenomeContigs, contigLen) comp = 0.0 for cs in colocatedSets[i]: present = 0 for contigId in cs: if contigId in containedMarkerGenes: present += 1 comp += float(present) / len(cs) completion[i].append(comp / len(colocatedSets[i]) - percentCompletion) plotLabels.append(genomeId + ' - ' + lineages[i]) for d in completion: plotData.append(d) # plot data boxPlot = BoxPlot() plotFilename = './images/sim.lineages.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png' title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)