def run(self): img = IMG() fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1) # get list of all marker genes markerset = MarkerSet() pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes() print('PFAM marker genes: ' + str(len(tigrMarkers))) print('TIGR marker genes: ' + str(len(pfamMarkers))) print('') # run HMMs on each of the finished genomes genomeIds = img.genomeIds('Finished') for genomeId in genomeIds: print(genomeId + ':') fout.write(genomeId + ':\n') self.runPFAM(genomeId) self.runTIGRFAM(genomeId) fout.write(' ORF results:\n') self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout) #self.translateSixFrames(genomeId) #self.runPFAM_SixFrames(genomeId) #self.runTIGRFAM_SixFrames(genomeId) #fout.write(' Six-frame translation results:\n') #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout) fout.close()
def run(self, outputFile): img = IMG() print('Identifying all IMG prokaryotic genomes with valid data.') metadata = img.genomeMetadata() genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata) genomeMissingData = img.genomesWithMissingData(genomeIds) genomeIds -= genomeMissingData print(' Identified %d valid genomes.' % (len(genomeIds))) print('Calculating gene copy number for each genome.') countTable = img.geneCountTable(genomeIds) counts = [] for _, count in countTable['pfam00318'].iteritems(): counts.append(count) print(len(genomeIds)) print(len(counts)) print(mean(counts)) fout = open(outputFile, 'w') fout.write(str(countTable)) fout.close() print('Gene count dictionary to: ' + outputFile)
def __init__(self, outputDir): self.img = IMG() self.markerSetBuilder = MarkerSetBuilder() if os.path.exists(outputDir): print '[Error] Output directory already exists: ' + outputDir sys.exit(0) else: os.makedirs(outputDir) self.__checkForHMMER() self.__checkForFastTree() self.hmmDir = os.path.join(outputDir, 'phylo_hmms') self.alignmentDir = os.path.join(outputDir, 'gene_alignments') self.geneTreeDir = os.path.join(outputDir, 'gene_trees') self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific') self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final') self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv') self.concatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.faa') self.derepConcatenatedAlignFile = os.path.join( outputDir, 'genome_tree.concatenated.derep.fasta') self.treeOut = os.path.join(outputDir, 'genome_tree.tre') self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre') self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre') self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre') self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre') self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre') self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre') self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre') self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv') self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv') self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm') self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt') self.phyloUbiquity = 0.90 self.phyloSingleCopy = 0.90 self.paralogAcceptPer = 0.01 # self.consistencyAcceptPer = 0.95 # for trees at the class-level self.consistencyAcceptPer = 0.906 # for trees at the phylum-level self.consistencyMinTaxa = 20 # create output directories os.makedirs(self.hmmDir) os.makedirs(self.alignmentDir) os.makedirs(self.geneTreeDir) os.makedirs(self.conspecificGeneTreeDir) os.makedirs(self.finalGeneTreeDir)
def __init__(self): self.simFile = './experiments/simulation.tuning.genus.summary.tsv' self.looRank = 5 self.markerSetBuilder = MarkerSetBuilder() self.img = IMG()
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG() self.simContigLen = 10000
def __init__(self): self.img = IMG() self.markerset = MarkerSet()
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.items(): print('[' + lineage + ']') print(' Number of genomes: %d' % len(allLineageGenomeIds)) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print( '\nDetermining initial marker gene sets for genome filtering.') markerSet = markerSetBuilder.buildMarkerSet( finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print( ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())) fout = open( os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print('\nIdentifying highly complete, low contamination genomes.') trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck( markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId] ['status']] = retainedStatus.get( metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId] ['status']] = filteredStatus.get( metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print(' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(filteredStatus)) print(' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(retainedStatus)) # determine status of retained genomes print('\nTrusted genomes by phylum:') trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.items(): print(' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)) print('') allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in range(0, 6): # Domain to Genus for genomeId, data in metadata.items(): taxaStr = ';'.join(data['taxonomy'][0:r + 1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' + str(trustedStats.get(lineage, 0)) + '\n') fout.close()
def __init__(self): self.img = IMG()
def __init__(self): self.markerSetBuilder = MarkerSetBuilder() self.img = IMG()
def run(self, metadataFile, percentThreshold): img = IMG() metadata = img.genomeMetadataFromFile(metadataFile) matches = {} pfamCount = {} tigrCount = {} for genomeCounter, genomeId in enumerate(metadata): statusStr = ' Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if metadata[genomeId]['status'] == 'Finished': pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension if not os.path.exists(pfamFile): continue # get PFAM hits geneIdToPfams = {} bHeader = True for line in open(pfamFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToPfams: geneIdToPfams[lineSplit[0]].add(lineSplit[8]) else: geneIdToPfams[lineSplit[0]] = set([lineSplit[8]]) if lineSplit[8] in pfamCount: pfamCount[lineSplit[8]].add(genomeId) else: pfamCount[lineSplit[8]] = set([genomeId]) # get TIGRFAM hits geneIdToTigr = {} bHeader = True for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToTigr: geneIdToTigr[lineSplit[0]].add(lineSplit[6]) else: geneIdToTigr[lineSplit[0]] = set([lineSplit[6]]) if lineSplit[6] in tigrCount: tigrCount[lineSplit[6]].add(genomeId) else: tigrCount[lineSplit[6]] = set([genomeId]) # keep track of TIGRFAMs matching the same gene as a PFAM geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys())) for geneId in geneIds: pfams = geneIdToPfams.get(geneId, None) tigrs = geneIdToTigr.get(geneId, None) if pfams == None or tigrs == None: continue for pfamId in pfams: for tigrId in tigrs: key = pfamId + '-' + tigrId if key in matches: matches[key].add(genomeId) else: matches[key] = set([genomeId]) sys.stdout.write('\n') # find TIGRFAMs that generally hit the same gene as a PFAM fout = open('../data/pfam/tigrfam2pfam.tsv', 'w') for key, genomeSet in matches.items(): pfam, tigr = key.split('-') # deem a TIGRFAM HMM redundant if it is almost always hits that # same ORF as a PFAM HMM if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold: fout.write(pfam + '\t' + tigr + '\n') fout.close()