Пример #1
0
    def removeOutliers(self, binFile, outlierFile, outputFile):
        """Remove sequences specified as outliers in the provided file."""

        binSeqs = readFasta(binFile)
        binIdToModify = binIdFromFilename(binFile)

        # get files to remove
        checkFileExists(outlierFile)
        seqsToRemove = []
        bHeader = True
        for line in open(outlierFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            binId = lineSplit[0]

            if binId == binIdToModify:
                seqId = lineSplit[1]
                seqsToRemove.append(seqId)

        # remove sequences from bin
        if len(seqsToRemove) > 0:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Пример #2
0
    def removeOutliers(self, binFile, outlierFile, outputFile):
        """Remove sequences specified as outliers in the provided file."""

        binSeqs = readFasta(binFile)
        binIdToModify = binIdFromFilename(binFile)

        # get files to remove
        checkFileExists(outlierFile)
        seqsToRemove = []
        bHeader = True
        for line in open(outlierFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            binId = lineSplit[0]

            if binId == binIdToModify:
                seqId = lineSplit[1]
                seqsToRemove.append(seqId)

        # remove sequences from bin
        if len(seqsToRemove) > 0:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Пример #3
0
    def run(self, outputDir):
        # make sure output directory exists
        if not os.path.exists(outputDir):
            os.mkdir(outputDir)

        # remove similar taxa
        print 'Filtering out highly similar taxa in order to reduce size of tree:'
        seqs = readFasta(self.derepConcatenatedAlignFile)

        nearlyIdentical = self.__nearlyIdenticalGenomes(seqs, outputDir)

        reducedSeqs = {}
        for s in nearlyIdentical:
            rndGenome = random.choice(tuple(s))
            reducedSeqs[rndGenome] = seqs[rndGenome]

        # write out reduced alignment
        reducedAlignmentFile = os.path.join(outputDir, "genome_tree.fasta")
        writeFasta(reducedSeqs, reducedAlignmentFile)

        # prune tree to retained taxa
        print ''
        print 'Pruning tree:'
        tree = dendropy.Tree.get_from_path(self.tree, schema='newick', as_rooted=False, preserve_underscores=True)

        for seqId in reducedSeqs:
            node = tree.find_node_with_taxon_label(seqId)
            if not node:
                print 'Missing taxa: %s' % seqId

        tree.retain_taxa_with_labels(reducedSeqs.keys())

        outputTree = os.path.join(outputDir, 'genome_tree.tre')
        tree.write_to_path(outputTree, schema='newick', suppress_rooting=True, unquoted_underscores=True)

        for t in tree.internal_nodes():
            t.label = None

        for t in tree.leaf_nodes():
            if t.taxon.label not in reducedSeqs:
                print 'missing in sequence file: %s' % t.taxon.label

        outputTreeWithoutLabels = os.path.join(outputDir, 'genome_tree.small.no_internal_labels.tre')
        tree.write_to_path(outputTreeWithoutLabels, schema='newick', suppress_rooting=True, unquoted_underscores=True)
        print '  Pruned tree written to: %s' % outputTree

        # calculate model parameters for pruned tree
        print ''
        print 'Determining model parameters for new tree.'
        outputTreeLog = os.path.join(outputDir, 'genome_tree.log')
        fastTreeOutput = os.path.join(outputDir, 'genome_tree.no_internal_labels.fasttree.tre')
        # os.system('FastTreeMP -nome -mllen -intree %s -log %s < %s > %s' % (outputTreeWithoutLabels, outputTreeLog, reducedAlignmentFile, fastTreeOutput))

        # calculate reference package for pruned tree
        print ''
        print 'Creating reference package.'
        os.system('taxit create -l %s -P %s --aln-fasta %s --tree-stats %s --tree-file %s' % ('genome_tree_reduced', os.path.join(outputDir, 'genome_tree_reduced.refpkg'), reducedAlignmentFile, outputTreeLog, outputTree))
Пример #4
0
    def __createConcatenatedAlignment(self, binFiles, resultsParser,
                                      alignOutputDir):
        """Create a concatenated alignment of marker genes for each bin."""

        # read alignment files
        self.logger.info('  Reading marker alignment files.')
        alignments = defaultdict(dict)
        files = os.listdir(alignOutputDir)
        binIds = set()
        for f in files:
            if f.endswith('.masked.faa'):
                markerId = f[0:f.find('.masked.faa')]
                seqs = readFasta(os.path.join(alignOutputDir, f))

                for seqId, seq in seqs.items():
                    binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    alignments[markerId][binId] = seq
                    binIds.add(binId)

        # get all markers and their lengths
        markerIds = list(resultsParser.models[list(
            resultsParser.models.keys())[0]].keys())
        markerIdLens = {}
        for markerId in markerIds:
            markerIdLens[markerId] = resultsParser.models[list(
                resultsParser.models.keys())[0]][markerId].leng

        # create concatenated alignment
        self.logger.info('  Concatenating alignments.')
        concatenatedSeqs = {}
        for markerId in sorted(markerIds):
            seqs = alignments[markerId]

            for binId in binIds:
                if binId in seqs:
                    # append alignment
                    concatenatedSeqs[binId] = concatenatedSeqs.get(
                        binId, '') + seqs[binId]
                else:
                    # missing gene
                    concatenatedSeqs[binId] = concatenatedSeqs.get(
                        binId, '') + '-' * markerIdLens[markerId]

        # save concatenated alignment
        concatenatedAlignFile = os.path.join(
            alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT)
        writeFasta(concatenatedSeqs, concatenatedAlignFile)

        return concatenatedAlignFile
Пример #5
0
    def modify(self, binFile, seqFile, seqsToAdd, seqsToRemove, outputFile):
        """Add and remove sequences from a file."""
        binSeqs = readFasta(binFile)

        # add sequences to bin
        if seqsToAdd != None:
            refSeqs = readFasta(seqFile)
            self.__addSeqs(binSeqs, refSeqs, seqsToAdd)

        # remove sequences from bin
        if seqsToRemove != None:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Пример #6
0
    def modify(self, binFile, seqFile, seqsToAdd, seqsToRemove, outputFile):
        """Add and remove sequences from a file."""
        binSeqs = readFasta(binFile)

        # add sequences to bin
        if seqsToAdd != None:
            refSeqs = readFasta(seqFile)
            self.__addSeqs(binSeqs, refSeqs, seqsToAdd)

        # remove sequences from bin
        if seqsToRemove != None:
            self.__removeSeqs(binSeqs, seqsToRemove)

        # save modified bin
        writeFasta(binSeqs, outputFile)
Пример #7
0
    def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir):
        """Create a concatenated alignment of marker genes for each bin."""

        # read alignment files
        self.logger.info('  Reading marker alignment files.')
        alignments = defaultdict(dict)
        files = os.listdir(alignOutputDir)
        binIds = set()
        for f in files:
            if f.endswith('.masked.faa'):
                markerId = f[0:f.find('.masked.faa')]
                seqs = readFasta(os.path.join(alignOutputDir, f))

                for seqId, seq in seqs.iteritems():
                    binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)]

                    alignments[markerId][binId] = seq
                    binIds.add(binId)

        # get all markers and their lengths
        markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys()
        markerIdLens = {}
        for markerId in markerIds:
            markerIdLens[markerId] = resultsParser.models[resultsParser.models.keys()[0]][markerId].leng

        # create concatenated alignment
        self.logger.info('  Concatenating alignments.')
        concatenatedSeqs = {}
        for markerId in sorted(markerIds):
            seqs = alignments[markerId]

            for binId in binIds:
                if binId in seqs:
                    # append alignment
                    concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + seqs[binId]
                else:
                    # missing gene
                    concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + '-' * markerIdLens[markerId]

        # save concatenated alignment
        concatenatedAlignFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT)
        writeFasta(concatenatedSeqs, concatenatedAlignFile)

        return concatenatedAlignFile
Пример #8
0
    def run(self, query, bNucORFs=True):
    
        prodigal_input = query

        # decompress gzip input files
        if prodigal_input.endswith('.gz'):
            tmp_dir = tempfile.mkdtemp()
            prodigal_input = os.path.join(tmp_dir, os.path.basename(prodigal_input[0:-3]) + '.fna')
            writeFasta(seqs, prodigal_input)
            
        # gather statistics about query file
        seqs = readFasta(prodigal_input)
        totalBases = 0
        for seqId, seq in seqs.items():
            totalBases += len(seq)

        # call ORFs with different translation tables and select the one with the highest coding density
        tableCodingDensity = {}
        for translationTable in [4, 11]:
            aaGeneFile = self.aaGeneFile + '.' + str(translationTable)
            ntGeneFile = self.ntGeneFile + '.' + str(translationTable)
            gffFile = self.gffFile + '.' + str(translationTable)

            # check if there is sufficient bases to calculate prodigal parameters
            if totalBases < 100000:
                procedureStr = 'meta'  # use best precalculated parameters
            else:
                procedureStr = 'single'  # estimate parameters from data

            if bNucORFs:
                cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (procedureStr, 
                                                                                                    translationTable, 
                                                                                                    aaGeneFile, 
                                                                                                    ntGeneFile, 
                                                                                                    prodigal_input, 
                                                                                                    gffFile))
            else:
                cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -i %s > %s 2> /dev/null' % (procedureStr, 
                                                                                            translationTable, 
                                                                                            aaGeneFile, 
                                                                                            prodigal_input, 
                                                                                            gffFile))

            os.system(cmd)

            if not self.__areORFsCalled(aaGeneFile) and procedureStr == 'single':
                # prodigal will fail to learn a model if the input genome has a large number of N's
                # so try gene prediction with 'meta'
                cmd = cmd.replace('-p single', '-p meta')
                os.system(cmd)

            # determine coding density
            prodigalParser = ProdigalGeneFeatureParser(gffFile)

            codingBases = 0
            for seqId, seq in seqs.items():
                codingBases += prodigalParser.codingBases(seqId)

            if totalBases != 0:
                codingDensity = float(codingBases) / totalBases
            else:
                codingDensity = 0
            tableCodingDensity[translationTable] = codingDensity

        # determine best translation table
        bestTranslationTable = 11
        if (tableCodingDensity[4] - tableCodingDensity[11] > 0.05) and tableCodingDensity[4] > 0.7:
            bestTranslationTable = 4

        shutil.copyfile(self.aaGeneFile + '.' + str(bestTranslationTable), self.aaGeneFile)
        shutil.copyfile(self.gffFile + '.' + str(bestTranslationTable), self.gffFile)
        if bNucORFs:
            shutil.copyfile(self.ntGeneFile + '.' + str(bestTranslationTable), self.ntGeneFile)

        # clean up redundant prodigal results
        for translationTable in [4, 11]:
            os.remove(self.aaGeneFile + '.' + str(translationTable))
            os.remove(self.gffFile + '.' + str(translationTable))
            if bNucORFs:
                os.remove(self.ntGeneFile + '.' + str(translationTable))
                
        if prodigal_input.endswith('.gz'):
            shutil.rmtree(tmp_dir)

        return bestTranslationTable
Пример #9
0
    def run(self,
            geneTreeDir,
            alignmentDir,
            extension,
            outputAlignFile,
            outputTree,
            outputTaxonomy,
            bSupportValues=False):
        # read gene trees
        print 'Reading gene trees.'
        geneIds = set()
        files = os.listdir(geneTreeDir)
        for f in files:
            if f.endswith('.tre'):
                geneId = f[0:f.find('.')]
                geneIds.add(geneId)

        # write out genome tree taxonomy
        print 'Reading trusted genomes.'
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        genomeIds = img.genomeMetadata().keys()
        self.__taxonomy(img, genomeIds, outputTaxonomy)

        print '  There are %d trusted genomes.' % (len(genomeIds))

        # get genes in genomes
        print 'Reading all PFAM and TIGRFAM hits in trusted genomes.'
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # read alignment files
        print 'Reading alignment files.'
        alignments = {}
        genomeIds = set()
        files = os.listdir(alignmentDir)
        for f in files:
            geneId = f[0:f.find('.')]
            if f.endswith(extension) and geneId in geneIds:
                seqs = readFasta(os.path.join(alignmentDir, f))

                imgGeneId = geneId
                if imgGeneId.startswith('PF'):
                    imgGeneId = imgGeneId.replace('PF', 'pfam')
                seqs = self.__filterParalogs(seqs, imgGeneId, genesInGenomes)

                genomeIds.update(set(seqs.keys()))
                alignments[geneId] = seqs

        # create concatenated alignment
        print 'Concatenating alignments:'
        concatenatedSeqs = {}
        totalAlignLen = 0
        for geneId in sorted(alignments.keys()):
            seqs = alignments[geneId]
            alignLen = len(seqs[seqs.keys()[0]])
            print '  ' + str(geneId) + ',' + str(alignLen)
            totalAlignLen += alignLen
            for genomeId in genomeIds:
                if genomeId in seqs:
                    # append alignment
                    concatenatedSeqs['IMG_' + genomeId] = concatenatedSeqs.get(
                        'IMG_' + genomeId, '') + seqs[genomeId]
                else:
                    # missing gene
                    concatenatedSeqs['IMG_' + genomeId] = concatenatedSeqs.get(
                        'IMG_' + genomeId, '') + '-' * alignLen

        print '  Total alignment length: ' + str(totalAlignLen)

        # save concatenated alignment
        writeFasta(concatenatedSeqs, outputAlignFile)

        # infer genome tree
        print 'Inferring genome tree.'
        outputLog = outputTree[0:outputTree.rfind('.')] + '.log'

        supportStr = ' '
        if not bSupportValues:
            supportStr = ' -nosupport '

        cmd = 'FastTreeMP' + supportStr + '-wag -gamma -log ' + outputLog + ' ' + outputAlignFile + ' > ' + outputTree
        os.system(cmd)
Пример #10
0
    def run(
        self, geneTreeDir, alignmentDir, extension, outputAlignFile, outputTree, outputTaxonomy, bSupportValues=False
    ):
        # read gene trees
        print "Reading gene trees."
        geneIds = set()
        files = os.listdir(geneTreeDir)
        for f in files:
            if f.endswith(".tre"):
                geneId = f[0 : f.find(".")]
                geneIds.add(geneId)

        # write out genome tree taxonomy
        print "Reading trusted genomes."
        img = IMG("/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv")
        genomeIds = img.genomeMetadata().keys()
        self.__taxonomy(img, genomeIds, outputTaxonomy)

        print "  There are %d trusted genomes." % (len(genomeIds))

        # get genes in genomes
        print "Reading all PFAM and TIGRFAM hits in trusted genomes."
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # read alignment files
        print "Reading alignment files."
        alignments = {}
        genomeIds = set()
        files = os.listdir(alignmentDir)
        for f in files:
            geneId = f[0 : f.find(".")]
            if f.endswith(extension) and geneId in geneIds:
                seqs = readFasta(os.path.join(alignmentDir, f))

                imgGeneId = geneId
                if imgGeneId.startswith("PF"):
                    imgGeneId = imgGeneId.replace("PF", "pfam")
                seqs = self.__filterParalogs(seqs, imgGeneId, genesInGenomes)

                genomeIds.update(set(seqs.keys()))
                alignments[geneId] = seqs

        # create concatenated alignment
        print "Concatenating alignments:"
        concatenatedSeqs = {}
        totalAlignLen = 0
        for geneId in sorted(alignments.keys()):
            seqs = alignments[geneId]
            alignLen = len(seqs[seqs.keys()[0]])
            print "  " + str(geneId) + "," + str(alignLen)
            totalAlignLen += alignLen
            for genomeId in genomeIds:
                if genomeId in seqs:
                    # append alignment
                    concatenatedSeqs["IMG_" + genomeId] = concatenatedSeqs.get("IMG_" + genomeId, "") + seqs[genomeId]
                else:
                    # missing gene
                    concatenatedSeqs["IMG_" + genomeId] = concatenatedSeqs.get("IMG_" + genomeId, "") + "-" * alignLen

        print "  Total alignment length: " + str(totalAlignLen)

        # save concatenated alignment
        writeFasta(concatenatedSeqs, outputAlignFile)

        # infer genome tree
        print "Inferring genome tree."
        outputLog = outputTree[0 : outputTree.rfind(".")] + ".log"

        supportStr = " "
        if not bSupportValues:
            supportStr = " -nosupport "

        cmd = "FastTreeMP" + supportStr + "-wag -gamma -log " + outputLog + " " + outputAlignFile + " > " + outputTree
        os.system(cmd)