Exemplo n.º 1
0
    def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir,
                      outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in allTrustedGenomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' +
                                 genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'),
                        markerSeqFile,
                        os.path.join(outputGeneDir, modelName + '.aln.faa'),
                        trim=False,
                        outputFormat='Pfam')
            self.__maskAlignment(
                os.path.join(outputGeneDir, modelName + '.aln.faa'),
                os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)
Exemplo n.º 2
0
    def __alignMarker(self, markerId, binSeqs, binStats, bReportHitStats, alignOutputDir, hmmModelFile, bKeepUnmaskedAlign):
        unalignSeqFile = os.path.join(alignOutputDir, markerId + '.unaligned.faa')
        fout = open(unalignSeqFile, 'w')
        numSeqs = 0
        for binId, seqs in binSeqs.iteritems():
            for seqId, seq in seqs.iteritems():
                header = '>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqId
                if bReportHitStats:
                    header += ' [e-value=%.4g,score=%.1f]' % (binStats[binId][seqId][0], binStats[binId][seqId][1])

                fout.write(header + '\n')
                fout.write(seq + '\n')
                numSeqs += 1
        fout.close()

        if numSeqs > 0:
            alignSeqFile = os.path.join(alignOutputDir, markerId + '.aligned.faa')
            HA = HMMERRunner(mode='align')
            HA.align(hmmModelFile, unalignSeqFile, alignSeqFile, writeMode='>', outputFormat=self.outputFormat, trim=False)

            makedSeqFile = os.path.join(alignOutputDir, markerId + '.masked.faa')
            self.__maskAlignment(alignSeqFile, makedSeqFile)

            if not bKeepUnmaskedAlign:
                os.remove(alignSeqFile)

        os.remove(unalignSeqFile)
Exemplo n.º 3
0
    def __alignMarker(self, markerId, binSeqs, binStats, bReportHitStats, alignOutputDir, hmmModelFile, bKeepUnmaskedAlign):
        unalignSeqFile = os.path.join(alignOutputDir, markerId + '.unaligned.faa')
        fout = open(unalignSeqFile, 'w')
        numSeqs = 0
        for binId, seqs in binSeqs.items():
            for seqId, seq in seqs.items():
                header = '>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqId
                if bReportHitStats:
                    header += ' [e-value=%.4g,score=%.1f]' % (binStats[binId][seqId][0], binStats[binId][seqId][1])

                fout.write(header + '\n')
                fout.write(seq + '\n')
                numSeqs += 1
        fout.close()

        if numSeqs > 0:
            alignSeqFile = os.path.join(alignOutputDir, markerId + '.aligned.faa')
            HA = HMMERRunner(mode='align')
            HA.align(hmmModelFile, unalignSeqFile, alignSeqFile, writeMode='>', outputFormat=self.outputFormat, trim=False)

            makedSeqFile = os.path.join(alignOutputDir, markerId + '.masked.faa')
            self.__maskAlignment(alignSeqFile, makedSeqFile)

            if not bKeepUnmaskedAlign:
                os.remove(alignSeqFile)

        os.remove(unalignSeqFile)
Exemplo n.º 4
0
    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam')
            self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)
    def __runHmmAlign(self, allTrustedGenomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""
        
        while True:
            markerId = queueIn.get(block=True, timeout=None) 
            if markerId == None:
                break 
            
            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in allTrustedGenomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()
            
            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam')
            self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))
            
            queueOut.put(modelName)
Exemplo n.º 6
0
    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir,
                      outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' +
                                 genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'),
                        markerSeqFile,
                        os.path.join(outputGeneDir, modelName + '.aln.faa'),
                        trim=False,
                        outputFormat='Pfam')
            self.__maskAlignment(
                os.path.join(outputGeneDir, modelName + '.aln.faa'),
                os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)