Пример #1
0
def dnaToProt(inFastaDna, outFastaProt, translTable=11):
    """
        Translates DNA gene! sequences to PROT sequences.

        @param inFastaDna: input fasta file containing DNA sequences
        @param outFastaProt: output fasta file containing sequences translated to protein sequences
        @param translTable: default 11 for bacteria and archaea (http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)
    """
    out = OutFileBuffer(outFastaProt)
    for seqName, seqDna in fastaFileToDictWholeNames(inFastaDna).iteritems():
        seqProt = Seq(seqDna, generic_dna).translate(table=translTable, stop_symbol='', cds=True)
        out.writeText('>%s\n%s\n' % (seqName, seqProt))
    out.close()
Пример #2
0
def sortSeqDesc(inFasta, outFasta):
    """
        Sort sequences in a descending order.

        @param inFasta: input fasta file
        @param outFasta: output sorted fasta file
    """
    tupleList = []
    for seqName, seq in getSequencesToList(inFasta):
        tupleList.append((seqName, seq, len(seq)))
    # sort
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = OutFileBuffer(outFasta)
    for seqName, seq, bp in tupleList:
        out.writeText('>%s\n%s\n' % (seqName, seq))
    out.close()
Пример #3
0
class SplitFasta():
    def __init__(self, evenFasta, oddFasta):
        self._evenFasta = OutFileBuffer(evenFasta)
        self._oddFasta = OutFileBuffer(oddFasta)
        self._counter = 0

    def parse(self, record):
        entry = '>' + str(record.id) + '\n' + str(record.seq) + '\n'
        if self._counter % 2 == 0:
            self._evenFasta.writeText(entry)
        else:
            self._oddFasta.writeText(entry)
        self._counter += 1

    def close(self):
        self._oddFasta.close()
        self._evenFasta.close()
Пример #4
0
class SplitFasta():
    def __init__(self, evenFasta, oddFasta):
        self._evenFasta = OutFileBuffer(evenFasta)
        self._oddFasta = OutFileBuffer(oddFasta)
        self._counter = 0

    def parse(self, record):
        entry = '>' + str(record.id) + '\n' + str(record.seq) + '\n'
        if self._counter % 2 == 0:
            self._evenFasta.writeText(entry)
        else:
            self._oddFasta.writeText(entry)
        self._counter += 1

    def close(self):
        self._oddFasta.close()
        self._evenFasta.close()
Пример #5
0
def filterOutSequences(inFileName,
                       outFileName,
                       allowedNamesSet,
                       formatName="fasta",
                       seqNameModifyFunction=None):
    """
        From the input fasta file filter out sequences their names are not contained in the allowedNamesSet.

        @param allowedNamesSet: the set of entries that are allowed as a sequence names
        @param seqNameModifyFunction: a sequence`s name is modified by this function and then compared to the allowedNamesSet
    """
    outFileBuffer = OutFileBuffer(outFileName)
    recordCondition = RecordConditionFilterOutSequences(
        allowedNamesSet, seqNameModifyFunction)
    parser = RecordFilter(outFileBuffer, formatName, recordCondition)
    _forEachRecord(inFileName, parser)
Пример #6
0
def cpSeqNoShortSeq(inFile, outFile, minLen):
    """
        Copy sequences longer or equal to a minimum length from the input to the output file.

        @param inFile: input fasta file
        @param outFile: output fasta file containing only sequences longer or equal to the minimum length
        @param minLen: minimum length of a sequence that will be copied
    """
    out = OutFileBuffer(outFile)
    first = True
    for name, seq in fastaFileToDictWholeNames(inFile).iteritems():
        if len(seq) >= minLen:
            if first:
                out.writeText('>%s\n%s' % (name, seq))
                first = False
            else:
                out.writeText('\n>%s\n%s' % (name, seq))
    out.close()
Пример #7
0
def cpSeqNoShortSeq(inFile, outFile, minLen):
    """
        Copy sequences longer or equal to a minimum length from the input to the output file.

        @param inFile: input fasta file
        @param outFile: output fasta file containing only sequences longer or equal to the minimum length
        @param minLen: minimum length of a sequence that will be copied
    """
    out = OutFileBuffer(outFile)
    first = True
    for name, seq in fastaFileToDictWholeNames(inFile).iteritems():
        if len(seq) >= minLen:
            if first:
                out.writeText('>%s\n%s' % (name, seq))
                first = False
            else:
                out.writeText('\n>%s\n%s' % (name, seq))
    out.close()
Пример #8
0
    def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None):
        """
            Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis.
        """
        #read list of marker genes
        mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir))

        #translate DNA to protein sequences
        fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT'))
        dnaToProt(fastaFileDNA, fastaFileProt)

        #read DNA fasta file
        try:
            handle = open(fastaFileDNA, "rU")
            dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
            handle.close()
        except Exception:
            sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA)))
            raise

        #to output all predictions in one file
        outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_all.mP'))
        outAllBuffer = OutFileBuffer(outPredAllFileName)

        #run HMM search
        mgList = mgFiles.getGeneNameList()

        if outLog is not None:
            stdoutLog = open(outLog, 'w')
        else:
            stdoutLog = subprocess.STDOUT

        #for each gene perform the analysis separately
        for geneName in mgList:

            domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))]
            outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))]
            hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim')]  #,
                            # mgFiles.getFilePath(geneName, 'hmmPROTSec')]
            cmdArray = list([])

            #define cmd
            for i in range(1):
                if hmmFileArray[i] is not None:
                    cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01' + self.processorsHmm
                               + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt))
                else:
                    cmdArray.append(None)

            #run cmd
            for cmd in cmdArray:
                if cmd is not None and os.name == 'posix':

                    cwd = self.hmmInstallDir

                    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
                        sys.exit(-1)

                    # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog)
                    # print 'run cmd:', cmd
                    # hmmProc.wait()
                    # print 'HMM  return code:', hmmProc.returncode
                    # if hmmProc.returncode != 0:
                    #     raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))


                else:
                    print 'Marker genes analysis, doesn`t run (no posix): ', cmd


            #get regions that match to the HMM profile ()
            entryDictList = []
            for i in range(1):
                if cmdArray[i] is not None:
                    entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict())
                else:
                    entryDictList.append(None)

            entryDict1 = entryDictList[0]
            # entryDict2 = entryDictList[1]

            #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences
            regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff'))
            outFileBuffer = OutFileBuffer(regionDnaFasta)

            for seqName in entryDict1:
                i = -1
                for e in entryDict1[seqName]:
                    i += 1
                    from1 = entryDict1[seqName][i][0]
                    to1 = entryDict1[seqName][i][1]
                    assert ((from1 != None) and (to1 != None))
                    #compare the results found by the primary and secondary HMM profiles
                    # if (entryDict2 != None) and (seqName in entryDict2):
                    #     if len(entryDict2[seqName]) >= (i+1):
                    #         from2 = entryDict2[seqName][i][0]
                    #         to2 = entryDict2[seqName][i][1]
                            #if from1 != from2 or to1 != to2:
                            #    print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2)
                            #                + ' to1:' + str(to1) + ' to2:' + str(to2))

                    #extract regions from the DNA sequences (consider 3 ORF and reverse complements)

                    #name of the whole sequence
                    dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName)
                    #whole DNA sequence
                    dnaSeq = dnaSeqDict[dnaSeqName].seq

                    #reverse complement (contains "pr")
                    tagRev = 'p'
                    if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName):
                        dnaSeq = dnaSeq.reverse_complement()
                        tagRev = 'pr'

                    #shift "0"
                    if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName):
                        tagFrom = ((from1 - 1)*3)
                        tagTo = (to1*3)
                        tagRev += '0'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "1"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName):
                        tagFrom = (((from1 - 1)*3) + 1)
                        tagTo = ((to1*3) + 1)
                        tagRev += '1'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "2"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName):
                        tagFrom = (((from1 - 1)*3) + 2)
                        tagTo = ((to1*3) + 2)
                        tagRev += '2'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #error
                    else:
                        sys.stderr.write('Wrong seq name: ' + seqName + ' \n')
                        dnaSeq = None

                    tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev)
                    outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n'))

            outFileBuffer.close()

            #if no marker gene found
            if outFileBuffer.isEmpty():
                continue

            #run mothur classify (bayesian? the same as for the 16S analysis)
            templateFile = mgFiles.getFilePath(geneName, 'templateDNA')
            taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA')
            assert ((templateFile is not None) and (taxonomyFile is not None))
            cmd = str('' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile
                + ', taxonomy=' +  taxonomyFile + ', ' + self.mothurParam + ')"')
            if os.name == 'posix':

                print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0])

                cwd = self.markerGeneWorkingDir

                if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None:
                    sys.exit(-1)

                # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog)
                # print 'run cmd:', cmd
                # mothurProc.wait()
                # print 'mothur return code:', mothurProc.returncode
                # if mothurProc.returncode != 0:
                #     raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))

            else:
                print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

            #transform the mothur output to a simple output (name, ncbid, weight)

            #mothurPredFileName = os.path.join(self.markerGeneWorkingDir,
            #                                  str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy'))  # taxonomy
            #!!!!!!!!!!!!!
            mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile)
            if not os.path.isfile(mothurPredFileName):
                mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy')
                if not os.path.isfile(mothurPredFileName):
                    print("Can't open file: %s" % mothurPredFileName)

            outPredFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP'))
            outBuffer = OutFileBuffer(outPredFileName, bufferText=True)
            forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName))

            if not outAllBuffer.isEmpty():
                outAllBuffer.writeText('\n')
            outAllBuffer.writeText(outBuffer.getTextBuffer())

        if outLog is not None:
            stdoutLog.close()
        outAllBuffer.close()
Пример #9
0
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [] # fasta files containing regions that correspond to particular marker genes
        self._mgList = [] # list of names of marker genes
        mgToFastaPath = dict([]) # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(os.path.join(os.path.normpath(self._mgWorkingDir),'*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1' , os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1',name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict([]) #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg,tag,pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(),
                                    seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[mg] #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(mgToFastaPath[mg]) #read the fasta file
            outPath = os.path.normpath(os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict([]) # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*',r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)',r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [] # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                         candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [] # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                        or (len(taxPathDict) >= len(seqBaseTaxPathDict))): #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0: #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else: #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[0] #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [] #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]: #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg]
                + ' -out ' + mgToAlignPath[mg] + ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg]
                                + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath
                                + ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment = '#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
Пример #10
0
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [
        ]  # fasta files containing regions that correspond to particular marker genes
        self._mgList = []  # list of names of marker genes
        mgToFastaPath = dict([])  # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(
                os.path.join(os.path.normpath(self._mgWorkingDir), '*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1', os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1', name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict(
            [])  #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg, tag, pred in zip(seq.getCandidateTaxPathSourceList(),
                                     seq.getCandidateTaxPathTagList(),
                                     seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[
                mg]  #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(
                mgToFastaPath[mg])  #read the fasta file
            outPath = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict(
                [])  # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*', r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)', r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(
                    seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [
                ]  # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                        candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [
                ]  # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                            or (len(taxPathDict) >= len(seqBaseTaxPathDict))
                        ):  #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0:  #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else:  #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[
                        0]  #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [
                    ]  #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]:  #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(
                    str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) +
                        '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(
                    self._config.get('aligner') + ' -in ' +
                    mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] +
                    ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(
                    alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1
                )  #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str(
                    'time ' + mothur + ' "#dist.seqs(fasta=' +
                    mgToAlignPath[mg] +
                    ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"'
                )
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                            str(mg + '.align.phylip.dist'))
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' +
                                distFilePath +
                                ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'),
                             0,
                             1,
                             sep='\t',
                             comment='#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(
                file,
                MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(
        description=
        '''A simple Metagenome Illumina read simulator that wraps pIRS''',
        epilog='''''')

    parser.add_argument('-c',
                        '--config',
                        nargs=1,
                        type=file,
                        required=True,
                        help='configuration file of the simulator',
                        metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument(
        '-p',
        '--pIRS-param',
        action='store',
        nargs='+',
        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(
        config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam, '-m') != 0
            or string.count(pirsParam, '-v') != 0
            or string.count(pirsParam, '-l') != 0
            or string.count(pirsParam, '-x') != 0
            or string.count(pirsParam, '-i') != 0
            or string.count(pirsParam, '-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir, 'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' +
                  str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment='#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir, 'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir, 'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(
            seqNameToFreq[seqName][0]) * coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir, str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(
            os.path.join(pirsInstallDir, 'pirs') + ' simulate -i ' +
            fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) +
            ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' +
            seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(
            cmd, shell=True, bufsize=-1,
            cwd=tmpDir)  # stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(
            str(
                file1Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads1.close()

        reads2 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(
            str(
                file2Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()
Пример #12
0
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(description='''A simple Metagenome Illumina read simulator that wraps pIRS''',
                                 epilog='''''')

    parser.add_argument('-c', '--config', nargs=1, type=file, required=True,
                        help='configuration file of the simulator', metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument('-p', '--pIRS-param', action='store', nargs='+',
                        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
                        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam,'-m') != 0 or string.count(pirsParam,'-v') != 0 or string.count(pirsParam,'-l') != 0
        or string.count(pirsParam,'-x') != 0 or string.count(pirsParam,'-i') != 0 or string.count(pirsParam,'-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir,'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' + str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment = '#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir,'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir,'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(seqNameToFreq[seqName][0])*coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir,str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(os.path.join(pirsInstallDir,'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) +
                  ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength)
                  + ' -o ' + seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=tmpDir)# stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(str(file1Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads1.close()

        reads2 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(str(file2Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()
Пример #13
0
 def __init__(self, evenFasta, oddFasta):
     self._evenFasta = OutFileBuffer(evenFasta)
     self._oddFasta = OutFileBuffer(oddFasta)
     self._counter = 0
Пример #14
0
 def __init__(self, evenFasta, oddFasta):
     self._evenFasta = OutFileBuffer(evenFasta)
     self._oddFasta = OutFileBuffer(oddFasta)
     self._counter = 0
Пример #15
0
def filterOutNonDna(inFileName, outFileName):
    outFileBuffer = OutFileBuffer(outFileName)
    parser = RemoveNonDnaParser(outFileBuffer)
    _forEachRecord(inFileName, parser)
Пример #16
0
    def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None):
        """
            Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis.
        """
        #read list of marker genes
        mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir))

        #translate DNA to protein sequences
        fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT'))
        dnaToProt(fastaFileDNA, fastaFileProt)

        #read DNA fasta file
        try:
            handle = open(fastaFileDNA, "rU")
            dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
            handle.close()
        except Exception:
            sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA)))
            raise

        #to output all predictions in one file
        outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_all.mP'))
        outAllBuffer = OutFileBuffer(outPredAllFileName)

        #run HMM search
        mgList = mgFiles.getGeneNameList()

        if outLog is not None:
            stdoutLog = open(outLog,'w')
        else:
            stdoutLog = subprocess.STDOUT

        #for each gene perform the analysis separately
        for geneName in mgList:

            domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom')),
                            os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))]
            outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out')),
                            os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))]
            hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim'),
                            mgFiles.getFilePath(geneName, 'hmmPROTSec')]
            cmdArray = list([])

            #define cmd
            for i in range(2):
                if hmmFileArray[i] is not None:
                    cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01'
                               + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt))
                else:
                    cmdArray.append(None)

            #run cmd
            for cmd in cmdArray:
                if cmd is not None and os.name == 'posix':
                    hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog)
                    print 'run cmd:', cmd
                    hmmProc.wait()
                    print 'HMM  return code:', hmmProc.returncode
                    if hmmProc.returncode != 0:
                        raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))
                else:
                    print 'Marker genes analysis, doesn`t run (no posix): ', cmd


            #get regions that match to the HMM profile ()
            entryDictList = []
            for i in range(2):
                if cmdArray[i] is not None:
                    entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict())
                else:
                    entryDictList.append(None)

            entryDict1 = entryDictList[0]
            entryDict2 = entryDictList[1]

            #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences
            regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff'))
            outFileBuffer = OutFileBuffer(regionDnaFasta)

            for seqName in entryDict1:
                i = -1
                for e in entryDict1[seqName]:
                    i += 1
                    from1 = entryDict1[seqName][i][0]
                    to1 = entryDict1[seqName][i][1]
                    assert ((from1 != None) and (to1 != None))
                    #compare the results found by the primary and secondary HMM profiles
                    if (entryDict2 != None) and (seqName in entryDict2):
                        if len(entryDict2[seqName]) >= (i+1):
                            from2 = entryDict2[seqName][i][0]
                            to2 = entryDict2[seqName][i][1]
                            #if from1 != from2 or to1 != to2:
                            #    print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2)
                            #                + ' to1:' + str(to1) + ' to2:' + str(to2))

                    #extract regions from the DNA sequences (consider 3 ORF and reverse complements)

                    #name of the whole sequence
                    dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName)
                    #whole DNA sequence
                    dnaSeq = dnaSeqDict[dnaSeqName].seq

                    #reverse complement (contains "pr")
                    tagRev = 'p'
                    if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName):
                        dnaSeq = dnaSeq.reverse_complement()
                        tagRev = 'pr'

                    #shift "0"
                    if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName):
                        tagFrom = ((from1 - 1)*3)
                        tagTo = (to1*3)
                        tagRev += '0'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "1"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName):
                        tagFrom = (((from1 - 1)*3) + 1)
                        tagTo = ((to1*3) + 1)
                        tagRev += '1'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "2"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName):
                        tagFrom = (((from1 - 1)*3) + 2)
                        tagTo = ((to1*3) + 2)
                        tagRev += '2'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #error
                    else:
                        sys.stderr.write('Wrong seq name: ' + seqName + ' \n')
                        dnaSeq = None

                    tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev)
                    outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n'))

            outFileBuffer.close()

            #if no marker gene found
            if outFileBuffer.isEmpty():
                continue

            #run mothur classify (bayesian? the same as for the 16S analysis)
            templateFile = mgFiles.getFilePath(geneName, 'templateDNA')
            taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA')
            assert ((templateFile is not None) and (taxonomyFile is not None))
            cmd = str('time ' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile
                + ', taxonomy=' +  taxonomyFile + ', ' + self.mothurParam + ')"')
            if os.name == 'posix':
                mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog)
                print 'run cmd:', cmd
                mothurProc.wait()
                print 'mothur return code:', mothurProc.returncode
                if mothurProc.returncode != 0:
                    raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))
            else:
                print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

            #transform the mothur output to a simple output (name, ncbid, weight)

            #mothurPredFileName = os.path.join(self.markerGeneWorkingDir,
            #                                  str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy'))  # taxonomy
            #!!!!!!!!!!!!!
            mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile)
            if not os.path.isfile(mothurPredFileName):
                mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy')
                if not os.path.isfile(mothurPredFileName):
                    print("Can't open file: %s" % mothurPredFileName)

            outPredFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP'))
            outBuffer = OutFileBuffer(outPredFileName, bufferText=True)
            forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName))

            if not outAllBuffer.isEmpty():
                outAllBuffer.writeText('\n')
            outAllBuffer.writeText(outBuffer.getTextBuffer())

        if outLog is not None:
            stdoutLog.close()
        outAllBuffer.close()