Пример #1
0
    def extractClusteredReads(self, clusters, referenceSeqHits, minSeqCutoff):
        readsInFiles = {}

        for cluster in clusters:
            hits = cluster[0]
            ggIds = cluster[1]

            if hits > minSeqCutoff:
                for ggId in ggIds:
                    refSeqHit = referenceSeqHits[ggId]

                    for pairFile in refSeqHit.pairs:
                        readsInFiles[pairFile] = readsInFiles.get(pairFile, set()).union(refSeqHit.pairs[pairFile])

                    for singleFile in refSeqHit.singles:
                        readsInFiles[singleFile] = readsInFiles.get(singleFile, set()).union(refSeqHit.singles[singleFile])

        seqsInFiles = {}
        for filename in readsInFiles:
            seqIds = set()
            for readId in  readsInFiles[filename]:
                seqIds.add(readId[0:readId.rfind('/')])
            seqsInFiles[filename] = extractSeqs(filename, seqIds)

        return seqsInFiles
Пример #2
0
    def extractClusteredReads(self, clusters, referenceSeqHits, minSeqCutoff):
        readsInFiles = {}

        for cluster in clusters:
            hits = cluster[0]
            ggIds = cluster[1]

            if hits > minSeqCutoff:
                for ggId in ggIds:
                    refSeqHit = referenceSeqHits[ggId]

                    for pairFile in refSeqHit.pairs:
                        readsInFiles[pairFile] = readsInFiles.get(pairFile, set()).union(refSeqHit.pairs[pairFile])

                    for singleFile in refSeqHit.singles:
                        readsInFiles[singleFile] = readsInFiles.get(singleFile, set()).union(refSeqHit.singles[singleFile])

        seqsInFiles = {}
        for filename in readsInFiles:
            seqIds = set()
            for readId in  readsInFiles[filename]:
                seqIds.add(readId[0:readId.rfind('/')])
            seqsInFiles[filename] = extractSeqs(filename, seqIds)

        return seqsInFiles
Пример #3
0
  def processSingles(self, singles, outputDir, prefix):
    for i in xrange(0, len(singles)):
      seqFile = singles[i]

      print 'Identifying 16S sequences in single-end reads: ' + seqFile

      outputPrefix = prefix + '.' + seqFile[seqFile.rfind('/')+1:seqFile.rfind('.')]

      readsMappedTo16S = self.readSingleBAM(outputDir + ntpath.basename(seqFile) + '.bam')

      print '  Hits in ' + seqFile + ': ' + str(len(readsMappedTo16S))

      # extract reads with hits
      seqs = extractSeqs(seqFile, readsMappedTo16S)

      # create file with all 16S sequences
      allSeqFile = outputPrefix + '.all.16S.fasta'
      fout = open(allSeqFile, 'w')
      for seqId in readsMappedTo16S:
        fout.write('>' + seqs[seqId][0] + '\n')
        fout.write(seqs[seqId][1] + '\n')

      fout.close()

      print '  Identified 16S reads written to: ' + allSeqFile
      print ''
Пример #4
0
    def __processSingles(self, singles, evalue, alignLenThreshold, outputDir,
                         sample, threadsPerSample, queueOut):
        for i in xrange(0, len(singles)):
            seqFile = singles[i]

            outputPrefix = os.path.join(
                outputDir, 'extracted', sample + '.' +
                seqFile[seqFile.rfind('/') + 1:seqFile.rfind('.')])

            self.hmmSearch(seqFile, evalue, threadsPerSample, outputPrefix)

            # reads hits
            hitsBacteria = self.__getHits(outputPrefix + '.bacteria.table.txt',
                                          alignLenThreshold)
            hitsArchaea = self.__getHits(outputPrefix + '.archaea.table.txt',
                                         alignLenThreshold)
            hitsEuk = self.__getHits(outputPrefix + '.euk.table.txt',
                                     alignLenThreshold)

            hits = hitsBacteria.union(hitsArchaea).union(hitsEuk)

            # extract reads with hits
            seqs = extractSeqs(seqFile, hits)

            # create file with all 16S sequences
            allSeqFile = outputPrefix + '.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits:
                r = seqs[seqId][
                    0]  # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs[seqId][1] + '\n')

            fout.close()

            # gather output data
            hitRecord = HitRecord()
            hitRecord.pair1 = seqFile
            hitRecord.pair2 = None

            hitRecord.hits1 = len(hits)
            hitRecord.hitsBacteria1 = len(hitsBacteria)
            hitRecord.hitsArchaea1 = len(hitsArchaea)
            hitRecord.hitsEuk1 = len(hitsEuk)

            hitRecord.hits2 = None

            hitRecord.uniqueBacterialHits = len(
                hitsBacteria.difference(hitsArchaea.union(hitsEuk)))
            hitRecord.uniqueArchaealHits = len(
                hitsArchaea.difference(hitsBacteria.union(hitsEuk)))
            hitRecord.uniqueEukHits = len(
                hitsEuk.difference(hitsBacteria.union(hitsArchaea)))

            hitRecord.allSeqFile = allSeqFile

            queueOut.put(hitRecord)
Пример #5
0
    def processSingles(self, singles, threads, evalue, outputDir, sample):
        for i in xrange(0, len(singles)):
            seqFile = singles[i]

            if not self.bQuiet:
                print 'Identifying 16S/18S sequences in single-end reads: ' + seqFile

            outputPrefix = outputDir + 'extracted_lsu/' + sample + '.' + seqFile[seqFile.rfind('/')+1:seqFile.rfind('.')]

            self.hmmSearch(seqFile, threads, evalue, outputPrefix)

            # reads hits
            hitsBacteria = self.getHits(outputPrefix + '.lsu.bacteria.table.txt')
            hitsRevCompBacteria = self.getHits(outputPrefix + '.lsu.bacteria.table.rev_comp.txt')

            hitsArchaea = self.getHits(outputPrefix + '.lsu.archaea.table.txt')
            hitsRevCompArcheae = self.getHits(outputPrefix + '.lsu.archaea.table.rev_comp.txt')

            hitsEuk = self.getHits(outputPrefix + '.lsu.euk.table.txt')
            hitsRevCompEuk = self.getHits(outputPrefix + '.lsu.euk.table.rev_comp.txt')

            hits = hitsBacteria.union(hitsRevCompBacteria).union(hitsArchaea).union(hitsRevCompArcheae).union(hitsEuk).union(hitsRevCompEuk)

            if not self.bQuiet:
                print '  Hits in ' + seqFile
                print '    Fwd. bacterial hits: ' + str(len(hitsBacteria))
                print '    Rev. comp. bacterial hits: ' + str(len(hitsRevCompBacteria))
                print '    Fwd. archaeal hits: ' + str(len(hitsArchaea))
                print '    Rev. comp. archaeal hits: ' + str(len(hitsRevCompArcheae))
                print '    Fwd. eukaryotic hits: ' + str(len(hitsEuk))
                print '    Rev. comp. eukaryotic hits: ' + str(len(hitsRevCompEuk))
                print ''
                print '  Identified 16S/18S reads: ' + str(len(hits)) + ' reads'
                print ''

            # extract reads with hits
            seqs = extractSeqs(seqFile, hits)

            # create file with all 16S sequences
            allSeqFile = outputPrefix + '.lsu.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits:
                r = seqs[seqId][0]     # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs[seqId][1] + '\n')

            fout.close()

            if not self.bQuiet:
                print '  Identified 16S/18 reads written to: ' + allSeqFile
                print ''
Пример #6
0
    def __processSingles(self, singles, evalue, alignLenThreshold, outputDir, sample, threadsPerSample, queueOut):
        for i in xrange(0, len(singles)):
            seqFile = singles[i]

            outputPrefix = os.path.join(outputDir, 'extracted', sample + '.' + seqFile[seqFile.rfind('/')+1:seqFile.rfind('.')])

            self.hmmSearch(seqFile, evalue, threadsPerSample, outputPrefix)

            # reads hits
            hitsBacteria = self.__getHits(outputPrefix + '.bacteria.table.txt', alignLenThreshold)
            hitsArchaea = self.__getHits(outputPrefix + '.archaea.table.txt', alignLenThreshold)
            hitsEuk = self.__getHits(outputPrefix + '.euk.table.txt', alignLenThreshold)

            hits = hitsBacteria.union(hitsArchaea).union(hitsEuk)

            # extract reads with hits
            seqs = extractSeqs(seqFile, hits)

            # create file with all 16S sequences
            allSeqFile = outputPrefix + '.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits:
                r = seqs[seqId][0]     # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs[seqId][1] + '\n')

            fout.close()

            # gather output data
            hitRecord = HitRecord()
            hitRecord.pair1 = seqFile
            hitRecord.pair2 = None
            
            hitRecord.hits1 = len(hits)
            hitRecord.hitsBacteria1 = len(hitsBacteria)
            hitRecord.hitsArchaea1 = len(hitsArchaea)
            hitRecord.hitsEuk1 = len(hitsEuk)
            
            hitRecord.hits2 = None
 
            hitRecord.uniqueBacterialHits = len(hitsBacteria.difference(hitsArchaea.union(hitsEuk)))
            hitRecord.uniqueArchaealHits = len(hitsArchaea.difference(hitsBacteria.union(hitsEuk)))
            hitRecord.uniqueEukHits = len(hitsEuk.difference(hitsBacteria.union(hitsArchaea)))
            
            hitRecord.allSeqFile = allSeqFile
     
            queueOut.put(hitRecord)
Пример #7
0
    def __processPairs(self, pairs, evalue, alignLenThreshold, outputDir, sample, threadsPerSample, queueOut):
        for i in xrange(0, len(pairs), 2):
            pair1 = pairs[i]
            pair2 = pairs[i+1]

            outputPrefix1 = os.path.join(outputDir,'extracted',sample + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')])
            outputPrefix2 = os.path.join(outputDir,'extracted',sample + '.' + pair2[pair2.rfind('/')+1:pair2.rfind('.')])

            self.hmmSearch(pair1, evalue, threadsPerSample, outputPrefix1)
            self.hmmSearch(pair2, evalue, threadsPerSample, outputPrefix2)

            # reads hits
            hitsBacteria1 = self.__getHits(outputPrefix1 + '.bacteria.table.txt', alignLenThreshold)
            hitsArchaea1 = self.__getHits(outputPrefix1 + '.archaea.table.txt', alignLenThreshold)
            hitsEuk1 = self.__getHits(outputPrefix1 + '.euk.table.txt', alignLenThreshold)
            
            hitsBacteria2 = self.__getHits(outputPrefix2 + '.bacteria.table.txt', alignLenThreshold)
            hitsArchaea2 = self.__getHits(outputPrefix2 + '.archaea.table.txt', alignLenThreshold)
            hitsEuk2 = self.__getHits(outputPrefix2 + '.euk.table.txt', alignLenThreshold)

            # combine hits
            hits1 = hitsBacteria1.union(hitsArchaea1).union(hitsEuk1)
            hits2 = hitsBacteria2.union(hitsArchaea2).union(hitsEuk2)

            # extract reads with hits
            hitUnion = hits1.union(hits2)

            seqs1 = extractSeqs(pair1, hitUnion)
            seqs2 = extractSeqs(pair2, hitUnion)

            # create file with all 16S/18S sequences
            allSeqFile = outputPrefix1 + '.all.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits1:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            for seqId in hits2:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')

            fout.close()

            # create paired-end files where at least one read maps to a 16S/18S
            pair1FileUnion = outputPrefix1 + '.union.SSU.fasta'
            fout = open(pair1FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileUnion = outputPrefix2 + '.union.SSU.fasta'
            fout = open(pair2FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create paired-end files where both read maps to a 16S/18S
            hitIntersection = hits1.intersection(hits2)
            pair1FileIntersect = outputPrefix1 + '.intersect.SSU.fasta'
            fout = open(pair1FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileIntersect = outputPrefix2 + '.intersect.SSU.fasta'
            fout = open(pair2FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create file where only one read maps to a 16S/18S
            hitDiff1 = hits1.difference(hits2)
            diffFile = outputPrefix1 + '.difference.SSU.fasta'
            fout = open(diffFile, 'w')
            for seqId in hitDiff1:
                r = seqs1[seqId][0]     # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            hitDiff2 = hits2.difference(hits1)
            for seqId in hitDiff2:
                r = seqs2[seqId][0]
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # gather output data
            bacHits = hitsBacteria1.union(hitsBacteria2)
            arHits = hitsArchaea1.union(hitsArchaea2)
            eukHits = hitsEuk1.union(hitsEuk2)
            
            hitRecord = HitRecord()
            hitRecord.pair1 = pair1
            hitRecord.pair2 = pair2
            
            hitRecord.hits1 = len(hits1)
            hitRecord.hitsBacteria1 = len(hitsBacteria1)
            hitRecord.hitsArchaea1 = len(hitsArchaea1)
            hitRecord.hitsEuk1 = len(hitsEuk1)
            
            hitRecord.hits2 = len(hits2)
            hitRecord.hitsBacteria2 = len(hitsBacteria2)
            hitRecord.hitsArchaea2 = len(hitsArchaea2)
            hitRecord.hitsEuk2 = len(hitsEuk2)
            
            hitRecord.uniqueBacterialHits = len(bacHits.difference(arHits.union(eukHits)))
            hitRecord.uniqueArchaealHits = len(arHits.difference(bacHits.union(eukHits)))
            hitRecord.uniqueEukHits = len(eukHits.difference(bacHits.union(arHits)))
            
            hitRecord.hitUnion = len(hitUnion)
            hitRecord.hitIntersect = len(hitIntersection)
            hitRecord.hitDiff = len(hitDiff1) + len(hitDiff2)
            
            hitRecord.allSeqFile = allSeqFile
            hitRecord.pair1FileUnion = pair1FileUnion
            hitRecord.pair2FileUnion = pair2FileUnion
            hitRecord.pair1FileIntersect = pair1FileIntersect
            hitRecord.pair2FileIntersect = pair2FileIntersect
            hitRecord.diffFile = diffFile
               
            queueOut.put(hitRecord)
Пример #8
0
    def processSingles(self, singles, threads, evalue, outputDir, sample):
        for i in xrange(0, len(singles)):
            seqFile = singles[i]

            if not self.bQuiet:
                print 'Identifying 16S/18S sequences in single-end reads: ' + seqFile

            outputPrefix = os.path.join(
                outputDir, 'extracted_lsu', +sample + '.' +
                seqFile[seqFile.rfind('/') + 1:seqFile.rfind('.')])

            self.hmmSearch(seqFile, threads, evalue, outputPrefix)

            # reads hits
            hitsBacteria = self.getHits(outputPrefix +
                                        '.lsu.bacteria.table.txt')
            hitsRevCompBacteria = self.getHits(
                outputPrefix + '.lsu.bacteria.table.rev_comp.txt')

            hitsArchaea = self.getHits(outputPrefix + '.lsu.archaea.table.txt')
            hitsRevCompArcheae = self.getHits(
                outputPrefix + '.lsu.archaea.table.rev_comp.txt')

            hitsEuk = self.getHits(outputPrefix + '.lsu.euk.table.txt')
            hitsRevCompEuk = self.getHits(outputPrefix +
                                          '.lsu.euk.table.rev_comp.txt')

            hits = hitsBacteria.union(hitsRevCompBacteria).union(
                hitsArchaea).union(hitsRevCompArcheae).union(hitsEuk).union(
                    hitsRevCompEuk)

            if not self.bQuiet:
                print '  Hits in ' + seqFile
                print '    Fwd. bacterial hits: ' + str(len(hitsBacteria))
                print '    Rev. comp. bacterial hits: ' + str(
                    len(hitsRevCompBacteria))
                print '    Fwd. archaeal hits: ' + str(len(hitsArchaea))
                print '    Rev. comp. archaeal hits: ' + str(
                    len(hitsRevCompArcheae))
                print '    Fwd. eukaryotic hits: ' + str(len(hitsEuk))
                print '    Rev. comp. eukaryotic hits: ' + str(
                    len(hitsRevCompEuk))
                print ''
                print '  Identified 16S/18S reads: ' + str(
                    len(hits)) + ' reads'
                print ''

            # extract reads with hits
            seqs = extractSeqs(seqFile, hits)

            # create file with all 16S sequences
            allSeqFile = outputPrefix + '.lsu.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits:
                r = seqs[seqId][
                    0]  # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs[seqId][1] + '\n')

            fout.close()

            if not self.bQuiet:
                print '  Identified 16S/18 reads written to: ' + allSeqFile
                print ''
Пример #9
0
    def processPairs(self, pairs, threads, evalue, outputDir, sample):
        for i in xrange(0, len(pairs), 2):
            pair1 = pairs[i]
            pair2 = pairs[i + 1]

            if not self.bQuiet:
                print 'Identifying LSU sequences in paired-end reads: ' + pair1 + ', ' + pair2

            outputPrefix1 = os.path.join(
                outputDir, 'extracted_lsu',
                sample + '.' + pair1[pair1.rfind('/') + 1:pair1.rfind('.')])
            outputPrefix2 = os.path.join(
                outputDir, 'extracted_lsu',
                sample + '.' + pair2[pair2.rfind('/') + 1:pair2.rfind('.')])

            if not self.bQuiet:
                print '  Processing file: ' + pair1
            self.hmmSearch(pair1, threads, evalue, outputPrefix1)

            if not self.bQuiet:
                print '  Processing file: ' + pair2
            self.hmmSearch(pair2, threads, evalue, outputPrefix2)

            # reads hits
            hitsBacteria1 = self.getHits(outputPrefix1 +
                                         '.lsu.bacteria.table.txt')
            hitsRevCompBacteria1 = self.getHits(
                outputPrefix1 + '.lsu.bacteria.table.rev_comp.txt')

            hitsArchaea1 = self.getHits(outputPrefix1 +
                                        '.lsu.archaea.table.txt')
            hitsRevCompArcheae1 = self.getHits(
                outputPrefix1 + '.lsu.archaea.table.rev_comp.txt')

            hitsEuk1 = self.getHits(outputPrefix1 + '.lsu.euk.table.txt')
            hitsRevCompEuk1 = self.getHits(outputPrefix1 +
                                           '.lsu.euk.table.rev_comp.txt')

            hitsBacteria2 = self.getHits(outputPrefix2 +
                                         '.lsu.bacteria.table.txt')
            hitsRevCompBacteria2 = self.getHits(
                outputPrefix2 + '.lsu.bacteria.table.rev_comp.txt')

            hitsArchaea2 = self.getHits(outputPrefix2 +
                                        '.lsu.archaea.table.txt')
            hitsRevCompArcheae2 = self.getHits(
                outputPrefix2 + '.lsu.archaea.table.rev_comp.txt')

            hitsEuk2 = self.getHits(outputPrefix2 + '.lsu.euk.table.txt')
            hitsRevCompEuk2 = self.getHits(outputPrefix2 +
                                           '.lsu.euk.table.rev_comp.txt')

            # combine hits
            hits1 = hitsBacteria1.union(hitsRevCompBacteria1).union(
                hitsArchaea1).union(hitsRevCompArcheae1).union(hitsEuk1).union(
                    hitsRevCompEuk1)
            hits2 = hitsBacteria2.union(hitsRevCompBacteria2).union(
                hitsArchaea2).union(hitsRevCompArcheae2).union(hitsEuk2).union(
                    hitsRevCompEuk2)

            if not self.bQuiet:
                print '  Hits in ' + pair1 + ': ' + str(len(hits1))
                print '    Fwd. bacterial hits: ' + str(len(hitsBacteria1))
                print '    Rev. comp. bacterial hits: ' + str(
                    len(hitsRevCompBacteria1))
                print '    Fwd. archaeal hits: ' + str(len(hitsArchaea1))
                print '    Rev. comp. archaeal hits: ' + str(
                    len(hitsRevCompArcheae1))
                print '    Fwd. eukaryotic hits: ' + str(len(hitsEuk1))
                print '    Rev. comp. eukaryotic hits: ' + str(
                    len(hitsRevCompEuk1))
                print ''

                print '  Hits in ' + pair2 + ': ' + str(len(hits2))
                print '    Fwd. bacterial hits: ' + str(len(hitsBacteria2))
                print '    Rev. comp. bacterial hits: ' + str(
                    len(hitsRevCompBacteria2))
                print '    Fwd. archaeal hits: ' + str(len(hitsArchaea2))
                print '    Rev. comp. archaeal hits: ' + str(
                    len(hitsRevCompArcheae2))
                print '    Fwd. eukaryotic hits: ' + str(len(hitsEuk2))
                print '    Rev. comp. eukaryotic hits: ' + str(
                    len(hitsRevCompEuk2))
                print ''

            # extract reads with hits
            if not self.bQuiet:
                print '  Extracting putative 16S/18S reads:'
            hitUnion = hits1.union(hits2)

            seqs1 = extractSeqs(pair1, hitUnion)
            seqs2 = extractSeqs(pair2, hitUnion)

            # create file with all 16S/18S sequences
            allSeqFile = outputPrefix1 + '.lsu.all.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits1:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            for seqId in hits2:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')

            fout.close()

            # create paired-end files where at least one read maps to a 16S/18S
            pair1FileUnion = outputPrefix1 + '.lsu.union.SSU.fasta'
            fout = open(pair1FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileUnion = outputPrefix2 + '.lsu.union.SSU.fasta'
            fout = open(pair2FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create paired-end files where at least one read maps to a 16S/18S
            hitIntersection = hits1.intersection(hits2)
            pair1FileIntersect = outputPrefix1 + '.lsu.intersect.SSU.fasta'
            fout = open(pair1FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileIntersect = outputPrefix2 + '.lsu.intersect.SSU.fasta'
            fout = open(pair2FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create file where only one read maps to a 16S/18S
            hitDiff1 = hits1.difference(hits2)
            diffFile = outputPrefix1 + '.lsu.difference.SSU.fasta'
            fout = open(diffFile, 'w')
            for seqId in hitDiff1:
                r = seqs1[seqId][
                    0]  # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            hitDiff2 = hits2.difference(hits1)
            for seqId in hitDiff2:
                r = seqs2[seqId][0]
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            if not self.bQuiet:
                print '    Hits to left reads: ' + str(len(hits1))
                print '    Hits to right reads: ' + str(len(hits2))
                print '    Pairs with at least one read identified as 16S/18S: ' + str(
                    len(hitUnion)) + ' pairs'
                print '    Pairs with both read identified as 16S/18S: ' + str(
                    len(hitIntersection)) + ' pairs'
                print '    Pairs with only one read identified as 16S/18S: ' + str(
                    len(hitDiff1) + len(hitDiff2)) + ' reads'
                print ''
                print '    All identified 16S reads: ' + allSeqFile
                print '    Pairs with at least one read identified as 16S/18S written to: '
                print '      ' + pair1FileUnion
                print '      ' + pair2FileUnion
                print '    Pairs with both read identified as 16S/18S written to: '
                print '      ' + pair1FileIntersect
                print '      ' + pair2FileIntersect
                print '    Pairs with only one read identified as 16S/18S written to: '
                print '      ' + diffFile
                print ''
Пример #10
0
    def processPairs(self, pairs, threads, evalue, outputDir, sample):
        for i in xrange(0, len(pairs), 2):
            pair1 = pairs[i]
            pair2 = pairs[i+1]

            if not self.bQuiet:
                print 'Identifying LSU sequences in paired-end reads: ' + pair1 + ', ' + pair2

            outputPrefix1 = outputDir + 'extracted_lsu/' + sample + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')]
            outputPrefix2 = outputDir + 'extracted_lsu/' + sample + '.' + pair2[pair2.rfind('/')+1:pair2.rfind('.')]

            if not self.bQuiet:
                print '  Processing file: ' + pair1
            self.hmmSearch(pair1, threads, evalue, outputPrefix1)

            if not self.bQuiet:
                print '  Processing file: ' + pair2
            self.hmmSearch(pair2, threads, evalue, outputPrefix2)

            # reads hits
            hitsBacteria1 = self.getHits(outputPrefix1 + '.lsu.bacteria.table.txt')
            hitsRevCompBacteria1 = self.getHits(outputPrefix1 + '.lsu.bacteria.table.rev_comp.txt')

            hitsArchaea1 = self.getHits(outputPrefix1 + '.lsu.archaea.table.txt')
            hitsRevCompArcheae1 = self.getHits(outputPrefix1 + '.lsu.archaea.table.rev_comp.txt')

            hitsEuk1 = self.getHits(outputPrefix1 + '.lsu.euk.table.txt')
            hitsRevCompEuk1 = self.getHits(outputPrefix1 + '.lsu.euk.table.rev_comp.txt')

            hitsBacteria2 = self.getHits(outputPrefix2 + '.lsu.bacteria.table.txt')
            hitsRevCompBacteria2 = self.getHits(outputPrefix2 + '.lsu.bacteria.table.rev_comp.txt')

            hitsArchaea2 = self.getHits(outputPrefix2 + '.lsu.archaea.table.txt')
            hitsRevCompArcheae2 = self.getHits(outputPrefix2 + '.lsu.archaea.table.rev_comp.txt')

            hitsEuk2 = self.getHits(outputPrefix2 + '.lsu.euk.table.txt')
            hitsRevCompEuk2 = self.getHits(outputPrefix2 + '.lsu.euk.table.rev_comp.txt')

            # combine hits
            hits1 = hitsBacteria1.union(hitsRevCompBacteria1).union(hitsArchaea1).union(hitsRevCompArcheae1).union(hitsEuk1).union(hitsRevCompEuk1)
            hits2 = hitsBacteria2.union(hitsRevCompBacteria2).union(hitsArchaea2).union(hitsRevCompArcheae2).union(hitsEuk2).union(hitsRevCompEuk2)

            if not self.bQuiet:
                print '  Hits in ' + pair1 + ': ' + str(len(hits1))
                print '    Fwd. bacterial hits: ' + str(len(hitsBacteria1))
                print '    Rev. comp. bacterial hits: ' + str(len(hitsRevCompBacteria1))
                print '    Fwd. archaeal hits: ' + str(len(hitsArchaea1))
                print '    Rev. comp. archaeal hits: ' + str(len(hitsRevCompArcheae1))
                print '    Fwd. eukaryotic hits: ' + str(len(hitsEuk1))
                print '    Rev. comp. eukaryotic hits: ' + str(len(hitsRevCompEuk1))
                print ''

                print '  Hits in ' + pair2 + ': ' + str(len(hits2))
                print '    Fwd. bacterial hits: ' + str(len(hitsBacteria2))
                print '    Rev. comp. bacterial hits: ' + str(len(hitsRevCompBacteria2))
                print '    Fwd. archaeal hits: ' + str(len(hitsArchaea2))
                print '    Rev. comp. archaeal hits: ' + str(len(hitsRevCompArcheae2))
                print '    Fwd. eukaryotic hits: ' + str(len(hitsEuk2))
                print '    Rev. comp. eukaryotic hits: ' + str(len(hitsRevCompEuk2))
                print ''

            # extract reads with hits
            if not self.bQuiet:
                print '  Extracting putative 16S/18S reads:'
            hitUnion = hits1.union(hits2)

            seqs1 = extractSeqs(pair1, hitUnion)
            seqs2 = extractSeqs(pair2, hitUnion)

            # create file with all 16S/18S sequences
            allSeqFile = outputPrefix1 + '.lsu.all.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits1:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            for seqId in hits2:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')

            fout.close()

            # create paired-end files where at least one read maps to a 16S/18S
            pair1FileUnion = outputPrefix1 + '.lsu.union.SSU.fasta'
            fout = open(pair1FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileUnion = outputPrefix2 + '.lsu.union.SSU.fasta'
            fout = open(pair2FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create paired-end files where at least one read maps to a 16S/18S
            hitIntersection = hits1.intersection(hits2)
            pair1FileIntersect = outputPrefix1 + '.lsu.intersect.SSU.fasta'
            fout = open(pair1FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileIntersect = outputPrefix2 + '.lsu.intersect.SSU.fasta'
            fout = open(pair2FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create file where only one read maps to a 16S/18S
            hitDiff1 = hits1.difference(hits2)
            diffFile = outputPrefix1 + '.lsu.difference.SSU.fasta'
            fout = open(diffFile, 'w')
            for seqId in hitDiff1:
                r = seqs1[seqId][0]     # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            hitDiff2 = hits2.difference(hits1)
            for seqId in hitDiff2:
                r = seqs2[seqId][0]
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            if not self.bQuiet:
                print '    Hits to left reads: ' + str(len(hits1))
                print '    Hits to right reads: ' + str(len(hits2))
                print '    Pairs with at least one read identified as 16S/18S: ' + str(len(hitUnion)) + ' pairs'
                print '    Pairs with both read identified as 16S/18S: ' + str(len(hitIntersection)) + ' pairs'
                print '    Pairs with only one read identified as 16S/18S: ' + str(len(hitDiff1) + len(hitDiff2)) + ' reads'
                print ''
                print '    All identified 16S reads: ' + allSeqFile
                print '    Pairs with at least one read identified as 16S/18S written to: '
                print '      ' + pair1FileUnion
                print '      ' + pair2FileUnion
                print '    Pairs with both read identified as 16S/18S written to: '
                print '      ' + pair1FileIntersect
                print '      ' + pair2FileIntersect
                print '    Pairs with only one read identified as 16S/18S written to: '
                print '      ' + diffFile
                print ''
Пример #11
0
  def processPairs(self, pairs, outputDir, prefix):
    for i in xrange(0, len(pairs), 2):
      pair1 = pairs[i]
      pair2 = pairs[i+1]

      print 'Identifying 16S sequences in paired-end reads: ' + pair1 + ', ' + pair2

      outputPrefix = prefix + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')]
      outputPrefix1 = prefix + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')] + '.1'
      outputPrefix2 = prefix + '.' + pair2[pair2.rfind('/')+1:pair2.rfind('.')] + '.2'

      readsMappedTo16S_1, readsMappedTo16S_2  = self.readPairedBAM(outputDir + ntpath.basename(pair1) + '.bam')

      print '  Hits in ' + pair1 + ': ' + str(len(readsMappedTo16S_1))
      print '  Hits in ' + pair2 + ': ' + str(len(readsMappedTo16S_2))
      print ''

      # extract all pairs where at least one read is on a 16S sequence
      print 'Extracting putative 16S reads: '
      readsMappedTo16S = readsMappedTo16S_1.union(readsMappedTo16S_2)

      seqs1 = extractSeqs(pair1, readsMappedTo16S)
      seqs2 = extractSeqs(pair2, readsMappedTo16S)

      # create file with all 16S sequences
      allSeqFile = outputPrefix + '.all.16S.fasta'
      fout = open(allSeqFile, 'w')
      for seqId in readsMappedTo16S_1:
        if seqs1[seqId][0].endswith('/1'):
          fout.write('>' + seqs1[seqId][0] + '\n')
        else:
          fout.write('>' + seqs1[seqId][0] + '/1\n')
        fout.write(seqs1[seqId][1] + '\n')

      for seqId in readsMappedTo16S_2:
        if seqs2[seqId][0].endswith('/1'):
          fout.write('>' + seqs2[seqId][0] + '\n')
        else:
          fout.write('>' + seqs2[seqId][0] + '/2\n')
        fout.write(seqs2[seqId][1] + '\n')

      fout.close()

      print '  All identified 16S reads written to: ' + allSeqFile

      # create paired-end files where at least one read maps to a 16S
      pair1File = outputPrefix1 + '.16S.fasta'
      fout = open(pair1File, 'w')
      for seqId in readsMappedTo16S:
        if seqs1[seqId][0].endswith('/1'):
          fout.write('>' + seqs1[seqId][0] + '\n')
        else:
          fout.write('>' + seqs1[seqId][0] + '/1\n')
        fout.write(seqs1[seqId][1] + '\n')
      fout.close()

      pair2File = outputPrefix2 + '.16S.fasta'
      fout = open(pair2File, 'w')
      for seqId in readsMappedTo16S:
        if seqs2[seqId][0].endswith('/1'):
          fout.write('>' + seqs2[seqId][0] + '\n')
        else:
          fout.write('>' + seqs2[seqId][0] + '/2\n')
        fout.write(seqs2[seqId][1] + '\n')
      fout.close()

      print '  Pairs with at least one read identified as 16S written to: ' + pair1File + ', ' + pair2File
      print '    Pairs with at least one read identified as 16S: ' + str(len(readsMappedTo16S))
      print ''
Пример #12
0
    def __processPairs(self, pairs, evalue, alignLenThreshold, outputDir,
                       sample, threadsPerSample, queueOut):
        for i in xrange(0, len(pairs), 2):
            pair1 = pairs[i]
            pair2 = pairs[i + 1]

            outputPrefix1 = os.path.join(
                outputDir, 'extracted',
                sample + '.' + pair1[pair1.rfind('/') + 1:pair1.rfind('.')])
            outputPrefix2 = os.path.join(
                outputDir, 'extracted',
                sample + '.' + pair2[pair2.rfind('/') + 1:pair2.rfind('.')])

            self.hmmSearch(pair1, evalue, threadsPerSample, outputPrefix1)
            self.hmmSearch(pair2, evalue, threadsPerSample, outputPrefix2)

            # reads hits
            hitsBacteria1 = self.__getHits(
                outputPrefix1 + '.bacteria.table.txt', alignLenThreshold)
            hitsArchaea1 = self.__getHits(outputPrefix1 + '.archaea.table.txt',
                                          alignLenThreshold)
            hitsEuk1 = self.__getHits(outputPrefix1 + '.euk.table.txt',
                                      alignLenThreshold)

            hitsBacteria2 = self.__getHits(
                outputPrefix2 + '.bacteria.table.txt', alignLenThreshold)
            hitsArchaea2 = self.__getHits(outputPrefix2 + '.archaea.table.txt',
                                          alignLenThreshold)
            hitsEuk2 = self.__getHits(outputPrefix2 + '.euk.table.txt',
                                      alignLenThreshold)

            # combine hits
            hits1 = hitsBacteria1.union(hitsArchaea1).union(hitsEuk1)
            hits2 = hitsBacteria2.union(hitsArchaea2).union(hitsEuk2)

            # extract reads with hits
            hitUnion = hits1.union(hits2)

            seqs1 = extractSeqs(pair1, hitUnion)
            seqs2 = extractSeqs(pair2, hitUnion)

            # create file with all 16S/18S sequences
            allSeqFile = outputPrefix1 + '.all.SSU.fasta'
            fout = open(allSeqFile, 'w')
            for seqId in hits1:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            for seqId in hits2:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')

            fout.close()

            # create paired-end files where at least one read maps to a 16S/18S
            pair1FileUnion = outputPrefix1 + '.union.SSU.fasta'
            fout = open(pair1FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileUnion = outputPrefix2 + '.union.SSU.fasta'
            fout = open(pair2FileUnion, 'w')
            for seqId in hitUnion:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create paired-end files where both read maps to a 16S/18S
            hitIntersection = hits1.intersection(hits2)
            pair1FileIntersect = outputPrefix1 + '.intersect.SSU.fasta'
            fout = open(pair1FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs1[seqId][0] + '\n')
                fout.write(seqs1[seqId][1] + '\n')
            fout.close()

            pair2FileIntersect = outputPrefix2 + '.intersect.SSU.fasta'
            fout = open(pair2FileIntersect, 'w')
            for seqId in hitIntersection:
                fout.write('>' + seqs2[seqId][0] + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # create file where only one read maps to a 16S/18S
            hitDiff1 = hits1.difference(hits2)
            diffFile = outputPrefix1 + '.difference.SSU.fasta'
            fout = open(diffFile, 'w')
            for seqId in hitDiff1:
                r = seqs1[seqId][
                    0]  # strip read identifier as these will be mapped as singletons
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs1[seqId][1] + '\n')

            hitDiff2 = hits2.difference(hits1)
            for seqId in hitDiff2:
                r = seqs2[seqId][0]
                if '/' in r:
                    r = r[0:r.rfind('/')]
                fout.write('>' + r + '\n')
                fout.write(seqs2[seqId][1] + '\n')
            fout.close()

            # gather output data
            bacHits = hitsBacteria1.union(hitsBacteria2)
            arHits = hitsArchaea1.union(hitsArchaea2)
            eukHits = hitsEuk1.union(hitsEuk2)

            hitRecord = HitRecord()
            hitRecord.pair1 = pair1
            hitRecord.pair2 = pair2

            hitRecord.hits1 = len(hits1)
            hitRecord.hitsBacteria1 = len(hitsBacteria1)
            hitRecord.hitsArchaea1 = len(hitsArchaea1)
            hitRecord.hitsEuk1 = len(hitsEuk1)

            hitRecord.hits2 = len(hits2)
            hitRecord.hitsBacteria2 = len(hitsBacteria2)
            hitRecord.hitsArchaea2 = len(hitsArchaea2)
            hitRecord.hitsEuk2 = len(hitsEuk2)

            hitRecord.uniqueBacterialHits = len(
                bacHits.difference(arHits.union(eukHits)))
            hitRecord.uniqueArchaealHits = len(
                arHits.difference(bacHits.union(eukHits)))
            hitRecord.uniqueEukHits = len(
                eukHits.difference(bacHits.union(arHits)))

            hitRecord.hitUnion = len(hitUnion)
            hitRecord.hitIntersect = len(hitIntersection)
            hitRecord.hitDiff = len(hitDiff1) + len(hitDiff2)

            hitRecord.allSeqFile = allSeqFile
            hitRecord.pair1FileUnion = pair1FileUnion
            hitRecord.pair2FileUnion = pair2FileUnion
            hitRecord.pair1FileIntersect = pair1FileIntersect
            hitRecord.pair2FileIntersect = pair2FileIntersect
            hitRecord.diffFile = diffFile

            queueOut.put(hitRecord)