def extractClusteredReads(self, clusters, referenceSeqHits, minSeqCutoff): readsInFiles = {} for cluster in clusters: hits = cluster[0] ggIds = cluster[1] if hits > minSeqCutoff: for ggId in ggIds: refSeqHit = referenceSeqHits[ggId] for pairFile in refSeqHit.pairs: readsInFiles[pairFile] = readsInFiles.get(pairFile, set()).union(refSeqHit.pairs[pairFile]) for singleFile in refSeqHit.singles: readsInFiles[singleFile] = readsInFiles.get(singleFile, set()).union(refSeqHit.singles[singleFile]) seqsInFiles = {} for filename in readsInFiles: seqIds = set() for readId in readsInFiles[filename]: seqIds.add(readId[0:readId.rfind('/')]) seqsInFiles[filename] = extractSeqs(filename, seqIds) return seqsInFiles
def processSingles(self, singles, outputDir, prefix): for i in xrange(0, len(singles)): seqFile = singles[i] print 'Identifying 16S sequences in single-end reads: ' + seqFile outputPrefix = prefix + '.' + seqFile[seqFile.rfind('/')+1:seqFile.rfind('.')] readsMappedTo16S = self.readSingleBAM(outputDir + ntpath.basename(seqFile) + '.bam') print ' Hits in ' + seqFile + ': ' + str(len(readsMappedTo16S)) # extract reads with hits seqs = extractSeqs(seqFile, readsMappedTo16S) # create file with all 16S sequences allSeqFile = outputPrefix + '.all.16S.fasta' fout = open(allSeqFile, 'w') for seqId in readsMappedTo16S: fout.write('>' + seqs[seqId][0] + '\n') fout.write(seqs[seqId][1] + '\n') fout.close() print ' Identified 16S reads written to: ' + allSeqFile print ''
def __processSingles(self, singles, evalue, alignLenThreshold, outputDir, sample, threadsPerSample, queueOut): for i in xrange(0, len(singles)): seqFile = singles[i] outputPrefix = os.path.join( outputDir, 'extracted', sample + '.' + seqFile[seqFile.rfind('/') + 1:seqFile.rfind('.')]) self.hmmSearch(seqFile, evalue, threadsPerSample, outputPrefix) # reads hits hitsBacteria = self.__getHits(outputPrefix + '.bacteria.table.txt', alignLenThreshold) hitsArchaea = self.__getHits(outputPrefix + '.archaea.table.txt', alignLenThreshold) hitsEuk = self.__getHits(outputPrefix + '.euk.table.txt', alignLenThreshold) hits = hitsBacteria.union(hitsArchaea).union(hitsEuk) # extract reads with hits seqs = extractSeqs(seqFile, hits) # create file with all 16S sequences allSeqFile = outputPrefix + '.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits: r = seqs[seqId][ 0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs[seqId][1] + '\n') fout.close() # gather output data hitRecord = HitRecord() hitRecord.pair1 = seqFile hitRecord.pair2 = None hitRecord.hits1 = len(hits) hitRecord.hitsBacteria1 = len(hitsBacteria) hitRecord.hitsArchaea1 = len(hitsArchaea) hitRecord.hitsEuk1 = len(hitsEuk) hitRecord.hits2 = None hitRecord.uniqueBacterialHits = len( hitsBacteria.difference(hitsArchaea.union(hitsEuk))) hitRecord.uniqueArchaealHits = len( hitsArchaea.difference(hitsBacteria.union(hitsEuk))) hitRecord.uniqueEukHits = len( hitsEuk.difference(hitsBacteria.union(hitsArchaea))) hitRecord.allSeqFile = allSeqFile queueOut.put(hitRecord)
def processSingles(self, singles, threads, evalue, outputDir, sample): for i in xrange(0, len(singles)): seqFile = singles[i] if not self.bQuiet: print 'Identifying 16S/18S sequences in single-end reads: ' + seqFile outputPrefix = outputDir + 'extracted_lsu/' + sample + '.' + seqFile[seqFile.rfind('/')+1:seqFile.rfind('.')] self.hmmSearch(seqFile, threads, evalue, outputPrefix) # reads hits hitsBacteria = self.getHits(outputPrefix + '.lsu.bacteria.table.txt') hitsRevCompBacteria = self.getHits(outputPrefix + '.lsu.bacteria.table.rev_comp.txt') hitsArchaea = self.getHits(outputPrefix + '.lsu.archaea.table.txt') hitsRevCompArcheae = self.getHits(outputPrefix + '.lsu.archaea.table.rev_comp.txt') hitsEuk = self.getHits(outputPrefix + '.lsu.euk.table.txt') hitsRevCompEuk = self.getHits(outputPrefix + '.lsu.euk.table.rev_comp.txt') hits = hitsBacteria.union(hitsRevCompBacteria).union(hitsArchaea).union(hitsRevCompArcheae).union(hitsEuk).union(hitsRevCompEuk) if not self.bQuiet: print ' Hits in ' + seqFile print ' Fwd. bacterial hits: ' + str(len(hitsBacteria)) print ' Rev. comp. bacterial hits: ' + str(len(hitsRevCompBacteria)) print ' Fwd. archaeal hits: ' + str(len(hitsArchaea)) print ' Rev. comp. archaeal hits: ' + str(len(hitsRevCompArcheae)) print ' Fwd. eukaryotic hits: ' + str(len(hitsEuk)) print ' Rev. comp. eukaryotic hits: ' + str(len(hitsRevCompEuk)) print '' print ' Identified 16S/18S reads: ' + str(len(hits)) + ' reads' print '' # extract reads with hits seqs = extractSeqs(seqFile, hits) # create file with all 16S sequences allSeqFile = outputPrefix + '.lsu.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits: r = seqs[seqId][0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs[seqId][1] + '\n') fout.close() if not self.bQuiet: print ' Identified 16S/18 reads written to: ' + allSeqFile print ''
def __processSingles(self, singles, evalue, alignLenThreshold, outputDir, sample, threadsPerSample, queueOut): for i in xrange(0, len(singles)): seqFile = singles[i] outputPrefix = os.path.join(outputDir, 'extracted', sample + '.' + seqFile[seqFile.rfind('/')+1:seqFile.rfind('.')]) self.hmmSearch(seqFile, evalue, threadsPerSample, outputPrefix) # reads hits hitsBacteria = self.__getHits(outputPrefix + '.bacteria.table.txt', alignLenThreshold) hitsArchaea = self.__getHits(outputPrefix + '.archaea.table.txt', alignLenThreshold) hitsEuk = self.__getHits(outputPrefix + '.euk.table.txt', alignLenThreshold) hits = hitsBacteria.union(hitsArchaea).union(hitsEuk) # extract reads with hits seqs = extractSeqs(seqFile, hits) # create file with all 16S sequences allSeqFile = outputPrefix + '.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits: r = seqs[seqId][0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs[seqId][1] + '\n') fout.close() # gather output data hitRecord = HitRecord() hitRecord.pair1 = seqFile hitRecord.pair2 = None hitRecord.hits1 = len(hits) hitRecord.hitsBacteria1 = len(hitsBacteria) hitRecord.hitsArchaea1 = len(hitsArchaea) hitRecord.hitsEuk1 = len(hitsEuk) hitRecord.hits2 = None hitRecord.uniqueBacterialHits = len(hitsBacteria.difference(hitsArchaea.union(hitsEuk))) hitRecord.uniqueArchaealHits = len(hitsArchaea.difference(hitsBacteria.union(hitsEuk))) hitRecord.uniqueEukHits = len(hitsEuk.difference(hitsBacteria.union(hitsArchaea))) hitRecord.allSeqFile = allSeqFile queueOut.put(hitRecord)
def __processPairs(self, pairs, evalue, alignLenThreshold, outputDir, sample, threadsPerSample, queueOut): for i in xrange(0, len(pairs), 2): pair1 = pairs[i] pair2 = pairs[i+1] outputPrefix1 = os.path.join(outputDir,'extracted',sample + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')]) outputPrefix2 = os.path.join(outputDir,'extracted',sample + '.' + pair2[pair2.rfind('/')+1:pair2.rfind('.')]) self.hmmSearch(pair1, evalue, threadsPerSample, outputPrefix1) self.hmmSearch(pair2, evalue, threadsPerSample, outputPrefix2) # reads hits hitsBacteria1 = self.__getHits(outputPrefix1 + '.bacteria.table.txt', alignLenThreshold) hitsArchaea1 = self.__getHits(outputPrefix1 + '.archaea.table.txt', alignLenThreshold) hitsEuk1 = self.__getHits(outputPrefix1 + '.euk.table.txt', alignLenThreshold) hitsBacteria2 = self.__getHits(outputPrefix2 + '.bacteria.table.txt', alignLenThreshold) hitsArchaea2 = self.__getHits(outputPrefix2 + '.archaea.table.txt', alignLenThreshold) hitsEuk2 = self.__getHits(outputPrefix2 + '.euk.table.txt', alignLenThreshold) # combine hits hits1 = hitsBacteria1.union(hitsArchaea1).union(hitsEuk1) hits2 = hitsBacteria2.union(hitsArchaea2).union(hitsEuk2) # extract reads with hits hitUnion = hits1.union(hits2) seqs1 = extractSeqs(pair1, hitUnion) seqs2 = extractSeqs(pair2, hitUnion) # create file with all 16S/18S sequences allSeqFile = outputPrefix1 + '.all.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits1: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') for seqId in hits2: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where at least one read maps to a 16S/18S pair1FileUnion = outputPrefix1 + '.union.SSU.fasta' fout = open(pair1FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileUnion = outputPrefix2 + '.union.SSU.fasta' fout = open(pair2FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where both read maps to a 16S/18S hitIntersection = hits1.intersection(hits2) pair1FileIntersect = outputPrefix1 + '.intersect.SSU.fasta' fout = open(pair1FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileIntersect = outputPrefix2 + '.intersect.SSU.fasta' fout = open(pair2FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create file where only one read maps to a 16S/18S hitDiff1 = hits1.difference(hits2) diffFile = outputPrefix1 + '.difference.SSU.fasta' fout = open(diffFile, 'w') for seqId in hitDiff1: r = seqs1[seqId][0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs1[seqId][1] + '\n') hitDiff2 = hits2.difference(hits1) for seqId in hitDiff2: r = seqs2[seqId][0] if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # gather output data bacHits = hitsBacteria1.union(hitsBacteria2) arHits = hitsArchaea1.union(hitsArchaea2) eukHits = hitsEuk1.union(hitsEuk2) hitRecord = HitRecord() hitRecord.pair1 = pair1 hitRecord.pair2 = pair2 hitRecord.hits1 = len(hits1) hitRecord.hitsBacteria1 = len(hitsBacteria1) hitRecord.hitsArchaea1 = len(hitsArchaea1) hitRecord.hitsEuk1 = len(hitsEuk1) hitRecord.hits2 = len(hits2) hitRecord.hitsBacteria2 = len(hitsBacteria2) hitRecord.hitsArchaea2 = len(hitsArchaea2) hitRecord.hitsEuk2 = len(hitsEuk2) hitRecord.uniqueBacterialHits = len(bacHits.difference(arHits.union(eukHits))) hitRecord.uniqueArchaealHits = len(arHits.difference(bacHits.union(eukHits))) hitRecord.uniqueEukHits = len(eukHits.difference(bacHits.union(arHits))) hitRecord.hitUnion = len(hitUnion) hitRecord.hitIntersect = len(hitIntersection) hitRecord.hitDiff = len(hitDiff1) + len(hitDiff2) hitRecord.allSeqFile = allSeqFile hitRecord.pair1FileUnion = pair1FileUnion hitRecord.pair2FileUnion = pair2FileUnion hitRecord.pair1FileIntersect = pair1FileIntersect hitRecord.pair2FileIntersect = pair2FileIntersect hitRecord.diffFile = diffFile queueOut.put(hitRecord)
def processSingles(self, singles, threads, evalue, outputDir, sample): for i in xrange(0, len(singles)): seqFile = singles[i] if not self.bQuiet: print 'Identifying 16S/18S sequences in single-end reads: ' + seqFile outputPrefix = os.path.join( outputDir, 'extracted_lsu', +sample + '.' + seqFile[seqFile.rfind('/') + 1:seqFile.rfind('.')]) self.hmmSearch(seqFile, threads, evalue, outputPrefix) # reads hits hitsBacteria = self.getHits(outputPrefix + '.lsu.bacteria.table.txt') hitsRevCompBacteria = self.getHits( outputPrefix + '.lsu.bacteria.table.rev_comp.txt') hitsArchaea = self.getHits(outputPrefix + '.lsu.archaea.table.txt') hitsRevCompArcheae = self.getHits( outputPrefix + '.lsu.archaea.table.rev_comp.txt') hitsEuk = self.getHits(outputPrefix + '.lsu.euk.table.txt') hitsRevCompEuk = self.getHits(outputPrefix + '.lsu.euk.table.rev_comp.txt') hits = hitsBacteria.union(hitsRevCompBacteria).union( hitsArchaea).union(hitsRevCompArcheae).union(hitsEuk).union( hitsRevCompEuk) if not self.bQuiet: print ' Hits in ' + seqFile print ' Fwd. bacterial hits: ' + str(len(hitsBacteria)) print ' Rev. comp. bacterial hits: ' + str( len(hitsRevCompBacteria)) print ' Fwd. archaeal hits: ' + str(len(hitsArchaea)) print ' Rev. comp. archaeal hits: ' + str( len(hitsRevCompArcheae)) print ' Fwd. eukaryotic hits: ' + str(len(hitsEuk)) print ' Rev. comp. eukaryotic hits: ' + str( len(hitsRevCompEuk)) print '' print ' Identified 16S/18S reads: ' + str( len(hits)) + ' reads' print '' # extract reads with hits seqs = extractSeqs(seqFile, hits) # create file with all 16S sequences allSeqFile = outputPrefix + '.lsu.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits: r = seqs[seqId][ 0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs[seqId][1] + '\n') fout.close() if not self.bQuiet: print ' Identified 16S/18 reads written to: ' + allSeqFile print ''
def processPairs(self, pairs, threads, evalue, outputDir, sample): for i in xrange(0, len(pairs), 2): pair1 = pairs[i] pair2 = pairs[i + 1] if not self.bQuiet: print 'Identifying LSU sequences in paired-end reads: ' + pair1 + ', ' + pair2 outputPrefix1 = os.path.join( outputDir, 'extracted_lsu', sample + '.' + pair1[pair1.rfind('/') + 1:pair1.rfind('.')]) outputPrefix2 = os.path.join( outputDir, 'extracted_lsu', sample + '.' + pair2[pair2.rfind('/') + 1:pair2.rfind('.')]) if not self.bQuiet: print ' Processing file: ' + pair1 self.hmmSearch(pair1, threads, evalue, outputPrefix1) if not self.bQuiet: print ' Processing file: ' + pair2 self.hmmSearch(pair2, threads, evalue, outputPrefix2) # reads hits hitsBacteria1 = self.getHits(outputPrefix1 + '.lsu.bacteria.table.txt') hitsRevCompBacteria1 = self.getHits( outputPrefix1 + '.lsu.bacteria.table.rev_comp.txt') hitsArchaea1 = self.getHits(outputPrefix1 + '.lsu.archaea.table.txt') hitsRevCompArcheae1 = self.getHits( outputPrefix1 + '.lsu.archaea.table.rev_comp.txt') hitsEuk1 = self.getHits(outputPrefix1 + '.lsu.euk.table.txt') hitsRevCompEuk1 = self.getHits(outputPrefix1 + '.lsu.euk.table.rev_comp.txt') hitsBacteria2 = self.getHits(outputPrefix2 + '.lsu.bacteria.table.txt') hitsRevCompBacteria2 = self.getHits( outputPrefix2 + '.lsu.bacteria.table.rev_comp.txt') hitsArchaea2 = self.getHits(outputPrefix2 + '.lsu.archaea.table.txt') hitsRevCompArcheae2 = self.getHits( outputPrefix2 + '.lsu.archaea.table.rev_comp.txt') hitsEuk2 = self.getHits(outputPrefix2 + '.lsu.euk.table.txt') hitsRevCompEuk2 = self.getHits(outputPrefix2 + '.lsu.euk.table.rev_comp.txt') # combine hits hits1 = hitsBacteria1.union(hitsRevCompBacteria1).union( hitsArchaea1).union(hitsRevCompArcheae1).union(hitsEuk1).union( hitsRevCompEuk1) hits2 = hitsBacteria2.union(hitsRevCompBacteria2).union( hitsArchaea2).union(hitsRevCompArcheae2).union(hitsEuk2).union( hitsRevCompEuk2) if not self.bQuiet: print ' Hits in ' + pair1 + ': ' + str(len(hits1)) print ' Fwd. bacterial hits: ' + str(len(hitsBacteria1)) print ' Rev. comp. bacterial hits: ' + str( len(hitsRevCompBacteria1)) print ' Fwd. archaeal hits: ' + str(len(hitsArchaea1)) print ' Rev. comp. archaeal hits: ' + str( len(hitsRevCompArcheae1)) print ' Fwd. eukaryotic hits: ' + str(len(hitsEuk1)) print ' Rev. comp. eukaryotic hits: ' + str( len(hitsRevCompEuk1)) print '' print ' Hits in ' + pair2 + ': ' + str(len(hits2)) print ' Fwd. bacterial hits: ' + str(len(hitsBacteria2)) print ' Rev. comp. bacterial hits: ' + str( len(hitsRevCompBacteria2)) print ' Fwd. archaeal hits: ' + str(len(hitsArchaea2)) print ' Rev. comp. archaeal hits: ' + str( len(hitsRevCompArcheae2)) print ' Fwd. eukaryotic hits: ' + str(len(hitsEuk2)) print ' Rev. comp. eukaryotic hits: ' + str( len(hitsRevCompEuk2)) print '' # extract reads with hits if not self.bQuiet: print ' Extracting putative 16S/18S reads:' hitUnion = hits1.union(hits2) seqs1 = extractSeqs(pair1, hitUnion) seqs2 = extractSeqs(pair2, hitUnion) # create file with all 16S/18S sequences allSeqFile = outputPrefix1 + '.lsu.all.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits1: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') for seqId in hits2: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where at least one read maps to a 16S/18S pair1FileUnion = outputPrefix1 + '.lsu.union.SSU.fasta' fout = open(pair1FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileUnion = outputPrefix2 + '.lsu.union.SSU.fasta' fout = open(pair2FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where at least one read maps to a 16S/18S hitIntersection = hits1.intersection(hits2) pair1FileIntersect = outputPrefix1 + '.lsu.intersect.SSU.fasta' fout = open(pair1FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileIntersect = outputPrefix2 + '.lsu.intersect.SSU.fasta' fout = open(pair2FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create file where only one read maps to a 16S/18S hitDiff1 = hits1.difference(hits2) diffFile = outputPrefix1 + '.lsu.difference.SSU.fasta' fout = open(diffFile, 'w') for seqId in hitDiff1: r = seqs1[seqId][ 0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs1[seqId][1] + '\n') hitDiff2 = hits2.difference(hits1) for seqId in hitDiff2: r = seqs2[seqId][0] if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() if not self.bQuiet: print ' Hits to left reads: ' + str(len(hits1)) print ' Hits to right reads: ' + str(len(hits2)) print ' Pairs with at least one read identified as 16S/18S: ' + str( len(hitUnion)) + ' pairs' print ' Pairs with both read identified as 16S/18S: ' + str( len(hitIntersection)) + ' pairs' print ' Pairs with only one read identified as 16S/18S: ' + str( len(hitDiff1) + len(hitDiff2)) + ' reads' print '' print ' All identified 16S reads: ' + allSeqFile print ' Pairs with at least one read identified as 16S/18S written to: ' print ' ' + pair1FileUnion print ' ' + pair2FileUnion print ' Pairs with both read identified as 16S/18S written to: ' print ' ' + pair1FileIntersect print ' ' + pair2FileIntersect print ' Pairs with only one read identified as 16S/18S written to: ' print ' ' + diffFile print ''
def processPairs(self, pairs, threads, evalue, outputDir, sample): for i in xrange(0, len(pairs), 2): pair1 = pairs[i] pair2 = pairs[i+1] if not self.bQuiet: print 'Identifying LSU sequences in paired-end reads: ' + pair1 + ', ' + pair2 outputPrefix1 = outputDir + 'extracted_lsu/' + sample + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')] outputPrefix2 = outputDir + 'extracted_lsu/' + sample + '.' + pair2[pair2.rfind('/')+1:pair2.rfind('.')] if not self.bQuiet: print ' Processing file: ' + pair1 self.hmmSearch(pair1, threads, evalue, outputPrefix1) if not self.bQuiet: print ' Processing file: ' + pair2 self.hmmSearch(pair2, threads, evalue, outputPrefix2) # reads hits hitsBacteria1 = self.getHits(outputPrefix1 + '.lsu.bacteria.table.txt') hitsRevCompBacteria1 = self.getHits(outputPrefix1 + '.lsu.bacteria.table.rev_comp.txt') hitsArchaea1 = self.getHits(outputPrefix1 + '.lsu.archaea.table.txt') hitsRevCompArcheae1 = self.getHits(outputPrefix1 + '.lsu.archaea.table.rev_comp.txt') hitsEuk1 = self.getHits(outputPrefix1 + '.lsu.euk.table.txt') hitsRevCompEuk1 = self.getHits(outputPrefix1 + '.lsu.euk.table.rev_comp.txt') hitsBacteria2 = self.getHits(outputPrefix2 + '.lsu.bacteria.table.txt') hitsRevCompBacteria2 = self.getHits(outputPrefix2 + '.lsu.bacteria.table.rev_comp.txt') hitsArchaea2 = self.getHits(outputPrefix2 + '.lsu.archaea.table.txt') hitsRevCompArcheae2 = self.getHits(outputPrefix2 + '.lsu.archaea.table.rev_comp.txt') hitsEuk2 = self.getHits(outputPrefix2 + '.lsu.euk.table.txt') hitsRevCompEuk2 = self.getHits(outputPrefix2 + '.lsu.euk.table.rev_comp.txt') # combine hits hits1 = hitsBacteria1.union(hitsRevCompBacteria1).union(hitsArchaea1).union(hitsRevCompArcheae1).union(hitsEuk1).union(hitsRevCompEuk1) hits2 = hitsBacteria2.union(hitsRevCompBacteria2).union(hitsArchaea2).union(hitsRevCompArcheae2).union(hitsEuk2).union(hitsRevCompEuk2) if not self.bQuiet: print ' Hits in ' + pair1 + ': ' + str(len(hits1)) print ' Fwd. bacterial hits: ' + str(len(hitsBacteria1)) print ' Rev. comp. bacterial hits: ' + str(len(hitsRevCompBacteria1)) print ' Fwd. archaeal hits: ' + str(len(hitsArchaea1)) print ' Rev. comp. archaeal hits: ' + str(len(hitsRevCompArcheae1)) print ' Fwd. eukaryotic hits: ' + str(len(hitsEuk1)) print ' Rev. comp. eukaryotic hits: ' + str(len(hitsRevCompEuk1)) print '' print ' Hits in ' + pair2 + ': ' + str(len(hits2)) print ' Fwd. bacterial hits: ' + str(len(hitsBacteria2)) print ' Rev. comp. bacterial hits: ' + str(len(hitsRevCompBacteria2)) print ' Fwd. archaeal hits: ' + str(len(hitsArchaea2)) print ' Rev. comp. archaeal hits: ' + str(len(hitsRevCompArcheae2)) print ' Fwd. eukaryotic hits: ' + str(len(hitsEuk2)) print ' Rev. comp. eukaryotic hits: ' + str(len(hitsRevCompEuk2)) print '' # extract reads with hits if not self.bQuiet: print ' Extracting putative 16S/18S reads:' hitUnion = hits1.union(hits2) seqs1 = extractSeqs(pair1, hitUnion) seqs2 = extractSeqs(pair2, hitUnion) # create file with all 16S/18S sequences allSeqFile = outputPrefix1 + '.lsu.all.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits1: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') for seqId in hits2: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where at least one read maps to a 16S/18S pair1FileUnion = outputPrefix1 + '.lsu.union.SSU.fasta' fout = open(pair1FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileUnion = outputPrefix2 + '.lsu.union.SSU.fasta' fout = open(pair2FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where at least one read maps to a 16S/18S hitIntersection = hits1.intersection(hits2) pair1FileIntersect = outputPrefix1 + '.lsu.intersect.SSU.fasta' fout = open(pair1FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileIntersect = outputPrefix2 + '.lsu.intersect.SSU.fasta' fout = open(pair2FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create file where only one read maps to a 16S/18S hitDiff1 = hits1.difference(hits2) diffFile = outputPrefix1 + '.lsu.difference.SSU.fasta' fout = open(diffFile, 'w') for seqId in hitDiff1: r = seqs1[seqId][0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs1[seqId][1] + '\n') hitDiff2 = hits2.difference(hits1) for seqId in hitDiff2: r = seqs2[seqId][0] if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() if not self.bQuiet: print ' Hits to left reads: ' + str(len(hits1)) print ' Hits to right reads: ' + str(len(hits2)) print ' Pairs with at least one read identified as 16S/18S: ' + str(len(hitUnion)) + ' pairs' print ' Pairs with both read identified as 16S/18S: ' + str(len(hitIntersection)) + ' pairs' print ' Pairs with only one read identified as 16S/18S: ' + str(len(hitDiff1) + len(hitDiff2)) + ' reads' print '' print ' All identified 16S reads: ' + allSeqFile print ' Pairs with at least one read identified as 16S/18S written to: ' print ' ' + pair1FileUnion print ' ' + pair2FileUnion print ' Pairs with both read identified as 16S/18S written to: ' print ' ' + pair1FileIntersect print ' ' + pair2FileIntersect print ' Pairs with only one read identified as 16S/18S written to: ' print ' ' + diffFile print ''
def processPairs(self, pairs, outputDir, prefix): for i in xrange(0, len(pairs), 2): pair1 = pairs[i] pair2 = pairs[i+1] print 'Identifying 16S sequences in paired-end reads: ' + pair1 + ', ' + pair2 outputPrefix = prefix + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')] outputPrefix1 = prefix + '.' + pair1[pair1.rfind('/')+1:pair1.rfind('.')] + '.1' outputPrefix2 = prefix + '.' + pair2[pair2.rfind('/')+1:pair2.rfind('.')] + '.2' readsMappedTo16S_1, readsMappedTo16S_2 = self.readPairedBAM(outputDir + ntpath.basename(pair1) + '.bam') print ' Hits in ' + pair1 + ': ' + str(len(readsMappedTo16S_1)) print ' Hits in ' + pair2 + ': ' + str(len(readsMappedTo16S_2)) print '' # extract all pairs where at least one read is on a 16S sequence print 'Extracting putative 16S reads: ' readsMappedTo16S = readsMappedTo16S_1.union(readsMappedTo16S_2) seqs1 = extractSeqs(pair1, readsMappedTo16S) seqs2 = extractSeqs(pair2, readsMappedTo16S) # create file with all 16S sequences allSeqFile = outputPrefix + '.all.16S.fasta' fout = open(allSeqFile, 'w') for seqId in readsMappedTo16S_1: if seqs1[seqId][0].endswith('/1'): fout.write('>' + seqs1[seqId][0] + '\n') else: fout.write('>' + seqs1[seqId][0] + '/1\n') fout.write(seqs1[seqId][1] + '\n') for seqId in readsMappedTo16S_2: if seqs2[seqId][0].endswith('/1'): fout.write('>' + seqs2[seqId][0] + '\n') else: fout.write('>' + seqs2[seqId][0] + '/2\n') fout.write(seqs2[seqId][1] + '\n') fout.close() print ' All identified 16S reads written to: ' + allSeqFile # create paired-end files where at least one read maps to a 16S pair1File = outputPrefix1 + '.16S.fasta' fout = open(pair1File, 'w') for seqId in readsMappedTo16S: if seqs1[seqId][0].endswith('/1'): fout.write('>' + seqs1[seqId][0] + '\n') else: fout.write('>' + seqs1[seqId][0] + '/1\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2File = outputPrefix2 + '.16S.fasta' fout = open(pair2File, 'w') for seqId in readsMappedTo16S: if seqs2[seqId][0].endswith('/1'): fout.write('>' + seqs2[seqId][0] + '\n') else: fout.write('>' + seqs2[seqId][0] + '/2\n') fout.write(seqs2[seqId][1] + '\n') fout.close() print ' Pairs with at least one read identified as 16S written to: ' + pair1File + ', ' + pair2File print ' Pairs with at least one read identified as 16S: ' + str(len(readsMappedTo16S)) print ''
def __processPairs(self, pairs, evalue, alignLenThreshold, outputDir, sample, threadsPerSample, queueOut): for i in xrange(0, len(pairs), 2): pair1 = pairs[i] pair2 = pairs[i + 1] outputPrefix1 = os.path.join( outputDir, 'extracted', sample + '.' + pair1[pair1.rfind('/') + 1:pair1.rfind('.')]) outputPrefix2 = os.path.join( outputDir, 'extracted', sample + '.' + pair2[pair2.rfind('/') + 1:pair2.rfind('.')]) self.hmmSearch(pair1, evalue, threadsPerSample, outputPrefix1) self.hmmSearch(pair2, evalue, threadsPerSample, outputPrefix2) # reads hits hitsBacteria1 = self.__getHits( outputPrefix1 + '.bacteria.table.txt', alignLenThreshold) hitsArchaea1 = self.__getHits(outputPrefix1 + '.archaea.table.txt', alignLenThreshold) hitsEuk1 = self.__getHits(outputPrefix1 + '.euk.table.txt', alignLenThreshold) hitsBacteria2 = self.__getHits( outputPrefix2 + '.bacteria.table.txt', alignLenThreshold) hitsArchaea2 = self.__getHits(outputPrefix2 + '.archaea.table.txt', alignLenThreshold) hitsEuk2 = self.__getHits(outputPrefix2 + '.euk.table.txt', alignLenThreshold) # combine hits hits1 = hitsBacteria1.union(hitsArchaea1).union(hitsEuk1) hits2 = hitsBacteria2.union(hitsArchaea2).union(hitsEuk2) # extract reads with hits hitUnion = hits1.union(hits2) seqs1 = extractSeqs(pair1, hitUnion) seqs2 = extractSeqs(pair2, hitUnion) # create file with all 16S/18S sequences allSeqFile = outputPrefix1 + '.all.SSU.fasta' fout = open(allSeqFile, 'w') for seqId in hits1: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') for seqId in hits2: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where at least one read maps to a 16S/18S pair1FileUnion = outputPrefix1 + '.union.SSU.fasta' fout = open(pair1FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileUnion = outputPrefix2 + '.union.SSU.fasta' fout = open(pair2FileUnion, 'w') for seqId in hitUnion: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create paired-end files where both read maps to a 16S/18S hitIntersection = hits1.intersection(hits2) pair1FileIntersect = outputPrefix1 + '.intersect.SSU.fasta' fout = open(pair1FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs1[seqId][0] + '\n') fout.write(seqs1[seqId][1] + '\n') fout.close() pair2FileIntersect = outputPrefix2 + '.intersect.SSU.fasta' fout = open(pair2FileIntersect, 'w') for seqId in hitIntersection: fout.write('>' + seqs2[seqId][0] + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # create file where only one read maps to a 16S/18S hitDiff1 = hits1.difference(hits2) diffFile = outputPrefix1 + '.difference.SSU.fasta' fout = open(diffFile, 'w') for seqId in hitDiff1: r = seqs1[seqId][ 0] # strip read identifier as these will be mapped as singletons if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs1[seqId][1] + '\n') hitDiff2 = hits2.difference(hits1) for seqId in hitDiff2: r = seqs2[seqId][0] if '/' in r: r = r[0:r.rfind('/')] fout.write('>' + r + '\n') fout.write(seqs2[seqId][1] + '\n') fout.close() # gather output data bacHits = hitsBacteria1.union(hitsBacteria2) arHits = hitsArchaea1.union(hitsArchaea2) eukHits = hitsEuk1.union(hitsEuk2) hitRecord = HitRecord() hitRecord.pair1 = pair1 hitRecord.pair2 = pair2 hitRecord.hits1 = len(hits1) hitRecord.hitsBacteria1 = len(hitsBacteria1) hitRecord.hitsArchaea1 = len(hitsArchaea1) hitRecord.hitsEuk1 = len(hitsEuk1) hitRecord.hits2 = len(hits2) hitRecord.hitsBacteria2 = len(hitsBacteria2) hitRecord.hitsArchaea2 = len(hitsArchaea2) hitRecord.hitsEuk2 = len(hitsEuk2) hitRecord.uniqueBacterialHits = len( bacHits.difference(arHits.union(eukHits))) hitRecord.uniqueArchaealHits = len( arHits.difference(bacHits.union(eukHits))) hitRecord.uniqueEukHits = len( eukHits.difference(bacHits.union(arHits))) hitRecord.hitUnion = len(hitUnion) hitRecord.hitIntersect = len(hitIntersection) hitRecord.hitDiff = len(hitDiff1) + len(hitDiff2) hitRecord.allSeqFile = allSeqFile hitRecord.pair1FileUnion = pair1FileUnion hitRecord.pair2FileUnion = pair2FileUnion hitRecord.pair1FileIntersect = pair1FileIntersect hitRecord.pair2FileIntersect = pair2FileIntersect hitRecord.diffFile = diffFile queueOut.put(hitRecord)