def removeOutliers(self, binFile, outlierFile, outputFile): """Remove sequences specified as outliers in the provided file.""" binSeqs = readFasta(binFile) binIdToModify = binIdFromFilename(binFile) # get files to remove checkFileExists(outlierFile) seqsToRemove = [] bHeader = True for line in open(outlierFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') binId = lineSplit[0] if binId == binIdToModify: seqId = lineSplit[1] seqsToRemove.append(seqId) # remove sequences from bin if len(seqsToRemove) > 0: self.__removeSeqs(binSeqs, seqsToRemove) # save modified bin writeFasta(binSeqs, outputFile)
def __init__(self, binningIndex, completeness, contamination, binFile): self.binningIndex = binningIndex self.completeness = completeness self.contamination = contamination self.binId = binIdFromFilename(binFile) self.seqs = readFasta(binFile) self.binFile = binFile
def __readBins(self, binFiles): bins = {} for binFile in binFiles: binId = binIdFromFilename(binFile) bins[binId] = set(readFastaSeqIds(binFile)) return bins
def unique(self, binFiles): """Check if sequences are assigned to multiple bins.""" # read seq ids from all bins binSeqs = {} for f in binFiles: binId = binIdFromFilename(f) binSeqs[binId] = readFastaSeqIds(f) # check for sequences assigned to multiple bins bDuplicates = False binIds = binSeqs.keys() for i in xrange(0, len(binIds)): for j in xrange(i+1, len(binIds)): seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]])) if len(seqInter) > 0: bDuplicates = True print ' Sequences shared between %s and %s: ' % (binIds[i], binIds[j]) for seqId in seqInter: print ' ' + seqId print '' if not bDuplicates: print ' No sequences assigned to multiple bins.'
def unique(self, binFiles): """Check if sequences are assigned to multiple bins.""" # read sequence IDs from all bins, # while checking for duplicate sequences within a bin binSeqs = {} for f in binFiles: binId = binIdFromFilename(f) if f.endswith('.gz'): openFile = gzip.open else: openFile = open seqIds = set() for line in openFile(f): if line[0] == '>': seqId = line[1:].split(None, 1)[0] if seqId in seqIds: print ' [Warning] Sequence %s found multiple times in bin %s.' % ( seqId, binId) seqIds.add(seqId) binSeqs[binId] = seqIds # check for sequences assigned to multiple bins bDuplicates = False binIds = binSeqs.keys() for i in xrange(0, len(binIds)): for j in xrange(i + 1, len(binIds)): seqInter = set(binSeqs[binIds[i]]).intersection( set(binSeqs[binIds[j]])) if len(seqInter) > 0: bDuplicates = True print ' Sequences shared between %s and %s: ' % ( binIds[i], binIds[j]) for seqId in seqInter: print ' ' + seqId print '' if not bDuplicates: print ' No sequences assigned to multiple bins.'
def unique(self, binFiles): """Check if sequences are assigned to multiple bins.""" # read sequence IDs from all bins, # while checking for duplicate sequences within a bin binSeqs = {} for f in binFiles: binId = binIdFromFilename(f) if f.endswith('.gz'): openFile = gzip.open else: openFile = open seqIds = set() for line in openFile(f): if line[0] == '>': seqId = line[1:].split(None, 1)[0] if seqId in seqIds: print ' [Warning] Sequence %s found multiple times in bin %s.' % (seqId, binId) seqIds.add(seqId) binSeqs[binId] = seqIds # check for sequences assigned to multiple bins bDuplicates = False binIds = binSeqs.keys() for i in xrange(0, len(binIds)): for j in xrange(i + 1, len(binIds)): seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]])) if len(seqInter) > 0: bDuplicates = True print ' Sequences shared between %s and %s: ' % (binIds[i], binIds[j]) for seqId in seqInter: print ' ' + seqId print '' if not bDuplicates: print ' No sequences assigned to multiple bins.'
def getBestCandidates(self, binFileSets, qas, minCompleteness, maxContamination): # Take the first set of bins as the best set yet bestCandidates = [] for f in binFileSets[0]: if ( qas[0].completeness(binIdFromFilename(f)) >= minCompleteness and qas[0].contamination(binIdFromFilename(f)) <= maxContamination ): bestCandidates.append( UnionBin( 0, qas[0].completeness(binIdFromFilename(f)), qas[0].contamination(binIdFromFilename(f)), f ) ) # For each bin in the second or after set, for binningIndex, binFileSet in enumerate(binFileSets): if binningIndex == 0: continue currentRoundCandidatesToAdd = [] for binFile in binFileSet: # Is it >50% (by sequence) aligned with any of the bins in the best set? binId = binIdFromFilename(binFile) if ( qas[binningIndex].completeness(binId) >= minCompleteness and qas[binningIndex].contamination(binId) <= maxContamination ): current = UnionBin( binningIndex, qas[binningIndex].completeness(binId), qas[binningIndex].contamination(binId), binFile, ) fiftyPercent = 0.5 * current.numBases() accountedFor = False for i, bestBin in enumerate(bestCandidates): overlap = current.numBasesOverlapping(bestBin) fiftyPercentBest = 0.5 * bestBin.numBases() if overlap > fiftyPercent or overlap > fiftyPercentBest: self.logger.debug( "Comparing best bin %s and current bin %s, overlap is %i" % (bestBin.binId, current.binId, overlap) ) if overlap > fiftyPercent and overlap > fiftyPercentBest: accountedFor = True # Choose the best one if current.compContSquaredScored() > bestBin.compContSquaredScored(): self.logger.debug("The newly found bin is better, going with that") # Found a better one, replace the best bin with that bestCandidates[i] = current # There's a bug here, but is sufficiently rare and hard to fix that meh. If a multiple bins have # the same contig, then it is possible that a bin can overlap > 50% with more # than one bin. So by breaking out of this for loop we may not be replacing the 'optimal' # bin. But then should it be a 1:1 swap anyway? meh. break elif overlap > fiftyPercent or overlap > fiftyPercentBest: self.logger.warn( "Bins %s and %s with sizes %i and %i overlap by %i bases and so have unusual overlap ratios, proceeding as if they are distinct bins" % (bestBin.binId, current.binId, bestBin.numBases(), current.numBases(), overlap) ) # Bins don't overlap, continue to go through the loop again if not accountedFor: currentRoundCandidatesToAdd.append(current) # Add all the bins that hit no other bins to the bestCandidates list # Do this after so that bins are not compared to themselves (saves some time?) for b in currentRoundCandidatesToAdd: self.logger.debug("Adding unmatched bin %s from %s" % (b.binId, b.binningIndex)) bestCandidates.append(b) return bestCandidates
def getBestCandidates(self, binFileSets, qas, minCompleteness, maxContamination): # Take the first set of bins as the best set yet bestCandidates = [] for f in binFileSets[0]: if qas[0].completeness(binIdFromFilename( f)) >= minCompleteness and qas[0].contamination( binIdFromFilename(f)) <= maxContamination: bestCandidates.append( UnionBin(0, qas[0].completeness(binIdFromFilename(f)), qas[0].contamination(binIdFromFilename(f)), f)) # For each bin in the second or after set, for binningIndex, binFileSet in enumerate(binFileSets): if binningIndex == 0: continue currentRoundCandidatesToAdd = [] for binFile in binFileSet: # Is it >50% (by sequence) aligned with any of the bins in the best set? binId = binIdFromFilename(binFile) if qas[binningIndex].completeness( binId ) >= minCompleteness and qas[binningIndex].contamination( binId) <= maxContamination: current = UnionBin(binningIndex, qas[binningIndex].completeness(binId), qas[binningIndex].contamination(binId), binFile) fiftyPercent = 0.5 * current.numBases() accountedFor = False for i, bestBin in enumerate(bestCandidates): overlap = current.numBasesOverlapping(bestBin) fiftyPercentBest = 0.5 * bestBin.numBases() if overlap > fiftyPercent or overlap > fiftyPercentBest: self.logger.debug( "Comparing best bin %s and current bin %s, overlap is %i" % (bestBin.binId, current.binId, overlap)) if overlap > fiftyPercent and overlap > fiftyPercentBest: accountedFor = True # Choose the best one if current.compContSquaredScored( ) > bestBin.compContSquaredScored(): self.logger.debug( "The newly found bin is better, going with that" ) # Found a better one, replace the best bin with that bestCandidates[i] = current # There's a bug here, but is sufficiently rare and hard to fix that meh. If a multiple bins have # the same contig, then it is possible that a bin can overlap > 50% with more # than one bin. So by breaking out of this for loop we may not be replacing the 'optimal' # bin. But then should it be a 1:1 swap anyway? meh. break elif overlap > fiftyPercent or overlap > fiftyPercentBest: self.logger.warn( "Bins %s and %s with sizes %i and %i overlap by %i bases and so have unusual overlap ratios, proceeding as if they are distinct bins" % (bestBin.binId, current.binId, bestBin.numBases(), current.numBases(), overlap)) # Bins don't overlap, continue to go through the loop again if not accountedFor: currentRoundCandidatesToAdd.append(current) # Add all the bins that hit no other bins to the bestCandidates list # Do this after so that bins are not compared to themselves (saves some time?) for b in currentRoundCandidatesToAdd: self.logger.debug("Adding unmatched bin %s from %s" % (b.binId, b.binningIndex)) bestCandidates.append(b) return bestCandidates
def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, reportType, outputFile): """Identify sequences that are outliers.""" self.logger.info(' Reading reference distributions.') gcBounds = readDistribution('gc_dist') cdBounds = readDistribution('cd_dist') tdBounds = readDistribution('td_dist') fout = open(outputFile, 'w') fout.write('Bin Id\tSequence Id\tSequence length\tOutlying distributions') fout.write('\tSequence GC\tMean bin GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (distribution, distribution)) fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' % distribution) fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' % distribution) self.logger.info('') processedBins = 0 for binFile in binFiles: binId = binIdFromFilename(binFile) processedBins += 1 self.logger.info(' Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles))) seqs = readFasta(binFile) meanGC, deltaGCs, seqGC = self.gcDist(seqs) genomicSig = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSig.read(tetraProfileFile) binSig = self.binTetraSig(seqs, tetraSigs) meanTD, deltaTDs = self.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) gffFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): self.logger.error(' [Error] Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF) sys.exit(1) prodigalParser = ProdigalGeneFeatureParser(gffFile) meanCD, deltaCDs, CDs = self.codingDensityDist(seqs, prodigalParser) # find keys into GC and CD distributions closestGC = findNearest(np.array(gcBounds.keys()), meanGC) sampleSeqLen = gcBounds[closestGC].keys()[0] d = gcBounds[closestGC][sampleSeqLen] gcLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0) gcUpperBoundKey = findNearest(d.keys(), (100 + distribution) / 2.0) closestCD = findNearest(np.array(cdBounds.keys()), meanCD) sampleSeqLen = cdBounds[closestCD].keys()[0] d = cdBounds[closestCD][sampleSeqLen] cdLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0) tdBoundKey = findNearest(tdBounds[tdBounds.keys()[0]].keys(), distribution) index = 0 for seqId, seq in seqs.iteritems(): seqLen = len(seq) # find GC, CD, and TD bounds closestSeqLen = findNearest(gcBounds[closestGC].keys(), seqLen) gcLowerBound = gcBounds[closestGC][closestSeqLen][gcLowerBoundKey] gcUpperBound = gcBounds[closestGC][closestSeqLen][gcUpperBoundKey] closestSeqLen = findNearest(cdBounds[closestCD].keys(), seqLen) cdLowerBound = cdBounds[closestCD][closestSeqLen][cdLowerBoundKey] closestSeqLen = findNearest(tdBounds.keys(), seqLen) tdBound = tdBounds[closestSeqLen][tdBoundKey] outlyingDists = [] if deltaGCs[index] < gcLowerBound or deltaGCs[index] > gcUpperBound: outlyingDists.append('GC') if deltaCDs[index] < cdLowerBound: outlyingDists.append('CD') if deltaTDs[index] > tdBound: outlyingDists.append('TD') if (reportType == 'any' and len(outlyingDists) >= 1) or (reportType == 'all' and len(outlyingDists) == 3): fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists)) fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100)) fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n') index += 1 fout.close()