def testNoRestriction(self): """ Testing for acceptance against a title filter that has no restrictions should return C{TitleFilter.DEFAULT_ACCEPT}. """ tf = TitleFilter() self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey'))
def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, percentageIdenticalCutoff=None, percentagePositiveCutoff=None, whitelist=None, blacklist=None, whitelistFile=None, blacklistFile=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff self.percentageIdenticalCutoff = percentageIdenticalCutoff self.percentagePositiveCutoff = percentagePositiveCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or whitelistFile or blacklistFile or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, whitelistFile=whitelistFile, blacklistFile=blacklistFile, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0
def __init__(self, assetDir='out', sampleName=None, sampleNameRegex=None, format_='fasta', proteinFastaFilenames=None, saveReadLengths=False, titleRegex=None, negativeTitleRegex=None, pathogenDataDir='pathogen-data'): self._assetDir = assetDir self._sampleName = sampleName self._sampleNameRegex = (re.compile(sampleNameRegex) if sampleNameRegex else None) if format_ in ('fasta', 'fastq'): self._format = format_ else: raise ValueError("format_ must be either 'fasta' or 'fastq'.") self._saveReadLengths = saveReadLengths if titleRegex or negativeTitleRegex: self.titleFilter = TitleFilter( positiveRegex=titleRegex, negativeRegex=negativeTitleRegex) else: self.titleFilter = None self._pathogenDataDir = pathogenDataDir self._pathogenProteinCount = getPathogenProteinCounts( proteinFastaFilenames) # pathogenNames will be a dict of dicts of dicts. The first two keys # will be a pathogen name and a sample name. The final dict will # contain 'proteins' (a list of dicts) and 'uniqueReadCount' (an int). self.pathogenNames = {} # sampleNames is keyed by sample name and will have values that hold # the sample's alignment panel index.html file. self.sampleNames = {} self.pathogenSampleFiles = PathogenSampleFiles(self, format_=format_)
def testBlacklist(self): """ Testing for acceptance against a title filter with a blacklist must work. """ tf = TitleFilter(blacklist=['never ok'], positiveRegex='ok') self.assertEqual(TitleFilter.REJECT, tf.accept('never ok'))
def testNegativeRegex(self): """ Testing for acceptance against a title filter with a negative regex must work. """ tf = TitleFilter(negativeRegex=r'x+\s') self.assertEqual(TitleFilter.REJECT, tf.accept('hey xxx you')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey xxyou'))
def testWhitelistTakesPrecedenceOverBlacklist(self): """ Testing for acceptance against a title filter with a whitelist and a blacklist that contain the same title must work (the whitelist takes precedence). """ tf = TitleFilter(whitelist=['always ok'], blacklist=['always ok']) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
def testWhitelist(self): """ Testing for acceptance against a title filter with a whitelist must work even when a title is ruled out for other violations. """ tf = TitleFilter(whitelist=['always ok'], negativeRegex='ok') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok')) self.assertEqual(TitleFilter.REJECT, tf.accept('always ok not'))
def testNegativeRegexHasPrecedenceOverRepeatedTruncatedTitle(self): """ Testing for acceptance against a title filter with a negative regex must have precedence over checking for truncated titles when the same matching title (that will be truncated) is passed twice. """ tf = TitleFilter(negativeRegex=r'spotty', truncateAfter='virus') self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1')) self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
def testWhitelistOnly(self): """ Testing for acceptance against a title filter with a whitelist and a negative regex that matches everything. """ tf = TitleFilter(whitelist=['always ok'], negativeRegex='.') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok')) self.assertEqual(TitleFilter.REJECT, tf.accept('always not ok')) self.assertEqual(TitleFilter.REJECT, tf.accept('rubbish'))
def testBlacklistFile(self): """ Testing for acceptance against a title filter with a blacklist file. """ data = '\n'.join(['id1', 'id2']) + '\n' with patch.object(builtins, 'open', mock_open(read_data=data)): tf = TitleFilter(blacklistFile='black.txt') self.assertEqual(TitleFilter.REJECT, tf.accept('id1')) self.assertEqual(TitleFilter.REJECT, tf.accept('id2')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id3'))
def testWhitelistFileOnly(self): """ Testing for acceptance against a title filter with a whitelist file and a negative regex that matches everything. """ data = '\n'.join(['id1', 'id2']) + '\n' with patch.object(builtins, 'open', mock_open(read_data=data)): tf = TitleFilter(whitelistFile='white.txt', negativeRegex='.') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1')) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2')) self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
def testPartialWordTruncation(self): """ Testing for acceptance against a title filter with title truncation in effect must work if the title contains the C{truncateAfter} string as a partial word. """ tf = TitleFilter(truncateAfter=r'virus') # Note that the truncation code will chop off the first part of the # title (the title ID). self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('gi|400684|gb|AY421767.1| rotavirus 1')) self.assertEqual(TitleFilter.REJECT, tf.accept('gi|400684|gb|AY421767.1| rotavirus 2'))
def testBlacklistFileAndBlacklist(self): """ Testing for acceptance against a title filter with a blacklist file and some specific other blacklist titles. """ data = '\n'.join(['id1', 'id2']) + '\n' mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): tf = TitleFilter(blacklistFile='black.txt', blacklist=set(['id3'])) self.assertEqual(TitleFilter.REJECT, tf.accept('id1')) self.assertEqual(TitleFilter.REJECT, tf.accept('id2')) self.assertEqual(TitleFilter.REJECT, tf.accept('id3')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id4'))
def testWordTruncationRepeat(self): """ Testing for acceptance against a title filter with title truncation in effect must allow the exact same title twice, even if the title is being truncated. """ tf = TitleFilter(truncateAfter=r'virus') # Note that the truncation code will chop off the first part of the # title (the title ID). self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('gi|400684|gb|AY421767.1| herpes virus 1')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('gi|400684|gb|AY421767.1| herpes virus 1'))
def testWhitelistFileAndWhitelistOnly(self): """ Testing for acceptance against a title filter with a whitelist file and some specific whitelist titles, with a negative regex that matches everything. """ data = '\n'.join(['id1', 'id2']) + '\n' mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): tf = TitleFilter(whitelistFile='white.txt', whitelist=set(['id3']), negativeRegex='.') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1')) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2')) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id3')) self.assertEqual(TitleFilter.REJECT, tf.accept('id4'))
def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop, oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist, blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex): """ Filter the read alignments in self. Do not call this function directly, instead use self.filter (above). Argument defaults and descriptions (other than for iteratorIndex) are as in self.filter. @param iteratorIndex: An index into self._iterators. Calling the iterator function will return a generator that yields C{ReadAlignments} instances. @return: A generator that yields C{ReadAlignments} instances. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the above arguments for # each match the way the current code does. # # 3. A better approach with readIdRegex might be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if taxonomy is not None: lineageFetcher = LineageFetcher() if readIdRegex is not None: readIdRegex = re.compile(readIdRegex) count = 0 for readAlignments in self._iterators[iteratorIndex](): if limit is not None and count == limit: return # Filter on the read id. if (readIdRegex and readIdRegex.search(readAlignments.read.id) is None): continue if titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Only return alignments that are against sequences of the # desired length. if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > maxHspsPerHit: alignment.hsps = hsps[:maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Throw out HSPs that don't match in the desired place on the # matched sequence. if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue yield readAlignments count += 1 if taxonomy: lineageFetcher.close()
def filter(self, minMatchingReads=None, minMedianScore=None, withScoreBetterThan=None, minNewReads=None, minCoverage=None, maxTitles=None, sortOn='maxScore', titleRegex=None, negativeTitleRegex=None): """ Filter the titles in self to create another TitlesAlignments. @param minMatchingReads: titles that are matched by fewer reads are unacceptable. @param minMedianScore: sequences that are matched with a median bit score that is less are unacceptable. @param withScoreBetterThan: if the best score for a title is not as good as this value, the title is not acceptable. @param minNewReads: The C{float} fraction of its reads by which a new title's read set must differ from the read sets of all previously seen titles in order for this title to be considered acceptably different (and therefore interesting). @param minCoverage: The C{float} minimum fraction of the title sequence that must be matched by at least one read. @param maxTitles: A non-negative C{int} maximum number of titles to keep. If more titles than this are present, titles will be sorted (according to C{sortOn}) and only the best will be retained. @param sortOn: A C{str} attribute to sort on, used only if C{maxTitles} is not C{None}. See the C{sortTitles} method below for the legal values. @param titleRegex: A regex that read ids must match. @param negativeTitleRegex: A regex that read ids must not match. @raise: C{ValueError} if C{maxTitles} is less than zero or the value of C{sortOn} is unknown. @return: A new L{TitlesAlignments} instance containing only the matching titles. """ # Use a ReadSetFilter only if we're checking that read sets are # sufficiently new. if minNewReads is None: readSetFilter = None else: if self.readSetFilter is None: self.readSetFilter = ReadSetFilter(minNewReads) readSetFilter = self.readSetFilter result = TitlesAlignments(self.readsAlignments, self.scoreClass, self.readSetFilter, importReadsAlignmentsTitles=False) if maxTitles is not None and len(self) > maxTitles: if maxTitles < 0: raise ValueError('maxTitles (%r) cannot be negative.' % maxTitles) else: # There are too many titles. Make a sorted list of them so # we loop through them (below) in the desired order and can # break when/if we've reached the maximum. We can't just # take the first maxTitles titles from the sorted list now, # as some of those titles might later be discarded by the # filter and then we'd return a result with fewer titles # than we should. titles = self.sortTitles(sortOn) else: titles = self.keys() if (titleRegex or negativeTitleRegex): titleFilter = TitleFilter(positiveRegex=titleRegex, negativeRegex=negativeTitleRegex) else: titleFilter = None for title in titles: # Test max titles up front, as it may be zero. if maxTitles is not None and len(result) == maxTitles: break # Test positive and negative regexps. if (titleFilter and titleFilter.accept(title) == TitleFilter.REJECT): continue titleAlignments = self[title] if (minMatchingReads is not None and titleAlignments.readCount() < minMatchingReads): continue # To compare the median score with another score, we must # convert both values to instances of the score class used in # this data set so they can be compared without us needing to # know if numerically greater scores are considered better or # not. if (minMedianScore is not None and self.scoreClass(titleAlignments.medianScore()) < self.scoreClass(minMedianScore)): continue if (withScoreBetterThan is not None and not titleAlignments.hasScoreBetterThan(withScoreBetterThan) ): continue if (minCoverage is not None and titleAlignments.coverage() < minCoverage): continue if (readSetFilter and not readSetFilter.accept(title, titleAlignments)): continue result.addTitle(title, titleAlignments) return result
def _filter(self, minLength=None, maxLength=None, removeGaps=False, whitelist=None, blacklist=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, indices=None, head=None, removeDuplicates=False, modifier=None, randomSubset=None, trueLength=None, sampleFraction=None, sequenceNumbersFile=None): """ Filter a set of reads to produce a matching subset. See docstring for self.filter (above) for parameter docs. @return: A generator that yields C{Read} instances. """ def _wantedSequences(filename): """ Read and yield integer sequence numbers from a file. @raise ValueError: If the sequence numbers are not all positive or are not ascending. @return: A generator that yields C{int} sequence numbers. """ with open(filename) as fp: lastNumber = None for line in fp: n = int(line) if lastNumber is None: if n < 1: raise ValueError( 'First line of sequence number file %r must ' 'be at least 1.' % filename) lastNumber = n yield n else: if n > lastNumber: lastNumber = n yield n else: raise ValueError( 'Line number file %r contains non-ascending ' 'numbers %d and %d.' % (filename, lastNumber, n)) if randomSubset is not None and sampleFraction is not None: raise ValueError('randomSubset and sampleFraction cannot be ' 'used simultaneously in a filter. Call filter ' 'twice instead.') if sequenceNumbersFile is None: nextWantedSequenceNumber = None wantedSequenceNumberGeneratorExhausted = False else: wantedSequenceNumerGenerator = _wantedSequences( sequenceNumbersFile) try: nextWantedSequenceNumber = next(wantedSequenceNumerGenerator) except StopIteration: # There was a sequence number file, but it was empty. return else: wantedSequenceNumberGeneratorExhausted = False if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if removeDuplicates: sequencesSeen = set() if sampleFraction is not None: if sampleFraction == 0.0: # The filter returns nothing. return elif sampleFraction == 1.0: # Passing 1.0 can be treated the same as passing no value. # This makes the loop code simpler. sampleFraction = None if randomSubset is not None and trueLength is None: trueLength = self._length yieldCount = 0 for readIndex, read in enumerate(self): if wantedSequenceNumberGeneratorExhausted: return if nextWantedSequenceNumber is not None: if readIndex + 1 == nextWantedSequenceNumber: # We want this sequence. try: nextWantedSequenceNumber = next( wantedSequenceNumerGenerator) except StopIteration: # The sequence number iterator ran out of sequence # numbers. We must let the rest of the filtering # continue for the current sequence in case we # throw it out for other reasons (as we might have # done for any of the earlier wanted sequence # numbers). wantedSequenceNumberGeneratorExhausted = True else: # This sequence isn't wanted. continue if (sampleFraction is not None and uniform(0.0, 1.0) > sampleFraction): # Note that we don't have to worry about the 0.0 or 1.0 # cases in the above if, as they have been dealt with # before the loop. continue if randomSubset is not None: if yieldCount == randomSubset: # The random subset has already been fully returned. # There's no point in going any further through the input. return elif uniform(0.0, 1.0) > ((randomSubset - yieldCount) / (trueLength - readIndex)): continue if head is not None and readIndex == head: # We're completely done. return readLen = len(read) if ((minLength is not None and readLen < minLength) or (maxLength is not None and readLen > maxLength)): continue if removeGaps: sequence = read.sequence.replace('-', '') read = read.__class__(read.id, sequence, read.quality) if (titleFilter and titleFilter.accept(read.id) == TitleFilter.REJECT): continue if indices is not None and readIndex not in indices: continue if removeDuplicates: if read.sequence in sequencesSeen: continue sequencesSeen.add(read.sequence) if modifier: modified = modifier(read) if modified is None: continue else: read = modified yield read yieldCount += 1