def testWhitelist(self): """ Testing for acceptance against a title filter with a whitelist must work even when a title is ruled out for other violations. """ tf = TitleFilter(whitelist=['always ok'], negativeRegex='ok') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok')) self.assertEqual(TitleFilter.REJECT, tf.accept('always ok not'))
def testNegativeRegex(self): """ Testing for acceptance against a title filter with a negative regex must work. """ tf = TitleFilter(negativeRegex=r"x+\s") self.assertEqual(TitleFilter.REJECT, tf.accept("hey xxx you")) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("hey xxyou"))
def testPositiveRegex(self): """ Testing for acceptance against a title filter with a positive regex must work. """ tf = TitleFilter(positiveRegex=r'x+\s') self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey xxx you')) self.assertEqual(TitleFilter.REJECT, tf.accept('hey xxyou'))
def testWhitelist(self): """ Testing for acceptance against a title filter with a whitelist must work even when a title is ruled out for other violations. """ tf = TitleFilter(whitelist=["always ok"], negativeRegex="ok") self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept("always ok")) self.assertEqual(TitleFilter.REJECT, tf.accept("always ok not"))
def testNegativeRegex(self): """ Testing for acceptance against a title filter with a negative regex must work. """ tf = TitleFilter(negativeRegex=r'x+\s') self.assertEqual(TitleFilter.REJECT, tf.accept('hey xxx you')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey xxyou'))
def testWhitelistOnly(self): """ Testing for acceptance against a title filter with a whitelist and a negative regex that matches everything. """ tf = TitleFilter(whitelist=['always ok'], negativeRegex='.') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok')) self.assertEqual(TitleFilter.REJECT, tf.accept('always not ok')) self.assertEqual(TitleFilter.REJECT, tf.accept('rubbish'))
def testWhitelistOnly(self): """ Testing for acceptance against a title filter with a whitelist and a negative regex that matches everything. """ tf = TitleFilter(whitelist=["always ok"], negativeRegex=".") self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept("always ok")) self.assertEqual(TitleFilter.REJECT, tf.accept("always not ok")) self.assertEqual(TitleFilter.REJECT, tf.accept("rubbish"))
def testBlacklistFile(self): """ Testing for acceptance against a title filter with a blacklist file. """ data = '\n'.join(['id1', 'id2']) + '\n' with patch.object(builtins, 'open', mock_open(read_data=data)): tf = TitleFilter(blacklistFile='black.txt') self.assertEqual(TitleFilter.REJECT, tf.accept('id1')) self.assertEqual(TitleFilter.REJECT, tf.accept('id2')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id3'))
def testPartialWordTruncation(self): """ Testing for acceptance against a title filter with title truncation in effect must work if the title contains the C{truncateAfter} string as a partial word. """ tf = TitleFilter(truncateAfter=r"virus") # Note that the truncation code will chop off the first part of the # title (the title ID). self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("gi|400684|gb|AY421767.1| rotavirus 1")) self.assertEqual(TitleFilter.REJECT, tf.accept("gi|400684|gb|AY421767.1| rotavirus 2"))
def testBlacklistFile(self): """ Testing for acceptance against a title filter with a blacklist file. """ data = '\n'.join(['id1', 'id2']) + '\n' mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): tf = TitleFilter(blacklistFile='black.txt') self.assertEqual(TitleFilter.REJECT, tf.accept('id1')) self.assertEqual(TitleFilter.REJECT, tf.accept('id2')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id3'))
def testWhitelistFileOnly(self): """ Testing for acceptance against a title filter with a whitelist file and a negative regex that matches everything. """ data = '\n'.join(['id1', 'id2']) + '\n' with patch.object(builtins, 'open', mock_open(read_data=data)): tf = TitleFilter(whitelistFile='white.txt', negativeRegex='.') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1')) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2')) self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
def testWhitelistFileOnly(self): """ Testing for acceptance against a title filter with a whitelist file and a negative regex that matches everything. """ data = '\n'.join(['id1', 'id2']) + '\n' mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): tf = TitleFilter(whitelistFile='white.txt', negativeRegex='.') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1')) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2')) self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
def testWordTruncationRepeat(self): """ Testing for acceptance against a title filter with title truncation in effect must allow the exact same title twice, even if the title is being truncated. """ tf = TitleFilter(truncateAfter=r'virus') # Note that the truncation code will chop off the first part of the # title (the title ID). self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('gi|400684|gb|AY421767.1| herpes virus 1')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('gi|400684|gb|AY421767.1| herpes virus 1'))
def testPartialWordTruncation(self): """ Testing for acceptance against a title filter with title truncation in effect must work if the title contains the C{truncateAfter} string as a partial word. """ tf = TitleFilter(truncateAfter=r'virus') # Note that the truncation code will chop off the first part of the # title (the title ID). self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('gi|400684|gb|AY421767.1| rotavirus 1')) self.assertEqual(TitleFilter.REJECT, tf.accept('gi|400684|gb|AY421767.1| rotavirus 2'))
def testBlacklistFileAndBlacklist(self): """ Testing for acceptance against a title filter with a blacklist file and some specific other blacklist titles. """ data = '\n'.join(['id1', 'id2']) + '\n' mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): tf = TitleFilter(blacklistFile='black.txt', blacklist=set(['id3'])) self.assertEqual(TitleFilter.REJECT, tf.accept('id1')) self.assertEqual(TitleFilter.REJECT, tf.accept('id2')) self.assertEqual(TitleFilter.REJECT, tf.accept('id3')) self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id4'))
def testNoRestriction(self): """ Testing for acceptance against a title filter that has no restrictions should return C{TitleFilter.DEFAULT_ACCEPT}. """ tf = TitleFilter() self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey'))
def testBlacklist(self): """ Testing for acceptance against a title filter with a blacklist must work. """ tf = TitleFilter(blacklist=['never ok'], positiveRegex='ok') self.assertEqual(TitleFilter.REJECT, tf.accept('never ok'))
def testNoRestriction(self): """ Testing for acceptance against a title filter that has no restrictions should return C{TitleFilter.DEFAULT_ACCEPT}. """ tf = TitleFilter() self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("hey"))
def testWhitelistFileAndWhitelistOnly(self): """ Testing for acceptance against a title filter with a whitelist file and some specific whitelist titles, with a negative regex that matches everything. """ data = '\n'.join(['id1', 'id2']) + '\n' mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): tf = TitleFilter(whitelistFile='white.txt', whitelist=set(['id3']), negativeRegex='.') self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1')) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2')) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id3')) self.assertEqual(TitleFilter.REJECT, tf.accept('id4'))
def testBlacklist(self): """ Testing for acceptance against a title filter with a blacklist must work. """ tf = TitleFilter(blacklist=["never ok"], positiveRegex="ok") self.assertEqual(TitleFilter.REJECT, tf.accept("never ok"))
def testWhitelistTakesPrecedenceOverBlacklist(self): """ Testing for acceptance against a title filter with a whitelist and a blacklist that contain the same title must work (the whitelist takes precedence). """ tf = TitleFilter(whitelist=["always ok"], blacklist=["always ok"]) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept("always ok"))
def testWhitelistTakesPrecedenceOverBlacklist(self): """ Testing for acceptance against a title filter with a whitelist and a blacklist that contain the same title must work (the whitelist takes precedence). """ tf = TitleFilter(whitelist=['always ok'], blacklist=['always ok']) self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
def testPositiveRegexHasPrecedenceOverRepeatedTruncatedTitle(self): """ Testing for acceptance against a title filter with a positive regex must have precedence over checking for truncated titles when the same non-matching title (that will be truncated) is passed twice. """ tf = TitleFilter(positiveRegex=r'xxxxx', truncateAfter='virus') self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1')) self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
def testNegativeRegexHasPrecedenceOverRepeatedTruncatedTitle(self): """ Testing for acceptance against a title filter with a negative regex must have precedence over checking for truncated titles when the same matching title (that will be truncated) is passed twice. """ tf = TitleFilter(negativeRegex=r"spotty", truncateAfter="virus") self.assertEqual(TitleFilter.REJECT, tf.accept("spotty virus 1")) self.assertEqual(TitleFilter.REJECT, tf.accept("spotty virus 1"))
def testNegativeRegexHasPrecedenceOverRepeatedTruncatedTitle(self): """ Testing for acceptance against a title filter with a negative regex must have precedence over checking for truncated titles when the same matching title (that will be truncated) is passed twice. """ tf = TitleFilter(negativeRegex=r'spotty', truncateAfter='virus') self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1')) self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop, oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist, blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex): """ Filter the read alignments in self. Do not call this function directly, instead use self.filter (above). Argument defaults and descriptions (other than for iteratorIndex) are as in self.filter. @param iteratorIndex: An index into self._iterators. Calling the iterator function will return a generator that yields C{ReadAlignments} instances. @return: A generator that yields C{ReadAlignments} instances. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the above arguments for # each match the way the current code does. # # 3. A better approach with readIdRegex might be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter( whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if taxonomy is not None: lineageFetcher = LineageFetcher() if readIdRegex is not None: readIdRegex = re.compile(readIdRegex) count = 0 for readAlignments in self._iterators[iteratorIndex](): if limit is not None and count == limit: return # Filter on the read id. if (readIdRegex and readIdRegex.search(readAlignments.read.id) is None): continue if titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Only return alignments that are against sequences of the # desired length. if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > maxHspsPerHit: alignment.hsps = hsps[:maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Throw out HSPs that don't match in the desired place on the # matched sequence. if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue yield readAlignments count += 1 if taxonomy: lineageFetcher.close()
class ReadsAlignmentsFilter(object): """ Provide a filter for C{ReadsAlignments} instances. @param limit: An C{int} limit on the number of records to read. @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a read may have in order not to be filtered out. Reads with a greater number of alignments will be elided. Pass 0 to filter out reads that did not match (i.e., align to) any subjects. Use C{None} for no max alignments filtering. @param minSequenceLen: Sequences of lesser length will be elided. @param maxSequenceLen: Sequences of greater length will be elided. @param minStart: HSPs that start before this offset in the matched sequence should not be returned. @param maxStop: HSPs that end after this offset in the matched sequence should not be returned. @param oneAlignmentPerRead: If C{True}, only keep the best alignment for each read. @param maxHspsPerHit: The maximum number of HSPs to keep for each alignment for each read. @param scoreCutoff: A C{float} score. Matches with scores that are not better than this score will be ignored. @param whitelist: If not C{None}, a set of exact titles that are always acceptable (though the match info for a whitelist title may rule it out for other reasons). @param blacklist: If not C{None}, a set of exact titles that are never acceptable. @param whitelistFile: If not C{None}, a C{str} filename containing lines that give exact ids that are always acceptable. @param blacklistFile: If not C{None}, a C{str} filename containing lines that give exact ids that are never acceptable. @param titleRegex: A regex that sequence titles must match. @param negativeTitleRegex: A regex that sequence titles must not match. @param truncateTitlesAfter: A string that titles will be truncated beyond. If a truncated title has already been seen, that title will be elided. @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic group on which should be filtered. eg 'Vira' will filter on viruses, while 11118 will filter on Coronaviridae. @param readIdRegex: A case-sensitive regex C{str} that read ids must match. @return: C{self}. """ def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, whitelist=None, blacklist=None, whitelistFile=None, blacklistFile=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or whitelistFile or blacklistFile or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, whitelistFile=whitelistFile, blacklistFile=blacklistFile, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0 def filter(self, readAlignments): """ Filter a read's alignments. @param readAlignments: A C{ReadAlignments} instance. @return: A C{ReadAlignments} instance if the passed C{readAlignments} is not filtered out, else C{False}. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the desired filtering # settings on each call the way the current code does. # # 3. A better approach with readIdRegex would be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # if self.limit is not None and self.count == self.limit: return False # Does the read have too many alignments? if (self.maxAlignmentsPerRead is not None and len(readAlignments) > self.maxAlignmentsPerRead): return False # Filter on the read id. if (self.readIdRegex and self.readIdRegex.search(readAlignments.read.id) is None): return False if self.titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (self.titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Only return alignments that are against sequences of the # desired length. minSequenceLen = self.minSequenceLen maxSequenceLen = self.maxSequenceLen if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ( (minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > self.maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = self.lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if self.taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if self.maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > self.maxHspsPerHit: alignment.hsps = hsps[:self.maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if self.scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(self.scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Throw out HSPs that don't match in the desired place on the # matched sequence. minStart = self.minStart maxStop = self.maxStop if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False self.count += 1 return readAlignments def close(self): """ Close our lineage fetcher, if any. """ if self.taxonomy: self.lineageFetcher.close()
def filter(self, minMatchingReads=None, minMedianScore=None, withScoreBetterThan=None, minNewReads=None, minCoverage=None, maxTitles=None, sortOn='maxScore', titleRegex=None, negativeTitleRegex=None): """ Filter the titles in self to create another TitlesAlignments. @param minMatchingReads: titles that are matched by fewer reads are unacceptable. @param minMedianScore: sequences that are matched with a median bit score that is less are unacceptable. @param withScoreBetterThan: if the best score for a title is not as good as this value, the title is not acceptable. @param minNewReads: The C{float} fraction of its reads by which a new title's read set must differ from the read sets of all previously seen titles in order for this title to be considered acceptably different (and therefore interesting). @param minCoverage: The C{float} minimum fraction of the title sequence that must be matched by at least one read. @param maxTitles: A non-negative C{int} maximum number of titles to keep. If more titles than this are present, titles will be sorted (according to C{sortOn}) and only the best will be retained. @param sortOn: A C{str} attribute to sort on, used only if C{maxTitles} is not C{None}. See the C{sortTitles} method below for the legal values. @param titleRegex: A regex that read ids must match. @param negativeTitleRegex: A regex that read ids must not match. @raise: C{ValueError} if C{maxTitles} is less than zero or the value of C{sortOn} is unknown. @return: A new L{TitlesAlignments} instance containing only the matching titles. """ # Use a ReadSetFilter only if we're checking that read sets are # sufficiently new. if minNewReads is None: readSetFilter = None else: if self.readSetFilter is None: self.readSetFilter = ReadSetFilter(minNewReads) readSetFilter = self.readSetFilter result = TitlesAlignments(self.readsAlignments, self.scoreClass, self.readSetFilter, importReadsAlignmentsTitles=False) if maxTitles is not None and len(self) > maxTitles: if maxTitles < 0: raise ValueError('maxTitles (%r) cannot be negative.' % maxTitles) else: # There are too many titles. Make a sorted list of them so # we loop through them (below) in the desired order and can # break when/if we've reached the maximum. We can't just # take the first maxTitles titles from the sorted list now, # as some of those titles might later be discarded by the # filter and then we'd return a result with fewer titles # than we should. titles = self.sortTitles(sortOn) else: titles = self.keys() if (titleRegex or negativeTitleRegex): titleFilter = TitleFilter(positiveRegex=titleRegex, negativeRegex=negativeTitleRegex) else: titleFilter = None for title in titles: # Test max titles up front, as it may be zero. if maxTitles is not None and len(result) == maxTitles: break # Test positive and negative regexps. if (titleFilter and titleFilter.accept(title) == TitleFilter.REJECT): continue titleAlignments = self[title] if (minMatchingReads is not None and titleAlignments.readCount() < minMatchingReads): continue # To compare the median score with another score, we must # convert both values to instances of the score class used in # this data set so they can be compared without us needing to # know if numerically greater scores are considered better or # not. if (minMedianScore is not None and self.scoreClass(titleAlignments.medianScore()) < self.scoreClass(minMedianScore)): continue if (withScoreBetterThan is not None and not titleAlignments.hasScoreBetterThan(withScoreBetterThan) ): continue if (minCoverage is not None and titleAlignments.coverage() < minCoverage): continue if (readSetFilter and not readSetFilter.accept(title, titleAlignments)): continue result.addTitle(title, titleAlignments) return result
class ReadsAlignmentsFilter(object): """ Provide a filter for C{ReadsAlignments} instances. @param limit: An C{int} limit on the number of records to read. @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a read may have in order not to be filtered out. Reads with a greater number of alignments will be elided. Pass 0 to filter out reads that did not match (i.e., align to) any subjects. Use C{None} for no max alignments filtering. @param minSequenceLen: Sequences of lesser length will be elided. @param maxSequenceLen: Sequences of greater length will be elided. @param minStart: HSPs that start before this offset in the matched sequence should not be returned. @param maxStop: HSPs that end after this offset in the matched sequence should not be returned. @param oneAlignmentPerRead: If C{True}, only keep the best alignment for each read. @param maxHspsPerHit: The maximum number of HSPs to keep for each alignment for each read. @param scoreCutoff: A C{float} score. Matches with scores that are not better than this score will be ignored. @param whitelist: If not C{None}, a set of exact titles that are always acceptable (though the match info for a whitelist title may rule it out for other reasons). @param blacklist: If not C{None}, a set of exact titles that are never acceptable. @param titleRegex: A regex that sequence titles must match. @param negativeTitleRegex: A regex that sequence titles must not match. @param truncateTitlesAfter: A string that titles will be truncated beyond. If a truncated title has already been seen, that title will be elided. @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic group on which should be filtered. eg 'Vira' will filter on viruses, while 11118 will filter on Coronaviridae. @param readIdRegex: A case-sensitive regex C{str} that read ids must match. @return: C{self}. """ def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, whitelist=None, blacklist=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter( whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0 def filter(self, readAlignments): """ Filter a read's alignments. @param readAlignments: A C{ReadAlignments} instance. @return: A C{ReadAlignments} instance if the passed C{readAlignments} is not filtered out, else C{False}. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the desired filtering # settings on each call the way the current code does. # # 3. A better approach with readIdRegex would be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # if self.limit is not None and self.count == self.limit: return False # Does the read have too many alignments? if (self.maxAlignmentsPerRead is not None and len(readAlignments) > self.maxAlignmentsPerRead): return False # Filter on the read id. if (self.readIdRegex and self.readIdRegex.search(readAlignments.read.id) is None): return False if self.titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (self.titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Only return alignments that are against sequences of the # desired length. minSequenceLen = self.minSequenceLen maxSequenceLen = self.maxSequenceLen if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > self.maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = self.lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if self.taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if self.maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > self.maxHspsPerHit: alignment.hsps = hsps[:self.maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if self.scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(self.scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Throw out HSPs that don't match in the desired place on the # matched sequence. minStart = self.minStart maxStop = self.maxStop if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False self.count += 1 return readAlignments def close(self): """ Close our lineage fetcher, if any. """ if self.taxonomy: self.lineageFetcher.close()
class ProteinGrouper(object): """ Group matched proteins by the pathogen they come from. @param assetDir: The C{str} directory name where C{noninteractive-alignment-panel.py} put its HTML, blue plot and alignment panel images, and FASTA or FASTQ files. This must be relative to the filenames that will later be passed to C{addFile}. @param sampleName: A C{str} sample name. This takes precedence over C{sampleNameRegex} (the two cannot be used together, obviously). @param sampleNameRegex: A C{str} regular expression that can be used to extract a short sample name from full file names subsequently passed to C{self.addFile}. The regular expression must have a matching group (delimited by parentheses) to capture the part of the file name that should be used as the sample name. @param format_: A C{str}, either 'fasta' or 'fastq' indicating the format of the files containing the reads matching proteins. @param proteinFastaFilenames: If not C{None}, a C{list} of C{str} filenames giving the name of the FASTA file with the protein AA sequences with their associated pathogens in square brackets. This is the format used by NCBI for the bacterial and viral reference sequence protein files. If given, the contents of this file will be used to determine how many proteins each matched pathogen has. @param saveReadLengths: If C{True}, save the lengths of all reads matching proteins. @param titleRegex: A regex that pathogen names must match. Note that this matching is done on the final part of the protein title in square brackets, according to the convention used by the NCBI viral refseq database and RVDB. @param negativeTitleRegex: A regex that pathogen names must not match. Note that this matching is done on the final part of the protein title in square brackets, according to the convention used by the NCBI viral refseq database and RVDB. @param pathogenDataDir: The C{str} directory where per-pathogen information (e.g., collected reads across all samples) should be written. Will be created (in C{self.toHTML}) if it doesn't exist. @raise ValueError: If C{format_} is unknown. """ VIRALZONE = 'https://viralzone.expasy.org/search?query=' ICTV = 'https://talk.ictvonline.org/search-124283882/?q=' READCOUNT_MARKER = '*READ-COUNT*' READ_AND_HSP_COUNT_STR_SEP = '/' def __init__(self, assetDir='out', sampleName=None, sampleNameRegex=None, format_='fasta', proteinFastaFilenames=None, saveReadLengths=False, titleRegex=None, negativeTitleRegex=None, pathogenDataDir='pathogen-data'): self._assetDir = assetDir self._sampleName = sampleName self._sampleNameRegex = (re.compile(sampleNameRegex) if sampleNameRegex else None) if format_ in ('fasta', 'fastq'): self._format = format_ else: raise ValueError("format_ must be either 'fasta' or 'fastq'.") self._saveReadLengths = saveReadLengths if titleRegex or negativeTitleRegex: self.titleFilter = TitleFilter( positiveRegex=titleRegex, negativeRegex=negativeTitleRegex) else: self.titleFilter = None self._pathogenDataDir = pathogenDataDir self._pathogenProteinCount = getPathogenProteinCounts( proteinFastaFilenames) # pathogenNames will be a dict of dicts of dicts. The first two keys # will be a pathogen name and a sample name. The final dict will # contain 'proteins' (a list of dicts) and 'uniqueReadCount' (an int). self.pathogenNames = {} # sampleNames is keyed by sample name and will have values that hold # the sample's alignment panel index.html file. self.sampleNames = {} self.pathogenSampleFiles = PathogenSampleFiles(self, format_=format_) def _title(self): """ Create a title summarizing the pathogens and samples. @return: A C{str} title. """ return ( 'Overall, proteins from %d pathogen%s were found in %d sample%s.' % (len(self.pathogenNames), '' if len(self.pathogenNames) == 1 else 's', len(self.sampleNames), '' if len(self.sampleNames) == 1 else 's')) def addFile(self, filename, fp): """ Read and record protein information for a sample. @param filename: A C{str} file name. @param fp: An open file pointer to read the file's data from. @raise ValueError: If information for a pathogen/protein/sample combination is given more than once. """ if self._sampleName: sampleName = self._sampleName elif self._sampleNameRegex: match = self._sampleNameRegex.search(filename) if match: sampleName = match.group(1) else: sampleName = filename else: sampleName = filename outDir = join(dirname(filename), self._assetDir) self.sampleNames[sampleName] = join(outDir, 'index.html') for index, proteinLine in enumerate(fp): proteinLine = proteinLine[:-1] (coverage, medianScore, bestScore, readCount, hspCount, proteinLength, names) = proteinLine.split(None, 6) proteinName, pathogenName = splitNames(names) # Ignore pathogens with names we don't want. if (self.titleFilter and self.titleFilter.accept( pathogenName) == TitleFilter.REJECT): continue if pathogenName not in self.pathogenNames: self.pathogenNames[pathogenName] = {} if sampleName not in self.pathogenNames[pathogenName]: self.pathogenNames[pathogenName][sampleName] = { 'proteins': {}, 'uniqueReadCount': None, } proteins = self.pathogenNames[pathogenName][sampleName]['proteins'] # We should only receive one line of information for a given # pathogen/sample/protein combination. if proteinName in proteins: raise ValueError( 'Protein %r already seen for pathogen %r sample %r.' % (proteinName, pathogenName, sampleName)) readsFilename = join(outDir, '%d.%s' % (index, self._format)) if proteinName.count('|') < 5: # Assume this is an NCBI refseq id, like # YP_009137153.1 uracil glycosylase [Human alphaherpesvirus 2] # with a protein but not a genome accession. proteinURL = NCBISequenceLinkURL(proteinName, field=0, delim=' ') genomeURL = None else: # Assume this is an RVDB id, like # acc|GENBANK|ABJ91970.1|GENBANK|DQ876317|pol protein [HIV] # with both protein and genome accession numbers. proteinURL = NCBISequenceLinkURL(proteinName, field=2) genomeURL = NCBISequenceLinkURL(proteinName, field=4) proteinInfo = proteins[proteinName] = { 'bestScore': float(bestScore), 'bluePlotFilename': join(outDir, '%d.png' % index), 'coverage': float(coverage), 'readsFilename': readsFilename, 'hspCount': int(hspCount), 'index': index, 'medianScore': float(medianScore), 'outDir': outDir, 'proteinLength': int(proteinLength), 'proteinName': proteinName, 'proteinURL': proteinURL, 'genomeURL': genomeURL, 'readCount': int(readCount), } if proteinInfo['readCount'] == proteinInfo['hspCount']: proteinInfo['readAndHspCountStr'] = readCount else: proteinInfo['readAndHspCountStr'] = '%s%s%s' % ( readCount, self.READ_AND_HSP_COUNT_STR_SEP, hspCount) if self._saveReadLengths: readsClass = (FastaReads if self._format == 'fasta' else FastqReads) proteins[proteinName]['readLengths'] = tuple( len(read) for read in readsClass(readsFilename)) def _computeUniqueReadCounts(self): """ Add all pathogen / sample combinations to self.pathogenSampleFiles. This will make all de-duplicated (by id) FASTA/FASTQ files and store the number of de-duplicated reads into C{self.pathogenNames}. """ for pathogenName, samples in self.pathogenNames.items(): for sampleName in samples: self.pathogenSampleFiles.add(pathogenName, sampleName) def toStr(self, title='Summary of pathogens', preamble=None): """ Produce a string representation of the pathogen summary. @param title: The C{str} title for the output. @param preamble: The C{str} descriptive preamble for the HTML page, or C{None} if no preamble is needed. @return: A C{str} suitable for printing. """ # Note that the string representation contains much less # information than the HTML summary. E.g., it does not contain the # unique (de-duplicated, by id) read count, since that is only computed # when we are making combined FASTA files of reads matching a # pathogen. readCountGetter = itemgetter('readCount') result = [] append = result.append result.extend((title, '')) if preamble: result.extend((preamble, '')) result.extend((self._title(), '')) for pathogenName in sorted(self.pathogenNames): samples = self.pathogenNames[pathogenName] sampleCount = len(samples) append('%s (in %d sample%s)' % (pathogenName, sampleCount, '' if sampleCount == 1 else 's')) for sampleName in sorted(samples): proteins = samples[sampleName]['proteins'] proteinCount = len(proteins) totalReads = sum(readCountGetter(p) for p in proteins.values()) append(' %s (%d protein%s, %d read%s)' % (sampleName, proteinCount, '' if proteinCount == 1 else 's', totalReads, '' if totalReads == 1 else 's')) for proteinName in sorted(proteins): append( ' %(coverage).2f\t%(medianScore).2f\t' '%(bestScore).2f\t%(readAndHspCountStr)11s\t' '%(proteinName)s' % proteins[proteinName]) append('') return '\n'.join(result) def toHTML(self, pathogenPanelFilename=None, minProteinFraction=0.0, pathogenType='viral', title='Summary of pathogens', preamble=None, sampleIndexFilename=None, pathogenIndexFilename=None, omitVirusLinks=False, omitSampleProteinCount=False): """ Produce an HTML string representation of the pathogen summary. @param pathogenPanelFilename: If not C{None}, a C{str} filename to write a pathogen panel PNG image to. @param minProteinFraction: The C{float} minimum fraction of proteins in a pathogen that must be matched by a sample in order for that pathogen to be displayed for that sample. @param pathogenType: A C{str} giving the type of the pathogen involved, either 'bacterial' or 'viral'. @param title: The C{str} title for the HTML page. @param preamble: The C{str} descriptive preamble for the HTML page, or C{None} if no preamble is needed. @param sampleIndexFilename: A C{str} filename to write a sample index file to. Lines in the file will have an integer index, a space, and then the sample name. @param pathogenIndexFilename: A C{str} filename to write a pathogen index file to. Lines in the file will have an integer index, a space, and then the pathogen name. @param omitVirusLinks: If C{True}, links to ICTV and ViralZone will be omitted in output. @param omitSampleProteinCount: If C{True}, do not display a number of matched pathogen proteins for a sample. This should be used when those numbers are inaccurate (e.g., when using the unclustered RVDB protein database and there are many sequences for the same protein). @return: An HTML C{str} suitable for printing. """ if pathogenType not in ('bacterial', 'viral'): raise ValueError( "Unrecognized pathogenType argument: %r. Value must be either " "'bacterial' or 'viral'." % pathogenType) if not exists(self._pathogenDataDir): os.mkdir(self._pathogenDataDir) self._computeUniqueReadCounts() if pathogenPanelFilename: self.pathogenPanel(pathogenPanelFilename) if sampleIndexFilename: with open(sampleIndexFilename, 'w') as fp: self.pathogenSampleFiles.writeSampleIndex(fp) if pathogenIndexFilename: with open(pathogenIndexFilename, 'w') as fp: self.pathogenSampleFiles.writePathogenIndex(fp) # Figure out if we have to delete some pathogens because the # fraction of their proteins that we have matches for is too low. if minProteinFraction > 0.0: toDelete = defaultdict(list) for pathogenName in self.pathogenNames: proteinCount = self._pathogenProteinCount[pathogenName] for s in self.pathogenNames[pathogenName]: if proteinCount: sampleProteinFraction = ( len(self.pathogenNames[ pathogenName][s]['proteins']) / proteinCount) else: sampleProteinFraction = 1.0 if sampleProteinFraction < minProteinFraction: toDelete[pathogenName].append(s) for pathogenName in toDelete: for sample in toDelete[pathogenName]: del self.pathogenNames[pathogenName][sample] pathogenNames = sorted( pathogenName for pathogenName in self.pathogenNames if len(self.pathogenNames[pathogenName]) > 0) nPathogenNames = len(pathogenNames) sampleNames = sorted(self.sampleNames) result = [ '<html>', '<head>', '<title>', title, '</title>', '<meta charset="UTF-8">', '</head>', '<body>', '<style>', '''\ body { margin-left: 2%; margin-right: 2%; } hr { display: block; margin-top: 0.5em; margin-bottom: 0.5em; margin-left: auto; margin-right: auto; border-style: inset; border-width: 1px; } p.pathogen { margin-top: 10px; margin-bottom: 3px; } p.sample { margin-top: 10px; margin-bottom: 3px; } .sample { margin-top: 5px; margin-bottom: 2px; } ul { margin-bottom: 2px; } .indented { margin-left: 2em; } .sample-name { font-size: 125%; font-weight: bold; } .pathogen-name { font-size: 125%; font-weight: bold; } .index-name { font-weight: bold; } .index { font-size: small; } .protein-name { font-family: "Courier New", Courier, monospace; } .stats { font-family: "Courier New", Courier, monospace; white-space: pre; } .protein-list { margin-top: 2px; }''', '</style>', '</head>', '<body>', ] proteinFieldsDescription = [ '<p>', 'In all bullet point protein lists below, there are the following ' 'fields:', '<ol>', '<li>Coverage fraction.</li>', '<li>Median bit score.</li>', '<li>Best bit score.</li>', '<li>Read count (if the HSP count differs, read and HSP ', ('counts are both given, separated by "%s").</li>' % self.READ_AND_HSP_COUNT_STR_SEP), '<li>Protein length (in amino acids).</li>', ] if self._saveReadLengths: proteinFieldsDescription.append( '<li>All read lengths (in parentheses).</li>') proteinFieldsDescription.extend([ '<li>Protein name.</li>', '</ol>', '</p>', ]) append = result.append append('<h1>%s</h1>' % title) if preamble: append('<p>%s</p>' % preamble) append('<p>') append(self._title()) if self._pathogenProteinCount and minProteinFraction: percent = minProteinFraction * 100.0 if nPathogenNames < len(self.pathogenNames): if nPathogenNames == 1: append('Pathogen protein fraction filtering has been ' 'applied, so information on only 1 pathogen is ' 'displayed. This is the only pathogen for which at ' 'least one sample matches at least %.2f%% of the ' 'pathogen proteins.' % percent) else: append('Pathogen protein fraction filtering has been ' 'applied, so information on only %d pathogens is ' 'displayed. These are the only pathogens for which ' 'at least one sample matches at least %.2f%% of ' 'the pathogen proteins.' % (nPathogenNames, percent)) else: append('Pathogen protein fraction filtering was applied, ' 'but all pathogens have at least %.2f%% of their ' 'proteins matched by at least one sample.' % percent) append('</p>') if pathogenPanelFilename: append('<p>') append('<a href="%s">Panel showing read count per pathogen, per ' 'sample.</a>' % pathogenPanelFilename) append('Red vertical bars indicate samples with an unusually high ' 'read count.') append('</p>') result.extend(proteinFieldsDescription) # Write a linked table of contents by pathogen. append('<p><span class="index-name">Pathogen index:</span>') append('<span class="index">') for pathogenName in pathogenNames: append('<a href="#pathogen-%s">%s</a>' % (pathogenName, pathogenName)) append('·') # Get rid of final middle dot and add a period. result.pop() result[-1] += '.' append('</span></p>') # Write a linked table of contents by sample. append('<p><span class="index-name">Sample index:</span>') append('<span class="index">') for sampleName in sampleNames: append('<a href="#sample-%s">%s</a>' % (sampleName, sampleName)) append('·') # Get rid of final middle dot and add a period. result.pop() result[-1] += '.' append('</span></p>') # Write all pathogens (with samples (with proteins)). append('<hr>') append('<h1>Pathogens by sample</h1>') for pathogenName in pathogenNames: samples = self.pathogenNames[pathogenName] sampleCount = len(samples) pathogenProteinCount = self._pathogenProteinCount[pathogenName] if pathogenType == 'viral' and not omitVirusLinks: quoted = quote(pathogenName) pathogenLinksHTML = ( ' (<a href="%s%s">ICTV</a>, <a href="%s%s">ViralZone</a>)' ) % (self.ICTV, quoted, self.VIRALZONE, quoted) else: pathogenLinksHTML = '' if pathogenProteinCount: withStr = (' with %d protein%s' % (pathogenProteinCount, '' if pathogenProteinCount == 1 else 's')) else: withStr = '' pathogenIndex = self.pathogenSampleFiles.pathogenIndex( pathogenName) pathogenReadsFilename = join( self._pathogenDataDir, 'pathogen-%d.%s' % (pathogenIndex, self._format)) pathogenReadsFp = open(pathogenReadsFilename, 'w') pathogenReadCount = 0 append( '<a id="pathogen-%s"></a>' '<p class="pathogen">' '<span class="pathogen-name">%s</span>' '%s %s, ' 'was matched by %d sample%s ' '(<a href="%s">%s</a> in total):' '</p>' % (pathogenName, pathogenName, pathogenLinksHTML, withStr, sampleCount, '' if sampleCount == 1 else 's', pathogenReadsFilename, self.READCOUNT_MARKER)) # Remember where we are in the output result so we can fill in # the total read count once we have processed all samples for # this pathogen. Not nice, I know. pathogenReadCountLineIndex = len(result) - 1 for sampleName in sorted(samples): readsFileName = self.pathogenSampleFiles.lookup( pathogenName, sampleName) # Copy the read data from the per-sample reads for this # pathogen into the per-pathogen file of reads. with open(readsFileName) as readsFp: while True: data = readsFp.read(4096) if data: pathogenReadsFp.write(data) else: break proteins = samples[sampleName]['proteins'] proteinCount = len(proteins) uniqueReadCount = samples[sampleName]['uniqueReadCount'] pathogenReadCount += uniqueReadCount if omitSampleProteinCount: proteinCountHTML = '' else: proteinCountHTML = '%d protein%s, ' % ( proteinCount, '' if proteinCount == 1 else 's') append( '<p class="sample indented">' 'Sample <a href="#sample-%s">%s</a> ' '(%s<a href="%s">%d de-duplicated (by id) ' 'read%s</a>, <a href="%s">panel</a>):</p>' % (sampleName, sampleName, proteinCountHTML, readsFileName, uniqueReadCount, '' if uniqueReadCount == 1 else 's', self.sampleNames[sampleName])) append('<ul class="protein-list indented">') for proteinName in sorted(proteins): proteinMatch = proteins[proteinName] append( '<li>' '<span class="stats">' '%(coverage).2f %(medianScore)6.2f %(bestScore)6.2f ' '%(readAndHspCountStr)11s %(proteinLength)4d ' % proteinMatch ) if self._saveReadLengths: append('(%s) ' % ', '.join( map(str, sorted(proteinMatch['readLengths'])))) append( '</span> ' '<span class="protein-name">' '%(proteinName)s' '</span> ' '(<a href="%(bluePlotFilename)s">blue plot</a>, ' '<a href="%(readsFilename)s">reads</a>' % proteinMatch) if proteinMatch['proteinURL']: # Append this directly to the last string in result, to # avoid introducing whitespace when we join result # using '\n'. result[-1] += (', <a href="%s">NCBI protein</a>' % proteinMatch['proteinURL']) if proteinMatch['genomeURL']: # Append this directly to the last string in result, to # avoid introducing whitespace when we join result # using '\n'. result[-1] += (', <a href="%s">NCBI genome</a>' % proteinMatch['genomeURL']) result[-1] += ')' append('</li>') append('</ul>') pathogenReadsFp.close() # Sanity check there's a read count marker text in our output # where we expect it. readCountLine = result[pathogenReadCountLineIndex] if readCountLine.find(self.READCOUNT_MARKER) == -1: raise ValueError( 'Could not find pathogen read count marker (%s) in result ' 'index %d text (%s).' % (self.READCOUNT_MARKER, pathogenReadCountLineIndex, readCountLine)) # Put the read count into the pathogen summary line we wrote # earlier, replacing the read count marker with the correct # text. result[pathogenReadCountLineIndex] = readCountLine.replace( self.READCOUNT_MARKER, '%d read%s' % (pathogenReadCount, '' if pathogenReadCount == 1 else 's')) # Write all samples (with pathogens (with proteins)). append('<hr>') append('<h1>Samples by pathogen</h1>') for sampleName in sampleNames: samplePathogenNames = [ pathName for pathName in self.pathogenNames if sampleName in self.pathogenNames[pathName]] if len(samplePathogenNames): append( '<a id="sample-%s"></a>' '<p class="sample">Sample ' '<span class="sample-name">%s</span> ' 'matched proteins from %d pathogen%s, ' '<a href="%s">panel</a>:</p>' % (sampleName, sampleName, len(samplePathogenNames), '' if len(samplePathogenNames) == 1 else 's', self.sampleNames[sampleName])) else: append( '<a id="sample-%s"></a>' '<p class="sample">Sample ' '<span class="sample-name">%s</span> ' 'did not match anything.</p>' % (sampleName, sampleName)) continue for pathogenName in sorted(samplePathogenNames): readsFileName = self.pathogenSampleFiles.lookup(pathogenName, sampleName) proteins = self.pathogenNames[pathogenName][sampleName][ 'proteins'] uniqueReadCount = self.pathogenNames[ pathogenName][sampleName]['uniqueReadCount'] proteinCount = len(proteins) pathogenProteinCount = self._pathogenProteinCount[pathogenName] if pathogenProteinCount: proteinCountStr = '%d/%d protein%s' % ( proteinCount, pathogenProteinCount, '' if pathogenProteinCount == 1 else 's') else: proteinCountStr = '%d protein%s' % ( proteinCount, '' if proteinCount == 1 else 's') append( '<p class="sample indented">' '<a href="#pathogen-%s">%s</a> %s, ' '<a href="%s">%d de-duplicated (by id) read%s</a>:</p>' % (pathogenName, pathogenName, proteinCountStr, readsFileName, uniqueReadCount, '' if uniqueReadCount == 1 else 's')) append('<ul class="protein-list indented">') for proteinName in sorted(proteins): proteinMatch = proteins[proteinName] append( '<li>' '<span class="stats">' '%(coverage).2f %(medianScore)6.2f %(bestScore)6.2f ' '%(readAndHspCountStr)11s %(proteinLength)4d ' '</span> ' '<span class="protein-name">' '%(proteinName)s' '</span> ' '(<a href="%(bluePlotFilename)s">blue plot</a>, ' '<a href="%(readsFilename)s">reads</a>' % proteinMatch) if proteinMatch['proteinURL']: # Append this directly to the last string in result, to # avoid introducing whitespace when we join result # using '\n'. result[-1] += (', <a href="%s">NCBI protein</a>' % proteinMatch['proteinURL']) if proteinMatch['genomeURL']: # Append this directly to the last string in result, to # avoid introducing whitespace when we join result # using '\n'. result[-1] += (', <a href="%s">NCBI genome</a>' % proteinMatch['genomeURL']) result[-1] += ')' append('</li>') append('</ul>') append('</body>') append('</html>') return '\n'.join(result) def _pathogenSamplePlot(self, pathogenName, sampleNames, ax): """ Make an image of a graph giving pathogen read count (Y axis) versus sample id (X axis). @param pathogenName: A C{str} pathogen name. @param sampleNames: A sorted C{list} of sample names. @param ax: A matplotlib C{axes} instance. """ readCounts = [] for i, sampleName in enumerate(sampleNames): try: readCount = self.pathogenNames[pathogenName][sampleName][ 'uniqueReadCount'] except KeyError: readCount = 0 readCounts.append(readCount) highlight = 'r' normal = 'gray' sdMultiple = 2.5 minReadsForHighlighting = 10 highlighted = [] if len(readCounts) == 1: if readCounts[0] > minReadsForHighlighting: color = [highlight] highlighted.append(sampleNames[0]) else: color = [normal] else: mean = np.mean(readCounts) sd = np.std(readCounts) color = [] for readCount, sampleName in zip(readCounts, sampleNames): if (readCount > (sdMultiple * sd) + mean and readCount >= minReadsForHighlighting): color.append(highlight) highlighted.append(sampleName) else: color.append(normal) nSamples = len(sampleNames) x = np.arange(nSamples) yMin = np.zeros(nSamples) ax.set_xticks([]) ax.set_xlim((-0.5, nSamples - 0.5)) ax.vlines(x, yMin, readCounts, color=color) if highlighted: title = '%s\nIn red: %s' % ( pathogenName, fill(', '.join(highlighted), 50)) else: # Add a newline to keep the first line of each title at the # same place as those titles that have an "In red:" second # line. title = pathogenName + '\n' ax.set_title(title, fontsize=10) ax.tick_params(axis='both', which='major', labelsize=8) ax.tick_params(axis='both', which='minor', labelsize=6) def pathogenPanel(self, filename): """ Make a panel of images, with each image being a graph giving pathogen de-duplicated (by id) read count (Y axis) versus sample id (X axis). @param filename: A C{str} file name to write the image to. """ import matplotlib matplotlib.use('PDF') import matplotlib.pyplot as plt self._computeUniqueReadCounts() pathogenNames = sorted(self.pathogenNames) sampleNames = sorted(self.sampleNames) cols = 5 rows = int(len(pathogenNames) / cols) + ( 0 if len(pathogenNames) % cols == 0 else 1) figure, ax = plt.subplots(rows, cols, squeeze=False) coords = dimensionalIterator((rows, cols)) for i, pathogenName in enumerate(pathogenNames): row, col = next(coords) self._pathogenSamplePlot(pathogenName, sampleNames, ax[row][col]) # Hide the final panel graphs (if any) that have no content. We do # this because the panel is a rectangular grid and some of the # plots at the end of the last row may be unused. for row, col in coords: ax[row][col].axis('off') figure.suptitle( ('Per-sample read count for %d pathogen%s and %d sample%s.\n\n' 'Sample name%s: %s') % ( len(pathogenNames), '' if len(pathogenNames) == 1 else 's', len(sampleNames), '' if len(sampleNames) == 1 else 's', '' if len(sampleNames) == 1 else 's', fill(', '.join(sampleNames), 50)), fontsize=20) figure.set_size_inches(5.0 * cols, 2.0 * rows, forward=True) plt.subplots_adjust(hspace=0.4) figure.savefig(filename)
def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop, oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist, blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex): """ Filter the read alignments in self. Do not call this function directly, instead use self.filter (above). Argument defaults and descriptions (other than for iteratorIndex) are as in self.filter. @param iteratorIndex: An index into self._iterators. Calling the iterator function will return a generator that yields C{ReadAlignments} instances. @return: A generator that yields C{ReadAlignments} instances. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the above arguments for # each match the way the current code does. # # 3. A better approach with readIdRegex might be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if taxonomy is not None: lineageFetcher = LineageFetcher() if readIdRegex is not None: readIdRegex = re.compile(readIdRegex) count = 0 for readAlignments in self._iterators[iteratorIndex](): if limit is not None and count == limit: return # Filter on the read id. if (readIdRegex and readIdRegex.search(readAlignments.read.id) is None): continue if titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Only return alignments that are against sequences of the # desired length. if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > maxHspsPerHit: alignment.hsps = hsps[:maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Throw out HSPs that don't match in the desired place on the # matched sequence. if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue yield readAlignments count += 1 if taxonomy: lineageFetcher.close()
def _filter(self, minLength=None, maxLength=None, removeGaps=False, whitelist=None, blacklist=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, indices=None, head=None, removeDuplicates=False, modifier=None, randomSubset=None, trueLength=None, sampleFraction=None, sequenceNumbersFile=None): """ Filter a set of reads to produce a matching subset. See docstring for self.filter (above) for parameter docs. @return: A generator that yields C{Read} instances. """ def _wantedSequences(filename): """ Read and yield integer sequence numbers from a file. @raise ValueError: If the sequence numbers are not all positive or are not ascending. @return: A generator that yields C{int} sequence numbers. """ with open(filename) as fp: lastNumber = None for line in fp: n = int(line) if lastNumber is None: if n < 1: raise ValueError( 'First line of sequence number file %r must ' 'be at least 1.' % filename) lastNumber = n yield n else: if n > lastNumber: lastNumber = n yield n else: raise ValueError( 'Line number file %r contains non-ascending ' 'numbers %d and %d.' % (filename, lastNumber, n)) if randomSubset is not None and sampleFraction is not None: raise ValueError('randomSubset and sampleFraction cannot be ' 'used simultaneously in a filter. Call filter ' 'twice instead.') if sequenceNumbersFile is None: nextWantedSequenceNumber = None wantedSequenceNumberGeneratorExhausted = False else: wantedSequenceNumerGenerator = _wantedSequences( sequenceNumbersFile) try: nextWantedSequenceNumber = next(wantedSequenceNumerGenerator) except StopIteration: # There was a sequence number file, but it was empty. return else: wantedSequenceNumberGeneratorExhausted = False if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter( whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if removeDuplicates: sequencesSeen = set() if sampleFraction is not None: if sampleFraction == 0.0: # The filter returns nothing. return elif sampleFraction == 1.0: # Passing 1.0 can be treated the same as passing no value. # This makes the loop code simpler. sampleFraction = None if randomSubset is not None and trueLength is None: trueLength = self._length yieldCount = 0 for readIndex, read in enumerate(self): if wantedSequenceNumberGeneratorExhausted: return if nextWantedSequenceNumber is not None: if readIndex + 1 == nextWantedSequenceNumber: # We want this sequence. try: nextWantedSequenceNumber = next( wantedSequenceNumerGenerator) except StopIteration: # The sequence number iterator ran out of sequence # numbers. We must let the rest of the filtering # continue for the current sequence in case we # throw it out for other reasons (as we might have # done for any of the earlier wanted sequence # numbers). wantedSequenceNumberGeneratorExhausted = True else: # This sequence isn't wanted. continue if (sampleFraction is not None and uniform(0.0, 1.0) > sampleFraction): # Note that we don't have to worry about the 0.0 or 1.0 # cases in the above if, as they have been dealt with # before the loop. continue if randomSubset is not None: if yieldCount == randomSubset: # The random subset has already been fully returned. # There's no point in going any further through the input. return elif uniform(0.0, 1.0) > ((randomSubset - yieldCount) / (trueLength - readIndex)): continue if head is not None and readIndex == head: # We're completely done. return readLen = len(read) if ((minLength is not None and readLen < minLength) or (maxLength is not None and readLen > maxLength)): continue if removeGaps: sequence = read.sequence.replace('-', '') read = read.__class__(read.id, sequence, read.quality) if (titleFilter and titleFilter.accept(read.id) == TitleFilter.REJECT): continue if indices is not None and readIndex not in indices: continue if removeDuplicates: if read.sequence in sequencesSeen: continue sequencesSeen.add(read.sequence) if modifier: modified = modifier(read) if modified is None: continue else: read = modified yield read yieldCount += 1
def filter(self, minLength=None, maxLength=None, removeGaps=False, whitelist=None, blacklist=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, indices=None, head=None, removeDuplicates=False, modifier=None): """ Filter a set of reads to produce a matching subset. Note: there are many additional filtering options that could be added, e.g., filtering on read id (whitelist, blacklist, regex, etc), GC %, and quality. @param minLength: The minimum acceptable length. @param maxLength: The maximum acceptable length. @param removeGaps: If C{True} remove all gaps ('-' characters) from the read sequences. @param whitelist: If not C{None}, a set of exact read ids that are always acceptable (though other characteristics, such as length, of a whitelisted id may rule it out). @param blacklist: If not C{None}, a set of exact read ids that are never acceptable. @param titleRegex: A regex that read ids must match. @param negativeTitleRegex: A regex that read ids must not match. @param truncateTitlesAfter: A string that read ids will be truncated beyond. If the truncated version of an id has already been seen, that sequence will be skipped. @param indices: Either C{None} or a set of C{int} indices corresponding to reads that are wanted. Indexing starts at zero. @param head: If not C{None}, the C{int} number of sequences at the start of the reads to return. Later sequences are skipped. @param removeDuplicates: If C{True} remove duplicated sequences. @param modifier: If not C{None} a function that is passed a read and which either returns a read or C{None}. If it returns a read, that read is passed through the filter. If it returns C{None}, the read is omitted. Such a function can be used to do customized filtering, to change sequence ids, etc. @return: A generator that yields C{Read} instances. """ if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter( whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if removeDuplicates: sequencesSeen = set() for readIndex, read in enumerate(self): if head is not None and readIndex == head: # We're completely done. return readLen = len(read) if ((minLength is not None and readLen < minLength) or (maxLength is not None and readLen > maxLength)): continue if removeGaps: sequence = read.sequence.replace('-', '') read = read.__class__(read.id, sequence, read.quality) if (titleFilter and titleFilter.accept(read.id) == TitleFilter.REJECT): continue if indices is not None and readIndex not in indices: continue if removeDuplicates: if read.sequence in sequencesSeen: continue sequencesSeen.add(read.sequence) if modifier: modified = modifier(read) if modified is None: continue else: read = modified yield read
def _filter(self, minLength=None, maxLength=None, removeGaps=False, whitelist=None, blacklist=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, indices=None, head=None, removeDuplicates=False, modifier=None, randomSubset=None, trueLength=None, sampleFraction=None, sequenceNumbersFile=None): """ Filter a set of reads to produce a matching subset. See docstring for self.filter (above) for parameter docs. @return: A generator that yields C{Read} instances. """ def _wantedSequences(filename): """ Read and yield integer sequence numbers from a file. @raise ValueError: If the sequence numbers are not all positive or are not ascending. @return: A generator that yields C{int} sequence numbers. """ with open(filename) as fp: lastNumber = None for line in fp: n = int(line) if lastNumber is None: if n < 1: raise ValueError( 'First line of sequence number file %r must ' 'be at least 1.' % filename) lastNumber = n yield n else: if n > lastNumber: lastNumber = n yield n else: raise ValueError( 'Line number file %r contains non-ascending ' 'numbers %d and %d.' % (filename, lastNumber, n)) if randomSubset is not None and sampleFraction is not None: raise ValueError('randomSubset and sampleFraction cannot be ' 'used simultaneously in a filter. Call filter ' 'twice instead.') if sequenceNumbersFile is None: nextWantedSequenceNumber = None wantedSequenceNumberGeneratorExhausted = False else: wantedSequenceNumerGenerator = _wantedSequences( sequenceNumbersFile) try: nextWantedSequenceNumber = next(wantedSequenceNumerGenerator) except StopIteration: # There was a sequence number file, but it was empty. return else: wantedSequenceNumberGeneratorExhausted = False if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if removeDuplicates: sequencesSeen = set() if sampleFraction is not None: if sampleFraction == 0.0: # The filter returns nothing. return elif sampleFraction == 1.0: # Passing 1.0 can be treated the same as passing no value. # This makes the loop code simpler. sampleFraction = None if randomSubset is not None and trueLength is None: trueLength = self._length yieldCount = 0 for readIndex, read in enumerate(self): if wantedSequenceNumberGeneratorExhausted: return if nextWantedSequenceNumber is not None: if readIndex + 1 == nextWantedSequenceNumber: # We want this sequence. try: nextWantedSequenceNumber = next( wantedSequenceNumerGenerator) except StopIteration: # The sequence number iterator ran out of sequence # numbers. We must let the rest of the filtering # continue for the current sequence in case we # throw it out for other reasons (as we might have # done for any of the earlier wanted sequence # numbers). wantedSequenceNumberGeneratorExhausted = True else: # This sequence isn't wanted. continue if (sampleFraction is not None and uniform(0.0, 1.0) > sampleFraction): # Note that we don't have to worry about the 0.0 or 1.0 # cases in the above if, as they have been dealt with # before the loop. continue if randomSubset is not None: if yieldCount == randomSubset: # The random subset has already been fully returned. # There's no point in going any further through the input. return elif uniform(0.0, 1.0) > ((randomSubset - yieldCount) / (trueLength - readIndex)): continue if head is not None and readIndex == head: # We're completely done. return readLen = len(read) if ((minLength is not None and readLen < minLength) or (maxLength is not None and readLen > maxLength)): continue if removeGaps: sequence = read.sequence.replace('-', '') read = read.__class__(read.id, sequence, read.quality) if (titleFilter and titleFilter.accept(read.id) == TitleFilter.REJECT): continue if indices is not None and readIndex not in indices: continue if removeDuplicates: if read.sequence in sequencesSeen: continue sequencesSeen.add(read.sequence) if modifier: modified = modifier(read) if modified is None: continue else: read = modified yield read yieldCount += 1