Пример #1
0
 def testNoRestriction(self):
     """
     Testing for acceptance against a title filter that has no
     restrictions should return C{TitleFilter.DEFAULT_ACCEPT}.
     """
     tf = TitleFilter()
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey'))
Пример #2
0
    def __init__(self,
                 limit=None,
                 maxAlignmentsPerRead=None,
                 minSequenceLen=None,
                 maxSequenceLen=None,
                 minStart=None,
                 maxStop=None,
                 oneAlignmentPerRead=False,
                 maxHspsPerHit=None,
                 scoreCutoff=None,
                 percentageIdenticalCutoff=None,
                 percentagePositiveCutoff=None,
                 whitelist=None,
                 blacklist=None,
                 whitelistFile=None,
                 blacklistFile=None,
                 titleRegex=None,
                 negativeTitleRegex=None,
                 truncateTitlesAfter=None,
                 taxonomy=None,
                 readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff
        self.percentageIdenticalCutoff = percentageIdenticalCutoff
        self.percentagePositiveCutoff = percentagePositiveCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile
                or titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(whitelist=whitelist,
                                           blacklist=blacklist,
                                           whitelistFile=whitelistFile,
                                           blacklistFile=blacklistFile,
                                           positiveRegex=titleRegex,
                                           negativeRegex=negativeTitleRegex,
                                           truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0
Пример #3
0
    def __init__(self, assetDir='out', sampleName=None, sampleNameRegex=None,
                 format_='fasta', proteinFastaFilenames=None,
                 saveReadLengths=False, titleRegex=None,
                 negativeTitleRegex=None, pathogenDataDir='pathogen-data'):
        self._assetDir = assetDir
        self._sampleName = sampleName
        self._sampleNameRegex = (re.compile(sampleNameRegex) if sampleNameRegex
                                 else None)
        if format_ in ('fasta', 'fastq'):
            self._format = format_
        else:
            raise ValueError("format_ must be either 'fasta' or 'fastq'.")
        self._saveReadLengths = saveReadLengths

        if titleRegex or negativeTitleRegex:
            self.titleFilter = TitleFilter(
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex)
        else:
            self.titleFilter = None

        self._pathogenDataDir = pathogenDataDir

        self._pathogenProteinCount = getPathogenProteinCounts(
            proteinFastaFilenames)

        # pathogenNames will be a dict of dicts of dicts. The first two keys
        # will be a pathogen name and a sample name. The final dict will
        # contain 'proteins' (a list of dicts) and 'uniqueReadCount' (an int).
        self.pathogenNames = {}
        # sampleNames is keyed by sample name and will have values that hold
        # the sample's alignment panel index.html file.
        self.sampleNames = {}
        self.pathogenSampleFiles = PathogenSampleFiles(self, format_=format_)
Пример #4
0
 def testBlacklist(self):
     """
     Testing for acceptance against a title filter with a blacklist
     must work.
     """
     tf = TitleFilter(blacklist=['never ok'], positiveRegex='ok')
     self.assertEqual(TitleFilter.REJECT, tf.accept('never ok'))
Пример #5
0
 def testNegativeRegex(self):
     """
     Testing for acceptance against a title filter with a negative regex
     must work.
     """
     tf = TitleFilter(negativeRegex=r'x+\s')
     self.assertEqual(TitleFilter.REJECT, tf.accept('hey xxx you'))
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey xxyou'))
Пример #6
0
 def testWhitelistTakesPrecedenceOverBlacklist(self):
     """
     Testing for acceptance against a title filter with a whitelist
     and a blacklist that contain the same title must work (the whitelist
     takes precedence).
     """
     tf = TitleFilter(whitelist=['always ok'], blacklist=['always ok'])
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
Пример #7
0
 def testWhitelist(self):
     """
     Testing for acceptance against a title filter with a whitelist
     must work even when a title is ruled out for other violations.
     """
     tf = TitleFilter(whitelist=['always ok'], negativeRegex='ok')
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('always ok not'))
Пример #8
0
 def testNegativeRegexHasPrecedenceOverRepeatedTruncatedTitle(self):
     """
     Testing for acceptance against a title filter with a negative regex
     must have precedence over checking for truncated titles when the same
     matching title (that will be truncated) is passed twice.
     """
     tf = TitleFilter(negativeRegex=r'spotty', truncateAfter='virus')
     self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
Пример #9
0
 def testWhitelistOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist
     and a negative regex that matches everything.
     """
     tf = TitleFilter(whitelist=['always ok'], negativeRegex='.')
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('always not ok'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('rubbish'))
Пример #10
0
 def testBlacklistFile(self):
     """
     Testing for acceptance against a title filter with a blacklist file.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         tf = TitleFilter(blacklistFile='black.txt')
         self.assertEqual(TitleFilter.REJECT, tf.accept('id1'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id2'))
         self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id3'))
Пример #11
0
 def testWhitelistFileOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist file
     and a negative regex that matches everything.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         tf = TitleFilter(whitelistFile='white.txt', negativeRegex='.')
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1'))
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
Пример #12
0
 def testPartialWordTruncation(self):
     """
     Testing for acceptance against a title filter with title truncation
     in effect must work if the title contains the C{truncateAfter} string
     as a partial word.
     """
     tf = TitleFilter(truncateAfter=r'virus')
     # Note that the truncation code will chop off the first part of the
     # title (the title ID).
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT,
                      tf.accept('gi|400684|gb|AY421767.1| rotavirus 1'))
     self.assertEqual(TitleFilter.REJECT,
                      tf.accept('gi|400684|gb|AY421767.1| rotavirus 2'))
Пример #13
0
 def testBlacklistFileAndBlacklist(self):
     """
     Testing for acceptance against a title filter with a blacklist file and
     some specific other blacklist titles.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         tf = TitleFilter(blacklistFile='black.txt', blacklist=set(['id3']))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id1'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id2'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
         self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id4'))
Пример #14
0
 def testWordTruncationRepeat(self):
     """
     Testing for acceptance against a title filter with title truncation
     in effect must allow the exact same title twice, even if the title
     is being truncated.
     """
     tf = TitleFilter(truncateAfter=r'virus')
     # Note that the truncation code will chop off the first part of the
     # title (the title ID).
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT,
                      tf.accept('gi|400684|gb|AY421767.1| herpes virus 1'))
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT,
                      tf.accept('gi|400684|gb|AY421767.1| herpes virus 1'))
Пример #15
0
 def testWhitelistFileAndWhitelistOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist file
     and some specific whitelist titles, with a negative regex that matches
     everything.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         tf = TitleFilter(whitelistFile='white.txt',
                          whitelist=set(['id3']),
                          negativeRegex='.')
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1'))
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2'))
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id3'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id4'))
Пример #16
0
    def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop,
                oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist,
                blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter,
                taxonomy, iteratorIndex, readIdRegex):
        """
        Filter the read alignments in self.

        Do not call this function directly, instead use self.filter (above).
        Argument defaults and descriptions (other than for iteratorIndex) are
        as in self.filter.

        @param iteratorIndex: An index into self._iterators. Calling the
            iterator function will return a generator that yields
            C{ReadAlignments} instances.
        @return: A generator that yields C{ReadAlignments} instances.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the above arguments for
        #    each match the way the current code does.
        #
        # 3. A better approach with readIdRegex might be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex
                or truncateTitlesAfter):
            titleFilter = TitleFilter(whitelist=whitelist,
                                      blacklist=blacklist,
                                      positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex,
                                      truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if taxonomy is not None:
            lineageFetcher = LineageFetcher()

        if readIdRegex is not None:
            readIdRegex = re.compile(readIdRegex)

        count = 0
        for readAlignments in self._iterators[iteratorIndex]():
            if limit is not None and count == limit:
                return

            # Filter on the read id.
            if (readIdRegex
                    and readIdRegex.search(readAlignments.read.id) is None):
                continue

            if titleFilter:
                # Remove alignments against sequences whose titles are
                # unacceptable.
                wantedAlignments = []
                for alignment in readAlignments:
                    if (titleFilter.accept(alignment.subjectTitle) !=
                            TitleFilter.REJECT):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Only return alignments that are against sequences of the
            # desired length.
            if minSequenceLen is not None or maxSequenceLen is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    length = alignment.subjectLength
                    if not ((minSequenceLen is not None
                             and length < minSequenceLen) or
                            (maxSequenceLen is not None
                             and length > maxSequenceLen)):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if taxonomy is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    lineage = lineageFetcher.lineage(alignment.subjectTitle)
                    if lineage:
                        for taxonomyIdAndScientificName in lineage:
                            if taxonomy in taxonomyIdAndScientificName:
                                wantedAlignments.append(alignment)
                    else:
                        # No lineage info was found. Keep the alignment
                        # since we can't rule it out.  We could add another
                        # option to control this.
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if oneAlignmentPerRead and readAlignments:
                readAlignments[:] = [bestAlignment(readAlignments)]

            #
            # From here on we do only HSP-based filtering.
            #

            # Throw out any unwanted HSPs due to maxHspsPerHit.
            if maxHspsPerHit is not None:
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    if len(hsps) > maxHspsPerHit:
                        alignment.hsps = hsps[:maxHspsPerHit]

            # Throw out HSPs whose scores are not good enough.
            if scoreCutoff is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if hsp.betterThan(scoreCutoff):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Throw out HSPs that don't match in the desired place on the
            # matched sequence.
            if minStart is not None or maxStop is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if not ((minStart is not None
                                 and hsp.readStartInSubject < minStart) or
                                (maxStop is not None
                                 and hsp.readEndInSubject > maxStop)):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            yield readAlignments
            count += 1

        if taxonomy:
            lineageFetcher.close()
Пример #17
0
    def filter(self,
               minMatchingReads=None,
               minMedianScore=None,
               withScoreBetterThan=None,
               minNewReads=None,
               minCoverage=None,
               maxTitles=None,
               sortOn='maxScore',
               titleRegex=None,
               negativeTitleRegex=None):
        """
        Filter the titles in self to create another TitlesAlignments.

        @param minMatchingReads: titles that are matched by fewer reads
            are unacceptable.
        @param minMedianScore: sequences that are matched with a median
            bit score that is less are unacceptable.
        @param withScoreBetterThan: if the best score for a title is not
            as good as this value, the title is not acceptable.
        @param minNewReads: The C{float} fraction of its reads by which a new
            title's read set must differ from the read sets of all previously
            seen titles in order for this title to be considered acceptably
            different (and therefore interesting).
        @param minCoverage: The C{float} minimum fraction of the title sequence
            that must be matched by at least one read.
        @param maxTitles: A non-negative C{int} maximum number of titles to
            keep. If more titles than this are present, titles will be sorted
            (according to C{sortOn}) and only the best will be retained.
        @param sortOn: A C{str} attribute to sort on, used only if C{maxTitles}
            is not C{None}. See the C{sortTitles} method below for the legal
            values.
        @param titleRegex: A regex that read ids must match.
        @param negativeTitleRegex: A regex that read ids must not match.
        @raise: C{ValueError} if C{maxTitles} is less than zero or the value of
            C{sortOn} is unknown.
        @return: A new L{TitlesAlignments} instance containing only the
            matching titles.
        """
        # Use a ReadSetFilter only if we're checking that read sets are
        # sufficiently new.
        if minNewReads is None:
            readSetFilter = None
        else:
            if self.readSetFilter is None:
                self.readSetFilter = ReadSetFilter(minNewReads)
            readSetFilter = self.readSetFilter

        result = TitlesAlignments(self.readsAlignments,
                                  self.scoreClass,
                                  self.readSetFilter,
                                  importReadsAlignmentsTitles=False)

        if maxTitles is not None and len(self) > maxTitles:
            if maxTitles < 0:
                raise ValueError('maxTitles (%r) cannot be negative.' %
                                 maxTitles)
            else:
                # There are too many titles. Make a sorted list of them so
                # we loop through them (below) in the desired order and can
                # break when/if we've reached the maximum. We can't just
                # take the first maxTitles titles from the sorted list now,
                # as some of those titles might later be discarded by the
                # filter and then we'd return a result with fewer titles
                # than we should.
                titles = self.sortTitles(sortOn)
        else:
            titles = self.keys()

        if (titleRegex or negativeTitleRegex):
            titleFilter = TitleFilter(positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex)
        else:
            titleFilter = None

        for title in titles:
            # Test max titles up front, as it may be zero.
            if maxTitles is not None and len(result) == maxTitles:
                break

            # Test positive and negative regexps.
            if (titleFilter
                    and titleFilter.accept(title) == TitleFilter.REJECT):
                continue

            titleAlignments = self[title]
            if (minMatchingReads is not None
                    and titleAlignments.readCount() < minMatchingReads):
                continue

            # To compare the median score with another score, we must
            # convert both values to instances of the score class used in
            # this data set so they can be compared without us needing to
            # know if numerically greater scores are considered better or
            # not.
            if (minMedianScore is not None
                    and self.scoreClass(titleAlignments.medianScore()) <
                    self.scoreClass(minMedianScore)):
                continue

            if (withScoreBetterThan is not None and
                    not titleAlignments.hasScoreBetterThan(withScoreBetterThan)
                ):
                continue

            if (minCoverage is not None
                    and titleAlignments.coverage() < minCoverage):
                continue

            if (readSetFilter
                    and not readSetFilter.accept(title, titleAlignments)):
                continue

            result.addTitle(title, titleAlignments)

        return result
Пример #18
0
    def _filter(self,
                minLength=None,
                maxLength=None,
                removeGaps=False,
                whitelist=None,
                blacklist=None,
                titleRegex=None,
                negativeTitleRegex=None,
                truncateTitlesAfter=None,
                indices=None,
                head=None,
                removeDuplicates=False,
                modifier=None,
                randomSubset=None,
                trueLength=None,
                sampleFraction=None,
                sequenceNumbersFile=None):
        """
        Filter a set of reads to produce a matching subset.

        See docstring for self.filter (above) for parameter docs.

        @return: A generator that yields C{Read} instances.
        """
        def _wantedSequences(filename):
            """
            Read and yield integer sequence numbers from a file.

            @raise ValueError: If the sequence numbers are not all positive or
                are not ascending.
            @return: A generator that yields C{int} sequence numbers.
            """
            with open(filename) as fp:
                lastNumber = None
                for line in fp:
                    n = int(line)
                    if lastNumber is None:
                        if n < 1:
                            raise ValueError(
                                'First line of sequence number file %r must '
                                'be at least 1.' % filename)
                        lastNumber = n
                        yield n
                    else:
                        if n > lastNumber:
                            lastNumber = n
                            yield n
                        else:
                            raise ValueError(
                                'Line number file %r contains non-ascending '
                                'numbers %d and %d.' %
                                (filename, lastNumber, n))

        if randomSubset is not None and sampleFraction is not None:
            raise ValueError('randomSubset and sampleFraction cannot be '
                             'used simultaneously in a filter. Call filter '
                             'twice instead.')

        if sequenceNumbersFile is None:
            nextWantedSequenceNumber = None
            wantedSequenceNumberGeneratorExhausted = False
        else:
            wantedSequenceNumerGenerator = _wantedSequences(
                sequenceNumbersFile)
            try:
                nextWantedSequenceNumber = next(wantedSequenceNumerGenerator)
            except StopIteration:
                # There was a sequence number file, but it was empty.
                return
            else:
                wantedSequenceNumberGeneratorExhausted = False

        if (whitelist or blacklist or titleRegex or negativeTitleRegex
                or truncateTitlesAfter):
            titleFilter = TitleFilter(whitelist=whitelist,
                                      blacklist=blacklist,
                                      positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex,
                                      truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if removeDuplicates:
            sequencesSeen = set()

        if sampleFraction is not None:
            if sampleFraction == 0.0:
                # The filter returns nothing.
                return
            elif sampleFraction == 1.0:
                # Passing 1.0 can be treated the same as passing no value.
                # This makes the loop code simpler.
                sampleFraction = None

        if randomSubset is not None and trueLength is None:
            trueLength = self._length

        yieldCount = 0

        for readIndex, read in enumerate(self):

            if wantedSequenceNumberGeneratorExhausted:
                return

            if nextWantedSequenceNumber is not None:
                if readIndex + 1 == nextWantedSequenceNumber:
                    # We want this sequence.
                    try:
                        nextWantedSequenceNumber = next(
                            wantedSequenceNumerGenerator)
                    except StopIteration:
                        # The sequence number iterator ran out of sequence
                        # numbers.  We must let the rest of the filtering
                        # continue for the current sequence in case we
                        # throw it out for other reasons (as we might have
                        # done for any of the earlier wanted sequence
                        # numbers).
                        wantedSequenceNumberGeneratorExhausted = True
                else:
                    # This sequence isn't wanted.
                    continue

            if (sampleFraction is not None
                    and uniform(0.0, 1.0) > sampleFraction):
                # Note that we don't have to worry about the 0.0 or 1.0
                # cases in the above if, as they have been dealt with
                # before the loop.
                continue

            if randomSubset is not None:
                if yieldCount == randomSubset:
                    # The random subset has already been fully returned.
                    # There's no point in going any further through the input.
                    return
                elif uniform(0.0, 1.0) > ((randomSubset - yieldCount) /
                                          (trueLength - readIndex)):
                    continue

            if head is not None and readIndex == head:
                # We're completely done.
                return

            readLen = len(read)
            if ((minLength is not None and readLen < minLength)
                    or (maxLength is not None and readLen > maxLength)):
                continue

            if removeGaps:
                sequence = read.sequence.replace('-', '')
                read = read.__class__(read.id, sequence, read.quality)

            if (titleFilter
                    and titleFilter.accept(read.id) == TitleFilter.REJECT):
                continue

            if indices is not None and readIndex not in indices:
                continue

            if removeDuplicates:
                if read.sequence in sequencesSeen:
                    continue
                sequencesSeen.add(read.sequence)

            if modifier:
                modified = modifier(read)
                if modified is None:
                    continue
                else:
                    read = modified

            yield read
            yieldCount += 1