示例#1
0
    def testPopulationFromCommandLineSequencesAndFastaFile(self):
        """
        Using both command line sequences and --databaseFasta must result in
        all the command line subjects and those in the file being added to the
        returned database.
        """
        parser = argparse.ArgumentParser()
        specifier = DatabaseSpecifier()
        specifier.addArgsToParser(parser)
        args = parser.parse_args([
            '--databaseFasta', 'file.fasta', '--databaseSequence', 'id1 FFF',
            '--databaseSequence', 'id2 RRR'
        ])
        data = '\n'.join(['>id3', 'FFFF', '>id4', 'RRRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = specifier.getDatabaseFromArgs(args)

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {
                AARead('id1', 'FFF'),
                AARead('id2', 'RRR'),
                AARead('id3', 'FFFF'),
                AARead('id4', 'RRRR'),
            }, set(allSubjects))
示例#2
0
 def testPopulationNotAllowed(self):
     """
     Using --databaseFasta must result in a ValueError if database
     population has not been enabled.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier(allowPopulation=False)
     specifier.addArgsToParser(parser)
示例#3
0
 def testNoArgs(self):
     """
     If no arguments are given, getDatabaseFromArgs must return a database.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     db = specifier.getDatabaseFromArgs(args)
     self.assertIsInstance(db, Database)
示例#4
0
 def testPassedParamsAreUsed(self):
     """
     If specific parameters are given, they must be used.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     dbParams = DatabaseParameters()
     db = specifier.getDatabaseFromArgs(args, dbParams)
     self.assertIs(db.dbParams, dbParams)
示例#5
0
 def testNoArgsDefaultParameters(self):
     """
     The database returned from getDatabaseFromKeywords when it is
     passed no keywords must have the default database parameters.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     db = specifier.getDatabaseFromArgs(args)
     self.assertIs(None, db.dbParams.compare(DatabaseParameters()))
示例#6
0
 def testPopulationByInMemorySubjects(self):
     """
     Passing a subjects keyword must result in the subjects being added
     to the returned database.
     """
     subjects = Reads()
     subject1 = AARead('id1', 'FFF')
     subject2 = AARead('id2', 'RRR')
     subjects.add(subject1)
     subjects.add(subject2)
     db = DatabaseSpecifier().getDatabaseFromKeywords(subjects=subjects)
     allSubjects = [subject.read for subject in db.getSubjects()]
     self.assertEqual({subject1, subject2}, set(allSubjects))
示例#7
0
 def testCreationNotAllowed(self):
     """
     Not passing any arguments when creation (or a WAMP connection)
     is not allowed must result in a RuntimeError.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier(allowCreation=False, allowWamp=False)
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     error = ('^Not enough information given to specify a database, '
              'database creation is not enabled, and '
              'no remote WAMP database could be found\.$')
     six.assertRaisesRegex(self, RuntimeError, error,
                           specifier.getDatabaseFromArgs, args)
示例#8
0
 def testPopulationFromCommandLineSequences(self):
     """
     Passing --databaseSequence arguments must result in the subjects in the
     sequences being added to the returned database.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args(
         ['--databaseSequence', 'id1 FFF', '--databaseSequence', 'id2 RRR'])
     db = specifier.getDatabaseFromArgs(args)
     allSubjects = [subject.read for subject in db.getSubjects()]
     self.assertEqual(
         {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
示例#9
0
    def testPopulationFromFastaFile(self):
        """
        Passing a databaseFasta keyword must result in the subjects in the
        file being added to the returned database.
        """
        data = '\n'.join(['>id1', 'FFF', '>id2', 'RRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = DatabaseSpecifier().getDatabaseFromKeywords(
                databaseFasta='file.fasta')

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
示例#10
0
 def testInMemoryDatabaseIsPopulated(self):
     """
     Passing a database keyword with an in-memory database results in that
     database being populated.
     """
     original = Database()
     subjects = Reads()
     subject1 = AARead('id1', 'FFF')
     subject2 = AARead('id2', 'RRR')
     subjects.add(subject1)
     subjects.add(subject2)
     db = DatabaseSpecifier().getDatabaseFromKeywords(database=original,
                                                      subjects=subjects)
     allSubjects = [subject.read for subject in db.getSubjects()]
     self.assertEqual({subject1, subject2}, set(allSubjects))
示例#11
0
    def getFractionOfStructuresCovered(self):
        """
        Return the fraction of known structures matched by at least one
        substring in the subset that is being evaluated.
        """
        hit = 0
        total = 0

        db = DatabaseSpecifier().getDatabaseFromKeywords(
            trigPoints=[],
            landmarks=['AC ' + self.structureType],
            acAlphaHelixFilename=self.acAlphaHelixFilename,
            acAlphaHelix310Filename=self.acAlphaHelix310Filename,
            acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename,
            acAlphaHelixPiFilename=self.acAlphaHelixPiFilename,
            acExtendedStrandFilename=self.acExtendedStrandFilename)

        backend = Backend()
        backend.configure(db.dbParams)

        for read in FastaReads(self.structureFile,
                               readClass=AAReadWithX,
                               checkAlphabet=0):
            total += 1
            scannedRead = backend.scan(read)
            if len(scannedRead.landmarks) > 0:
                hit += 1

        return hit / total if total else 0.0
示例#12
0
 def testNoKeywordsDefaultParameters(self):
     """
     The database returned from getDatabaseFromKeywords when it is
     passed no keywords must have the default database parameters.
     """
     db = DatabaseSpecifier().getDatabaseFromKeywords()
     self.assertIs(None, db.dbParams.compare(DatabaseParameters()))
示例#13
0
 def testNoKeywords(self):
     """
     The getDatabaseFromKeywords method must return a database when
     it is passed no keywords.
     """
     db = DatabaseSpecifier().getDatabaseFromKeywords()
     self.assertIsInstance(db, Database)
示例#14
0
    def testPopulationFromFastaFile(self):
        """
        Passing a --databaseFasta argument must result in the subjects in the
        file being added to the returned database.
        """
        parser = argparse.ArgumentParser()
        specifier = DatabaseSpecifier()
        specifier.addArgsToParser(parser)
        args = parser.parse_args(['--databaseFasta', 'file.fasta'])
        data = '\n'.join(['>id1', 'FFF', '>id2', 'RRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = specifier.getDatabaseFromArgs(args)

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
示例#15
0
 def testInMemoryDatabaseIsReturned(self):
     """
     Passing a database keyword with an in-memory database results in that
     database being returned.
     """
     original = Database()
     db = DatabaseSpecifier().getDatabaseFromKeywords(database=original)
     self.assertIs(original, db)
示例#16
0
 def testCreationNotAllowed(self):
     """
     Not passing a database keyword when creation (or a WAMP connection)
     is not allowed must result in a RuntimeError.
     """
     specifier = DatabaseSpecifier(allowCreation=False, allowWamp=False)
     error = ('^Not enough information given to specify a database, '
              'database creation is not enabled, and '
              'no remote WAMP database could be found\.$')
     six.assertRaisesRegex(self, RuntimeError, error,
                           specifier.getDatabaseFromKeywords)
示例#17
0
 def testPopulationNotAllowed(self):
     """
     Passing a subjects keyword must result in a ValueError if database
     population has not been enabled.
     """
     subjects = Reads()
     specifier = DatabaseSpecifier(allowPopulation=False)
     error = '^Database population is not enabled.$'
     six.assertRaisesRegex(self,
                           ValueError,
                           error,
                           specifier.getDatabaseFromKeywords,
                           subjects=subjects)
示例#18
0
 def testInMemoryDatabaseNotAllowed(self):
     """
     Passing a database keyword results in a ValueError if an in-memory
     database is not allowed.
     """
     original = Database()
     specifier = DatabaseSpecifier(allowInMemory=False)
     error = '^In-memory database specification not enabled.$'
     six.assertRaisesRegex(self,
                           ValueError,
                           error,
                           specifier.getDatabaseFromKeywords,
                           database=original)
示例#19
0
    def testPopulationFromInMemoryAndFastaFile(self):
        """
        Passing both subjects and databaseFasta keywords must result in
        all the subjects in memory and in the file being added to the returned
        database.
        """
        subjects = Reads()
        subject1 = AARead('id1', 'FFF')
        subject2 = AARead('id2', 'RRR')
        subjects.add(subject1)
        subjects.add(subject2)

        data = '\n'.join(['>id3', 'FFFF', '>id4', 'RRRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = DatabaseSpecifier().getDatabaseFromKeywords(
                subjects=subjects, databaseFasta='file.fasta')

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {subject1, subject2,
             AARead('id3', 'FFFF'),
             AARead('id4', 'RRRR')}, set(allSubjects))
示例#20
0
    def __init__(self, **kwargs):
        # Set default landmark and trig point finders.
        if 'landmarks' not in kwargs:
            kwargs['landmarks'] = ALL_LANDMARK_CLASSES + [
                c for c in DEV_LANDMARK_CLASSES if c.NAME.startswith('PDB ')
            ]
        if 'trigPoints' not in kwargs:
            kwargs['trigPoints'] = [
                c for c in ALL_TRIG_CLASSES if c.NAME != 'Volume'
            ]

        db = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        self._backend = Backend()
        self._backend.configure(db.dbParams)

        self._names = (db.dbParams.landmarkFinderNames() +
                       db.dbParams.trigPointFinderNames())
示例#21
0
def affinityMatrix(queries,
                   findParams=None,
                   subjects=None,
                   symmetric=True,
                   computeDiagonal=False,
                   diagonalValue=1.0,
                   progressFunc=None,
                   returnDict=False,
                   returnAnalysis=False,
                   **kwargs):
    """
    Produce an affinity matrix containing scores for a set of reads matched
    against a set of subjects.

    @param queries: Either A C{str} filename of sequences to consider or
        a C{light.reads.Reads} instance.
    @param findParams: A C{light.parameters.FindParameters} instance.
    @param subjects: Either 1) a C{str} filename of sequences to consider, or
        2) a C{light.reads.Reads} instance, or 3) C{None}, in which case
        the C{queries} will also be used as the subjects.
    @param symmetric: If C{True}, corresponding off-diagonal scores will be
        assumed to be equal and only computed once. I.e., this is a speedup
        when scores (affinities) are symmetric. This option gives the biggest
        speed up on a square matrix, but can also be used when the matrix is
        not square (e.g., when making a 2x4 matrix comparing {A, B} against
        {A, B, C, D}, the A->B distance can be used to set the B->A distance).
    @param computeDiagonal: If C{True}, values on the diagonal will be computed
        (i.e., obtained from find). Otherwise, all diagonal values will be set
        to C{diagonalValue}.
    @param diagonalValue: The result that diagonal values will all be set to if
        C{computeDiagonal} is C{False}.
    @param progressFunc: If not C{None}, a function that takes two arguments.
        The function will be called before each query sequence is processed.
        The arguments will be the C{int} (zero-based) number of the query and
        the query (an AAReadWithX instance) itself.
    @param returnDict: If C{True}, return a C{dict} keyed by query id, whose
        values are C{dict}s keyed by subject id, whose values are C{float}
        scores. In other words, a 2-level deep C{dict} that allows the caller
        to look up a score via something like C{result[query.id][subject.id]}.
    @param returnAnalysis: This determines what information is returned in each
        affinity matrix location. If C{False}, the default, each location will
        contain the overall score for the corresponding query/subject pair, or
        0.0 if the query did not match the subject. If C{True}, the location
        will contain the analysis C{dict} computed by the
        C{light.result.Result} instance, or C{None} if the query did not match
        the subject.
    @param kwargs: See
        C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
        additional keywords, all of which are optional.
    @raise ValueError: If C{returnDict} is C{True} and there is a duplicated
        query or subject id.
    @return: If C{returnDict} is C{True}, a C{dict} as described above, else a
        two-dimensional array whose dimensions are the query index (in
        C{queries}, and then the subject index (in C{subjects}). The values in
        the returned structure are as described in C{returnAnalysis}, above.
    """
    if isinstance(queries, str):
        queries = list(
            FastaReads(queries, readClass=AAReadWithX, upperCase=True))

    if subjects is None:
        subjects = queries
    else:
        if isinstance(subjects, str):
            subjects = list(
                FastaReads(subjects, readClass=AAReadWithX, upperCase=True))

    findParams = findParams or FindParameters()

    db = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)

    subjectIndices = []
    for subject in subjects:
        _, subjectIndex, _ = db.addSubject(subject)
        subjectIndices.append(subjectIndex)

    nQueries = len(queries)
    nSubjects = len(subjectIndices)

    # Prepare a result array (we'll walk through this later to make a dict
    # if returnDict is True).
    affinity = []
    noMatchValue = None if returnAnalysis else 0.0
    for _ in range(nQueries):
        affinity.append([noMatchValue] * nSubjects)

    for i, query in enumerate(queries):
        if progressFunc:
            progressFunc(i, query)
        if symmetric:
            # We don't have to consider all subjects in the find, so pass a
            # restricted set of subject indices to restrict the search to.
            # The ones we omit have already been looked up.
            #
            # For clarity, there's a little code repetition here.
            if computeDiagonal:
                wantedIndices = set(subjectIndices[i:])
            else:
                wantedIndices = set(subjectIndices[i + 1:])
            result = db.find(query,
                             findParams,
                             storeFullAnalysis=returnAnalysis,
                             subjectIndices=wantedIndices)
        else:
            result = db.find(query,
                             findParams,
                             storeFullAnalysis=returnAnalysis)

        analysis = result.analysis
        for j in range(nSubjects):
            if j < i and symmetric:
                score = affinity[j][i]
            elif j == i and not computeDiagonal:
                score = diagonalValue
            else:
                # Be careful how we access the analysis. It is a defaultdict,
                # so its keys are created on access. I.e., we must use 'in'
                # to test for membership not try/except, because
                # analysis[subjectIndex] will never raise a KeyError.
                if subjectIndices[j] in analysis:
                    if returnAnalysis:
                        score = analysis[subjectIndices[j]]
                    else:
                        score = analysis[subjectIndices[j]]['overallScore']
                else:
                    # The query didn't match the subject. We don't actually
                    # need to set this value as it is already present due
                    # to the initialization of affinity above, but it
                    # simplifies the code to do so.
                    score = noMatchValue

            affinity[i][j] = score

    if returnDict:
        result = {}
        for i, query in enumerate(queries):
            if query.id in result:
                raise ValueError('Query id %r appears more than once.' %
                                 query.id)
            result[query.id] = values = {}
            for j, subject in enumerate(subjects):
                if subject.id in values:
                    raise ValueError('Subject id %r appears more than once.' %
                                     subject.id)
                values[subject.id] = affinity[i][j]
        return result
    else:
        return affinity
#!/usr/bin/env python

import argparse

from autobahn.asyncio.wamp import ApplicationRunner

from light.autobahn.database import DatabaseComponent
from light.database import DatabaseSpecifier

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Start a WAMP-based distributed light-matter database.')

    databaseSpecifier = DatabaseSpecifier(allowInMemory=False)
    databaseSpecifier.addArgsToParser(parser)
    args = parser.parse_args()
    # We're always using WAMP for distributed databases.
    args.wampServer = True

    database = databaseSpecifier.getDatabaseFromArgs(args)
    runner = ApplicationRunner(args.wampUrl,
                               args.realm,
                               extra=dict(database=database))
    runner.run(DatabaseComponent)
示例#23
0
    def __init__(self, sequences, cutoff, **kwargs):
        """
        A class to work with hashes.
        For a set of given sequences, find all hashes and for each sequence
        make a string of 1 or 0 denoting whether a hash is present in that
        sequence or not. Only include hashes if they occur in more than at '
        least a specified percentage of all given sequences.

        @param sequences: A C{str} filename with a fasta file of sequences to
            be used or a C{dark.reads.Reads} object.
        @param cutoff: A C{float} between 0.0 and 1.0 of the fraction of
            sequences in which a hash has to be present to be included in the
            final string.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences

        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)

        # Make a dictionary where the keys are the sequence ids and the value
        # is an orderedDict of hashes as returned from getHashes().
        hashes = {}
        for read in reads:
            scannedRead = backend.scan(read)
            readHashes = backend.getHashes(scannedRead)
            hashes[read.id] = readHashes

        # Make a list of all unique hashes that occur.
        totalHashes = set()
        for read in hashes:
            totalHashes.update(hashes[read].keys())

        # Make a dictionary where the key is a hash and the value is a list of
        # the reads in which the hash occurs.
        byHashes = {}
        for hash_ in totalHashes:
            viruses = []
            for readId in hashes:
                try:
                    hashes[readId][hash_]
                except KeyError:
                    continue
                viruses.append(readId)
            byHashes[hash_] = viruses

        # Make a dictionary where the key is a readId and the value is a string
        # of 1 and 0 denoting which hashes occur in which read.
        co = cutoff * len(reads)
        self.hashString = {read.id: '' for read in reads}

        for hash_ in byHashes:
            if len(byHashes[hash_]) > co:
                for virus in self.hashString:
                    if virus in byHashes[hash_]:
                        self.hashString[virus] += '1'
                    else:
                        self.hashString[virus] += '0'
#!/usr/bin/env python

import sys
import argparse

from autobahn.asyncio.wamp import ApplicationRunner

from light.autobahn.backend import BackendComponent
from light.database import DatabaseSpecifier

if sys.version_info < (3, 3):
    raise Exception('The light matter autobahn code needs Python 3.3 or '
                    'later.')

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=('Start a WAMP-based distributed light-matter database '
                     'backend.'))

    databaseSpecifier = DatabaseSpecifier(allowInMemory=False)
    databaseSpecifier.addArgsToParser(parser)
    args = parser.parse_args()

    runner = ApplicationRunner(args.wampUrl, args.realm, extra=dict(args=args))
    runner.run(BackendComponent)
示例#25
0
    def __init__(self, sequences, labels, defaultLabel=None, **kwargs):
        """
        Base class for using cluster analysis to evaluate how well various
        feature finders and database parameter settings can separate a set of
        sequences. The clustering is based on feature offset deltas.

        @param sequences: Either A C{str} filename of sequences to consider or
            a C{light.reads.Reads} instance.
        @param labels: A C{dict} with a label for each sequence id in
            C{sequences}. These are the known categories of each sequence.
        @param defaultLabel: If not C{None}, a label to use for reads whose ids
            are not present in C{labels}. If C{None} and a read id has no label
            a ValueError is raised.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        @raises ValueError: If the id of a read is not in labels and no default
            label has been set, or if there are no reads in C{sequences}.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences
        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)
        allOffsetDeltas = []
        trueLabels = []

        for read in reads:
            trueLabel = labels.get(read.id, defaultLabel)
            if trueLabel is None:
                raise ValueError('Read %r has no corresponding label' %
                                 read.id)
            trueLabels.append(trueLabel)
            offsetDeltas = Counter()
            scannedRead = backend.scan(read)
            for landmark, trigPoint in backend.getScannedPairs(scannedRead):
                delta = scaleLog(trigPoint.offset - landmark.offset,
                                 database.dbParams.distanceBase)
                offsetDeltas[delta] += 1
            allOffsetDeltas.append(offsetDeltas)

        nReads = len(reads)

        if nReads == 0:
            raise ValueError('No sequences were found in %r' % sequences)

        # Don't check that len(reads) == len(labels). I.e., ignore extra labels
        # to make using this class interactively more convenient.

        # Create an affinity matrix. Initially set all values to 1.0 so we
        # don't need to later initialize the diagonal.
        affinity = np.ones((nReads, nReads))

        for row, offsetDeltas in enumerate(allOffsetDeltas):
            for col in range(row + 1, nReads):
                affinity[row,
                         col] = affinity[col,
                                         row] = (self.affinityFromOffsetDeltas(
                                             allOffsetDeltas[row],
                                             allOffsetDeltas[col]))

        self.nReads = nReads
        self.affinity = affinity
        self.trueLabels = trueLabels