def testOneByTwoReturnAnalysis(self): """ If affinityMatrix is called with one read and two subjects, the resulting matrix must be 1x2 with each entry containing an analysis dict if returnAnalysis is True (and the query matches the subject). The analysis must contain the keys from a full analysis. """ reads = Reads([AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')]) subjects = Reads([AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF'), AARead('id3', 'FFF')]) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True, returnAnalysis=True) analysis = matrix[0][0] self.assertEqual( { 'bestBinScore', 'histogram', 'overallScore', 'overallScoreAnalysis', 'significanceAnalysis', 'significantBins', }, set(analysis)) self.assertEqual(1.0, analysis['overallScore']) # The query doesn't match the second subject. self.assertIs(None, matrix[0][1])
def fromSequences(cls, labels, sequences, findParams=None, **kwargs): """ Construct an NJTree instance from some seqeunces. @param cls: Our class. @param labels: An iterable producing C{str} labels for the sequences. @param sequences: Either A C{str} filename of sequences to consider or a C{light.reads.Reads} instance. @param findParams: An instance of C{FindParameters}. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @return: An C{NJTree} instance. """ if isinstance(sequences, str): sequences = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) new = cls() new.sequences = list(sequences) new.labels = labels findParams = findParams or FindParameters() affinity = np.array( affinityMatrix(new.sequences, findParams=findParams, **kwargs)) new.distance = np.ones(affinity.shape) - affinity new.tree = nj(DistanceMatrix(new.distance, labels)) return new
def _checkSymmetry(self, sequences, findParams, symmetric=False, **kwargs): """ Create an affinity matrix for a set of sequences and check its symmetry. @param sequences: A C{list} of C{AARead} instances. @param findParams: A {light.parameters.FindParameters} instance. @param symmetric: If C{True}, pass symmetric=True to the affinityMatrix function, allowing it to speed up the calculation by assuming scores are symmetric. We still check that the result actually is symmetric. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. """ matrix = affinityMatrix(sequences, findParams, symmetric=symmetric, **kwargs) for i in range(len(sequences)): # Test the diagonal score of each sequence against itself is 1.0. self.assertEqual( 1.0, matrix[i][i], 'Diagonal entry (%d, %d) for %s against itself has non-1.0 ' 'score of %f.' % (i, i, sequences[i].id, matrix[i][i])) # Test that off-diagonal score pairs are identical. for j in range(i + 1, len(sequences)): self.assertEqual( matrix[i][j], matrix[j][i], 'Off-diagonal entries (%d, %d) and (%d, %d) for %s ' 'against %s have unequal scores %f and %f.' % (i, j, j, i, sequences[i].id, sequences[j].id, matrix[i][j], matrix[j][i]))
def testTwoByTwoAsDict(self): """ If affinityMatrix is called with two reads and the database has two subjects, and a dict result is requested, the result must be as expected. """ reads = Reads() reads.add(AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')) reads.add(AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF')) subjects = Reads() subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id4', 'FRRRFRRRFAAAFRRRFRRRF')) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True, returnDict=True) self.assertEqual( { 'id1': { 'id3': 1.0, 'id4': 1.0, }, 'id2': { 'id3': 1.0, 'id4': 1.0, }, }, matrix)
def ultrametric(sequenceFileOrMatrix, findParams=None, **kwargs): """ Test whether ultrametricity is satisfied for a distance matrix. Ultrametricity is satisfied when: d(A, C) <= max(d(A, B), d(B, C)) for any three scores (from sequence comparisons) A, B, C. @param sequenceFileOrMatrix: Either a C{str} file name of a file containing sequences or a distance matrix as returned from C{light.performance.affinity}. @param findParams: A C{light.parameters.FindParameters} instance. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @return: A generator which returns non-ultrametric triplets. """ if isinstance(sequenceFileOrMatrix, np.ndarray): matrix = sequenceFileOrMatrix else: matrix = affinity.affinityMatrix(sequenceFileOrMatrix, findParams, **kwargs) for a, b, c in permutations(range(len(matrix)), 3): if matrix[a][c] < max(matrix[a][b], matrix[b][c]): yield a, b, c
def testNoReads(self): """ If affinityMatrix is called with no reads and no subjects, an empty score matrix must be returned. """ reads = Reads() matrix = affinityMatrix(reads, landmarks=['AlphaHelix']) self.assertEqual([], matrix)
def testOneSequenceSpecificDiagonalValue(self): """ If affinityMatrix is called with a single read and a specific diagonal value, that diagonal value must be in the result. """ reads = Reads() read = AARead('id1', 'AAA') reads.add(read) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], diagonalValue=2.0) self.assertEqual([[2.0]], matrix)
def testOneByZero(self): """ If affinityMatrix is called with no reads and three subjects, the resulting matrix must be 1x0. """ reads = Reads() read = AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF') reads.add(read) subjects = Reads() matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True) self.assertEqual([[]], matrix)
def testSequenceWithFeaturesAgainstItself(self): """ If affinityMatrix is called with a read that is also the only subject in the database, and the read has features, a matrix with just a single 1.0 value must be returned. """ reads = Reads() read = AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF') reads.add(read) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], computeDiagonal=True) self.assertEqual([[1.0]], matrix)
def testZeroByThree(self): """ If affinityMatrix is called with no reads and three subjects, the resulting matrix must be empty. """ reads = Reads() subjects = Reads() subjects.add(AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id4', 'FRRRFRRRFAAAFRRRFRRRF')) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True) self.assertEqual([], matrix)
def testOneByTwo(self): """ If affinityMatrix is called with one query and two subjects, with the query matching just the first subject, getScore must work as expected in retrieving the two scores. """ reads = Reads([AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')]) subjects = Reads([AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF'), AARead('id3', 'FFF')]) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True, returnAnalysis=True) self.assertEqual(1.0, getScore(matrix, 0, 0)) self.assertEqual(0.0, getScore(matrix, 0, 1))
def testTwoByTwoWithProgressFunction(self): """ If affinityMatrix is called with two reads and the database has two subjects, and a progress function is passed, the progress function must be called as expected. """ reads = Reads() reads.add(AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')) reads.add(AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF')) subjects = Reads() subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id4', 'FRRRFRRRFAAAFRRRFRRRF')) output = [] def progress(i, query): output.append((i, query.id)) affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True, progressFunc=progress) self.assertEqual([(0, 'id1'), (1, 'id2')], output)
def testOneByThree(self): """ If affinityMatrix is called with a read and the database has three subjects, the resulting matrix must be 1x3. """ reads = Reads() read = AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF') reads.add(read) subjects = Reads() subjects.add(AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id4', 'FRRRFRRRFAAAFRRRFRRRF')) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True) self.assertEqual([[1.0, 1.0, 1.0]], matrix)
def testTwoByThreeWithRepeatedQueryAndSubjectIds(self): """ If affinityMatrix is called with two reads and the database has three subjects, the resulting matrix must be 2x3, and the fact that query and subject ids are not all different must not cause a problem (as it would if we called affinityMatrix with returnDict=True). """ reads = Reads() reads.add(AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')) reads.add(AARead('id1', 'FRRRFRRRFAAAFRRRFRRRF')) subjects = Reads() subjects.add(AARead('id2', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) subjects.add(AARead('id3', 'FRRRFRRRFAAAFRRRFRRRF')) matrix = affinityMatrix(reads, landmarks=['AlphaHelix'], subjects=subjects, computeDiagonal=True) self.assertEqual( [ [1.0, 1.0, 1.0], [1.0, 1.0, 1.0] ], matrix)
def getCorrelation(self): """ Compute the correlation between light matter scores for the perfect PDB finders and for the finders in the subset that are being evaluated. """ result = {} datasets = { '2HLA': { 'queries': HLA_Q, 'subjects': HLA_S, }, '4MTP': { 'queries': MTP_Q, 'subjects': MTP_S, }, 'Polymerase': { 'queries': POLY_Q, 'subjects': POLY_S, }, 'HA': { 'queries': HA_Q, 'subjects': HA_S, }, } for data in datasets: pdbScores = [] evaluateScores = [] pdbMatrix = affinityMatrix( datasets[data]['queries'], subjects=datasets[data]['subjects'], symmetric=False, computeDiagonal=True, returnDict=True, findParams=self.findParams, landmarks=['PDB ' + self.structureType], trigPoints=[], acAlphaHelixFilename=self.acAlphaHelixFilename, acAlphaHelix310Filename=self.acAlphaHelix310Filename, acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename, acAlphaHelixPiFilename=self.acAlphaHelixPiFilename, acExtendedStrandFilename=self.acExtendedStrandFilename) for query in datasets[data]['queries']: for subject in datasets[data]['subjects']: if query.id != subject.id: pdbScores.append(pdbMatrix[query.id][subject.id]) evaluateMatrix = affinityMatrix( datasets[data]['queries'], subjects=datasets[data]['subjects'], symmetric=False, computeDiagonal=True, returnDict=True, findParams=self.findParams, landmarks=['AC ' + self.structureType], trigPoints=[], acAlphaHelixFilename=self.acAlphaHelixFilename, acAlphaHelix310Filename=self.acAlphaHelix310Filename, acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename, acAlphaHelixPiFilename=self.acAlphaHelixPiFilename, acExtendedStrandFilename=self.acExtendedStrandFilename) for query in datasets[data]['queries']: for subject in datasets[data]['subjects']: if query.id != subject.id: evaluateScores.append( evaluateMatrix[query.id][subject.id]) slope, intercept, rValue, pValue, se = stats.linregress( pdbScores, evaluateScores) result[data] = rValue return result