def testSymmetricFindScoresSameSubjectAndQuery(self): """ The score of matching a sequence A against a sequence B must be the same as when matching B against A, and that score must be 1.0 when the subject and the query are identical. """ sequence = 'AFRRRFRRRFASAASAFRRRFRRRF' subject = AARead('subject', sequence) query = AARead('query', sequence) dbParams = DatabaseParameters(landmarks=[AlphaHelix, BetaStrand], trigPoints=[Peaks]) db = Database(dbParams) db.addSubject(subject) findParams = FindParameters(significanceFraction=0.0) result = db.find(query, findParams) score1 = result.analysis['0']['bestBinScore'] dbParams = DatabaseParameters(landmarks=[AlphaHelix, BetaStrand], trigPoints=[Peaks]) db = Database(dbParams) db.addSubject(query) result = db.find(subject, findParams) score2 = result.analysis['0']['bestBinScore'] self.assertEqual(score1, score2) self.assertEqual(1.0, score1)
def testSymmetricFindScoresDifferingSubjectAndQuery(self): """ The score of matching a sequence A against a sequence B must be the same as when matching B against A, including when the number of hashes in the two differs and the scores are not 1.0. """ subject = AARead('subject', 'AFRRRFRRRFASAASAFRRRFRRRF') query = AARead('query', 'FRRRFRRRFASAVVVVVV') dbParams1 = DatabaseParameters(landmarks=[AlphaHelix, BetaStrand], trigPoints=[Peaks]) db = Database(dbParams1) _, index, _ = db.addSubject(subject) hashCount1 = db.getSubjectByIndex(index).hashCount findParams = FindParameters(significanceFraction=0.0) result = db.find(query, findParams) score1 = result.analysis['0']['bestBinScore'] dbParams2 = DatabaseParameters(landmarks=[AlphaHelix, BetaStrand], trigPoints=[Peaks]) db = Database(dbParams2) _, index, _ = db.addSubject(query) hashCount2 = db.getSubjectByIndex(index).hashCount result = db.find(subject, findParams) score2 = result.analysis['0']['bestBinScore'] self.assertNotEqual(hashCount1, hashCount2) self.assertEqual(score1, score2) self.assertNotEqual(1.0, score1)
def testFindOneMatchingSignificant(self): """ One matching and significant subject must be found if the significanceFraction is sufficiently low. """ sequence = 'AFRRRFRRRFASAASA' subject = AARead('subject', sequence) query = AARead('query', sequence) dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], maxDistance=11) db = Database(dbParams) db.addSubject(subject) findParams = FindParameters(significanceFraction=0.0) result = db.find(query, findParams) self.assertEqual( { '0': [ { 'queryLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 11), 'subjectLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 11), }, ], }, result.matches)
def testFindBug493Minimal(self): """ A minimal failing test case for https://github.com/acorg/light-matter/issues/493 """ query = SSAARead( '2HLA:A', 'ALKEDLRSWTAADMAAQTTKHKWEAAHVAEQWRAYLEGTCVEWLRRYLENGKETLQRTDAPK' 'THMTHHAVSDHEATLRCWALSFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWVAVV', 'EE-TTSS-EEESSHHHHHHHHHHHHTTTHHHHHHHHHTHHHHHHHHHHHHHHHHHT--B--E' 'EEEEEEE-SSSEEEEEEEEEEEBSS-EEEEEEETTEEE-TTEEE---EE-SSS-EEEEEEEE') subject = SSAARead( '3D2U:A', 'HVLRYGYTGIFDDTSHMTLTVVGIFDGQHFFTYHVQSSDKASSRANGTISWMANVSAAYPTY' 'PVTHPVVKGGVRNQNDNRAEAFCTSYGFFPGEIQITFIHYGDKVPEDSEPQCNPLLPTLDGT', '-EEEEEEEEEESSSS-EEEEEEEEETTEEEEEEEEESS-SSS-EEEE-STHHHHHHHHSTTH' '--B--EEEEEEEEEETTEEEEEEEEEEEBSS--EEEEEEESS---TT---EE---EE-TTS-') dbParams = DatabaseParameters(landmarks=['PDB ExtendedStrand'], trigPoints=[], limitPerLandmark=50, distanceBase=1.1) db = Database(dbParams) _, subjectIndex, _ = db.addSubject(subject) findParams = FindParameters(significanceFraction=0.01) result = db.find(query, findParams, storeFullAnalysis=True) significantBins = result.analysis[subjectIndex]['significantBins'] for binInfo in significantBins: normalizeBin(binInfo['bin'], len(query))
def testFindTwoMatchingInSameSubject(self): """ Two matching hashes in the subject must be found correctly. """ sequence = 'FRRRFRRRFASAASA' subject = AARead('subject', sequence) query = AARead('query', sequence) dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks]) db = Database(dbParams) db.addSubject(subject) result = db.find(query) self.assertEqual( { '0': [{ 'queryLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 10), 'subjectLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 10), }, { 'queryLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 13), 'subjectLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 13), }] }, result.matches)
def testFindOneMatchingSignificantWithSubjectIndicesIncludingIt(self): """ One matching and significant subject must be found, including when a non-empty subjectIndices is passed which includes the found index (and other non-matched subject indices) """ sequence = 'AFRRRFRRRFASAASA' subject = AARead('subject', sequence) query = AARead('query', sequence) dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], maxDistance=11) db = Database(dbParams) db.addSubject(subject) findParams = FindParameters(significanceFraction=0.0) result = db.find(query, findParams, subjectIndices={'0', 'x', 'y'}) self.assertEqual( { '0': [ { 'queryLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 11), 'subjectLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 11), }, ], }, result.matches)
def testFindOneMatchingInsignificant(self): """ One matching subject should be found, but is not significant with the default value of significanceFraction. """ subject = AARead('subject', 'AFRRRFRRRFASAASAVVVVVVASAVVVASA') query = AARead('query', 'FRRRFRRRFASAASAFRRRFRRRFFRRRFRRRFFRRRFRRRF') dbParams = DatabaseParameters(landmarks=[AlphaHelix, BetaStrand], trigPoints=[Peaks]) db = Database(dbParams) db.addSubject(subject) result = db.find(query) self.assertEqual( { '0': [{ 'queryLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 10), 'subjectLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 11), }, { 'queryLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 13), 'subjectLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 14), }] }, result.matches) self.assertEqual(0, len(list(result.significantSubjects())))
def testFindMatchAfterSaveRestore(self): """ A matching subject found before a save/restore must also be found following a database save/restore. """ subject = AARead('subject', 'AFRRRFRRRFASAASAVVVVVVASAVVVASA') query = AARead('query', 'FRRRFRRRFASAASAFRRRFRRRFFRRRFRRRFFRRRFRRRF') dbParams = DatabaseParameters(landmarks=[AlphaHelix, BetaStrand], trigPoints=[Peaks]) db1 = Database(dbParams) db1.addSubject(subject) result = db1.find(query) expected = { '0': [{ 'queryLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 10), 'subjectLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 11), }, { 'queryLandmark': Landmark('AlphaHelix', 'A', 0, 9, 2), 'queryTrigPoint': TrigPoint('Peaks', 'P', 13), 'subjectLandmark': Landmark('AlphaHelix', 'A', 1, 9, 2), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 14), }] } self.assertEqual(expected, result.matches) fp = StringIO() db1.save(fp) fp.seek(0) db2 = Database.restore(fp) result = db2.find(query) self.assertEqual(expected, result.matches)
def testFindNoMatching(self): """ A non-matching key must not be found. """ subject = AARead('subject', 'FRRRFRRRFASAASA') query = AARead('query', 'FRRR') dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks]) db = Database(dbParams) db.addSubject(subject) result = db.find(query) self.assertEqual({}, result.matches)
def testFindNoneMatchingNoTrigPoint(self): """ No matches should be found if there is only one landmark and there are no trig point finders. """ sequence = 'AFRRRFRRRFASAASA' subject = AARead('subject', sequence) query = AARead('query', sequence) dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[]) db = Database(dbParams) db.addSubject(subject) result = db.find(query) self.assertEqual({}, result.matches)
def testFindNoneMatchingTooSmallDistance(self): """ No matches should be found if the max distance is too small. """ sequence = 'AFRRRFRRRFASAASA' subject = AARead('subject', sequence) query = AARead('query', sequence) dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], maxDistance=1) db = Database(dbParams) db.addSubject(subject) result = db.find(query) self.assertEqual({}, result.matches)
def testFindBug493(self): """ Failing test case for https://github.com/acorg/light-matter/issues/493 """ query = SSAARead( '2HLA:A', 'GSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDR' 'NTRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQMMYGCDVGSDGRFLRGYRQDAYDGKDYI' 'ALKEDLRSWTAADMAAQTTKHKWEAAHVAEQWRAYLEGTCVEWLRRYLENGKETLQRTDAPK' 'THMTHHAVSDHEATLRCWALSFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWVAVV' 'VPSGQEQRYTCHVQHEGLPKPL', '--EEEEEEEEEE--TTSS--EEEEEEEETTEEEEEEETTSTT-S-EE-SHHHHTS-HHHHHH' 'HHHHHHHHHHHHHHHHHHHHHHTT--TTS--EEEEEEEEEE-TTS-EEEEEEEEEETTEEEE' 'EE-TTSS-EEESSHHHHHHHHHHHHTTTHHHHHHHHHTHHHHHHHHHHHHHHHHHT--B--E' 'EEEEEEE-SSSEEEEEEEEEEEBSS-EEEEEEETTEEE-TTEEE---EE-SSS-EEEEEEEE' 'EETT-GGGEEEEEEETTB-S--') subject = SSAARead( '3D2U:A', 'HVLRYGYTGIFDDTSHMTLTVVGIFDGQHFFTYHVQSSDKASSRANGTISWMANVSAAYPTY' 'LDGERAKGDLIFNQTEQNLLELEIALGYRSQSVLTWTHECNTTENGSFVAGYEGFGWDGETL' 'MELKDNLTLWTGPNYEISWLKQQKTYIDGKIKNISEGDTTIQRNYLKGNCTQWSVIYSGFQP' 'PVTHPVVKGGVRNQNDNRAEAFCTSYGFFPGEIQITFIHYGDKVPEDSEPQCNPLLPTLDGT' 'FHQGCYVAIFSNQNYTCRVTHGNWTVEIPISVT', '-EEEEEEEEEESSSS-EEEEEEEEETTEEEEEEEEESS-SSS-EEEE-STHHHHHHHHSTTH' 'HHHHHHHHHHHHHHHHHHHHHHHHHH--SS--EEEEEEEEEE-TT--EEEEEEEEEETTEEE' 'EEE-TTS---B---TTT-GGGGGHHHHHHHHHT--SHHHHHHHHHHHTHHHHHHHHHHHHS-' '--B--EEEEEEEEEETTEEEEEEEEEEEBSS--EEEEEEESS---TT---EE---EE-TTS-' 'EEEEEEEEEETTSEEEEEEE-SS-EEEEEEE--') dbParams = DatabaseParameters(landmarks=[ 'PDB AlphaHelix', 'PDB AlphaHelix_3_10', 'PDB AlphaHelix_pi', 'PDB ExtendedStrand', 'AminoAcidsLm' ], trigPoints=[ 'AminoAcids', 'Peaks', 'Troughs', 'IndividualPeaks', 'IndividualTroughs' ], featureLengthBase=1.01, maxDistance=10000, limitPerLandmark=50, distanceBase=1.1) db = Database(dbParams) _, subjectIndex, _ = db.addSubject(subject) findParams = FindParameters(significanceFraction=0.01) result = db.find(query, findParams, storeFullAnalysis=True) significantBins = result.analysis[subjectIndex]['significantBins'] for binInfo in significantBins: normalizeBin(binInfo['bin'], len(query))
def testFindOneMatchingButSubjectExcluded(self): """ Despite one matching and significant subject, no result should be returned if a subjectIndices argument that excludes it is passed to find. """ sequence = 'AFRRRFRRRFASAASA' subject = AARead('subject', sequence) query = AARead('query', sequence) dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], maxDistance=11) db = Database(dbParams) db.addSubject(subject) findParams = FindParameters(significanceFraction=0.0) result = db.find(query, findParams, subjectIndices=set()) self.assertEqual({}, result.matches)
def __init__(self, histogram, query, database): self._histogram = histogram # A top-level import of Database would be circular. from light.database import Database db = Database(database.dbParams) _, subjectIndex, _ = db.addSubject(query) from light.parameters import FindParameters findParams = FindParameters(significanceMethod='Always') result = db.find(query, findParams, storeFullAnalysis=True) bins = result.analysis[subjectIndex]['histogram'].bins # The highest-scoring bin is ignored. binHeights = sorted([len(h) for h in bins], reverse=True)[1:] self.significanceCutoff = binHeights[0] self.analysis = { 'significanceMethod': self.__class__.__name__, 'significanceCutoff': self.significanceCutoff, }
def testFindSelf(self): """ Does a sequence match itself using different landmark and trig point finders. """ read = AARead(self.ID, self.SEQUENCE) dbParams = DatabaseParameters(landmarks=self.LANDMARKS, trigPoints=self.TRIG_POINTS, limitPerLandmark=self.LIMIT_PER_LANDMARK, maxDistance=self.MAX_DISTANCE, minDistance=self.MIN_DISTANCE) database = Database(dbParams=dbParams) _, subjectIndex, _ = database.addSubject(read) result = database.find(read) if subjectIndex in result.analysis: self.details = { 'result': True, 'score': result.analysis[subjectIndex]['bestBinScore'], } else: self.details = { 'result': False, }
MUMMYPOX = AARead('Mummypox', _BETA + _ALPHA + _ALPHA + _ALPHA + _ALPHA + _TRYPTOPHAN) SQUIRRELPOX = AARead('Squirrelpox', _ALPHA + _BETA + _BETA + _ALPHA + _BETA) _, _CATPOX_INDEX, _ = DB.addSubject(CATPOX) _, _COWPOX_INDEX, _ = DB.addSubject(COWPOX) _, _MONKEYPOX_INDEX, _ = DB.addSubject(MONKEYPOX) _, _MUMMYPOX_INDEX, _ = DB.addSubject(MUMMYPOX) _, _SQUIRRELPOX_INDEX, _ = DB.addSubject(SQUIRRELPOX) # Run find on a read that matches squirrelpox and catpox. READ0 = AARead('read0', _ALPHA + _BETA + _BETA + _ALPHA + _BETA) _findParams = FindParameters(significanceFraction=0.2) _result = DB.find(READ0, _findParams, storeFullAnalysis=True) READ0_SQUIRRELPOX_SCORE = _result.analysis[_SQUIRRELPOX_INDEX]['bestBinScore'] READ0_CATPOX_SCORE = _result.analysis[_CATPOX_INDEX]['bestBinScore'] RECORD0 = _result.save(StringIO()).getvalue() # Run find on a read that matches both monkeypox and mummypox. READ1 = AARead('read1', _BETA + _ALPHA + _ALPHA + _ALPHA + _BETA + _TRYPTOPHAN) _findParams = FindParameters(significanceFraction=0.25) _result = DB.find(READ1, _findParams, storeFullAnalysis=True) READ1_MONKEYPOX_SCORE = _result.analysis[_MONKEYPOX_INDEX]['bestBinScore'] READ1_MONKEYPOX_HSP2_SCORE = _result.analysis[_MONKEYPOX_INDEX][ 'significantBins'][1]['score'] READ1_MUMMYPOX_SCORE = _result.analysis[_MUMMYPOX_INDEX]['bestBinScore'] RECORD1 = _result.save(StringIO()).getvalue() # Run find on a read that matches only cowpox.