def testGetScannedPairs(self): """ The getSequencePairs method must return pairs of (landmark, trigPoints). """ subject = AARead('subject', 'FRRRFRRRFASAASA') dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], distanceBase=1.0) be = Backend() be.configure(dbParams) be.addSubject(subject, '0') scannedSubject = be.scan(subject) pairs = list(be.getScannedPairs(scannedSubject)) # First pair. landmark, trigPoint = pairs[0] self.assertEqual(Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2), landmark) self.assertEqual(TrigPoint(Peaks.NAME, Peaks.SYMBOL, 10), trigPoint) # Second pair. landmark, trigPoint = pairs[1] self.assertEqual(Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2), landmark) self.assertEqual(TrigPoint(Peaks.NAME, Peaks.SYMBOL, 13), trigPoint) self.assertEqual(2, len(pairs))
def __init__(self, sequences, labels, defaultLabel=None, **kwargs): """ Base class for using cluster analysis to evaluate how well various feature finders and database parameter settings can separate a set of sequences. The clustering is based on feature offset deltas. @param sequences: Either A C{str} filename of sequences to consider or a C{light.reads.Reads} instance. @param labels: A C{dict} with a label for each sequence id in C{sequences}. These are the known categories of each sequence. @param defaultLabel: If not C{None}, a label to use for reads whose ids are not present in C{labels}. If C{None} and a read id has no label a ValueError is raised. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @raises ValueError: If the id of a read is not in labels and no default label has been set, or if there are no reads in C{sequences}. """ if isinstance(sequences, str): reads = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) else: reads = sequences database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) backend = Backend() backend.configure(database.dbParams) allOffsetDeltas = [] trueLabels = [] for read in reads: trueLabel = labels.get(read.id, defaultLabel) if trueLabel is None: raise ValueError('Read %r has no corresponding label' % read.id) trueLabels.append(trueLabel) offsetDeltas = Counter() scannedRead = backend.scan(read) for landmark, trigPoint in backend.getScannedPairs(scannedRead): delta = scaleLog(trigPoint.offset - landmark.offset, database.dbParams.distanceBase) offsetDeltas[delta] += 1 allOffsetDeltas.append(offsetDeltas) nReads = len(reads) if nReads == 0: raise ValueError('No sequences were found in %r' % sequences) # Don't check that len(reads) == len(labels). I.e., ignore extra labels # to make using this class interactively more convenient. # Create an affinity matrix. Initially set all values to 1.0 so we # don't need to later initialize the diagonal. affinity = np.ones((nReads, nReads)) for row, offsetDeltas in enumerate(allOffsetDeltas): for col in range(row + 1, nReads): affinity[row, col] = affinity[col, row] = (self.affinityFromOffsetDeltas( allOffsetDeltas[row], allOffsetDeltas[col])) self.nReads = nReads self.affinity = affinity self.trueLabels = trueLabels