def test_alignment(self): self.assertEqual(align_glocal("test", 'test'), (0, 0)) self.assertEqual(align_glocal("test", ' test'), (0, 1)) self.assertEqual(align_glocal(" test", 'test'), (1, 0)) self.assertEqual(align_glocal("tast", 'test'), (1, 0)) self.assertEqual(align_glocal("teste", 'tast'), (2, 0)) self.assertEqual(align_glocal("test", ' tast'), (1, 1))
def find_ranges(needle, genome, max_edit): """ Finds all matches in the FM-index genome for the needle within max_edit edit distance """ #Calculate the biggest substring that must exist in the haystack if the needle matches # with the edit distance. This assumes worst case distribution of deletion-edits. k = (len(needle) - max_edit) / (max_edit + 1) #Generate a generator of all substrings of length k. kmers = (i for i in xrange(len(needle) - k + 1)) #Skip all the hard work if the edit distance is zero. if not max_edit: return [(h, 0) for h in genome.search(needle)], 0 #Find where all of these kmers match exactly kmerhits = (genome.search(needle[kmer:kmer + k]) for kmer in kmers) #Create all a sorted set of intervals ranges = SortedSet(updator=OverlappingIntervalsUpdator) #Iterate over all of the kmers and kmers matches for i, kmer in enumerate(kmerhits): for hit in kmer: #Check for any existing possible ranges that are already in our list overlaps = ranges.overlap_point(hit) if not len(overlaps): #Create a new range consisting of the worst-case given the position of the kmer in the needle ranges.add([hit - i - max_edit, hit - i + len(needle) + max_edit, 1]) else: #Incrememnt the number of kmers in each possible range for overlap in overlaps: overlap[2] += 1 best_edit = max_edit #Iterate over all potential alignments and use dynamic programming to determine whether it actually #fits within edit distance #Possible matches defined as the ranges with > max_edit distance kmers found in the match alignments = [] for potential_alignment in (r for r in ranges if r[2] > max_edit): ed, start = align_glocal(needle, genome.seq[potential_alignment[0]:potential_alignment[1]]) if ed < best_edit: best_edit = ed alignments = [] alignments.append((potential_alignment[0] + start, ed)) return alignments, best_edit