def test_distance(self): """ Test the method for finding the Levenshtein distance between strings """ self.assertEqual(flamingo.distance("apple", "aaple"), 1) self.assertEqual(flamingo.distance("A", "AAA"), 2) self.assertEqual(flamingo.distance("apple", "applesauce"), 5)
def search(self, barcode, distance, verbose=False, details=False, unique=False): """Search for the object mapping from a barcode""" cache_match = self.cache_dicts[unique].get(barcode) if cache_match != None: # this is the closest match, so if it doesn't fit with this # distance, it won't work original = self.original[cache_match] if flamingo.distance(barcode, original) > distance: return None return (cache_match, original) if details else cache_match if distance == 0: # won't be able to find any inexact matches anyway return None # inexact match matches = self.index.search(barcode, distance) if len(matches) == 0: self.cache_dicts[unique][barcode] = None return None best = closest_match(barcode, matches, unique=unique) if best == None: return None ret = self.barcode_dict[best] self.cache_dicts[unique][barcode] = ret return (ret, best) if details else ret
def test_barcode_cache_multiple_len(self): """basic tests of BarcodeCacheMultipleLen""" words = list(itertools.chain(*[[l * 5, l * 6] for l in "ABCDEFG"])) cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words])) for d in range(0, 4): for l in "ABCDEFG": self.assertEqual(cache.search(l * 5, d), l * 5) self.assertEqual(cache.search(l * 6, d), l * 6) for d in range(1, 4): for l in "ABCDEFG": self.assertEqual(cache.search(l * 7, d), l * 6) self.assertEqual(cache.search(l * 8, d), l * 6) # check negative results as well for i in range(10): self.assertEqual(cache.search("T" * i, 1), None) # create and test a random, unique catalog LENGTH = 9 catalog = list(set([random_barcode(LENGTH) for i in range(2000)])) cache = matching.BarcodeCacheMultipleLen( dict([(w, w) for w in catalog])) for w in catalog: for d in range(4): self.assertEqual(cache.search(w, d), w) # pick random barcodes and see that they match correctly for i in range(250): b = random_barcode(LENGTH) closest = brute_force_levenshtein(b, catalog) dists = [flamingo.distance(b, m) for m in closest] # sanity check on brute_force_levenshtein: self.assertEqual(len(set(dists)), 1) if len(closest) > 1: self.assertEqual(cache.search(b, dists[0], unique=True), None) else: # make sure the cache finds it self.assertEqual(cache.search(b, dists[0], unique=True), closest[0]) self.assertEqual(cache.search(b, dists[0] + 1, unique=True), closest[0]) self.assertTrue(cache.search(b, dists[0], unique=False) in closest) # make sure it can't find anything closer for d in range(dists[0]): self.assertEqual(cache.search(b, d), None) # test an unusual case- close to ones of multiple length words = ["BAAAA", "BAAAAA"] cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words])) self.assertEqual(cache.search("AAAAA", 1), "BAAAA") self.assertEqual(cache.search("AAAAA", 1, details=True), ("BAAAA", "BAAAA", 5))
def test_barcode_cache_multiple_len(self): """basic tests of BarcodeCacheMultipleLen""" words = list(itertools.chain(*[[l * 5, l * 6] for l in "ABCDEFG"])) cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words])) for d in range(0, 4): for l in "ABCDEFG": self.assertEqual(cache.search(l * 5, d), l * 5) self.assertEqual(cache.search(l * 6, d), l * 6) for d in range(1, 4): for l in "ABCDEFG": self.assertEqual(cache.search(l * 7, d), l * 6) self.assertEqual(cache.search(l * 8, d), l * 6) # check negative results as well for i in range(10): self.assertEqual(cache.search("T" * i, 1), None) # create and test a random, unique catalog LENGTH = 9 catalog = list(set([random_barcode(LENGTH) for i in range(2000)])) cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in catalog])) for w in catalog: for d in range(4): self.assertEqual(cache.search(w, d), w) # pick random barcodes and see that they match correctly for i in range(250): b = random_barcode(LENGTH) closest = brute_force_levenshtein(b, catalog) dists = [flamingo.distance(b, m) for m in closest] # sanity check on brute_force_levenshtein: self.assertEqual(len(set(dists)), 1) if len(closest) > 1: self.assertEqual(cache.search(b, dists[0], unique=True), None) else: # make sure the cache finds it self.assertEqual(cache.search(b, dists[0], unique=True), closest[0]) self.assertEqual(cache.search(b, dists[0] + 1, unique=True), closest[0]) self.assertTrue(cache.search(b, dists[0], unique=False) in closest) # make sure it can't find anything closer for d in range(dists[0]): self.assertEqual(cache.search(b, d), None) # test an unusual case- close to ones of multiple length words = ["BAAAA", "BAAAAA"] cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words])) self.assertEqual(cache.search("AAAAA", 1), "BAAAA") self.assertEqual(cache.search("AAAAA", 1, details=True), ("BAAAA", "BAAAA", 5))
def closest_match(original, matches, unique=False): """ Return the closest Levenshtein match if there is one. If unique, return None if there is a tie for closest """ if len(matches) == 1: return matches[0] distances = [flamingo.distance(m, original) for m in matches] if unique and distances.count(min(distances)) > 1: return None return min(zip(distances, matches))[1]
def check_consistent(self, query, results, dist, msg=None): """check that a query is within the distance of all results""" for e in results: self.assertTrue(dist >= flamingo.distance(query, e), msg=msg)
def brute_force_levenshtein(w, lst): """find the closest levenshtein matches by brute force""" matches = [(e, flamingo.distance(w, e)) for e in lst] best_distance = min((d for w, d in matches)) return [w for w, d in matches if d == best_distance]