Exemplo n.º 1
0
 def test_distance(self):
     """
     Test the method for finding the Levenshtein distance between
     strings
     """
     self.assertEqual(flamingo.distance("apple", "aaple"), 1)
     self.assertEqual(flamingo.distance("A", "AAA"), 2)
     self.assertEqual(flamingo.distance("apple", "applesauce"), 5)
Exemplo n.º 2
0
 def test_distance(self):
     """
     Test the method for finding the Levenshtein distance between
     strings
     """
     self.assertEqual(flamingo.distance("apple", "aaple"), 1)
     self.assertEqual(flamingo.distance("A", "AAA"), 2)
     self.assertEqual(flamingo.distance("apple", "applesauce"), 5)
Exemplo n.º 3
0
    def search(self, barcode, distance, verbose=False, details=False, unique=False):
        """Search for the object mapping from a barcode"""
        cache_match = self.cache_dicts[unique].get(barcode)
        if cache_match != None:
            # this is the closest match, so if it doesn't fit with this
            # distance, it won't work
            original = self.original[cache_match]
            if flamingo.distance(barcode, original) > distance:
                return None

            return (cache_match, original) if details else cache_match

        if distance == 0:
            # won't be able to find any inexact matches anyway
            return None

        # inexact match
        matches = self.index.search(barcode, distance)

        if len(matches) == 0:
            self.cache_dicts[unique][barcode] = None
            return None

        best = closest_match(barcode, matches, unique=unique)

        if best == None:
            return None

        ret = self.barcode_dict[best]
        self.cache_dicts[unique][barcode] = ret

        return (ret, best) if details else ret
Exemplo n.º 4
0
    def test_barcode_cache_multiple_len(self):
        """basic tests of BarcodeCacheMultipleLen"""
        words = list(itertools.chain(*[[l * 5, l * 6] for l in "ABCDEFG"]))

        cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words]))

        for d in range(0, 4):
            for l in "ABCDEFG":
                self.assertEqual(cache.search(l * 5, d), l * 5)
                self.assertEqual(cache.search(l * 6, d), l * 6)
        for d in range(1, 4):
            for l in "ABCDEFG":
                self.assertEqual(cache.search(l * 7, d), l * 6)
                self.assertEqual(cache.search(l * 8, d), l * 6)

        # check negative results as well
        for i in range(10):
            self.assertEqual(cache.search("T" * i, 1), None)

        # create and test a random, unique catalog
        LENGTH = 9
        catalog = list(set([random_barcode(LENGTH) for i in range(2000)]))

        cache = matching.BarcodeCacheMultipleLen(
            dict([(w, w) for w in catalog]))
        for w in catalog:
            for d in range(4):
                self.assertEqual(cache.search(w, d), w)

        # pick random barcodes and see that they match correctly
        for i in range(250):
            b = random_barcode(LENGTH)
            closest = brute_force_levenshtein(b, catalog)
            dists = [flamingo.distance(b, m) for m in closest]
            # sanity check on brute_force_levenshtein:
            self.assertEqual(len(set(dists)), 1)

            if len(closest) > 1:
                self.assertEqual(cache.search(b, dists[0], unique=True), None)
            else:
                # make sure the cache finds it
                self.assertEqual(cache.search(b, dists[0], unique=True),
                                 closest[0])
                self.assertEqual(cache.search(b, dists[0] + 1, unique=True),
                                 closest[0])

            self.assertTrue(cache.search(b, dists[0], unique=False) in closest)

            # make sure it can't find anything closer
            for d in range(dists[0]):
                self.assertEqual(cache.search(b, d), None)

        # test an unusual case- close to ones of multiple length
        words = ["BAAAA", "BAAAAA"]
        cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words]))
        self.assertEqual(cache.search("AAAAA", 1), "BAAAA")
        self.assertEqual(cache.search("AAAAA", 1, details=True),
                         ("BAAAA", "BAAAA", 5))
Exemplo n.º 5
0
    def test_barcode_cache_multiple_len(self):
        """basic tests of BarcodeCacheMultipleLen"""
        words = list(itertools.chain(*[[l * 5, l * 6] for l in "ABCDEFG"]))

        cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words]))

        for d in range(0, 4):
            for l in "ABCDEFG":
                self.assertEqual(cache.search(l * 5, d), l * 5)
                self.assertEqual(cache.search(l * 6, d), l * 6)
        for d in range(1, 4):
            for l in "ABCDEFG":
                self.assertEqual(cache.search(l * 7, d), l * 6)
                self.assertEqual(cache.search(l * 8, d), l * 6)

        # check negative results as well
        for i in range(10):
            self.assertEqual(cache.search("T" * i, 1), None)

        # create and test a random, unique catalog
        LENGTH = 9
        catalog = list(set([random_barcode(LENGTH) for i in range(2000)]))

        cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in catalog]))
        for w in catalog:
            for d in range(4):
                self.assertEqual(cache.search(w, d), w)

        # pick random barcodes and see that they match correctly
        for i in range(250):
            b = random_barcode(LENGTH)
            closest = brute_force_levenshtein(b, catalog)
            dists = [flamingo.distance(b, m) for m in closest]
            # sanity check on brute_force_levenshtein:
            self.assertEqual(len(set(dists)), 1)

            if len(closest) > 1:
                self.assertEqual(cache.search(b, dists[0], unique=True), None)
            else:
                # make sure the cache finds it
                self.assertEqual(cache.search(b, dists[0], unique=True), closest[0])
                self.assertEqual(cache.search(b, dists[0] + 1, unique=True), closest[0])

            self.assertTrue(cache.search(b, dists[0], unique=False) in closest)

            # make sure it can't find anything closer
            for d in range(dists[0]):
                self.assertEqual(cache.search(b, d), None)

        # test an unusual case- close to ones of multiple length
        words = ["BAAAA", "BAAAAA"]
        cache = matching.BarcodeCacheMultipleLen(dict([(w, w) for w in words]))
        self.assertEqual(cache.search("AAAAA", 1), "BAAAA")
        self.assertEqual(cache.search("AAAAA", 1, details=True), ("BAAAA", "BAAAA", 5))
Exemplo n.º 6
0
def closest_match(original, matches, unique=False):
    """
    Return the closest Levenshtein match if there is one. If unique, return
    None if there is a tie for closest
    """
    if len(matches) == 1:
        return matches[0]

    distances = [flamingo.distance(m, original) for m in matches]

    if unique and distances.count(min(distances)) > 1:
        return None

    return min(zip(distances, matches))[1]
Exemplo n.º 7
0
def closest_match(original, matches, unique=False):
    """
    Return the closest Levenshtein match if there is one. If unique, return
    None if there is a tie for closest
    """
    if len(matches) == 1:
        return matches[0]

    distances = [flamingo.distance(m, original) for m in matches]

    if unique and distances.count(min(distances)) > 1:
        return None

    return min(zip(distances, matches))[1]
Exemplo n.º 8
0
    def search(self,
               barcode,
               distance,
               verbose=False,
               details=False,
               unique=False):
        """Search for the object mapping from a barcode"""
        cache_match = self.cache_dicts[unique].get(barcode)
        if cache_match != None:
            # this is the closest match, so if it doesn't fit with this
            # distance, it won't work
            original = self.original[cache_match]
            if flamingo.distance(barcode, original) > distance:
                return None

            return (cache_match, original) if details else cache_match

        if distance == 0:
            # won't be able to find any inexact matches anyway
            return None

        # inexact match
        matches = self.index.search(barcode, distance)

        if len(matches) == 0:
            self.cache_dicts[unique][barcode] = None
            return None

        best = closest_match(barcode, matches, unique=unique)

        if best == None:
            return None

        ret = self.barcode_dict[best]
        self.cache_dicts[unique][barcode] = ret

        return (ret, best) if details else ret
Exemplo n.º 9
0
 def check_consistent(self, query, results, dist, msg=None):
     """check that a query is within the distance of all results"""
     for e in results:
         self.assertTrue(dist >= flamingo.distance(query, e), msg=msg)
Exemplo n.º 10
0
def brute_force_levenshtein(w, lst):
    """find the closest levenshtein matches by brute force"""
    matches = [(e, flamingo.distance(w, e)) for e in lst]
    best_distance = min((d for w, d in matches))
    return [w for w, d in matches if d == best_distance]
Exemplo n.º 11
0
 def check_consistent(self, query, results, dist, msg=None):
     """check that a query is within the distance of all results"""
     for e in results:
         self.assertTrue(dist >= flamingo.distance(query, e), msg=msg)
Exemplo n.º 12
0
def brute_force_levenshtein(w, lst):
    """find the closest levenshtein matches by brute force"""
    matches = [(e, flamingo.distance(w, e)) for e in lst]
    best_distance = min((d for w, d in matches))
    return [w for w, d in matches if d == best_distance]