示例#1
0
    def test_update_index_with_hash_index(self) -> None:
        # Similar test to `test_update_index_add_new_descriptors` but with a
        # linear hash index.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs, linear_hi)

        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        # Initial hash index should only encode hashes for first batch of
        # descriptors.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        # Now the hash index should include all descriptor hashes.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4, 5, 6})
示例#2
0
    def test_build_index_fresh_build(self) -> None:
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)

        # Make sure descriptors are now in attached index and in
        # key-value-store.
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
示例#3
0
    def test_remove_from_index(self) -> None:
        # Test that removing by UIDs does the correct thing.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement(0),
            DescriptorMemoryElement(1),
            DescriptorMemoryElement(2),
            DescriptorMemoryElement(3),
            DescriptorMemoryElement(4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        d_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)

        # Attempt removing 1 uid.
        idx.remove_from_index([3])
        assert isinstance(idx.descriptor_set, MemoryDescriptorSet)
        self.assertEqual(idx.descriptor_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            4: descriptors[4],
        })
        assert isinstance(idx.hash2uuids_kvstore, MemoryKeyValueStore)
        self.assertEqual(idx.hash2uuids_kvstore._table, {
            0: {0},
            1: {1},
            2: {2},
            4: {4},
        })
示例#4
0
    def test_remove_from_index_invalid_uid(self) -> None:
        # Test that attempting to remove a single invalid UID causes a key
        # error and does not affect index.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement(0),
            DescriptorMemoryElement(1),
            DescriptorMemoryElement(2),
            DescriptorMemoryElement(3),
            DescriptorMemoryElement(4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        # uid -> descriptor
        expected_dset_table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }
        # hash int -> set[uid]
        expected_kvs_table = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
        }

        d_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)
        # Assert we have the correct expected values
        assert isinstance(idx.descriptor_set, MemoryDescriptorSet)
        self.assertEqual(idx.descriptor_set._table, expected_dset_table)
        assert isinstance(idx.hash2uuids_kvstore, MemoryKeyValueStore)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove descriptor with a UID we did not build with.
        self.assertRaisesRegex(
            KeyError, '5',
            idx.remove_from_index, [5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_set._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove multiple UIDs, one valid and one invalid
        self.assertRaisesRegex(
            KeyError, '5',
            idx.remove_from_index, [2, 5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_set._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
示例#5
0
    def test_remove_from_index_shared_hashes(self) -> None:
        """
        Test that removing a descriptor (by UID) that shares a hash with other
        descriptors does not trigger removal of its hash.
        """
        # Simulate descriptors all hashing to the same hash value: 0
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(return_value=np.asarray(
            [0], bool))  # type: ignore

        d_set = MemoryDescriptorSet()
        hash2uids_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs)

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        idx.build_index(descriptors)
        # We expect the descriptor-set and kvs to look like the following now:
        self.assertDictEqual(
            d_set._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                3: descriptors[3],
                4: descriptors[4],
            })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}})

        # Mock out hash index as if we had an implementation so we can check
        # call to its remove_from_index method.
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([2, 4])

        # Only uid 2 and 4 descriptors should be gone from d-set, kvs should
        # still have the 0 key and its set value should only contain uids 0, 1
        # and 3.  `hash_index.remove_from_index` should not be called because
        # no hashes should be marked for removal.
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            3: descriptors[3],
        })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}})
        idx.hash_index.remove_from_index.assert_not_called()
示例#6
0
    def _known_unit(
        self,
        hash_ftor: LshFunctor,
        hash_idx: Optional[HashIndex],
        dist_method: str,
        ftor_train_hook: Callable[[Iterable[DescriptorElement]],
                                  None] = lambda d: None
    ) -> None:
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            test_descriptors.append(
                DescriptorMemoryElement('unit', i).set_vector(v))

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorSet()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
示例#7
0
    def test_update_index_similar_descriptors(self) -> None:
        """
        Test that updating a built index with similar descriptors (same
        vectors, different UUIDs) results in contained structures having an
        expected state.
        """
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_set.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_set
        for d in descriptors2:
            assert d in descr_set
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}
示例#8
0
    def test_update_index_add_new_descriptors(self) -> None:
        # Test that calling update index after a build index causes index
        # components to be properly updated.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)
        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        self.assertEqual(descr_set.count(), 5)
        for d in descriptors1:
            self.assertIn(d, descr_set)
        for d in descriptors2:
            self.assertNotIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        self.assertEqual(descr_set.count(), 7)
        for d in descriptors1 + descriptors2:
            self.assertIn(d, descr_set)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 7)
        for i in range(7):
            self.assertSetEqual(hash_kvs.get(i), {i})
示例#9
0
    def _known_ordered_euclidean(
        self,
        hash_ftor: LshFunctor,
        hash_idx: Optional[HashIndex],
        ftor_train_hook: Callable[[Iterable[DescriptorElement]],
                                  None] = lambda d: None
    ) -> None:
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = [
            DescriptorMemoryElement('ordered',
                                    j).set_vector(np.array([j, j * 2], float))
            for j in range(i)
        ]
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorSet()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
示例#10
0
    def test_update_index_duplicate_descriptors(self) -> None:
        """
        Test that updating a built index with the same descriptors results in
        idempotent behavior.
        """
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs)

        # Identical Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_set.count() == 5
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_set
        for d in descriptors2:
            assert d in descr_set
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0}
        assert hash_kvs.get(1) == {1}
        assert hash_kvs.get(2) == {2}
        assert hash_kvs.get(3) == {3}
        assert hash_kvs.get(4) == {4}
示例#11
0
    def test_build_index_fresh_build_with_hash_index(self) -> None:
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs, linear_hi)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)
        # Hash index should have been built with hash vectors, and linearHI
        # converts those to integers for storage.
        self.assertEqual(linear_hi.index, {0, 1, 2, 3, 4})
示例#12
0
    def _random_euclidean(
        self,
        hash_ftor: LshFunctor,
        hash_idx: Optional[HashIndex],
        ftor_train_hook: Callable[[Iterable[DescriptorElement]],
                                  None] = lambda d: None
    ) -> None:
        # :param hash_ftor: Hash function class for generating hash codes for
        #   descriptors.
        # :param hash_idx: Hash index instance to use in local LSH algo
        #   instance.
        # :param ftor_train_hook: Function for training functor if necessary.

        # make random descriptors
        i = 1000
        dim = 256
        td = []
        np.random.seed(self.RANDOM_SEED)
        for j in range(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(np.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorSet()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor,
                                        di,
                                        kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        td_q_v = td_q.vector()
        assert td_q_v is not None
        v = td_q_v.copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim - 1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        self.assertFalse(np.array_equal(q.vector(), td_q.vector()))
        self.assertEqual(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i + 1)
        q.set_vector(np.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j - 1])
        r, dists = index.nn(q, i)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j - 1])