Пример #1
0
 def test_remove_from_index(self) -> None:
     # Test that actual removal occurs.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     # noinspection PyTypeChecker
     i.remove_from_index([[0, 0], [1, 0]])
     self.assertSetEqual(i.index, {1})
Пример #2
0
 def test_from_config_with_cache(self) -> None:
     c = LinearHashIndex.get_default_config()
     c['cache_element'][
         'type'] = 'smqtk_dataprovider.impls.data_element.memory.DataMemoryElement'
     i = LinearHashIndex.from_config(c)
     self.assertIsInstance(i.cache_element, DataMemoryElement)
     self.assertEqual(i.index, set())
Пример #3
0
 def test_build_index_with_cache(self) -> None:
     cache_element = DataMemoryElement()
     i = LinearHashIndex(cache_element)
     # noinspection PyTypeChecker
     i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]])
     self.assertEqual(i.index, {1, 2, 3, 4})
     self.assertFalse(cache_element.is_empty())
Пример #4
0
 def test_update_index_no_index(self) -> None:
     # Test calling update index with no existing index.  Should result the
     # same as calling build_index with no index.
     i = LinearHashIndex()
     # noinspection PyTypeChecker
     i.update_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]])
     self.assertEqual(i.index, {1, 2, 3, 4})
     self.assertIsNone(i.cache_element)
Пример #5
0
 def test_remove_from_index_single_not_in_index(self) -> None:
     # Test attempting to remove single hash not in the index.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     self.assertRaises(
         KeyError,
         i.remove_from_index,
         [[1, 0, 0]]  # 4
     )
     self.assertSetEqual(i.index, {0, 1, 2})
Пример #6
0
    def test_load_cache(self) -> None:
        cache_element = DataMemoryElement()
        i1 = LinearHashIndex(cache_element)
        # noinspection PyTypeChecker
        i1.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]])

        # load called on initialization.
        i2 = LinearHashIndex(cache_element)

        self.assertEqual(i1.cache_element, i2.cache_element)
        self.assertEqual(i1.index, i2.index)
Пример #7
0
    def test_save_cache_build_index(self) -> None:
        cache_element = DataMemoryElement()
        self.assertTrue(cache_element.is_empty())

        i = LinearHashIndex(cache_element)
        # noinspection PyTypeChecker
        i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]])
        self.assertFalse(cache_element.is_empty())
        # Check byte content
        expected_cache = {1, 2, 3, 4}
        actual_cache = set(numpy.load(BytesIO(cache_element.get_bytes())))
        self.assertSetEqual(expected_cache, actual_cache)
Пример #8
0
 def test_nn(self) -> None:
     i = LinearHashIndex()
     # noinspection PyTypeChecker
     i.build_index([[0, 1, 0], [1, 1, 0], [0, 1, 1], [0, 0, 1]])
     # noinspection PyTypeChecker
     near_codes, near_dists = i.nn([0, 0, 0], 4)
     self.assertEqual(set(map(tuple, near_codes[:2])), {(0, 1, 0),
                                                        (0, 0, 1)})
     self.assertEqual(set(map(tuple, near_codes[2:])), {(1, 1, 0),
                                                        (0, 1, 1)})
     numpy.testing.assert_array_almost_equal(
         near_dists, (1 / 3., 1 / 3., 2 / 3., 2 / 3.))
Пример #9
0
 def test_remove_from_index_one_of_many_not_in_index(self) -> None:
     # Test attempting to remove hashes where one of them is not in the
     # index.
     i = LinearHashIndex()
     i.index = {0, 1, 2}
     self.assertRaises(
         KeyError,
         i.remove_from_index,
         [
             [0, 0],  # 0
             [0, 1],  # 1
             [1, 1]
         ]  # 3
     )
     # Check that the index has not been modified.
     self.assertSetEqual(i.index, {0, 1, 2})
Пример #10
0
    def test_update_index_with_hash_index(self) -> None:
        # Similar test to `test_update_index_add_new_descriptors` but with a
        # linear hash index.
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs, linear_hi)

        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        # Initial hash index should only encode hashes for first batch of
        # descriptors.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        # Now the hash index should include all descriptor hashes.
        self.assertSetEqual(linear_hi.index, {0, 1, 2, 3, 4, 5, 6})
Пример #11
0
    def test_save_cache_remove_from_index(self) -> None:
        # Test that the cache is updated appropriately on a removal.
        cache_element = DataMemoryElement()
        self.assertTrue(cache_element.is_empty())

        i = LinearHashIndex(cache_element)
        # noinspection PyTypeChecker
        i.build_index([
            [0, 1, 0],  # 2
            [0, 1, 1],  # 3
            [1, 0, 0],  # 4
            [1, 1, 0]
        ])  # 6
        self.assertFalse(cache_element.is_empty())
        self.assertSetEqual(
            set(numpy.load(BytesIO(cache_element.get_bytes()))), {2, 3, 4, 6})

        # noinspection PyTypeChecker
        i.remove_from_index([
            [0, 1, 1],  # 3
            [1, 0, 0]
        ])  # 4
        self.assertFalse(cache_element.is_empty())
        self.assertSetEqual(
            set(numpy.load(BytesIO(cache_element.get_bytes()))), {2, 6})
Пример #12
0
 def test_configuration(self) -> None:
     i = LSHNearestNeighborIndex(
         lsh_functor=ItqFunctor(), descriptor_set=MemoryDescriptorSet(),
         hash2uuids_kvstore=MemoryKeyValueStore(),
         hash_index=LinearHashIndex(), distance_method='euclidean',
         read_only=True
     )
     for inst in configuration_test_helper(i):  # type: LSHNearestNeighborIndex
         assert isinstance(inst.lsh_functor, LshFunctor)
         assert isinstance(inst.descriptor_set, MemoryDescriptorSet)
         assert isinstance(inst.hash_index, LinearHashIndex)
         assert isinstance(inst.hash2uuids_kvstore, MemoryKeyValueStore)
         assert inst.distance_method == 'euclidean'
         assert inst.read_only is True
Пример #13
0
 def test_update_index_add_hashes(self) -> None:
     i = LinearHashIndex()
     # Build index with some initial hashes
     # noinspection PyTypeChecker
     i.build_index([[0, 0], [0, 1]])
     self.assertSetEqual(i.index, {0, 1})
     # Update index with new stuff
     # noinspection PyTypeChecker
     i.update_index([[1, 0], [1, 1]])
     self.assertSetEqual(i.index, {0, 1, 2, 3})
Пример #14
0
    def test_get_config(self) -> None:
        i = LinearHashIndex()

        # Without cache element
        expected_c = LinearHashIndex.get_default_config()
        self.assertEqual(i.get_config(), expected_c)

        # With cache element
        i.cache_element = DataMemoryElement()
        expected_c['cache_element'][
            'type'] = 'smqtk_dataprovider.impls.data_element.memory.DataMemoryElement'
        self.assertEqual(i.get_config(), expected_c)
Пример #15
0
    def test_build_index_fresh_build_with_hash_index(self) -> None:
        descr_set = MemoryDescriptorSet()
        hash_kvs = MemoryKeyValueStore()
        linear_hi = LinearHashIndex()  # simplest hash index, heap-sorts.
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set,
                                        hash_kvs, linear_hi)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)
        # Hash index should have been built with hash vectors, and linearHI
        # converts those to integers for storage.
        self.assertEqual(linear_hi.index, {0, 1, 2, 3, 4})
Пример #16
0
    def _nn(
        self,
        d: DescriptorElement,
        n: int = 1
    ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :param n: Number of nearest neighbors to find.

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.

        """
        LOG.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v: numpy.ndarray) -> float:
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            LOG.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys()))
            near_hashes, _ = hi.nn(d_h, n)

            LOG.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids: List[Hashable] = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            LOG.debug("-- matched %d UUIDs", len(neighbor_uuids))

            LOG.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_set.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        LOG.debug(f"ordering descriptors via distance method {self.distance_method}")
        LOG.debug('-- getting element vectors')
        neighbor_vectors = numpy.asarray(list(
            parallel_map(lambda d_: d_.vector(), neighbors)
        ))
        LOG.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        LOG.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        LOG.debug(f'-- slicing top n={n}')
        r_descrs: Tuple[DescriptorElement, ...]
        r_dists: Tuple[float, ...]
        r_descrs, r_dists = zip(*(ordered[:n]))
        return r_descrs, r_dists
Пример #17
0
 def test_update_index_no_input(self) -> None:
     i = LinearHashIndex()
     self.assertRaises(ValueError, i.update_index, [])
Пример #18
0
 def _make_hi_linear(self) -> LinearHashIndex:
     return LinearHashIndex()
Пример #19
0
 def test_build_index_no_cache(self) -> None:
     i = LinearHashIndex()
     # noinspection PyTypeChecker
     i.build_index([[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]])
     self.assertEqual(i.index, {1, 2, 3, 4})
     self.assertIsNone(i.cache_element)
Пример #20
0
 def test_is_usable(self) -> None:
     # Should always be true since this impl does no have special deps.
     self.assertTrue(LinearHashIndex.is_usable())
Пример #21
0
 def test_default_config(self) -> None:
     c = LinearHashIndex.get_default_config()
     self.assertEqual(len(c), 1)
     self.assertIsNone(c['cache_element']['type'])
Пример #22
0
 def test_save_cache_readonly_update_index(self) -> None:
     ro_cache = DataMemoryElement(readonly=True)
     i = LinearHashIndex(ro_cache)
     self.assertRaisesRegex(ValueError, "is read-only", i.update_index,
                            [[0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 0, 1]])
Пример #23
0
 def test_from_config_no_cache(self) -> None:
     # Default config is valid and specifies no cache.
     c = LinearHashIndex.get_default_config()
     i = LinearHashIndex.from_config(c)
     self.assertIsNone(i.cache_element)
     self.assertEqual(i.index, set())