示例#1
0
    def test_adjudication_switch(self):
        """
        Test providing positives and negatives on top of an existing state such
        that the descriptor adjudications are reversed. (what was once positive
        is now negative, etc.)
        """

        p0 = DescriptorMemoryElement('', 0).set_vector([0])
        p1 = DescriptorMemoryElement('', 1).set_vector([1])
        p2 = DescriptorMemoryElement('', 2).set_vector([2])
        n3 = DescriptorMemoryElement('', 3).set_vector([3])
        n4 = DescriptorMemoryElement('', 4).set_vector([4])

        # Set initial state
        self.iqrs.positive_descriptors = {p0, p1, p2}
        self.iqrs.negative_descriptors = {n3, n4}

        # Adjudicate, partially swapping adjudications individually
        self.iqrs.adjudicate(new_positives=[n3])
        assert self.iqrs.positive_descriptors == {p0, p1, p2, n3}
        assert self.iqrs.negative_descriptors == {n4}

        self.iqrs.adjudicate(new_negatives=[p1])
        assert self.iqrs.positive_descriptors == {p0, p2, n3}
        assert self.iqrs.negative_descriptors == {n4, p1}

        # Adjudicate swapping remaining at the same time
        self.iqrs.adjudicate(new_positives=[n4], new_negatives=[p0, p2])
        assert self.iqrs.positive_descriptors == {n3, n4}
        assert self.iqrs.negative_descriptors == {p0, p1, p2}
示例#2
0
    def test_known_descriptors_euclidean_ordered(self):
        index = self._make_inst()

        # make vectors to return in a known euclidean distance order
        i = 100
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index
        # order.
        q = DescriptorMemoryElement('query', 99)
        q.set_vector(np.array([0, 0], float))
        r, dists = index.nn(q, n=i)
        # Because the data is one-dimensional, all of the cells will have
        # the same points (any division will just correspond to a point on
        # the line), and a cell can't have more than half of the points
        ntools.assert_equal(len(dists), i//2)
        for j, d, dist in zip(range(i), r, dists):
            ntools.assert_equal(d.uuid(), j)
            np.testing.assert_equal(d.vector(), [j, j*2])
示例#3
0
        def test_build_index_with_cache(self):
            # Empty memory data elements for storage
            empty_data = 'base64://'
            f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data)
            # Internal elements should initialize have zero-length byte values
            self.assertEqual(len(f._index_elem.get_bytes()), 0)
            self.assertEqual(len(f._index_param_elem.get_bytes()), 0)
            self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0)

            # Make unit vectors, one for each feature dimension.
            dim = 8
            test_descriptors = []
            for i in range(dim):
                v = numpy.zeros(dim, float)
                v[i] = 1.
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(v)
                test_descriptors.append(d)

            f.build_index(test_descriptors)

            # Internal elements should not have non-zero byte values.
            self.assertGreater(len(f._index_elem.get_bytes()), 0)
            self.assertGreater(len(f._index_param_elem.get_bytes()), 0)
            self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
示例#4
0
    def test_normal_conditions(self, mock_dsi_count):
        index = DummySI()
        mock_dsi_count.return_value = 1

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        index.nn(q)
示例#5
0
    def test_fit_with_cache(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(DataMemoryElement(),
                         DataMemoryElement(),
                         bit_length=1,
                         random_seed=0)
        itq.fit(fit_descriptors)

        # TODO: Explanation as to why this is the expected result.
        numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0])
        numpy.testing.assert_array_almost_equal(itq.rotation,
                                                [[1 / sqrt(2)], [1 / sqrt(2)]])
        self.assertIsNotNone(itq.mean_vec_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.mean_vec_cache_elem.get_bytes())), [0, 0])

        self.assertIsNotNone(itq.rotation_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.rotation_cache_elem.get_bytes())),
            [[1 / sqrt(2)], [1 / sqrt(2)]])
示例#6
0
    def test_feedback_results_has_results_post_reset(self):
        """
        Test that an empty list is returned after a reset where there was a
        cached value before the reset.
        """

        # Mocking results map existing for return.
        d0 = DescriptorMemoryElement('', 0).set_vector([0])
        d1 = DescriptorMemoryElement('', 1).set_vector([1])
        d2 = DescriptorMemoryElement('', 2).set_vector([2])
        d3 = DescriptorMemoryElement('', 3).set_vector([3])
        self.iqrs.feedback_list = {
            d0,
            d1,
            d2,
            d3,
        }

        # Initial call to ``ordered_results`` should have a non-None return.
        assert self.iqrs.feedback_results() is not None

        self.iqrs.reset()

        # Post-reset, there should be no results nor cache.
        actual = self.iqrs.feedback_results()
        assert actual == []
示例#7
0
    def test_update_index_no_existing_index(self):
        # Test that calling update_index with no existing index acts like
        # building the index fresh.  This test is basically the same as
        # test_build_index_fresh_build but using update_index instead.
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_index,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        index.update_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
示例#8
0
    def test_adjudicate_remove_pos_neg(self):
        """
        Test that we can remove positive and negative adjudications using
        "un_*" parameters.
        """

        # Set initial state
        p0 = DescriptorMemoryElement('', 0).set_vector([0])
        p1 = DescriptorMemoryElement('', 1).set_vector([1])
        p2 = DescriptorMemoryElement('', 2).set_vector([2])
        n3 = DescriptorMemoryElement('', 3).set_vector([3])
        n4 = DescriptorMemoryElement('', 4).set_vector([4])

        # Set initial state
        self.iqrs.positive_descriptors = {p0, p1, p2}
        self.iqrs.negative_descriptors = {n3, n4}

        # "Un-Adjudicate" descriptors individually
        self.iqrs.adjudicate(un_positives=[p1])
        assert self.iqrs.positive_descriptors == {p0, p2}
        assert self.iqrs.negative_descriptors == {n3, n4}
        self.iqrs.adjudicate(un_negatives=[n3])
        assert self.iqrs.positive_descriptors == {p0, p2}
        assert self.iqrs.negative_descriptors == {n4}

        # "Un-Adjudicate" collectively
        self.iqrs.adjudicate(un_positives=[p0, p2], un_negatives=[n4])
        assert self.iqrs.positive_descriptors == set()
        assert self.iqrs.negative_descriptors == set()
示例#9
0
    def test_adjudicate_add_duplicates(self):
        """
        Test that adding duplicate descriptors as positive or negative
        adjudications has no effect as the behavior of sets should be observed.
        """

        p0 = DescriptorMemoryElement('', 0).set_vector([0])
        p2 = DescriptorMemoryElement('', 2).set_vector([2])
        n1 = DescriptorMemoryElement('', 1).set_vector([1])
        p3 = DescriptorMemoryElement('', 3).set_vector([3])
        n4 = DescriptorMemoryElement('', 4).set_vector([4])

        # Partially add the above descriptors
        self.iqrs.adjudicate(new_positives=[p0], new_negatives=[n1])
        assert self.iqrs.positive_descriptors == {p0}
        assert self.iqrs.negative_descriptors == {n1}

        # Add all descriptors, observing that that already added descriptors
        # are ignored.
        self.iqrs.adjudicate(new_positives=[p0, p2, p3],
                             new_negatives=[n1, n4])
        assert self.iqrs.positive_descriptors == {p0, p2, p3}
        assert self.iqrs.negative_descriptors == {n1, n4}

        # Duplicate previous call so no new descriptors are added. No change or
        # issue should be observed.
        self.iqrs.adjudicate(new_positives=[p0, p2, p3],
                             new_negatives=[n1, n4])
        assert self.iqrs.positive_descriptors == {p0, p2, p3}
        assert self.iqrs.negative_descriptors == {n1, n4}
示例#10
0
文件: test_itq.py 项目: Kitware/SMQTK
    def test_fit_with_cache(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(DataMemoryElement(), DataMemoryElement(),
                         bit_length=1, random_seed=0)
        itq.fit(fit_descriptors)

        # TODO: Explanation as to why this is the expected result.
        numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0])
        numpy.testing.assert_array_almost_equal(itq.rotation, [[1 / sqrt(2)],
                                                               [1 / sqrt(2)]])
        self.assertIsNotNone(itq.mean_vec_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.mean_vec_cache_elem.get_bytes())),
            [0, 0]
        )

        self.assertIsNotNone(itq.rotation_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.rotation_cache_elem.get_bytes())),
            [[1 / sqrt(2)],
             [1 / sqrt(2)]]
        )
示例#11
0
文件: test_itq.py 项目: Kitware/SMQTK
    def test_fit_short_descriptors_for_bit_length(self):
        # Should error when input descriptors have fewer dimensions than set bit
        # length for output hash codes (limitation of PCA method currently
        # used).
        fit_descriptors = []
        for i in range(3):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-1+i, -1+i])
            fit_descriptors.append(d)

        itq = ItqFunctor(bit_length=8)
        self.assertRaisesRegexp(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, fit_descriptors
        )
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)

        # Should behave the same when input is an iterable
        self.assertRaisesRegexp(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, iter(fit_descriptors)
        )
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)
示例#12
0
    def test_fit_short_descriptors_for_bit_length(self):
        # Should error when input descriptors have fewer dimensions than set bit
        # length for output hash codes (limitation of PCA method currently
        # used).
        fit_descriptors = []
        for i in range(3):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-1 + i, -1 + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(bit_length=8)
        self.assertRaisesRegexp(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, fit_descriptors)
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)

        # Should behave the same when input is an iterable
        self.assertRaisesRegexp(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, iter(fit_descriptors))
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)
示例#13
0
文件: test_itq.py 项目: Kitware/SMQTK
    def test_get_hash(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        # The following "rotation" matrix should cause any 2-feature descriptor
        # to the right of the line ``y = -x`` to be True, and to the left as
        # False. If on the line, should be True.
        itq = ItqFunctor(bit_length=1, random_seed=0)
        itq.mean_vec = numpy.array([0., 0.])
        itq.rotation = numpy.array([[1. / sqrt(2)],
                                    [1. / sqrt(2)]])

        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, 1])), [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, -1])), [False])

        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, 1])), [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1.001, 1])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, 1.001])), [True])

        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, -1])), [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, -1.001])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1.001, -1])), [True])
示例#14
0
        def test_build_index(self):
            # Empty memory data elements for storage
            empty_data = 'base64://'
            f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data)
            # Internal elements should initialize have zero-length byte values
            self.assertEqual(len(f._index_elem.get_bytes()), 0)
            self.assertEqual(len(f._index_param_elem.get_bytes()), 0)
            self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0)

            # Make unit vectors, one for each feature
            dim = 8
            test_descriptors = []
            for i in range(dim):
                v = numpy.zeros(dim, float)
                v[i] = 1.
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(v)
                test_descriptors.append(d)

            f.build_index(test_descriptors)

            # Internal elements should not have non-zero byte values.
            self.assertGreater(len(f._index_elem.get_bytes()), 0)
            self.assertGreater(len(f._index_param_elem.get_bytes()), 0)
            self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
示例#15
0
    def test_remove_from_index(self):
        # Test that removing by UIDs does the correct thing.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        d_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)

        # Attempt removing 1 uid.
        idx.remove_from_index([3])
        self.assertEqual(
            idx.descriptor_index._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                4: descriptors[4],
            })
        self.assertEqual(idx.hash2uuids_kvstore._table, {
            0: {0},
            1: {1},
            2: {2},
            4: {4},
        })
示例#16
0
    def test_pathological_example(self):
        n = 10 ** 4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_index = [DescriptorMemoryElement('test', i) for i in range(n)]
        # Put all descriptors on a line so that different trees get same
        # divisions
        [d.set_vector(np.full(dim, d.uuid(), dtype=np.float64))
         for d in d_index]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim,)))

        di = MemoryDescriptorIndex()
        mrpt = MRPTNearestNeighborsIndex(
            di, num_trees=num_trees, depth=depth, random_seed=0)
        mrpt.build_index(d_index)

        nbrs, dists = mrpt.nn(q, k)
        ntools.assert_equal(len(nbrs), len(dists))
        # We should get about 10 descriptors back instead of the requested
        # 200
        ntools.assert_less(len(nbrs), 20)
示例#17
0
    def test_build_index_fresh_build(self):
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_index,
                                        hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
示例#18
0
    def test_count_empty_hash2uid(self):
        """
        Test that an empty hash-to-uid mapping results in a 0 return regardless
        of descriptor-set state.
        """
        descr_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        self.assertEqual(descr_set.count(), 0)
        self.assertEqual(hash_kvs.count(), 0)

        lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs)
        self.assertEqual(lsh.count(), 0)

        # Additions to the descriptor-set should not impact LSH index "size"
        lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 0))
        self.assertEqual(lsh.descriptor_index.count(), 1)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 1))
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.hash2uuids_kvstore.add(0, {0})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 1)

        lsh.hash2uuids_kvstore.add(0, {0, 1})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 2)

        lsh.hash2uuids_kvstore.add(0, {0, 1, 2})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 3)
示例#19
0
    def test_remove_then_add(self):
        """
        Test that we can remove from the index and then add to it again.
        """
        n1 = 100
        n2 = 10
        dim = 8
        set1 = [DescriptorMemoryElement('test', i) for i in range(n1)]
        set2 = [DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)]
        [d.set_vector(np.random.rand(dim)) for d in (set1 + set2)]
        uids_to_remove = [10, 98]

        index = self._make_inst()
        index.build_index(set1)
        index.remove_from_index(uids_to_remove)
        index.update_index(set2)

        self.assertEqual(len(index), 108)
        # Removed descriptors should not be in return queries.
        self.assertNotEqual(index.nn(set1[10], 1)[0][0], set1[10])
        self.assertNotEqual(index.nn(set1[98], 1)[0][0], set1[98])
        # Every other descriptor should be queryable
        for d in set1 + set2:
            if d.uuid() not in uids_to_remove:
                self.assertEqual(index.nn(d, 1)[0][0], d)
        self.assertEqual(index._next_index, 110)
示例#20
0
    def test_update_index_additive(self):
        n1 = 100
        n2 = 10
        dim = 8
        set1 = {DescriptorMemoryElement('test', i) for i in range(n1)}
        set2 = {DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)}
        [d.set_vector(np.random.rand(dim)) for d in set1.union(set1 | set2)]

        # Create and build initial index.
        index = self._make_inst()
        index.build_index(set1)
        self.assertEqual(index.count(), len(set1))
        for d in set1:
            self.assertIn(d, index._descriptor_set)

        # Update and check that all intended descriptors are present in index.
        index.update_index(set2)
        set_all = set1 | set2
        self.assertEqual(index.count(), len(set_all))
        for d in set_all:
            self.assertIn(d, index._descriptor_set)

        # Check that NN can return something from the updated set.
        # - nearest element to the query element when the query is in the index
        #   should be the query element.
        for q in set2:
            n_elems, n_dists = index.nn(q)
            self.assertEqual(n_elems[0], q)
示例#21
0
    def test_nn_small_leaves(self):
        np.random.seed(0)

        n = 10**4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_set = [DescriptorMemoryElement('test', i) for i in range(n)]
        [d.set_vector(np.random.rand(dim)) for d in d_set]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim, )))

        di = MemoryDescriptorSet()
        mrpt = MRPTNearestNeighborsIndex(di,
                                         num_trees=num_trees,
                                         depth=depth,
                                         random_seed=0)
        mrpt.build_index(d_set)

        nbrs, dists = mrpt.nn(q, k)
        self.assertEqual(len(nbrs), len(dists))
        self.assertEqual(len(nbrs), k)
示例#22
0
    def test_normal_conditions(self, mock_dsi_count):
        index = DummySI()
        mock_dsi_count.return_value = 1

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        index.nn(q)
示例#23
0
    def test_get_hash(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement('test', i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        # The following "rotation" matrix should cause any 2-feature descriptor
        # to the right of the line ``y = -x`` to be True, and to the left as
        # False. If on the line, should be True.
        itq = ItqFunctor(bit_length=1, random_seed=0)
        itq.mean_vec = numpy.array([0., 0.])
        itq.rotation = numpy.array([[1. / sqrt(2)], [1. / sqrt(2)]])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, 1])),
                                         [True])
        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, -1])),
                                         [False])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, 1])),
                                         [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1.001, 1])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, 1.001])), [True])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, -1])),
                                         [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, -1.001])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1.001, -1])), [True])
示例#24
0
    def test_classify(self):
        d = DescriptorMemoryElement('test', 0)
        d.set_vector([1, 2, 3])

        c = DummyClassifier()
        e = c.classify(d)
        nose.tools.assert_equal(e.get_classification(), {0: [1, 2, 3]})
        nose.tools.assert_equal(e.uuid, d.uuid())
示例#25
0
 def test_build_index_one(self):
     d = DescriptorMemoryElement('test', 0)
     d.set_vector(numpy.zeros(8, float))
     index = self._make_inst('euclidean')
     index.build_index([d])
     self.assertListEqual(index._descr_cache, [d])
     self.assertIsNotNone(index._flann)
     self.assertIsInstance(index._flann_build_params, dict)
示例#26
0
    def test_remove_from_index_invalid_uid(self):
        # Test that attempting to remove a single invalid UID causes a key
        # error and does not affect index.

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        # uid -> descriptor
        expected_dset_table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }
        # hash int -> set[uid]
        expected_kvs_table = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
        }

        d_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs)
        idx.build_index(descriptors)
        # Assert we have the correct expected values
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove descriptor with a UID we did not build with.
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)

        # Attempt to remove multiple UIDs, one valid and one invalid
        self.assertRaisesRegexp(
            KeyError, '5',
            idx.remove_from_index, [2, 5]
        )
        # Index should not have been modified.
        self.assertEqual(idx.descriptor_index._table, expected_dset_table)
        self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
示例#27
0
 def test_classify_elements_missing_vector(self):
     """ Test that we get a ValueError when """
     elems = [
         DescriptorMemoryElement('', 0).set_vector([1, 2, 3]),
         DescriptorMemoryElement('', 0),  # no set vector
         DescriptorMemoryElement('', 0).set_vector([4, 5, 6]),
     ]
     with pytest.raises(ValueError, match=r"no vector stored"):
         list(self.inst.classify_elements(elems))
示例#28
0
    def test_nn_empty_index(self):
        # nn should fail if index size is 0
        index = DummySI()
        index.count = mock.MagicMock(return_value=0)
        index._nn = mock.MagicMock()

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        self.assertRaises(ValueError, index.nn, q)
示例#29
0
    def test_read_only(self):
        v = np.zeros(5, float)
        v[0] = 1.
        d = DescriptorMemoryElement('unit', 0)
        d.set_vector(v)
        test_descriptors = [d]

        index = self._make_inst(read_only=True)
        self.assertRaises(ReadOnlyError, index.build_index, test_descriptors)
    def test_simple_multiclass_classification(self):
        """ Test simple train and classify setup with 3 classes. """
        # Fix random seed for deterministic testing.
        numpy.random.seed(0)

        N = 1000
        LABEL_1 = 'p1'
        LABEL_2 = 'p2'
        LABEL_3 = 'p3'

        # Setup training dataset
        # - 1 dimensional for obvious separation, this is not a performance
        #   test.
        train1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis]
        train2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis]
        train3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis]

        train1_e = [DescriptorMemoryElement('train', i).set_vector(v)
                    for i, v in enumerate(train1)]
        train2_e = [DescriptorMemoryElement('train', i).set_vector(v)
                    for i, v in enumerate(train2, start=len(train1_e))]
        train3_e = [DescriptorMemoryElement('train', i).set_vector(v)
                    for i, v
                    in enumerate(train3,
                                 start=len(train1_e) + len(train2_e))]

        # Setup testing dataset
        test1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis]
        test2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis]
        test3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis]

        # Train and test classifier instance
        classifier = SkLearnLogisticRegression(random_state=0)
        classifier.train({
            LABEL_1: train1_e,
            LABEL_2: train2_e,
            LABEL_3: train3_e,
        })
        c_maps_l1 = list(classifier._classify_arrays(test1))
        c_maps_l2 = list(classifier._classify_arrays(test2))
        c_maps_l3 = list(classifier._classify_arrays(test3))

        for v, m in zip(test1, c_maps_l1):
            assert m[LABEL_1] > max(m[LABEL_2], m[LABEL_3]), \
                "Incorrect {} label: c_map={} :: test_vector={}".format(
                    LABEL_1, m, v
                )
        for v, m in zip(test2, c_maps_l2):
            assert m[LABEL_2] > max(m[LABEL_1], m[LABEL_3]), \
                "Incorrect {} label: c_map={} :: test_vector={}".format(
                    LABEL_2, m, v
                )
        for v, m in zip(test3, c_maps_l3):
            assert m[LABEL_3] > max(m[LABEL_2], m[LABEL_1]), \
                "Incorrect {} label: c_map={} :: test_vector={}".format(
                    LABEL_3, m, v
                )
示例#31
0
    def test_nn_normal_conditions(self):
        index = DummySI()
        # Need to force a non-zero index size for knn to be performed.
        index.count = mock.MagicMock()
        index.count.return_value = 1

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        # Basically this shouldn't crash
        index.nn(q)
示例#32
0
    def _random_euclidean(self, hash_ftor, hash_idx,
                          ftor_train_hook=lambda d: None):
        # :param hash_ftor: Hash function class for generating hash codes for
        #   descriptors.
        # :param hash_idx: Hash index instance to use in local LSH algo
        #   instance.
        # :param ftor_train_hook: Function for training functor if necessary.

        # make random descriptors
        i = 1000
        dim = 256
        td = []
        np.random.seed(self.RANDOM_SEED)
        for j in range(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(np.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim-1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        self.assertFalse(np.array_equal(q.vector(), td_q.vector()))
        self.assertEqual(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i+1)
        q.set_vector(np.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j-1])
        r, dists = index.nn(q, i)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j-1])
示例#33
0
    def test_known_descriptors_euclidean_ordered(self):
        index = self._make_inst('euclidean')

        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in xrange(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(numpy.array([j, j * 2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(numpy.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        ntools.assert_equal(r[0].uuid(), 0)
        ntools.assert_equal(r[1].uuid(), 1)
        ntools.assert_equal(r[2].uuid(), 2)
        ntools.assert_equal(r[3].uuid(), 3)
        ntools.assert_equal(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            ntools.assert_equal(d.uuid(), j)
示例#34
0
    def _known_ordered_euclidean(self, hash_ftor, hash_idx,
                                 ftor_train_hook=lambda d: None):
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
示例#35
0
    def test_remove_from_index_shared_hashes_partial(self):
        """
        Test that only some hashes are removed from the hash index, but not
        others when those hashes still refer to other descriptors.
        """
        # Simulate initial state with some descriptor hashed to one value and
        # other descriptors hashed to another.

        # Vectors of length 1 for easy dummy hashing prediction.
        descriptors = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        # Dummy hash function to do the simulated thing
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(
            # Vectors of even sum hash to 0, odd to 1.
            side_effect=lambda vec: [vec.sum() % 2]
        )

        d_set = MemoryDescriptorIndex()
        d_set._table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }

        hash2uid_kvs = MemoryKeyValueStore()
        hash2uid_kvs._table = {
            0: {0, 2, 4},
            1: {1, 3},
        }

        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs)
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([1, 2, 3])
        # Check that only one hash vector was passed to hash_index's removal
        # method (deque of hash-code vectors).
        idx.hash_index.remove_from_index.assert_called_once_with(
            collections.deque([
                [1],
            ])
        )
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            4: descriptors[4],
        })
        self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
    def test_none_set(self):
        d = DescriptorMemoryElement('test', 0)
        self.assertFalse(d.has_vector())

        d.set_vector(numpy.ones(16))
        self.assertTrue(d.has_vector())
        numpy.testing.assert_equal(d.vector(), numpy.ones(16))

        d.set_vector(None)
        self.assertFalse(d.has_vector())
        self.assertIs(d.vector(), None)
    def test_none_set(self):
        d = DescriptorMemoryElement('test', 0)
        ntools.assert_false(d.has_vector())

        d.set_vector(numpy.ones(16))
        ntools.assert_true(d.has_vector())
        numpy.testing.assert_equal(d.vector(), numpy.ones(16))

        d.set_vector(None)
        ntools.assert_false(d.has_vector())
        ntools.assert_is(d.vector(), None)
示例#38
0
 def test_build_index_one(self):
     d = DescriptorMemoryElement('test', 0)
     d.set_vector(numpy.zeros(8, float))
     index = self._make_inst('euclidean')
     index.build_index([d])
     self.assertListEqual(
         index._descr_cache,
         [d]
     )
     self.assertIsNotNone(index._flann)
     self.assertIsInstance(index._flann_build_params, dict)
示例#39
0
        def test_build_index_read_only(self):
            v = np.zeros(5, float)
            v[0] = 1.
            d = DescriptorMemoryElement('unit', 0)
            d.set_vector(v)
            test_descriptors = [d]

            index = self._make_inst(read_only=True)
            self.assertRaises(
                ReadOnlyError,
                index.build_index, test_descriptors
            )
示例#40
0
    def test_remove_from_index_shared_hashes(self):
        """
        Test that removing a descriptor (by UID) that shares a hash with other
        descriptors does not trigger removal of its hash.
        """
        # Simulate descriptors all hashing to the same hash value: 0
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(return_value=np.asarray([0], bool))

        d_set = MemoryDescriptorSet()
        hash2uids_kvs = MemoryKeyValueStore()
        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs)

        # Descriptors are 1 dim, value == index.
        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        idx.build_index(descriptors)
        # We expect the descriptor-set and kvs to look like the following now:
        self.assertDictEqual(
            d_set._table, {
                0: descriptors[0],
                1: descriptors[1],
                2: descriptors[2],
                3: descriptors[3],
                4: descriptors[4],
            })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}})

        # Mock out hash index as if we had an implementation so we can check
        # call to its remove_from_index method.
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([2, 4])

        # Only uid 2 and 4 descriptors should be gone from d-set, kvs should
        # still have the 0 key and its set value should only contain uids 0, 1
        # and 3.  `hash_index.remove_from_index` should not be called because
        # no hashes should be marked for removal.
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            1: descriptors[1],
            3: descriptors[3],
        })
        self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}})
        idx.hash_index.remove_from_index.assert_not_called()
    def test_configuration(self):
        default_config = DescriptorMemoryElement.get_default_config()
        ntools.assert_equal(default_config, {})

        inst1 = DescriptorMemoryElement.from_config(default_config, 'test', 'a')
        ntools.assert_equal(default_config, inst1.get_config())
        ntools.assert_equal(inst1.type(), 'test')
        ntools.assert_equal(inst1.uuid(), 'a')

        # vector-based equality
        inst2 = DescriptorMemoryElement.from_config(inst1.get_config(),
                                                    'test', 'a')
        ntools.assert_equal(inst1, inst2)
示例#42
0
    def test_classify(self):
        c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS)
        m_expected = {
            six.b('label_1'): 1,
            six.b('label_2'): 2,
            six.b('negative'): 3,
            six.b('label_3'): 4,
            six.b('Kitware'): 5,
            six.b('label_4'): 6,
        }

        d = DescriptorMemoryElement('test', 0)
        d.set_vector([1, 2, 3, 4, 5, 6])

        m = c._classify(d)
        self.assertEqual(m, m_expected)
示例#43
0
    def _random_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None):
        # make random descriptors
        i = 1000
        dim = 256
        td = []
        numpy.random.seed(self.RANDOM_SEED)
        for j in xrange(i):
            d = DescriptorMemoryElement("random", j)
            d.set_vector(numpy.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        index = LSHNearestNeighborIndex(hash_ftor, di, hash_idx, distance_method="euclidean")
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement("query", i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim - 1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector()))
        ntools.assert_equal(r[0], td_q)

        # random query
        q = DescriptorMemoryElement("query", i + 1)
        q.set_vector(numpy.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])
        r, dists = index.nn(q, i)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])

        DescriptorMemoryElement.MEMORY_CACHE = {}
    def test_clustering_equal_descriptors(self):
        # Test that clusters of descriptor of size  n-features are correctly
        # clustered together.
        print("Creating dummy descriptors")
        n_features = 8
        n_descriptors = 20

        index = MemoryDescriptorIndex()
        c = 0
        for i in range(n_features):
            v = numpy.ndarray((8,))
            v[...] = 0
            v[i] = 1
            for j in range(n_descriptors):
                d = DescriptorMemoryElement('test', c)
                d.set_vector(v)
                index.add_descriptor(d)
                c += 1

        print("Creating test MBKM")
        mbkm = MiniBatchKMeans(n_features, batch_size=12, verbose=True,
                               compute_labels=False, random_state=0)

        # Initial fit with half of index
        d_classes = mb_kmeans_build_apply(index, mbkm, n_descriptors)

        # There should be 20 descriptors per class
        for c in d_classes:
            self.assertEqual(
                len(d_classes[c]),
                n_descriptors,
                "Cluster %s did not have expected number of descriptors "
                "(%d != %d)"
                % (c, n_descriptors, len(d_classes[c]))
            )

            # Each descriptor in each cluster should be equal to the other
            # descriptors in that cluster
            uuids = list(d_classes[c])
            v = index[uuids[0]].vector()
            for uuid in uuids[1:]:
                v2 = index[uuid].vector()
                numpy.testing.assert_array_equal(v, v2,
                                                 "vector in cluster %d did not "
                                                 "match other vectors "
                                                 "(%s != %s)"
                                                 % (c, v, v2))
示例#45
0
        def test_nn_many_descriptors(self):
            np.random.seed(0)

            n = 10 ** 4
            dim = 256

            d_index = [DescriptorMemoryElement('test', i) for i in range(n)]
            [d.set_vector(np.random.rand(dim)) for d in d_index]
            q = DescriptorMemoryElement('q', -1)
            q.set_vector(np.zeros((dim,)))

            faiss_index = self._make_inst()
            faiss_index.build_index(d_index)

            nbrs, dists = faiss_index.nn(q, 10)
            self.assertEqual(len(nbrs), len(dists))
            self.assertEqual(len(nbrs), 10)
示例#46
0
    def test_classify_invalid_descriptor_dimensions(self):
        c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS)
        d = DescriptorMemoryElement('test', 0)

        # One less
        d.set_vector([1, 2, 3, 4, 5])
        self.assertRaises(
            RuntimeError,
            c._classify, d
        )

        # One more
        d.set_vector([1, 2, 3, 4, 5, 6, 7])
        self.assertRaises(
            RuntimeError,
            c._classify, d
        )
示例#47
0
    def _known_ordered_euclidean(self, hash_ftor, hash_idx,
                                 ftor_train_hook=lambda d: None):
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
示例#48
0
    def test_known_descriptors_euclidean_ordered(self):
        index = self._make_inst('euclidean')

        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in xrange(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(numpy.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(numpy.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        ntools.assert_equal(r[0].uuid(), 0)
        ntools.assert_equal(r[1].uuid(), 1)
        ntools.assert_equal(r[2].uuid(), 2)
        ntools.assert_equal(r[3].uuid(), 3)
        ntools.assert_equal(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            ntools.assert_equal(d.uuid(), j)
示例#49
0
        def test_nn_preprocess_index(self):
            faiss_index = self._make_inst(factory_string='PCAR64,IVF1,Flat')
            self.assertEqual(faiss_index.factory_string, 'PCAR64,IVF1,Flat')

            np.random.seed(self.RAND_SEED)
            n = 10 ** 4
            dim = 256

            d_index = [DescriptorMemoryElement('test', i) for i in range(n)]
            [d.set_vector(np.random.rand(dim)) for d in d_index]
            q = DescriptorMemoryElement('q', -1)
            q.set_vector(np.zeros((dim,)))

            faiss_index.build_index(d_index)

            nbrs, dists = faiss_index.nn(q, 10)
            self.assertEqual(len(nbrs), len(dists))
            self.assertEqual(len(nbrs), 10)
示例#50
0
    def test_random_descriptors_euclidean(self):
        # make random descriptors
        i = 1000
        dim = 256
        bits = 32
        td = []
        for j in xrange(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(numpy.random.rand(dim))
            td.append(d)

        index = self._make_inst('euclidean', bits)
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        v = numpy.array(td_q.vector())  # copy
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim-1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector()))
        ntools.assert_equal(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i+1)
        q.set_vector(numpy.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j-1])
        r, dists = index.nn(q, i)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j-1])
示例#51
0
        def test_nn_known_descriptors_euclidean_unit(self):
            dim = 5

            ###
            # Unit vectors -- Equal distance
            #
            index = self._make_inst()
            test_descriptors = []
            for i in range(dim):
                v = np.zeros(dim, float)
                v[i] = 1.
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(v)
                test_descriptors.append(d)
            index.build_index(test_descriptors)
            # query descriptor -- zero vector
            # -> all modeled descriptors should be equally distant (unit
            # corners)
            q = DescriptorMemoryElement('query', 0)
            q.set_vector(np.zeros(dim, float))
            r, dists = index.nn(q, n=dim)
            self.assertEqual(len(dists), dim)
            # All dists should be 1.0, r order doesn't matter
            for d in dists:
                self.assertEqual(d, 1.)
    def test_pickle_dump_load(self):
        # Wipe current cache
        DescriptorMemoryElement.MEMORY_CACHE = {}

        # Make a couple descriptors
        v1 = numpy.array([1, 2, 3])
        d1 = DescriptorMemoryElement('test', 0)
        d1.set_vector(v1)

        v2 = numpy.array([4, 5, 6])
        d2 = DescriptorMemoryElement('test', 1)
        d2.set_vector(v2)

        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        d1_s = cPickle.dumps(d1)
        d2_s = cPickle.dumps(d2)

        # Wipe cache again
        DescriptorMemoryElement.MEMORY_CACHE = {}
        ntools.assert_not_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_not_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        # Attempt reconstitution
        d1_r = cPickle.loads(d1_s)
        d2_r = cPickle.loads(d2_s)

        numpy.testing.assert_array_equal(v1, d1_r.vector())
        numpy.testing.assert_array_equal(v2, d2_r.vector())

        # Cache should now have those entries back in it
        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)
示例#53
0
        def test_known_descriptors_hik_unit(self):
            dim = 5

            ###
            # Unit vectors - Equal distance
            #
            index = self._make_inst('hik')
            test_descriptors = []
            for i in xrange(dim):
                v = numpy.zeros(dim, float)
                v[i] = 1.
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(v)
                test_descriptors.append(d)
            index.build_index(test_descriptors)
            # query with zero vector
            # -> all modeled descriptors have no intersection, dists should be 1.0,
            #    or maximum distance by histogram intersection
            q = DescriptorMemoryElement('query', 0)
            q.set_vector(numpy.zeros(dim, float))
            r, dists = index.nn(q, dim)
            # All dists should be 1.0, r order doesn't matter
            for d in dists:
                ntools.assert_equal(d, 1.)

            # query with index element
            q = test_descriptors[3]
            r, dists = index.nn(q, 1)
            ntools.assert_equal(r[0], q)
            ntools.assert_equal(dists[0], 0.)

            r, dists = index.nn(q, dim)
            ntools.assert_equal(r[0], q)
            ntools.assert_equal(dists[0], 0.)
 def test_output_immutability(self):
     # make sure that data stored is not susceptible to modifications after
     # extraction
     v = numpy.ones(16)
     d = DescriptorMemoryElement('test', 0)
     ntools.assert_false(d.has_vector())
     d.set_vector(v)
     r = d.vector()
     r[:] = 0
     ntools.assert_equal(r.sum(), 0)
     ntools.assert_equal(d.vector().sum(), 16)
示例#55
0
        def test_known_descriptors_euclidean_ordered(self):
            index = self._make_inst('euclidean')

            # make vectors to return in a known euclidean distance order
            i = 10
            test_descriptors = []
            for j in xrange(i):
                d = DescriptorMemoryElement('ordered', j)
                d.set_vector(numpy.array([j, j*2], float))
                test_descriptors.append(d)
            random.shuffle(test_descriptors)
            index.build_index(test_descriptors)

            # Since descriptors were build in increasing distance from (0,0),
            # returned descriptors for a query of [0,0] should be in index order.
            q = DescriptorMemoryElement('query', 99)
            q.set_vector(numpy.array([0, 0], float))
            r, dists = index.nn(q, i)
            for j, d, dist in zip(range(i), r, dists):
                ntools.assert_equal(d.uuid(), j)
                numpy.testing.assert_equal(d.vector(), [j, j*2])
    def test_set_state_version_1(self):
        # Test support of older state version
        expected_type = 'test-type'
        expected_uid = 'test-uid'
        expected_v = numpy.array([1, 2, 3])
        expected_v_b = BytesIO()
        # noinspection PyTypeChecker
        numpy.save(expected_v_b, expected_v)
        expected_v_dump = expected_v_b.getvalue()

        e = DescriptorMemoryElement(None, None)
        e.__setstate__((expected_type, expected_uid, expected_v_dump))
        self.assertEqual(e.type(), expected_type)
        self.assertEqual(e.uuid(), expected_uid)
        numpy.testing.assert_array_equal(e.vector(), expected_v)
示例#57
0
        def test_update_index(self):
            # Build index with one descriptor, then "update" with a second
            # different descriptor checking that the new cache contains both.
            d1 = DescriptorMemoryElement('test', 0)
            d1.set_vector(numpy.zeros(8))
            d2 = DescriptorMemoryElement('test', 1)
            d2.set_vector(numpy.ones(8))

            index = self._make_inst('euclidean')
            index.build_index([d1])
            self.assertEqual(index.count(), 1)
            self.assertSetEqual(set(index._descr_cache), {d1})

            index.update_index([d2])
            self.assertEqual(index.count(), 2)
            self.assertSetEqual(set(index._descr_cache), {d1, d2})
    def test_pickle_dump_load(self):
        # Make a couple descriptors
        v1 = numpy.array([1, 2, 3])
        d1 = DescriptorMemoryElement('test', 0)
        d1.set_vector(v1)

        v2 = numpy.array([4, 5, 6])
        d2 = DescriptorMemoryElement('test', 1)
        d2.set_vector(v2)

        d1_s = cPickle.dumps(d1)
        d2_s = cPickle.dumps(d2)

        # Attempt reconstitution
        d1_r = cPickle.loads(d1_s)
        d2_r = cPickle.loads(d2_s)

        numpy.testing.assert_array_equal(v1, d1_r.vector())
        numpy.testing.assert_array_equal(v2, d2_r.vector())
示例#59
0
    def _known_unit(self, hash_ftor, hash_idx, dist_method,
                    ftor_train_hook=lambda d: None):
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
示例#60
0
def train_classifier_iqr(config, iqr_state_fp):
    log = logging.getLogger(__name__)

    #: :type: smqtk.algorithms.SupervisedClassifier
    classifier = from_plugin_config(config['classifier'], get_classifier_impls)

    if not isinstance(classifier, SupervisedClassifier):
        raise RuntimeError("Configured classifier must be of the "
                           "SupervisedClassifier type in order to train.")

    # Get pos/neg descriptors out of iqr state zip
    z_file = open(iqr_state_fp, 'r')
    z = zipfile.ZipFile(z_file)
    if len(z.namelist()) != 1:
        raise RuntimeError("Invalid IqrState file!")
    iqrs = json.loads(z.read(z.namelist()[0]))
    if len(iqrs) != 2:
        raise RuntimeError("Invalid IqrState file!")
    if 'pos' not in iqrs or 'neg' not in iqrs:
        raise RuntimeError("Invalid IqrState file!")

    log.info("Loading pos/neg descriptors")
    #: :type: list[smqtk.representation.DescriptorElement]
    pos = []
    #: :type: list[smqtk.representation.DescriptorElement]
    neg = []
    i = 0
    for v in set(map(tuple, iqrs['pos'])):
        d = DescriptorMemoryElement('train', i)
        d.set_vector(numpy.array(v))
        pos.append(d)
        i += 1
    for v in set(map(tuple, iqrs['neg'])):
        d = DescriptorMemoryElement('train', i)
        d.set_vector(numpy.array(v))
        neg.append(d)
        i += 1
    log.info('    positive -> %d', len(pos))
    log.info('    negative -> %d', len(neg))

    classifier.train({'positive': pos}, negatives=neg)