Пример #1
0
    def test_classify_subset(self):
        ccol = ClassifierCollection({
            'subjectA': DummyClassifier(),
            'subjectB': DummyClassifier(),
        })

        classifierB = ccol._label_to_classifier['subjectB']
        classifierB.classify_one_element = mock.Mock()

        d_v = [0, 1, 2, 3, 4]
        d = DescriptorMemoryElement('memory', '0')
        d.set_vector(d_v)
        result = ccol.classify(d, labels=['subjectA'])

        # Should contain one entry for each requested classifier.
        self.assertEqual(len(result), 1)
        self.assertIn('subjectA', result)
        self.assertNotIn('subjectB', result)
        classifierB.classify_one_element.assert_not_called()
        # Each key should map to a classification element (memory in this case
        # because we're using the default factory)
        self.assertIsInstance(result['subjectA'], MemoryClassificationElement)
        # We know the dummy classifier outputs "classifications" in a
        # deterministic way: class label is descriptor UUID and classification
        # value is its vector as a list.
        self.assertDictEqual(result['subjectA'].get_classification(),
                             {'test': 0})
Пример #2
0
    def test_fit_short_descriptors_for_bit_length(self):
        # Should error when input descriptors have fewer dimensions than set bit
        # length for output hash codes (limitation of PCA method currently
        # used).
        fit_descriptors = []
        for i in range(3):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-1 + i, -1 + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(bit_length=8)
        self.assertRaisesRegex(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, fit_descriptors)
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)

        # Should behave the same when input is an iterable
        self.assertRaisesRegex(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, iter(fit_descriptors))
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)
Пример #3
0
    def test_classify_missing_label(self):
        ccol = ClassifierCollection({
            'subjectA': DummyClassifier(),
            'subjectB': DummyClassifier(),
        })

        d_v = [0, 1, 2, 3, 4]
        d = DescriptorMemoryElement('memory', '0')
        d.set_vector(d_v)

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectC'])
        self.assertSetEqual(cm.exception.labels, {'subjectC'})

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectA', 'subjectC'])
        self.assertSetEqual(cm.exception.labels, {'subjectC'})

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectC', 'subjectD'])
        self.assertSetEqual(cm.exception.labels, {'subjectC', 'subjectD'})

        # Should throw a MissingLabelError
        with self.assertRaises(MissingLabelError) as cm:
            ccol.classify(d, labels=['subjectA', 'subjectC', 'subjectD'])
        self.assertSetEqual(cm.exception.labels, {'subjectC', 'subjectD'})
Пример #4
0
    def test_nn_known_descriptors_hik_unit(self):
        dim = 5

        ###
        # Unit vectors - Equal distance
        #
        index = self._make_inst('hik')
        test_descriptors = []
        for i in range(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)
        index.build_index(test_descriptors)
        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be
        #    1.0, or maximum distance by histogram intersection.
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(numpy.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
Пример #5
0
    def test_fit_with_cache(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(DataMemoryElement(),
                         DataMemoryElement(),
                         bit_length=1,
                         random_seed=0)
        itq.fit(fit_descriptors)

        # TODO: Explanation as to why this is the expected result.
        numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0])
        numpy.testing.assert_array_almost_equal(itq.rotation,
                                                [[1 / sqrt(2)], [1 / sqrt(2)]])
        self.assertIsNotNone(itq.mean_vec_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(six.BytesIO(itq.mean_vec_cache_elem.get_bytes())),
            [0, 0])

        self.assertIsNotNone(itq.rotation_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(six.BytesIO(itq.rotation_cache_elem.get_bytes())),
            [[1 / sqrt(2)], [1 / sqrt(2)]])
Пример #6
0
    def test_pickle_dump_load(self):
        # Wipe current cache
        DescriptorMemoryElement.MEMORY_CACHE = {}

        # Make a couple descriptors
        v1 = numpy.array([1, 2, 3])
        d1 = DescriptorMemoryElement('test', 0)
        d1.set_vector(v1)

        v2 = numpy.array([4, 5, 6])
        d2 = DescriptorMemoryElement('test', 1)
        d2.set_vector(v2)

        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        d1_s = cPickle.dumps(d1)
        d2_s = cPickle.dumps(d2)

        # Wipe cache again
        DescriptorMemoryElement.MEMORY_CACHE = {}
        ntools.assert_not_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_not_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        # Attempt reconstitution
        d1_r = cPickle.loads(d1_s)
        d2_r = cPickle.loads(d2_s)

        numpy.testing.assert_array_equal(v1, d1_r.vector())
        numpy.testing.assert_array_equal(v2, d2_r.vector())

        # Cache should now have those entries back in it
        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)
Пример #7
0
    def test_known_descriptors_euclidean_ordered(self):
        index = self._make_inst('euclidean')

        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in xrange(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(numpy.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(numpy.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        ntools.assert_equal(r[0].uuid(), 0)
        ntools.assert_equal(r[1].uuid(), 1)
        ntools.assert_equal(r[2].uuid(), 2)
        ntools.assert_equal(r[3].uuid(), 3)
        ntools.assert_equal(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            ntools.assert_equal(d.uuid(), j)
Пример #8
0
    def _known_ordered_euclidean(self, hash_ftor, hash_idx,
                                 ftor_train_hook=lambda d: None):
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
Пример #9
0
    def test_normal_conditions(self, mock_dsi_count):
        index = DummySI()
        mock_dsi_count.return_value = 1

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        index.nn(q)
Пример #10
0
    def test_build_index_with_cache(self):
        # Empty memory data elements for storage
        empty_data = 'base64://'
        f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data)
        # Internal elements should initialize have zero-length byte values
        self.assertEqual(len(f._index_elem.get_bytes()), 0)
        self.assertEqual(len(f._index_param_elem.get_bytes()), 0)
        self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0)

        # Make unit vectors, one for each feature dimension.
        dim = 8
        test_descriptors = []
        for i in range(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)

        f.build_index(test_descriptors)

        # Internal elements should not have non-zero byte values.
        self.assertGreater(len(f._index_elem.get_bytes()), 0)
        self.assertGreater(len(f._index_param_elem.get_bytes()), 0)
        self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
Пример #11
0
    def test_normal_conditions(self, mock_dsi_count):
        index = DummySI()
        mock_dsi_count.return_value = 1

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        index.nn(q)
Пример #12
0
    def test_nn_known_descriptors_euclidean_unit(self):
        dim = 5

        ###
        # Unit vectors -- Equal distance
        #
        index = self._make_inst()
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)
        index.build_index(test_descriptors)
        # query descriptor -- zero vector
        # -> all modeled descriptors should be equally distant (unit
        # corners)
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, n=dim)
        self.assertEqual(len(dists), dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)
Пример #13
0
    def test_nn_known_descriptors_euclidean_ordered(self):
        index = self._make_inst()

        # make vectors to return in a known euclidean distance order
        i = 100
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j * 2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index
        # order.
        q = DescriptorMemoryElement('query', 99)
        q.set_vector(np.array([0, 0], float))
        r, dists = index.nn(q, n=i)
        # Because the data is one-dimensional, all of the cells will have
        # the same points (any division will just correspond to a point on
        # the line), and a cell can't have more than half of the points
        self.assertEqual(len(dists), i // 2)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
            np.testing.assert_equal(d.vector(), [j, j * 2])
Пример #14
0
    def test_fit_with_cache(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        itq = ItqFunctor(DataMemoryElement(), DataMemoryElement(),
                         bit_length=1, random_seed=0)
        itq.fit(fit_descriptors)

        # TODO: Explanation as to why this is the expected result.
        numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0])
        numpy.testing.assert_array_almost_equal(itq.rotation, [[1 / sqrt(2)],
                                                               [1 / sqrt(2)]])
        self.assertIsNotNone(itq.mean_vec_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.mean_vec_cache_elem.get_bytes())),
            [0, 0]
        )

        self.assertIsNotNone(itq.rotation_cache_elem)
        numpy.testing.assert_array_almost_equal(
            numpy.load(BytesIO(itq.rotation_cache_elem.get_bytes())),
            [[1 / sqrt(2)],
             [1 / sqrt(2)]]
        )
Пример #15
0
        def test_build_index(self):
            # Empty memory data elements for storage
            empty_data = 'base64://'
            f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data)
            # Internal elements should initialize have zero-length byte values
            self.assertEqual(len(f._index_elem.get_bytes()), 0)
            self.assertEqual(len(f._index_param_elem.get_bytes()), 0)
            self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0)

            # Make unit vectors, one for each feature
            dim = 8
            test_descriptors = []
            for i in range(dim):
                v = numpy.zeros(dim, float)
                v[i] = 1.
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(v)
                test_descriptors.append(d)

            f.build_index(test_descriptors)

            # Internal elements should not have non-zero byte values.
            self.assertGreater(len(f._index_elem.get_bytes()), 0)
            self.assertGreater(len(f._index_param_elem.get_bytes()), 0)
            self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
Пример #16
0
    def test_get_hash(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        # The following "rotation" matrix should cause any 2-feature descriptor
        # to the right of the line ``y = -x`` to be True, and to the left as
        # False. If on the line, should be True.
        itq = ItqFunctor(bit_length=1, random_seed=0)
        itq.mean_vec = numpy.array([0., 0.])
        itq.rotation = numpy.array([[1. / sqrt(2)],
                                    [1. / sqrt(2)]])

        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, 1])), [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, -1])), [False])

        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, 1])), [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1.001, 1])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, 1.001])), [True])

        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, -1])), [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, -1.001])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1.001, -1])), [True])
Пример #17
0
        def test_known_descriptors_hik_unit(self):
            dim = 5

            ###
            # Unit vectors - Equal distance
            #
            index = self._make_inst('hik')
            test_descriptors = []
            for i in xrange(dim):
                v = numpy.zeros(dim, float)
                v[i] = 1.
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(v)
                test_descriptors.append(d)
            index.build_index(test_descriptors)
            # query with zero vector
            # -> all modeled descriptors have no intersection, dists should be 1.0,
            #    or maximum distance by histogram intersection
            q = DescriptorMemoryElement('query', 0)
            q.set_vector(numpy.zeros(dim, float))
            r, dists = index.nn(q, dim)
            # All dists should be 1.0, r order doesn't matter
            for d in dists:
                ntools.assert_equal(d, 1.)

            # query with index element
            q = test_descriptors[3]
            r, dists = index.nn(q, 1)
            ntools.assert_equal(r[0], q)
            ntools.assert_equal(dists[0], 0.)

            r, dists = index.nn(q, dim)
            ntools.assert_equal(r[0], q)
            ntools.assert_equal(dists[0], 0.)
Пример #18
0
    def test_fit_short_descriptors_for_bit_length(self):
        # Should error when input descriptors have fewer dimensions than set bit
        # length for output hash codes (limitation of PCA method currently
        # used).
        fit_descriptors = []
        for i in range(3):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-1+i, -1+i])
            fit_descriptors.append(d)

        itq = ItqFunctor(bit_length=8)
        self.assertRaisesRegexp(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, fit_descriptors
        )
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)

        # Should behave the same when input is an iterable
        self.assertRaisesRegexp(
            ValueError,
            "Input descriptors have fewer features than requested bit encoding",
            itq.fit, iter(fit_descriptors)
        )
        self.assertIsNone(itq.mean_vec)
        self.assertIsNone(itq.rotation)
Пример #19
0
    def test_nn_pathological_example(self):
        n = 10**4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_set = [DescriptorMemoryElement('test', i) for i in range(n)]
        # Put all descriptors on a line so that different trees get same
        # divisions.
        # noinspection PyTypeChecker
        [d.set_vector(np.full(dim, d.uuid(), dtype=np.float64)) for d in d_set]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim, )))

        di = MemoryDescriptorSet()
        mrpt = MRPTNearestNeighborsIndex(di,
                                         num_trees=num_trees,
                                         depth=depth,
                                         random_seed=0)
        mrpt.build_index(d_set)

        nbrs, dists = mrpt.nn(q, k)
        self.assertEqual(len(nbrs), len(dists))
        # We should get about 10 descriptors back instead of the requested
        # 200
        self.assertLess(len(nbrs), 20)
Пример #20
0
    def test_pickle_dump_load(self):
        # Wipe current cache
        DescriptorMemoryElement.MEMORY_CACHE = {}

        # Make a couple descriptors
        v1 = numpy.array([1, 2, 3])
        d1 = DescriptorMemoryElement('test', 0)
        d1.set_vector(v1)

        v2 = numpy.array([4, 5, 6])
        d2 = DescriptorMemoryElement('test', 1)
        d2.set_vector(v2)

        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        d1_s = cPickle.dumps(d1)
        d2_s = cPickle.dumps(d2)

        # Wipe cache again
        DescriptorMemoryElement.MEMORY_CACHE = {}
        ntools.assert_not_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_not_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)

        # Attempt reconstitution
        d1_r = cPickle.loads(d1_s)
        d2_r = cPickle.loads(d2_s)

        numpy.testing.assert_array_equal(v1, d1_r.vector())
        numpy.testing.assert_array_equal(v2, d2_r.vector())

        # Cache should now have those entries back in it
        ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE)
        ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)
Пример #21
0
    def test_known_descriptors_euclidean_ordered(self):
        index = self._make_inst('euclidean')

        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in xrange(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(numpy.array([j, j * 2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)
        index.build_index(test_descriptors)

        # Since descriptors were build in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(numpy.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        ntools.assert_equal(r[0].uuid(), 0)
        ntools.assert_equal(r[1].uuid(), 1)
        ntools.assert_equal(r[2].uuid(), 2)
        ntools.assert_equal(r[3].uuid(), 3)
        ntools.assert_equal(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            ntools.assert_equal(d.uuid(), j)
Пример #22
0
        def test_nn_known_descriptors_euclidean_unit(self):
            dim = 5

            ###
            # Unit vectors -- Equal distance
            #
            index = self._make_inst()
            test_descriptors = []
            for i in range(dim):
                v = np.zeros(dim, float)
                v[i] = 1.
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(v)
                test_descriptors.append(d)
            index.build_index(test_descriptors)
            # query descriptor -- zero vector
            # -> all modeled descriptors should be equally distant (unit
            # corners)
            q = DescriptorMemoryElement('query', 0)
            q.set_vector(np.zeros(dim, float))
            r, dists = index.nn(q, n=dim)
            self.assertEqual(len(dists), dim)
            # All dists should be 1.0, r order doesn't matter
            for d in dists:
                self.assertEqual(d, 1.)
Пример #23
0
    def test_nn_small_leaves(self):
        np.random.seed(0)

        n = 10**4
        dim = 256
        depth = 10
        # L ~ n/2**depth = 10^4 / 2^10 ~ 10
        k = 200
        # 3k/L = 60
        num_trees = 60

        d_set = [DescriptorMemoryElement('test', i) for i in range(n)]
        [d.set_vector(np.random.rand(dim)) for d in d_set]
        q = DescriptorMemoryElement('q', -1)
        q.set_vector(np.zeros((dim, )))

        di = MemoryDescriptorSet()
        mrpt = MRPTNearestNeighborsIndex(di,
                                         num_trees=num_trees,
                                         depth=depth,
                                         random_seed=0)
        mrpt.build_index(d_set)

        nbrs, dists = mrpt.nn(q, k)
        self.assertEqual(len(nbrs), len(dists))
        self.assertEqual(len(nbrs), k)
Пример #24
0
    def test_classify(self):
        ccol = ClassifierCollection({
            'subjectA': DummyClassifier(),
            'subjectB': DummyClassifier(),
        })

        d_v = [0, 1, 2, 3, 4]
        d = DescriptorMemoryElement('memory', '0')
        d.set_vector(d_v)
        result = ccol.classify(d)

        # Should contain one entry for each configured classifier.
        self.assertEqual(len(result), 2)
        self.assertIn('subjectA', result)
        self.assertIn('subjectB', result)
        # Each key should map to a classification element (memory in this case
        # because we're using the default factory)
        self.assertIsInstance(result['subjectA'], MemoryClassificationElement)
        self.assertIsInstance(result['subjectB'], MemoryClassificationElement)
        # We know the dummy classifier outputs "classifications" in a
        # deterministic way: class label is "test" and classification
        # value is the index of the descriptor .
        self.assertDictEqual(result['subjectA'].get_classification(),
                             {'test': 0})
        self.assertDictEqual(result['subjectB'].get_classification(),
                             {'test': 0})
Пример #25
0
    def test_get_hash(self):
        fit_descriptors = []
        for i in range(5):
            d = DescriptorMemoryElement(six.b('test'), i)
            d.set_vector([-2. + i, -2. + i])
            fit_descriptors.append(d)

        # The following "rotation" matrix should cause any 2-feature descriptor
        # to the right of the line ``y = -x`` to be True, and to the left as
        # False. If on the line, should be True.
        itq = ItqFunctor(bit_length=1, random_seed=0)
        itq.mean_vec = numpy.array([0., 0.])
        itq.rotation = numpy.array([[1. / sqrt(2)], [1. / sqrt(2)]])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, 1])),
                                         [True])
        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, -1])),
                                         [False])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, 1])),
                                         [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1.001, 1])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([-1, 1.001])), [True])

        numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, -1])),
                                         [True])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1, -1.001])), [False])
        numpy.testing.assert_array_equal(
            itq.get_hash(numpy.array([1.001, -1])), [True])
Пример #26
0
    def _known_ordered_euclidean(self, hash_ftor, hash_idx,
                                 ftor_train_hook=lambda d: None):
        # make vectors to return in a known euclidean distance order
        i = 1000
        test_descriptors = []
        for j in range(i):
            d = DescriptorMemoryElement('ordered', j)
            d.set_vector(np.array([j, j*2], float))
            test_descriptors.append(d)
        random.shuffle(test_descriptors)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(test_descriptors)

        # Since descriptors were built in increasing distance from (0,0),
        # returned descriptors for a query of [0,0] should be in index order.
        q = DescriptorMemoryElement('query', i)
        q.set_vector(np.array([0, 0], float))
        # top result should have UUID == 0 (nearest to query)
        r, dists = index.nn(q, 5)
        self.assertEqual(r[0].uuid(), 0)
        self.assertEqual(r[1].uuid(), 1)
        self.assertEqual(r[2].uuid(), 2)
        self.assertEqual(r[3].uuid(), 3)
        self.assertEqual(r[4].uuid(), 4)
        # global search should be in complete order
        r, dists = index.nn(q, i)
        for j, d, dist in zip(range(i), r, dists):
            self.assertEqual(d.uuid(), j)
Пример #27
0
 def create_de(v: np.ndarray) -> DescriptorElement:
     nonlocal i
     # Hopefully type_str doesn't matter
     de = DescriptorMemoryElement('', i)
     de.set_vector(v)
     i += 1
     return de
Пример #28
0
 def test_build_index_one(self):
     d = DescriptorMemoryElement('test', 0)
     d.set_vector(numpy.zeros(8, float))
     index = self._make_inst('euclidean')
     index.build_index([d])
     self.assertListEqual(index._descr_cache, [d])
     self.assertIsNotNone(index._flann)
     self.assertIsInstance(index._flann_build_params, dict)
Пример #29
0
    def test_classify(self):
        d = DescriptorMemoryElement('test', 0)
        d.set_vector([1, 2, 3])

        c = DummyClassifier()
        e = c.classify(d)
        nose.tools.assert_equal(e.get_classification(), {0: [1, 2, 3]})
        nose.tools.assert_equal(e.uuid, d.uuid())
Пример #30
0
    def test_nn_empty_index(self):
        # nn should fail if index size is 0
        index = DummySI()
        index.count = mock.MagicMock(return_value=0)
        index._nn = mock.MagicMock()

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        self.assertRaises(ValueError, index.nn, q)
Пример #31
0
    def test_read_only(self):
        v = np.zeros(5, float)
        v[0] = 1.
        d = DescriptorMemoryElement('unit', 0)
        d.set_vector(v)
        test_descriptors = [d]

        index = self._make_inst(read_only=True)
        self.assertRaises(ReadOnlyError, index.build_index, test_descriptors)
Пример #32
0
    def test_nn_normal_conditions(self):
        index = DummySI()
        # Need to force a non-zero index size for knn to be performed.
        index.count = mock.MagicMock()
        index.count.return_value = 1

        q = DescriptorMemoryElement('q', 0)
        q.set_vector(numpy.random.rand(4))
        # Basically this shouldn't crash
        index.nn(q)
Пример #33
0
 def test_build_index_one(self):
     d = DescriptorMemoryElement('test', 0)
     d.set_vector(numpy.zeros(8, float))
     index = self._make_inst('euclidean')
     index.build_index([d])
     self.assertListEqual(
         index._descr_cache,
         [d]
     )
     self.assertIsNotNone(index._flann)
     self.assertIsInstance(index._flann_build_params, dict)
Пример #34
0
    def test_classify_invalid_descriptor_dimensions(self):
        c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS)
        d = DescriptorMemoryElement('test', 0)

        # One less
        d.set_vector([1, 2, 3, 4, 5])
        self.assertRaises(RuntimeError, c._classify, d)

        # One more
        d.set_vector([1, 2, 3, 4, 5, 6, 7])
        self.assertRaises(RuntimeError, c._classify, d)
Пример #35
0
    def test_none_set(self):
        d = DescriptorMemoryElement('test', 0)
        self.assertFalse(d.has_vector())

        d.set_vector(numpy.ones(16))
        self.assertTrue(d.has_vector())
        numpy.testing.assert_equal(d.vector(), numpy.ones(16))

        d.set_vector(None)
        self.assertFalse(d.has_vector())
        self.assertIs(d.vector(), None)
Пример #36
0
    def test_none_set(self):
        d = DescriptorMemoryElement('test', 0)
        self.assertFalse(d.has_vector())

        d.set_vector(numpy.ones(16))
        self.assertTrue(d.has_vector())
        numpy.testing.assert_equal(d.vector(), numpy.ones(16))

        d.set_vector(None)
        self.assertFalse(d.has_vector())
        self.assertIs(d.vector(), None)
Пример #37
0
 def test_output_immutability(self):
     # make sure that data stored is not susceptible to modifications after
     # extraction
     v = numpy.ones(16)
     d = DescriptorMemoryElement('test', 0)
     ntools.assert_false(d.has_vector())
     d.set_vector(v)
     r = d.vector()
     r[:] = 0
     ntools.assert_equal(r.sum(), 0)
     ntools.assert_equal(d.vector().sum(), 16)
Пример #38
0
    def test_none_set(self):
        d = DescriptorMemoryElement('test', 0)
        ntools.assert_false(d.has_vector())

        d.set_vector(numpy.ones(16))
        ntools.assert_true(d.has_vector())
        numpy.testing.assert_equal(d.vector(), numpy.ones(16))

        d.set_vector(None)
        ntools.assert_false(d.has_vector())
        ntools.assert_is(d.vector(), None)
Пример #39
0
 def test_output_immutability(self):
     # make sure that data stored is not susceptible to modifications after
     # extraction
     v = numpy.ones(16)
     d = DescriptorMemoryElement('test', 0)
     ntools.assert_false(d.has_vector())
     d.set_vector(v)
     r = d.vector()
     r[:] = 0
     ntools.assert_equal(r.sum(), 0)
     ntools.assert_equal(d.vector().sum(), 16)
Пример #40
0
    def test_none_set(self):
        d = DescriptorMemoryElement('test', 0)
        ntools.assert_false(d.has_vector())

        d.set_vector(numpy.ones(16))
        ntools.assert_true(d.has_vector())
        numpy.testing.assert_equal(d.vector(), numpy.ones(16))

        d.set_vector(None)
        ntools.assert_false(d.has_vector())
        ntools.assert_is(d.vector(), None)
Пример #41
0
        def test_build_index_read_only(self):
            v = np.zeros(5, float)
            v[0] = 1.
            d = DescriptorMemoryElement('unit', 0)
            d.set_vector(v)
            test_descriptors = [d]

            index = self._make_inst(read_only=True)
            self.assertRaises(
                ReadOnlyError,
                index.build_index, test_descriptors
            )
Пример #42
0
    def test_classify_no_overwrite(self, m_ce_hc):
        # Testing logic when classifier element for descriptor already has
        # stored results and we are NOT overwriting.
        m_ce_hc.return_value = True

        c = DummyClassifier()
        # Record if underlying classify call invoked
        c._classify = mock.MagicMock(side_effect=c._classify)

        d = DescriptorMemoryElement('test', 0)
        d.set_vector([1, 2, 3])

        c.classify(d, overwrite=False)
        c._classify.assert_not_called()
Пример #43
0
    def test_classify_with_overwrite(self, m_ce_hc):
        # Testing logic when classification element for descriptor already has
        # stored results but we call WITH overwrite on.
        m_ce_hc.return_value = True

        c = DummyClassifier()
        # Record if underlying classify call invoked
        c._classify = mock.MagicMock(side_effect=c._classify)

        d = DescriptorMemoryElement('test', 0)
        d.set_vector([1, 2, 3])

        c.classify(d, overwrite=True)
        c._classify.assert_called_once_with(d)
Пример #44
0
        def test_update_index(self):
            # Build index with one descriptor, then "update" with a second
            # different descriptor checking that the new cache contains both.
            d1 = DescriptorMemoryElement('test', 0)
            d1.set_vector(numpy.zeros(8))
            d2 = DescriptorMemoryElement('test', 1)
            d2.set_vector(numpy.ones(8))

            index = self._make_inst('euclidean')
            index.build_index([d1])
            self.assertEqual(index.count(), 1)
            self.assertSetEqual(set(index._descr_cache), {d1})

            index.update_index([d2])
            self.assertEqual(index.count(), 2)
            self.assertSetEqual(set(index._descr_cache), {d1, d2})
Пример #45
0
    def test_classify(self):
        c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS)
        m_expected = {
            six.b('label_1'): 1,
            six.b('label_2'): 2,
            six.b('negative'): 3,
            six.b('label_3'): 4,
            six.b('Kitware'): 5,
            six.b('label_4'): 6,
        }

        d = DescriptorMemoryElement('test', 0)
        d.set_vector([1, 2, 3, 4, 5, 6])

        m = c._classify(d)
        self.assertEqual(m, m_expected)
Пример #46
0
    def test_update_index(self):
        # Build index with one descriptor, then "update" with a second
        # different descriptor checking that the new cache contains both.
        d1 = DescriptorMemoryElement('test', 0)
        d1.set_vector(numpy.zeros(8))
        d2 = DescriptorMemoryElement('test', 1)
        d2.set_vector(numpy.ones(8))

        index = self._make_inst('euclidean')
        index.build_index([d1])
        self.assertEqual(index.count(), 1)
        self.assertSetEqual(set(index._descr_cache), {d1})

        index.update_index([d2])
        self.assertEqual(index.count(), 2)
        self.assertSetEqual(set(index._descr_cache), {d1, d2})
Пример #47
0
    def test_classify(self):
        c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS)
        m_expected = {
            six.b('label_1'): 1,
            six.b('label_2'): 2,
            six.b('negative'): 3,
            six.b('label_3'): 4,
            six.b('Kitware'): 5,
            six.b('label_4'): 6,
        }

        d = DescriptorMemoryElement('test', 0)
        d.set_vector([1, 2, 3, 4, 5, 6])

        m = c._classify(d)
        self.assertEqual(m, m_expected)
Пример #48
0
    def test_clustering_equal_descriptors(self):
        # Test that clusters of descriptor of size  n-features are correctly
        # clustered together.
        print("Creating dummy descriptors")
        n_features = 8
        n_descriptors = 20

        index = MemoryDescriptorIndex()
        c = 0
        for i in range(n_features):
            v = numpy.ndarray((8,))
            v[...] = 0
            v[i] = 1
            for j in range(n_descriptors):
                d = DescriptorMemoryElement('test', c)
                d.set_vector(v)
                index.add_descriptor(d)
                c += 1

        print("Creating test MBKM")
        mbkm = MiniBatchKMeans(n_features, batch_size=12, verbose=True,
                               compute_labels=False, random_state=0)

        # Initial fit with half of index
        d_classes = mb_kmeans_build_apply(index, mbkm, n_descriptors)

        # There should be 20 descriptors per class
        for c in d_classes:
            self.assertEqual(
                len(d_classes[c]),
                n_descriptors,
                "Cluster %s did not have expected number of descriptors "
                "(%d != %d)"
                % (c, n_descriptors, len(d_classes[c]))
            )

            # Each descriptor in each cluster should be equal to the other
            # descriptors in that cluster
            uuids = list(d_classes[c])
            v = index[uuids[0]].vector()
            for uuid in uuids[1:]:
                v2 = index[uuid].vector()
                numpy.testing.assert_array_equal(v, v2,
                                                 "vector in cluster %d did not "
                                                 "match other vectors "
                                                 "(%s != %s)"
                                                 % (c, v, v2))
Пример #49
0
    def test_classify_invalid_descriptor_dimensions(self):
        c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS)
        d = DescriptorMemoryElement('test', 0)

        # One less
        d.set_vector([1, 2, 3, 4, 5])
        self.assertRaises(
            RuntimeError,
            c._classify, d
        )

        # One more
        d.set_vector([1, 2, 3, 4, 5, 6, 7])
        self.assertRaises(
            RuntimeError,
            c._classify, d
        )
Пример #50
0
        def test_nn_many_descriptors(self):
            np.random.seed(0)

            n = 10 ** 4
            dim = 256

            d_index = [DescriptorMemoryElement('test', i) for i in range(n)]
            [d.set_vector(np.random.rand(dim)) for d in d_index]
            q = DescriptorMemoryElement('q', -1)
            q.set_vector(np.zeros((dim,)))

            faiss_index = self._make_inst()
            faiss_index.build_index(d_index)

            nbrs, dists = faiss_index.nn(q, 10)
            self.assertEqual(len(nbrs), len(dists))
            self.assertEqual(len(nbrs), 10)
Пример #51
0
        def test_nn_preprocess_index(self):
            faiss_index = self._make_inst(factory_string='PCAR64,IVF1,Flat')
            self.assertEqual(faiss_index.factory_string, 'PCAR64,IVF1,Flat')

            np.random.seed(self.RAND_SEED)
            n = 10 ** 4
            dim = 256

            d_index = [DescriptorMemoryElement('test', i) for i in range(n)]
            [d.set_vector(np.random.rand(dim)) for d in d_index]
            q = DescriptorMemoryElement('q', -1)
            q.set_vector(np.zeros((dim,)))

            faiss_index.build_index(d_index)

            nbrs, dists = faiss_index.nn(q, 10)
            self.assertEqual(len(nbrs), len(dists))
            self.assertEqual(len(nbrs), 10)
Пример #52
0
    def test_pickle_dump_load(self):
        # Make a couple descriptors
        v1 = numpy.array([1, 2, 3])
        d1 = DescriptorMemoryElement('test', 0)
        d1.set_vector(v1)

        v2 = numpy.array([4, 5, 6])
        d2 = DescriptorMemoryElement('test', 1)
        d2.set_vector(v2)

        d1_s = cPickle.dumps(d1)
        d2_s = cPickle.dumps(d2)

        # Attempt reconstitution
        d1_r = cPickle.loads(d1_s)
        d2_r = cPickle.loads(d2_s)

        numpy.testing.assert_array_equal(v1, d1_r.vector())
        numpy.testing.assert_array_equal(v2, d2_r.vector())
Пример #53
0
    def _known_unit(self, hash_ftor, hash_idx, dist_method,
                    ftor_train_hook=lambda d: None):
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in range(dim):
            v = np.zeros(dim, float)
            v[i] = 1.
            d = DescriptorMemoryElement('unit', i)
            d.set_vector(v)
            test_descriptors.append(d)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement('query', 0)
        q.set_vector(np.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            self.assertEqual(d, 1.)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)

        r, dists = index.nn(q, dim)
        self.assertEqual(r[0], q)
        self.assertEqual(dists[0], 0.)
Пример #54
0
    def test_input_immutability(self):
        # make sure that data stored is not susceptible to shifts in the
        # originating data matrix they were pulled from.

        #
        # Testing this with a single vector
        #
        v = numpy.random.rand(16)
        t = tuple(v.copy())
        d = DescriptorMemoryElement('test', 0)
        d.set_vector(v)
        v[:] = 0
        ntools.assert_true((v == 0).all())
        ntools.assert_false(sum(t) == 0.)
        numpy.testing.assert_equal(d.vector(), t)

        #
        # Testing with matrix
        #
        m = numpy.random.rand(20, 16)

        v1 = m[3]
        v2 = m[15]
        v3 = m[19]

        # Save truth values of arrays as immutable tuples (copies)
        t1 = tuple(v1.copy())
        t2 = tuple(v2.copy())
        t3 = tuple(v3.copy())

        d1 = DescriptorMemoryElement('test', 1)
        d1.set_vector(v1)
        d2 = DescriptorMemoryElement('test', 2)
        d2.set_vector(v2)
        d3 = DescriptorMemoryElement('test', 3)
        d3.set_vector(v3)

        numpy.testing.assert_equal(v1, d1.vector())
        numpy.testing.assert_equal(v2, d2.vector())
        numpy.testing.assert_equal(v3, d3.vector())

        # Changing the source should not change stored vectors
        m[:, :] = 0.
        ntools.assert_true((v1 == 0).all())
        ntools.assert_true((v2 == 0).all())
        ntools.assert_true((v3 == 0).all())
        ntools.assert_false(sum(t1) == 0.)
        ntools.assert_false(sum(t2) == 0.)
        ntools.assert_false(sum(t3) == 0.)
        numpy.testing.assert_equal(d1.vector(), t1)
        numpy.testing.assert_equal(d2.vector(), t2)
        numpy.testing.assert_equal(d3.vector(), t3)
Пример #55
0
        def test_known_descriptors_euclidean_ordered(self):
            index = self._make_inst('euclidean')

            # make vectors to return in a known euclidean distance order
            i = 10
            test_descriptors = []
            for j in xrange(i):
                d = DescriptorMemoryElement('ordered', j)
                d.set_vector(numpy.array([j, j*2], float))
                test_descriptors.append(d)
            random.shuffle(test_descriptors)
            index.build_index(test_descriptors)

            # Since descriptors were build in increasing distance from (0,0),
            # returned descriptors for a query of [0,0] should be in index order.
            q = DescriptorMemoryElement('query', 99)
            q.set_vector(numpy.array([0, 0], float))
            r, dists = index.nn(q, i)
            for j, d, dist in zip(range(i), r, dists):
                ntools.assert_equal(d.uuid(), j)
Пример #56
0
def train_classifier_iqr(config, iqr_state_fp):
    log = logging.getLogger(__name__)

    #: :type: smqtk.algorithms.SupervisedClassifier
    classifier = from_plugin_config(config['classifier'], get_classifier_impls)

    if not isinstance(classifier, SupervisedClassifier):
        raise RuntimeError("Configured classifier must be of the "
                           "SupervisedClassifier type in order to train.")

    # Get pos/neg descriptors out of iqr state zip
    z_file = open(iqr_state_fp, 'r')
    z = zipfile.ZipFile(z_file)
    if len(z.namelist()) != 1:
        raise RuntimeError("Invalid IqrState file!")
    iqrs = json.loads(z.read(z.namelist()[0]))
    if len(iqrs) != 2:
        raise RuntimeError("Invalid IqrState file!")
    if 'pos' not in iqrs or 'neg' not in iqrs:
        raise RuntimeError("Invalid IqrState file!")

    log.info("Loading pos/neg descriptors")
    #: :type: list[smqtk.representation.DescriptorElement]
    pos = []
    #: :type: list[smqtk.representation.DescriptorElement]
    neg = []
    i = 0
    for v in set(map(tuple, iqrs['pos'])):
        d = DescriptorMemoryElement('train', i)
        d.set_vector(numpy.array(v))
        pos.append(d)
        i += 1
    for v in set(map(tuple, iqrs['neg'])):
        d = DescriptorMemoryElement('train', i)
        d.set_vector(numpy.array(v))
        neg.append(d)
        i += 1
    log.info('    positive -> %d', len(pos))
    log.info('    negative -> %d', len(neg))

    classifier.train({'positive': pos}, negatives=neg)
Пример #57
0
    def _known_unit(self, hash_ftor, hash_idx, dist_method, ftor_train_hook=lambda d: None):
        ###
        # Unit vectors - Equal distance
        #
        dim = 5
        test_descriptors = []
        for i in xrange(dim):
            v = numpy.zeros(dim, float)
            v[i] = 1.0
            d = DescriptorMemoryElement("unit", i)
            d.set_vector(v)
            test_descriptors.append(d)

        ftor_train_hook(test_descriptors)

        di = MemoryDescriptorIndex()
        index = LSHNearestNeighborIndex(hash_ftor, di, hash_idx, distance_method=dist_method)
        index.build_index(test_descriptors)

        # query with zero vector
        # -> all modeled descriptors have no intersection, dists should be 1.0,
        #    or maximum distance by histogram intersection
        q = DescriptorMemoryElement("query", 0)
        q.set_vector(numpy.zeros(dim, float))
        r, dists = index.nn(q, dim)
        # All dists should be 1.0, r order doesn't matter
        for d in dists:
            ntools.assert_equal(d, 1.0)

        # query with index element
        q = test_descriptors[3]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)
        ntools.assert_equal(dists[0], 0.0)

        r, dists = index.nn(q, dim)
        ntools.assert_equal(r[0], q)
        ntools.assert_equal(dists[0], 0.0)

        DescriptorMemoryElement.MEMORY_CACHE = {}
Пример #58
0
        def test_nn_known_descriptors_nearest(self):
            dim = 5

            ###
            # Unit vectors -- Equal distance
            #
            index = self._make_inst()
            test_descriptors = []
            vectors = np.eye(dim, dtype=np.float32)
            for i in range(dim):
                d = DescriptorMemoryElement('unit', i)
                d.set_vector(vectors[i])
                test_descriptors.append(d)
            index.build_index(test_descriptors)
            # query descriptor -- first point
            q = DescriptorMemoryElement('query', 0)
            q.set_vector(vectors[0])
            r, dists = index.nn(q)
            self.assertEqual(len(dists), 1)
            # Distance should be zero
            self.assertEqual(dists[0], 0.)
            self.assertItemsEqual(r[0].vector(), vectors[0])
Пример #59
0
    def _random_euclidean(self, hash_ftor, hash_idx,
                          ftor_train_hook=lambda d: None):
        # :param hash_ftor: Hash function class for generating hash codes for
        #   descriptors.
        # :param hash_idx: Hash index instance to use in local LSH algo
        #   instance.
        # :param ftor_train_hook: Function for training functor if necessary.

        # make random descriptors
        i = 1000
        dim = 256
        td = []
        np.random.seed(self.RANDOM_SEED)
        for j in range(i):
            d = DescriptorMemoryElement('random', j)
            d.set_vector(np.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        kvstore = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(hash_ftor, di, kvstore,
                                        hash_index=hash_idx,
                                        distance_method='euclidean')
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        self.assertEqual(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement('query', i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim-1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        self.assertFalse(np.array_equal(q.vector(), td_q.vector()))
        self.assertEqual(r[0], td_q)

        # random query
        q = DescriptorMemoryElement('query', i+1)
        q.set_vector(np.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j-1])
        r, dists = index.nn(q, i)
        for j in range(1, len(dists)):
            self.assertGreater(dists[j], dists[j-1])
Пример #60
0
    def _random_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None):
        # make random descriptors
        i = 1000
        dim = 256
        td = []
        numpy.random.seed(self.RANDOM_SEED)
        for j in xrange(i):
            d = DescriptorMemoryElement("random", j)
            d.set_vector(numpy.random.rand(dim))
            td.append(d)

        ftor_train_hook(td)

        di = MemoryDescriptorIndex()
        index = LSHNearestNeighborIndex(hash_ftor, di, hash_idx, distance_method="euclidean")
        index.build_index(td)

        # test query from build set -- should return same descriptor when k=1
        q = td[255]
        r, dists = index.nn(q, 1)
        ntools.assert_equal(r[0], q)

        # test query very near a build vector
        td_q = td[0]
        q = DescriptorMemoryElement("query", i)
        v = td_q.vector().copy()
        v_min = max(v.min(), 0.1)
        v[0] += v_min
        v[dim - 1] -= v_min
        q.set_vector(v)
        r, dists = index.nn(q, 1)
        ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector()))
        ntools.assert_equal(r[0], td_q)

        # random query
        q = DescriptorMemoryElement("query", i + 1)
        q.set_vector(numpy.random.rand(dim))

        # for any query of size k, results should at least be in distance order
        r, dists = index.nn(q, 10)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])
        r, dists = index.nn(q, i)
        for j in xrange(1, len(dists)):
            ntools.assert_greater(dists[j], dists[j - 1])

        DescriptorMemoryElement.MEMORY_CACHE = {}