def test_adjudication_switch(self): """ Test providing positives and negatives on top of an existing state such that the descriptor adjudications are reversed. (what was once positive is now negative, etc.) """ p0 = DescriptorMemoryElement('', 0).set_vector([0]) p1 = DescriptorMemoryElement('', 1).set_vector([1]) p2 = DescriptorMemoryElement('', 2).set_vector([2]) n3 = DescriptorMemoryElement('', 3).set_vector([3]) n4 = DescriptorMemoryElement('', 4).set_vector([4]) # Set initial state self.iqrs.positive_descriptors = {p0, p1, p2} self.iqrs.negative_descriptors = {n3, n4} # Adjudicate, partially swapping adjudications individually self.iqrs.adjudicate(new_positives=[n3]) assert self.iqrs.positive_descriptors == {p0, p1, p2, n3} assert self.iqrs.negative_descriptors == {n4} self.iqrs.adjudicate(new_negatives=[p1]) assert self.iqrs.positive_descriptors == {p0, p2, n3} assert self.iqrs.negative_descriptors == {n4, p1} # Adjudicate swapping remaining at the same time self.iqrs.adjudicate(new_positives=[n4], new_negatives=[p0, p2]) assert self.iqrs.positive_descriptors == {n3, n4} assert self.iqrs.negative_descriptors == {p0, p1, p2}
def test_known_descriptors_euclidean_ordered(self): index = self._make_inst() # make vectors to return in a known euclidean distance order i = 100 test_descriptors = [] for j in range(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(np.array([j, j*2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were build in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index # order. q = DescriptorMemoryElement('query', 99) q.set_vector(np.array([0, 0], float)) r, dists = index.nn(q, n=i) # Because the data is one-dimensional, all of the cells will have # the same points (any division will just correspond to a point on # the line), and a cell can't have more than half of the points ntools.assert_equal(len(dists), i//2) for j, d, dist in zip(range(i), r, dists): ntools.assert_equal(d.uuid(), j) np.testing.assert_equal(d.vector(), [j, j*2])
def test_build_index_with_cache(self): # Empty memory data elements for storage empty_data = 'base64://' f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data) # Internal elements should initialize have zero-length byte values self.assertEqual(len(f._index_elem.get_bytes()), 0) self.assertEqual(len(f._index_param_elem.get_bytes()), 0) self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0) # Make unit vectors, one for each feature dimension. dim = 8 test_descriptors = [] for i in range(dim): v = numpy.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) f.build_index(test_descriptors) # Internal elements should not have non-zero byte values. self.assertGreater(len(f._index_elem.get_bytes()), 0) self.assertGreater(len(f._index_param_elem.get_bytes()), 0) self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
def test_normal_conditions(self, mock_dsi_count): index = DummySI() mock_dsi_count.return_value = 1 q = DescriptorMemoryElement('q', 0) q.set_vector(numpy.random.rand(4)) index.nn(q)
def test_fit_with_cache(self): fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) itq = ItqFunctor(DataMemoryElement(), DataMemoryElement(), bit_length=1, random_seed=0) itq.fit(fit_descriptors) # TODO: Explanation as to why this is the expected result. numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0]) numpy.testing.assert_array_almost_equal(itq.rotation, [[1 / sqrt(2)], [1 / sqrt(2)]]) self.assertIsNotNone(itq.mean_vec_cache_elem) numpy.testing.assert_array_almost_equal( numpy.load(BytesIO(itq.mean_vec_cache_elem.get_bytes())), [0, 0]) self.assertIsNotNone(itq.rotation_cache_elem) numpy.testing.assert_array_almost_equal( numpy.load(BytesIO(itq.rotation_cache_elem.get_bytes())), [[1 / sqrt(2)], [1 / sqrt(2)]])
def test_feedback_results_has_results_post_reset(self): """ Test that an empty list is returned after a reset where there was a cached value before the reset. """ # Mocking results map existing for return. d0 = DescriptorMemoryElement('', 0).set_vector([0]) d1 = DescriptorMemoryElement('', 1).set_vector([1]) d2 = DescriptorMemoryElement('', 2).set_vector([2]) d3 = DescriptorMemoryElement('', 3).set_vector([3]) self.iqrs.feedback_list = { d0, d1, d2, d3, } # Initial call to ``ordered_results`` should have a non-None return. assert self.iqrs.feedback_results() is not None self.iqrs.reset() # Post-reset, there should be no results nor cache. actual = self.iqrs.feedback_results() assert actual == []
def test_update_index_no_existing_index(self): # Test that calling update_index with no existing index acts like # building the index fresh. This test is basically the same as # test_build_index_fresh_build but using update_index instead. descr_index = MemoryDescriptorIndex() hash_kvs = MemoryKeyValueStore() index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_index, hash_kvs) descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) index.update_index(descriptors) # Make sure descriptors are now in attached index and in key-value-store self.assertEqual(descr_index.count(), 5) for d in descriptors: self.assertIn(d, descr_index) # Dummy hash function bins sum of descriptor vectors. self.assertEqual(hash_kvs.count(), 5) for i in range(5): self.assertSetEqual(hash_kvs.get(i), {i})
def test_adjudicate_remove_pos_neg(self): """ Test that we can remove positive and negative adjudications using "un_*" parameters. """ # Set initial state p0 = DescriptorMemoryElement('', 0).set_vector([0]) p1 = DescriptorMemoryElement('', 1).set_vector([1]) p2 = DescriptorMemoryElement('', 2).set_vector([2]) n3 = DescriptorMemoryElement('', 3).set_vector([3]) n4 = DescriptorMemoryElement('', 4).set_vector([4]) # Set initial state self.iqrs.positive_descriptors = {p0, p1, p2} self.iqrs.negative_descriptors = {n3, n4} # "Un-Adjudicate" descriptors individually self.iqrs.adjudicate(un_positives=[p1]) assert self.iqrs.positive_descriptors == {p0, p2} assert self.iqrs.negative_descriptors == {n3, n4} self.iqrs.adjudicate(un_negatives=[n3]) assert self.iqrs.positive_descriptors == {p0, p2} assert self.iqrs.negative_descriptors == {n4} # "Un-Adjudicate" collectively self.iqrs.adjudicate(un_positives=[p0, p2], un_negatives=[n4]) assert self.iqrs.positive_descriptors == set() assert self.iqrs.negative_descriptors == set()
def test_adjudicate_add_duplicates(self): """ Test that adding duplicate descriptors as positive or negative adjudications has no effect as the behavior of sets should be observed. """ p0 = DescriptorMemoryElement('', 0).set_vector([0]) p2 = DescriptorMemoryElement('', 2).set_vector([2]) n1 = DescriptorMemoryElement('', 1).set_vector([1]) p3 = DescriptorMemoryElement('', 3).set_vector([3]) n4 = DescriptorMemoryElement('', 4).set_vector([4]) # Partially add the above descriptors self.iqrs.adjudicate(new_positives=[p0], new_negatives=[n1]) assert self.iqrs.positive_descriptors == {p0} assert self.iqrs.negative_descriptors == {n1} # Add all descriptors, observing that that already added descriptors # are ignored. self.iqrs.adjudicate(new_positives=[p0, p2, p3], new_negatives=[n1, n4]) assert self.iqrs.positive_descriptors == {p0, p2, p3} assert self.iqrs.negative_descriptors == {n1, n4} # Duplicate previous call so no new descriptors are added. No change or # issue should be observed. self.iqrs.adjudicate(new_positives=[p0, p2, p3], new_negatives=[n1, n4]) assert self.iqrs.positive_descriptors == {p0, p2, p3} assert self.iqrs.negative_descriptors == {n1, n4}
def test_fit_with_cache(self): fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) itq = ItqFunctor(DataMemoryElement(), DataMemoryElement(), bit_length=1, random_seed=0) itq.fit(fit_descriptors) # TODO: Explanation as to why this is the expected result. numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0]) numpy.testing.assert_array_almost_equal(itq.rotation, [[1 / sqrt(2)], [1 / sqrt(2)]]) self.assertIsNotNone(itq.mean_vec_cache_elem) numpy.testing.assert_array_almost_equal( numpy.load(BytesIO(itq.mean_vec_cache_elem.get_bytes())), [0, 0] ) self.assertIsNotNone(itq.rotation_cache_elem) numpy.testing.assert_array_almost_equal( numpy.load(BytesIO(itq.rotation_cache_elem.get_bytes())), [[1 / sqrt(2)], [1 / sqrt(2)]] )
def test_fit_short_descriptors_for_bit_length(self): # Should error when input descriptors have fewer dimensions than set bit # length for output hash codes (limitation of PCA method currently # used). fit_descriptors = [] for i in range(3): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-1+i, -1+i]) fit_descriptors.append(d) itq = ItqFunctor(bit_length=8) self.assertRaisesRegexp( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, fit_descriptors ) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation) # Should behave the same when input is an iterable self.assertRaisesRegexp( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, iter(fit_descriptors) ) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation)
def test_fit_short_descriptors_for_bit_length(self): # Should error when input descriptors have fewer dimensions than set bit # length for output hash codes (limitation of PCA method currently # used). fit_descriptors = [] for i in range(3): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-1 + i, -1 + i]) fit_descriptors.append(d) itq = ItqFunctor(bit_length=8) self.assertRaisesRegexp( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, fit_descriptors) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation) # Should behave the same when input is an iterable self.assertRaisesRegexp( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, iter(fit_descriptors)) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation)
def test_get_hash(self): fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) # The following "rotation" matrix should cause any 2-feature descriptor # to the right of the line ``y = -x`` to be True, and to the left as # False. If on the line, should be True. itq = ItqFunctor(bit_length=1, random_seed=0) itq.mean_vec = numpy.array([0., 0.]) itq.rotation = numpy.array([[1. / sqrt(2)], [1. / sqrt(2)]]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1, 1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1, -1])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1, 1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1.001, 1])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1, 1.001])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1, -1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1, -1.001])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1.001, -1])), [True])
def test_build_index(self): # Empty memory data elements for storage empty_data = 'base64://' f = FlannNearestNeighborsIndex(empty_data, empty_data, empty_data) # Internal elements should initialize have zero-length byte values self.assertEqual(len(f._index_elem.get_bytes()), 0) self.assertEqual(len(f._index_param_elem.get_bytes()), 0) self.assertEqual(len(f._descr_cache_elem.get_bytes()), 0) # Make unit vectors, one for each feature dim = 8 test_descriptors = [] for i in range(dim): v = numpy.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) f.build_index(test_descriptors) # Internal elements should not have non-zero byte values. self.assertGreater(len(f._index_elem.get_bytes()), 0) self.assertGreater(len(f._index_param_elem.get_bytes()), 0) self.assertGreater(len(f._descr_cache_elem.get_bytes()), 0)
def test_remove_from_index(self): # Test that removing by UIDs does the correct thing. # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) d_set = MemoryDescriptorIndex() hash_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs) idx.build_index(descriptors) # Attempt removing 1 uid. idx.remove_from_index([3]) self.assertEqual( idx.descriptor_index._table, { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 4: descriptors[4], }) self.assertEqual(idx.hash2uuids_kvstore._table, { 0: {0}, 1: {1}, 2: {2}, 4: {4}, })
def test_pathological_example(self): n = 10 ** 4 dim = 256 depth = 10 # L ~ n/2**depth = 10^4 / 2^10 ~ 10 k = 200 # 3k/L = 60 num_trees = 60 d_index = [DescriptorMemoryElement('test', i) for i in range(n)] # Put all descriptors on a line so that different trees get same # divisions [d.set_vector(np.full(dim, d.uuid(), dtype=np.float64)) for d in d_index] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim,))) di = MemoryDescriptorIndex() mrpt = MRPTNearestNeighborsIndex( di, num_trees=num_trees, depth=depth, random_seed=0) mrpt.build_index(d_index) nbrs, dists = mrpt.nn(q, k) ntools.assert_equal(len(nbrs), len(dists)) # We should get about 10 descriptors back instead of the requested # 200 ntools.assert_less(len(nbrs), 20)
def test_build_index_fresh_build(self): descr_index = MemoryDescriptorIndex() hash_kvs = MemoryKeyValueStore() index = LSHNearestNeighborIndex(DummyHashFunctor(), descr_index, hash_kvs) descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for i, d in enumerate(descriptors): d.set_vector(np.ones(1, float) * i) index.build_index(descriptors) # Make sure descriptors are now in attached index and in key-value-store self.assertEqual(descr_index.count(), 5) for d in descriptors: self.assertIn(d, descr_index) # Dummy hash function bins sum of descriptor vectors. self.assertEqual(hash_kvs.count(), 5) for i in range(5): self.assertSetEqual(hash_kvs.get(i), {i})
def test_count_empty_hash2uid(self): """ Test that an empty hash-to-uid mapping results in a 0 return regardless of descriptor-set state. """ descr_set = MemoryDescriptorIndex() hash_kvs = MemoryKeyValueStore() self.assertEqual(descr_set.count(), 0) self.assertEqual(hash_kvs.count(), 0) lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs) self.assertEqual(lsh.count(), 0) # Additions to the descriptor-set should not impact LSH index "size" lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 0)) self.assertEqual(lsh.descriptor_index.count(), 1) self.assertEqual(lsh.hash2uuids_kvstore.count(), 0) self.assertEqual(lsh.count(), 0) lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 1)) self.assertEqual(lsh.descriptor_index.count(), 2) self.assertEqual(lsh.hash2uuids_kvstore.count(), 0) self.assertEqual(lsh.count(), 0) lsh.hash2uuids_kvstore.add(0, {0}) self.assertEqual(lsh.descriptor_index.count(), 2) self.assertEqual(lsh.count(), 1) lsh.hash2uuids_kvstore.add(0, {0, 1}) self.assertEqual(lsh.descriptor_index.count(), 2) self.assertEqual(lsh.count(), 2) lsh.hash2uuids_kvstore.add(0, {0, 1, 2}) self.assertEqual(lsh.descriptor_index.count(), 2) self.assertEqual(lsh.count(), 3)
def test_remove_then_add(self): """ Test that we can remove from the index and then add to it again. """ n1 = 100 n2 = 10 dim = 8 set1 = [DescriptorMemoryElement('test', i) for i in range(n1)] set2 = [DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)] [d.set_vector(np.random.rand(dim)) for d in (set1 + set2)] uids_to_remove = [10, 98] index = self._make_inst() index.build_index(set1) index.remove_from_index(uids_to_remove) index.update_index(set2) self.assertEqual(len(index), 108) # Removed descriptors should not be in return queries. self.assertNotEqual(index.nn(set1[10], 1)[0][0], set1[10]) self.assertNotEqual(index.nn(set1[98], 1)[0][0], set1[98]) # Every other descriptor should be queryable for d in set1 + set2: if d.uuid() not in uids_to_remove: self.assertEqual(index.nn(d, 1)[0][0], d) self.assertEqual(index._next_index, 110)
def test_update_index_additive(self): n1 = 100 n2 = 10 dim = 8 set1 = {DescriptorMemoryElement('test', i) for i in range(n1)} set2 = {DescriptorMemoryElement('test', i) for i in range(n1, n1 + n2)} [d.set_vector(np.random.rand(dim)) for d in set1.union(set1 | set2)] # Create and build initial index. index = self._make_inst() index.build_index(set1) self.assertEqual(index.count(), len(set1)) for d in set1: self.assertIn(d, index._descriptor_set) # Update and check that all intended descriptors are present in index. index.update_index(set2) set_all = set1 | set2 self.assertEqual(index.count(), len(set_all)) for d in set_all: self.assertIn(d, index._descriptor_set) # Check that NN can return something from the updated set. # - nearest element to the query element when the query is in the index # should be the query element. for q in set2: n_elems, n_dists = index.nn(q) self.assertEqual(n_elems[0], q)
def test_nn_small_leaves(self): np.random.seed(0) n = 10**4 dim = 256 depth = 10 # L ~ n/2**depth = 10^4 / 2^10 ~ 10 k = 200 # 3k/L = 60 num_trees = 60 d_set = [DescriptorMemoryElement('test', i) for i in range(n)] [d.set_vector(np.random.rand(dim)) for d in d_set] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim, ))) di = MemoryDescriptorSet() mrpt = MRPTNearestNeighborsIndex(di, num_trees=num_trees, depth=depth, random_seed=0) mrpt.build_index(d_set) nbrs, dists = mrpt.nn(q, k) self.assertEqual(len(nbrs), len(dists)) self.assertEqual(len(nbrs), k)
def test_get_hash(self): fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement('test', i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) # The following "rotation" matrix should cause any 2-feature descriptor # to the right of the line ``y = -x`` to be True, and to the left as # False. If on the line, should be True. itq = ItqFunctor(bit_length=1, random_seed=0) itq.mean_vec = numpy.array([0., 0.]) itq.rotation = numpy.array([[1. / sqrt(2)], [1. / sqrt(2)]]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, 1])), [True]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, -1])), [False]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, 1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1.001, 1])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1, 1.001])), [True]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, -1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1, -1.001])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1.001, -1])), [True])
def test_classify(self): d = DescriptorMemoryElement('test', 0) d.set_vector([1, 2, 3]) c = DummyClassifier() e = c.classify(d) nose.tools.assert_equal(e.get_classification(), {0: [1, 2, 3]}) nose.tools.assert_equal(e.uuid, d.uuid())
def test_build_index_one(self): d = DescriptorMemoryElement('test', 0) d.set_vector(numpy.zeros(8, float)) index = self._make_inst('euclidean') index.build_index([d]) self.assertListEqual(index._descr_cache, [d]) self.assertIsNotNone(index._flann) self.assertIsInstance(index._flann_build_params, dict)
def test_remove_from_index_invalid_uid(self): # Test that attempting to remove a single invalid UID causes a key # error and does not affect index. # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) # uid -> descriptor expected_dset_table = { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], } # hash int -> set[uid] expected_kvs_table = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, } d_set = MemoryDescriptorIndex() hash_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(DummyHashFunctor(), d_set, hash_kvs) idx.build_index(descriptors) # Assert we have the correct expected values self.assertEqual(idx.descriptor_index._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table) # Attempt to remove descriptor with a UID we did not build with. self.assertRaisesRegexp( KeyError, '5', idx.remove_from_index, [5] ) # Index should not have been modified. self.assertEqual(idx.descriptor_index._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table) # Attempt to remove multiple UIDs, one valid and one invalid self.assertRaisesRegexp( KeyError, '5', idx.remove_from_index, [2, 5] ) # Index should not have been modified. self.assertEqual(idx.descriptor_index._table, expected_dset_table) self.assertEqual(idx.hash2uuids_kvstore._table, expected_kvs_table)
def test_classify_elements_missing_vector(self): """ Test that we get a ValueError when """ elems = [ DescriptorMemoryElement('', 0).set_vector([1, 2, 3]), DescriptorMemoryElement('', 0), # no set vector DescriptorMemoryElement('', 0).set_vector([4, 5, 6]), ] with pytest.raises(ValueError, match=r"no vector stored"): list(self.inst.classify_elements(elems))
def test_nn_empty_index(self): # nn should fail if index size is 0 index = DummySI() index.count = mock.MagicMock(return_value=0) index._nn = mock.MagicMock() q = DescriptorMemoryElement('q', 0) q.set_vector(numpy.random.rand(4)) self.assertRaises(ValueError, index.nn, q)
def test_read_only(self): v = np.zeros(5, float) v[0] = 1. d = DescriptorMemoryElement('unit', 0) d.set_vector(v) test_descriptors = [d] index = self._make_inst(read_only=True) self.assertRaises(ReadOnlyError, index.build_index, test_descriptors)
def test_simple_multiclass_classification(self): """ Test simple train and classify setup with 3 classes. """ # Fix random seed for deterministic testing. numpy.random.seed(0) N = 1000 LABEL_1 = 'p1' LABEL_2 = 'p2' LABEL_3 = 'p3' # Setup training dataset # - 1 dimensional for obvious separation, this is not a performance # test. train1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis] train2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis] train3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis] train1_e = [DescriptorMemoryElement('train', i).set_vector(v) for i, v in enumerate(train1)] train2_e = [DescriptorMemoryElement('train', i).set_vector(v) for i, v in enumerate(train2, start=len(train1_e))] train3_e = [DescriptorMemoryElement('train', i).set_vector(v) for i, v in enumerate(train3, start=len(train1_e) + len(train2_e))] # Setup testing dataset test1 = numpy.interp(numpy.random.rand(N), [0, 1], [0.0, .30])[:, numpy.newaxis] test2 = numpy.interp(numpy.random.rand(N), [0, 1], [.40, .60])[:, numpy.newaxis] test3 = numpy.interp(numpy.random.rand(N), [0, 1], [.70, 1.0])[:, numpy.newaxis] # Train and test classifier instance classifier = SkLearnLogisticRegression(random_state=0) classifier.train({ LABEL_1: train1_e, LABEL_2: train2_e, LABEL_3: train3_e, }) c_maps_l1 = list(classifier._classify_arrays(test1)) c_maps_l2 = list(classifier._classify_arrays(test2)) c_maps_l3 = list(classifier._classify_arrays(test3)) for v, m in zip(test1, c_maps_l1): assert m[LABEL_1] > max(m[LABEL_2], m[LABEL_3]), \ "Incorrect {} label: c_map={} :: test_vector={}".format( LABEL_1, m, v ) for v, m in zip(test2, c_maps_l2): assert m[LABEL_2] > max(m[LABEL_1], m[LABEL_3]), \ "Incorrect {} label: c_map={} :: test_vector={}".format( LABEL_2, m, v ) for v, m in zip(test3, c_maps_l3): assert m[LABEL_3] > max(m[LABEL_2], m[LABEL_1]), \ "Incorrect {} label: c_map={} :: test_vector={}".format( LABEL_3, m, v )
def test_nn_normal_conditions(self): index = DummySI() # Need to force a non-zero index size for knn to be performed. index.count = mock.MagicMock() index.count.return_value = 1 q = DescriptorMemoryElement('q', 0) q.set_vector(numpy.random.rand(4)) # Basically this shouldn't crash index.nn(q)
def _random_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None): # :param hash_ftor: Hash function class for generating hash codes for # descriptors. # :param hash_idx: Hash index instance to use in local LSH algo # instance. # :param ftor_train_hook: Function for training functor if necessary. # make random descriptors i = 1000 dim = 256 td = [] np.random.seed(self.RANDOM_SEED) for j in range(i): d = DescriptorMemoryElement('random', j) d.set_vector(np.random.rand(dim)) td.append(d) ftor_train_hook(td) di = MemoryDescriptorIndex() kvstore = MemoryKeyValueStore() index = LSHNearestNeighborIndex(hash_ftor, di, kvstore, hash_index=hash_idx, distance_method='euclidean') index.build_index(td) # test query from build set -- should return same descriptor when k=1 q = td[255] r, dists = index.nn(q, 1) self.assertEqual(r[0], q) # test query very near a build vector td_q = td[0] q = DescriptorMemoryElement('query', i) v = td_q.vector().copy() v_min = max(v.min(), 0.1) v[0] += v_min v[dim-1] -= v_min q.set_vector(v) r, dists = index.nn(q, 1) self.assertFalse(np.array_equal(q.vector(), td_q.vector())) self.assertEqual(r[0], td_q) # random query q = DescriptorMemoryElement('query', i+1) q.set_vector(np.random.rand(dim)) # for any query of size k, results should at least be in distance order r, dists = index.nn(q, 10) for j in range(1, len(dists)): self.assertGreater(dists[j], dists[j-1]) r, dists = index.nn(q, i) for j in range(1, len(dists)): self.assertGreater(dists[j], dists[j-1])
def test_known_descriptors_euclidean_ordered(self): index = self._make_inst('euclidean') # make vectors to return in a known euclidean distance order i = 1000 test_descriptors = [] for j in xrange(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(numpy.array([j, j * 2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were build in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index order. q = DescriptorMemoryElement('query', i) q.set_vector(numpy.array([0, 0], float)) # top result should have UUID == 0 (nearest to query) r, dists = index.nn(q, 5) ntools.assert_equal(r[0].uuid(), 0) ntools.assert_equal(r[1].uuid(), 1) ntools.assert_equal(r[2].uuid(), 2) ntools.assert_equal(r[3].uuid(), 3) ntools.assert_equal(r[4].uuid(), 4) # global search should be in complete order r, dists = index.nn(q, i) for j, d, dist in zip(range(i), r, dists): ntools.assert_equal(d.uuid(), j)
def _known_ordered_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None): # make vectors to return in a known euclidean distance order i = 1000 test_descriptors = [] for j in range(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(np.array([j, j*2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) ftor_train_hook(test_descriptors) di = MemoryDescriptorIndex() kvstore = MemoryKeyValueStore() index = LSHNearestNeighborIndex(hash_ftor, di, kvstore, hash_index=hash_idx, distance_method='euclidean') index.build_index(test_descriptors) # Since descriptors were built in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index order. q = DescriptorMemoryElement('query', i) q.set_vector(np.array([0, 0], float)) # top result should have UUID == 0 (nearest to query) r, dists = index.nn(q, 5) self.assertEqual(r[0].uuid(), 0) self.assertEqual(r[1].uuid(), 1) self.assertEqual(r[2].uuid(), 2) self.assertEqual(r[3].uuid(), 3) self.assertEqual(r[4].uuid(), 4) # global search should be in complete order r, dists = index.nn(q, i) for j, d, dist in zip(range(i), r, dists): self.assertEqual(d.uuid(), j)
def test_remove_from_index_shared_hashes_partial(self): """ Test that only some hashes are removed from the hash index, but not others when those hashes still refer to other descriptors. """ # Simulate initial state with some descriptor hashed to one value and # other descriptors hashed to another. # Vectors of length 1 for easy dummy hashing prediction. descriptors = [ DescriptorMemoryElement('t', 0).set_vector([0]), DescriptorMemoryElement('t', 1).set_vector([1]), DescriptorMemoryElement('t', 2).set_vector([2]), DescriptorMemoryElement('t', 3).set_vector([3]), DescriptorMemoryElement('t', 4).set_vector([4]), ] # Dummy hash function to do the simulated thing hash_func = DummyHashFunctor() hash_func.get_hash = mock.Mock( # Vectors of even sum hash to 0, odd to 1. side_effect=lambda vec: [vec.sum() % 2] ) d_set = MemoryDescriptorIndex() d_set._table = { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], } hash2uid_kvs = MemoryKeyValueStore() hash2uid_kvs._table = { 0: {0, 2, 4}, 1: {1, 3}, } idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs) idx.hash_index = mock.Mock(spec=HashIndex) idx.remove_from_index([1, 2, 3]) # Check that only one hash vector was passed to hash_index's removal # method (deque of hash-code vectors). idx.hash_index.remove_from_index.assert_called_once_with( collections.deque([ [1], ]) ) self.assertDictEqual(d_set._table, { 0: descriptors[0], 4: descriptors[4], }) self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
def test_none_set(self): d = DescriptorMemoryElement('test', 0) self.assertFalse(d.has_vector()) d.set_vector(numpy.ones(16)) self.assertTrue(d.has_vector()) numpy.testing.assert_equal(d.vector(), numpy.ones(16)) d.set_vector(None) self.assertFalse(d.has_vector()) self.assertIs(d.vector(), None)
def test_none_set(self): d = DescriptorMemoryElement('test', 0) ntools.assert_false(d.has_vector()) d.set_vector(numpy.ones(16)) ntools.assert_true(d.has_vector()) numpy.testing.assert_equal(d.vector(), numpy.ones(16)) d.set_vector(None) ntools.assert_false(d.has_vector()) ntools.assert_is(d.vector(), None)
def test_build_index_one(self): d = DescriptorMemoryElement('test', 0) d.set_vector(numpy.zeros(8, float)) index = self._make_inst('euclidean') index.build_index([d]) self.assertListEqual( index._descr_cache, [d] ) self.assertIsNotNone(index._flann) self.assertIsInstance(index._flann_build_params, dict)
def test_build_index_read_only(self): v = np.zeros(5, float) v[0] = 1. d = DescriptorMemoryElement('unit', 0) d.set_vector(v) test_descriptors = [d] index = self._make_inst(read_only=True) self.assertRaises( ReadOnlyError, index.build_index, test_descriptors )
def test_remove_from_index_shared_hashes(self): """ Test that removing a descriptor (by UID) that shares a hash with other descriptors does not trigger removal of its hash. """ # Simulate descriptors all hashing to the same hash value: 0 hash_func = DummyHashFunctor() hash_func.get_hash = mock.Mock(return_value=np.asarray([0], bool)) d_set = MemoryDescriptorSet() hash2uids_kvs = MemoryKeyValueStore() idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uids_kvs) # Descriptors are 1 dim, value == index. descriptors = [ DescriptorMemoryElement('t', 0), DescriptorMemoryElement('t', 1), DescriptorMemoryElement('t', 2), DescriptorMemoryElement('t', 3), DescriptorMemoryElement('t', 4), ] # Vectors of length 1 for easy dummy hashing prediction. for d in descriptors: d.set_vector(np.ones(1, float) * d.uuid()) idx.build_index(descriptors) # We expect the descriptor-set and kvs to look like the following now: self.assertDictEqual( d_set._table, { 0: descriptors[0], 1: descriptors[1], 2: descriptors[2], 3: descriptors[3], 4: descriptors[4], }) self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 2, 3, 4}}) # Mock out hash index as if we had an implementation so we can check # call to its remove_from_index method. idx.hash_index = mock.Mock(spec=HashIndex) idx.remove_from_index([2, 4]) # Only uid 2 and 4 descriptors should be gone from d-set, kvs should # still have the 0 key and its set value should only contain uids 0, 1 # and 3. `hash_index.remove_from_index` should not be called because # no hashes should be marked for removal. self.assertDictEqual(d_set._table, { 0: descriptors[0], 1: descriptors[1], 3: descriptors[3], }) self.assertDictEqual(hash2uids_kvs._table, {0: {0, 1, 3}}) idx.hash_index.remove_from_index.assert_not_called()
def test_configuration(self): default_config = DescriptorMemoryElement.get_default_config() ntools.assert_equal(default_config, {}) inst1 = DescriptorMemoryElement.from_config(default_config, 'test', 'a') ntools.assert_equal(default_config, inst1.get_config()) ntools.assert_equal(inst1.type(), 'test') ntools.assert_equal(inst1.uuid(), 'a') # vector-based equality inst2 = DescriptorMemoryElement.from_config(inst1.get_config(), 'test', 'a') ntools.assert_equal(inst1, inst2)
def test_classify(self): c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS) m_expected = { six.b('label_1'): 1, six.b('label_2'): 2, six.b('negative'): 3, six.b('label_3'): 4, six.b('Kitware'): 5, six.b('label_4'): 6, } d = DescriptorMemoryElement('test', 0) d.set_vector([1, 2, 3, 4, 5, 6]) m = c._classify(d) self.assertEqual(m, m_expected)
def _random_euclidean(self, hash_ftor, hash_idx, ftor_train_hook=lambda d: None): # make random descriptors i = 1000 dim = 256 td = [] numpy.random.seed(self.RANDOM_SEED) for j in xrange(i): d = DescriptorMemoryElement("random", j) d.set_vector(numpy.random.rand(dim)) td.append(d) ftor_train_hook(td) di = MemoryDescriptorIndex() index = LSHNearestNeighborIndex(hash_ftor, di, hash_idx, distance_method="euclidean") index.build_index(td) # test query from build set -- should return same descriptor when k=1 q = td[255] r, dists = index.nn(q, 1) ntools.assert_equal(r[0], q) # test query very near a build vector td_q = td[0] q = DescriptorMemoryElement("query", i) v = td_q.vector().copy() v_min = max(v.min(), 0.1) v[0] += v_min v[dim - 1] -= v_min q.set_vector(v) r, dists = index.nn(q, 1) ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector())) ntools.assert_equal(r[0], td_q) # random query q = DescriptorMemoryElement("query", i + 1) q.set_vector(numpy.random.rand(dim)) # for any query of size k, results should at least be in distance order r, dists = index.nn(q, 10) for j in xrange(1, len(dists)): ntools.assert_greater(dists[j], dists[j - 1]) r, dists = index.nn(q, i) for j in xrange(1, len(dists)): ntools.assert_greater(dists[j], dists[j - 1]) DescriptorMemoryElement.MEMORY_CACHE = {}
def test_clustering_equal_descriptors(self): # Test that clusters of descriptor of size n-features are correctly # clustered together. print("Creating dummy descriptors") n_features = 8 n_descriptors = 20 index = MemoryDescriptorIndex() c = 0 for i in range(n_features): v = numpy.ndarray((8,)) v[...] = 0 v[i] = 1 for j in range(n_descriptors): d = DescriptorMemoryElement('test', c) d.set_vector(v) index.add_descriptor(d) c += 1 print("Creating test MBKM") mbkm = MiniBatchKMeans(n_features, batch_size=12, verbose=True, compute_labels=False, random_state=0) # Initial fit with half of index d_classes = mb_kmeans_build_apply(index, mbkm, n_descriptors) # There should be 20 descriptors per class for c in d_classes: self.assertEqual( len(d_classes[c]), n_descriptors, "Cluster %s did not have expected number of descriptors " "(%d != %d)" % (c, n_descriptors, len(d_classes[c])) ) # Each descriptor in each cluster should be equal to the other # descriptors in that cluster uuids = list(d_classes[c]) v = index[uuids[0]].vector() for uuid in uuids[1:]: v2 = index[uuid].vector() numpy.testing.assert_array_equal(v, v2, "vector in cluster %d did not " "match other vectors " "(%s != %s)" % (c, v, v2))
def test_nn_many_descriptors(self): np.random.seed(0) n = 10 ** 4 dim = 256 d_index = [DescriptorMemoryElement('test', i) for i in range(n)] [d.set_vector(np.random.rand(dim)) for d in d_index] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim,))) faiss_index = self._make_inst() faiss_index.build_index(d_index) nbrs, dists = faiss_index.nn(q, 10) self.assertEqual(len(nbrs), len(dists)) self.assertEqual(len(nbrs), 10)
def test_classify_invalid_descriptor_dimensions(self): c = IndexLabelClassifier(self.FILEPATH_TEST_LABELS) d = DescriptorMemoryElement('test', 0) # One less d.set_vector([1, 2, 3, 4, 5]) self.assertRaises( RuntimeError, c._classify, d ) # One more d.set_vector([1, 2, 3, 4, 5, 6, 7]) self.assertRaises( RuntimeError, c._classify, d )
def test_known_descriptors_euclidean_ordered(self): index = self._make_inst('euclidean') # make vectors to return in a known euclidean distance order i = 1000 test_descriptors = [] for j in xrange(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(numpy.array([j, j*2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were build in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index order. q = DescriptorMemoryElement('query', i) q.set_vector(numpy.array([0, 0], float)) # top result should have UUID == 0 (nearest to query) r, dists = index.nn(q, 5) ntools.assert_equal(r[0].uuid(), 0) ntools.assert_equal(r[1].uuid(), 1) ntools.assert_equal(r[2].uuid(), 2) ntools.assert_equal(r[3].uuid(), 3) ntools.assert_equal(r[4].uuid(), 4) # global search should be in complete order r, dists = index.nn(q, i) for j, d, dist in zip(range(i), r, dists): ntools.assert_equal(d.uuid(), j)
def test_nn_preprocess_index(self): faiss_index = self._make_inst(factory_string='PCAR64,IVF1,Flat') self.assertEqual(faiss_index.factory_string, 'PCAR64,IVF1,Flat') np.random.seed(self.RAND_SEED) n = 10 ** 4 dim = 256 d_index = [DescriptorMemoryElement('test', i) for i in range(n)] [d.set_vector(np.random.rand(dim)) for d in d_index] q = DescriptorMemoryElement('q', -1) q.set_vector(np.zeros((dim,))) faiss_index.build_index(d_index) nbrs, dists = faiss_index.nn(q, 10) self.assertEqual(len(nbrs), len(dists)) self.assertEqual(len(nbrs), 10)
def test_random_descriptors_euclidean(self): # make random descriptors i = 1000 dim = 256 bits = 32 td = [] for j in xrange(i): d = DescriptorMemoryElement('random', j) d.set_vector(numpy.random.rand(dim)) td.append(d) index = self._make_inst('euclidean', bits) index.build_index(td) # test query from build set -- should return same descriptor when k=1 q = td[255] r, dists = index.nn(q, 1) ntools.assert_equal(r[0], q) # test query very near a build vector td_q = td[0] q = DescriptorMemoryElement('query', i) v = numpy.array(td_q.vector()) # copy v_min = max(v.min(), 0.1) v[0] += v_min v[dim-1] -= v_min q.set_vector(v) r, dists = index.nn(q, 1) ntools.assert_false(numpy.array_equal(q.vector(), td_q.vector())) ntools.assert_equal(r[0], td_q) # random query q = DescriptorMemoryElement('query', i+1) q.set_vector(numpy.random.rand(dim)) # for any query of size k, results should at least be in distance order r, dists = index.nn(q, 10) for j in xrange(1, len(dists)): ntools.assert_greater(dists[j], dists[j-1]) r, dists = index.nn(q, i) for j in xrange(1, len(dists)): ntools.assert_greater(dists[j], dists[j-1])
def test_nn_known_descriptors_euclidean_unit(self): dim = 5 ### # Unit vectors -- Equal distance # index = self._make_inst() test_descriptors = [] for i in range(dim): v = np.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) index.build_index(test_descriptors) # query descriptor -- zero vector # -> all modeled descriptors should be equally distant (unit # corners) q = DescriptorMemoryElement('query', 0) q.set_vector(np.zeros(dim, float)) r, dists = index.nn(q, n=dim) self.assertEqual(len(dists), dim) # All dists should be 1.0, r order doesn't matter for d in dists: self.assertEqual(d, 1.)
def test_pickle_dump_load(self): # Wipe current cache DescriptorMemoryElement.MEMORY_CACHE = {} # Make a couple descriptors v1 = numpy.array([1, 2, 3]) d1 = DescriptorMemoryElement('test', 0) d1.set_vector(v1) v2 = numpy.array([4, 5, 6]) d2 = DescriptorMemoryElement('test', 1) d2.set_vector(v2) ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE) ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE) d1_s = cPickle.dumps(d1) d2_s = cPickle.dumps(d2) # Wipe cache again DescriptorMemoryElement.MEMORY_CACHE = {} ntools.assert_not_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE) ntools.assert_not_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE) # Attempt reconstitution d1_r = cPickle.loads(d1_s) d2_r = cPickle.loads(d2_s) numpy.testing.assert_array_equal(v1, d1_r.vector()) numpy.testing.assert_array_equal(v2, d2_r.vector()) # Cache should now have those entries back in it ntools.assert_in(('test', 0), DescriptorMemoryElement.MEMORY_CACHE) ntools.assert_in(('test', 1), DescriptorMemoryElement.MEMORY_CACHE)
def test_known_descriptors_hik_unit(self): dim = 5 ### # Unit vectors - Equal distance # index = self._make_inst('hik') test_descriptors = [] for i in xrange(dim): v = numpy.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) index.build_index(test_descriptors) # query with zero vector # -> all modeled descriptors have no intersection, dists should be 1.0, # or maximum distance by histogram intersection q = DescriptorMemoryElement('query', 0) q.set_vector(numpy.zeros(dim, float)) r, dists = index.nn(q, dim) # All dists should be 1.0, r order doesn't matter for d in dists: ntools.assert_equal(d, 1.) # query with index element q = test_descriptors[3] r, dists = index.nn(q, 1) ntools.assert_equal(r[0], q) ntools.assert_equal(dists[0], 0.) r, dists = index.nn(q, dim) ntools.assert_equal(r[0], q) ntools.assert_equal(dists[0], 0.)
def test_output_immutability(self): # make sure that data stored is not susceptible to modifications after # extraction v = numpy.ones(16) d = DescriptorMemoryElement('test', 0) ntools.assert_false(d.has_vector()) d.set_vector(v) r = d.vector() r[:] = 0 ntools.assert_equal(r.sum(), 0) ntools.assert_equal(d.vector().sum(), 16)
def test_known_descriptors_euclidean_ordered(self): index = self._make_inst('euclidean') # make vectors to return in a known euclidean distance order i = 10 test_descriptors = [] for j in xrange(i): d = DescriptorMemoryElement('ordered', j) d.set_vector(numpy.array([j, j*2], float)) test_descriptors.append(d) random.shuffle(test_descriptors) index.build_index(test_descriptors) # Since descriptors were build in increasing distance from (0,0), # returned descriptors for a query of [0,0] should be in index order. q = DescriptorMemoryElement('query', 99) q.set_vector(numpy.array([0, 0], float)) r, dists = index.nn(q, i) for j, d, dist in zip(range(i), r, dists): ntools.assert_equal(d.uuid(), j) numpy.testing.assert_equal(d.vector(), [j, j*2])
def test_set_state_version_1(self): # Test support of older state version expected_type = 'test-type' expected_uid = 'test-uid' expected_v = numpy.array([1, 2, 3]) expected_v_b = BytesIO() # noinspection PyTypeChecker numpy.save(expected_v_b, expected_v) expected_v_dump = expected_v_b.getvalue() e = DescriptorMemoryElement(None, None) e.__setstate__((expected_type, expected_uid, expected_v_dump)) self.assertEqual(e.type(), expected_type) self.assertEqual(e.uuid(), expected_uid) numpy.testing.assert_array_equal(e.vector(), expected_v)
def test_update_index(self): # Build index with one descriptor, then "update" with a second # different descriptor checking that the new cache contains both. d1 = DescriptorMemoryElement('test', 0) d1.set_vector(numpy.zeros(8)) d2 = DescriptorMemoryElement('test', 1) d2.set_vector(numpy.ones(8)) index = self._make_inst('euclidean') index.build_index([d1]) self.assertEqual(index.count(), 1) self.assertSetEqual(set(index._descr_cache), {d1}) index.update_index([d2]) self.assertEqual(index.count(), 2) self.assertSetEqual(set(index._descr_cache), {d1, d2})
def test_pickle_dump_load(self): # Make a couple descriptors v1 = numpy.array([1, 2, 3]) d1 = DescriptorMemoryElement('test', 0) d1.set_vector(v1) v2 = numpy.array([4, 5, 6]) d2 = DescriptorMemoryElement('test', 1) d2.set_vector(v2) d1_s = cPickle.dumps(d1) d2_s = cPickle.dumps(d2) # Attempt reconstitution d1_r = cPickle.loads(d1_s) d2_r = cPickle.loads(d2_s) numpy.testing.assert_array_equal(v1, d1_r.vector()) numpy.testing.assert_array_equal(v2, d2_r.vector())
def _known_unit(self, hash_ftor, hash_idx, dist_method, ftor_train_hook=lambda d: None): ### # Unit vectors - Equal distance # dim = 5 test_descriptors = [] for i in range(dim): v = np.zeros(dim, float) v[i] = 1. d = DescriptorMemoryElement('unit', i) d.set_vector(v) test_descriptors.append(d) ftor_train_hook(test_descriptors) di = MemoryDescriptorIndex() kvstore = MemoryKeyValueStore() index = LSHNearestNeighborIndex(hash_ftor, di, kvstore, hash_index=hash_idx, distance_method=dist_method) index.build_index(test_descriptors) # query with zero vector # -> all modeled descriptors have no intersection, dists should be 1.0, # or maximum distance by histogram intersection q = DescriptorMemoryElement('query', 0) q.set_vector(np.zeros(dim, float)) r, dists = index.nn(q, dim) # All dists should be 1.0, r order doesn't matter for d in dists: self.assertEqual(d, 1.) # query with index element q = test_descriptors[3] r, dists = index.nn(q, 1) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.) r, dists = index.nn(q, dim) self.assertEqual(r[0], q) self.assertEqual(dists[0], 0.)
def train_classifier_iqr(config, iqr_state_fp): log = logging.getLogger(__name__) #: :type: smqtk.algorithms.SupervisedClassifier classifier = from_plugin_config(config['classifier'], get_classifier_impls) if not isinstance(classifier, SupervisedClassifier): raise RuntimeError("Configured classifier must be of the " "SupervisedClassifier type in order to train.") # Get pos/neg descriptors out of iqr state zip z_file = open(iqr_state_fp, 'r') z = zipfile.ZipFile(z_file) if len(z.namelist()) != 1: raise RuntimeError("Invalid IqrState file!") iqrs = json.loads(z.read(z.namelist()[0])) if len(iqrs) != 2: raise RuntimeError("Invalid IqrState file!") if 'pos' not in iqrs or 'neg' not in iqrs: raise RuntimeError("Invalid IqrState file!") log.info("Loading pos/neg descriptors") #: :type: list[smqtk.representation.DescriptorElement] pos = [] #: :type: list[smqtk.representation.DescriptorElement] neg = [] i = 0 for v in set(map(tuple, iqrs['pos'])): d = DescriptorMemoryElement('train', i) d.set_vector(numpy.array(v)) pos.append(d) i += 1 for v in set(map(tuple, iqrs['neg'])): d = DescriptorMemoryElement('train', i) d.set_vector(numpy.array(v)) neg.append(d) i += 1 log.info(' positive -> %d', len(pos)) log.info(' negative -> %d', len(neg)) classifier.train({'positive': pos}, negatives=neg)