Пример #1
0
    def test_build_index_fresh_build(self):
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for i, d in enumerate(descriptors):
            d.set_vector(np.ones(1, float) * i)
        index.build_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Пример #2
0
    def test_count_empty_hash2uid(self):
        """
        Test that an empty hash-to-uid mapping results in a 0 return regardless
        of descriptor-set state.
        """
        descr_set = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        self.assertEqual(descr_set.count(), 0)
        self.assertEqual(hash_kvs.count(), 0)

        lsh = LSHNearestNeighborIndex(DummyHashFunctor(), descr_set, hash_kvs)
        self.assertEqual(lsh.count(), 0)

        # Additions to the descriptor-set should not impact LSH index "size"
        lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 0))
        self.assertEqual(lsh.descriptor_index.count(), 1)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.descriptor_index.add_descriptor(DescriptorMemoryElement('t', 1))
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.hash2uuids_kvstore.count(), 0)
        self.assertEqual(lsh.count(), 0)

        lsh.hash2uuids_kvstore.add(0, {0})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 1)

        lsh.hash2uuids_kvstore.add(0, {0, 1})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 2)

        lsh.hash2uuids_kvstore.add(0, {0, 1, 2})
        self.assertEqual(lsh.descriptor_index.count(), 2)
        self.assertEqual(lsh.count(), 3)
Пример #3
0
    def test_has(self):
        i = MemoryDescriptorIndex()
        descrs = [random_descriptor() for _ in xrange(10)]
        i.add_many_descriptors(descrs)

        ntools.assert_true(i.has_descriptor(descrs[4].uuid()))
        ntools.assert_false(i.has_descriptor('not_an_int'))
Пример #4
0
    def test_get_config(self):
        self.assertEqual(
            MemoryDescriptorIndex().get_config(),
            MemoryDescriptorIndex.get_default_config()
        )

        self.assertEqual(
            MemoryDescriptorIndex(None).get_config(),
            MemoryDescriptorIndex.get_default_config()
        )

        empty_elem = DataMemoryElement()
        self.assertEqual(
            MemoryDescriptorIndex(empty_elem).get_config(),
            merge_dict(MemoryDescriptorIndex.get_default_config(), {
                'cache_element': {'type': 'DataMemoryElement'}
            })
        )

        dict_pickle_bytes = pickle.dumps({1: 1, 2: 2, 3: 3}, -1)
        cache_elem = DataMemoryElement(bytes=dict_pickle_bytes)
        self.assertEqual(
            MemoryDescriptorIndex(cache_elem).get_config(),
            merge_dict(MemoryDescriptorIndex.get_default_config(), {
                'cache_element': {
                    'DataMemoryElement': {
                        'bytes': dict_pickle_bytes
                    },
                    'type': 'DataMemoryElement'
                }
            })
        )
Пример #5
0
    def test_update_index_no_existing_index(self):
        # Test that calling update_index with no existing index acts like
        # building the index fresh.  This test is basically the same as
        # test_build_index_fresh_build but using update_index instead.
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        descriptors = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors:
            d.set_vector(np.ones(1, float) * d.uuid())
        index.update_index(descriptors)

        # Make sure descriptors are now in attached index and in key-value-store
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})
Пример #6
0
    def test_from_config(self):
        inst = MemoryDescriptorIndex.from_config({'file_cache': None})
        ntools.assert_is_none(inst.file_cache)

        fp = '/doesnt/exist/yet'
        inst = MemoryDescriptorIndex.from_config({'file_cache': fp})
        ntools.assert_equal(inst.file_cache, fp)
Пример #7
0
    def test_cache_table_empty_table(self):
        inst = MemoryDescriptorIndex(DataMemoryElement(), -1)
        inst._table = {}
        expected_table_pickle_bytes = pickle.dumps(inst._table, -1)

        inst.cache_table()
        self.assertIsNotNone(inst.cache_element)
        self.assertEqual(inst.cache_element.get_bytes(),
                         expected_table_pickle_bytes)
Пример #8
0
    def test_add_descriptor(self):
        index = MemoryDescriptorIndex()

        d1 = random_descriptor()
        index.add_descriptor(d1)
        ntools.assert_equal(index._table[d1.uuid()], d1)

        d2 = random_descriptor()
        index.add_descriptor(d2)
        ntools.assert_equal(index._table[d2.uuid()], d2)
Пример #9
0
    def test_clear(self):
        i = MemoryDescriptorIndex()
        n = 10

        descrs = [random_descriptor() for _ in xrange(n)]
        i.add_many_descriptors(descrs)
        ntools.assert_equal(len(i), n)
        i.clear()
        ntools.assert_equal(len(i), 0)
        ntools.assert_equal(i._table, {})
Пример #10
0
    def test_remove_from_index_shared_hashes_partial(self):
        """
        Test that only some hashes are removed from the hash index, but not
        others when those hashes still refer to other descriptors.
        """
        # Simulate initial state with some descriptor hashed to one value and
        # other descriptors hashed to another.

        # Vectors of length 1 for easy dummy hashing prediction.
        descriptors = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        # Dummy hash function to do the simulated thing
        hash_func = DummyHashFunctor()
        hash_func.get_hash = mock.Mock(
            # Vectors of even sum hash to 0, odd to 1.
            side_effect=lambda vec: [vec.sum() % 2]
        )

        d_set = MemoryDescriptorIndex()
        d_set._table = {
            0: descriptors[0],
            1: descriptors[1],
            2: descriptors[2],
            3: descriptors[3],
            4: descriptors[4],
        }

        hash2uid_kvs = MemoryKeyValueStore()
        hash2uid_kvs._table = {
            0: {0, 2, 4},
            1: {1, 3},
        }

        idx = LSHNearestNeighborIndex(hash_func, d_set, hash2uid_kvs)
        idx.hash_index = mock.Mock(spec=HashIndex)

        idx.remove_from_index([1, 2, 3])
        # Check that only one hash vector was passed to hash_index's removal
        # method (deque of hash-code vectors).
        idx.hash_index.remove_from_index.assert_called_once_with(
            collections.deque([
                [1],
            ])
        )
        self.assertDictEqual(d_set._table, {
            0: descriptors[0],
            4: descriptors[4],
        })
        self.assertDictEqual(hash2uid_kvs._table, {0: {0, 4}})
Пример #11
0
    def test_from_config_null_cache_elem(self):
        inst = MemoryDescriptorIndex.from_config({'cache_element': None})
        self.assertIsNone(inst.cache_element)
        self.assertEqual(inst._table, {})

        inst = MemoryDescriptorIndex.from_config({
            'cache_element': {
                'type': None
            }
        })
        self.assertIsNone(inst.cache_element)
        self.assertEqual(inst._table, {})
Пример #12
0
    def test_update_index_existing_descriptors_frozenset(self):
        """
        Same as ``test_update_index_similar_descriptors`` but testing that
        we can update the index when seeded with structures with existing
        values.
        """
        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        descr_index = MemoryDescriptorIndex()
        descr_index.add_many_descriptors(descriptors1)

        hash_kvs = MemoryKeyValueStore()
        hash_kvs.add(0, frozenset({0}))
        hash_kvs.add(1, frozenset({1}))
        hash_kvs.add(2, frozenset({2}))
        hash_kvs.add(3, frozenset({3}))
        hash_kvs.add(4, frozenset({4}))

        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)
        index.update_index(descriptors2)

        assert descr_index.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_index
        for d in descriptors2:
            assert d in descr_index
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}
Пример #13
0
    def test_clustering_equal_descriptors(self):
        # Test that clusters of descriptor of size  n-features are correctly
        # clustered together.
        print("Creating dummy descriptors")
        n_features = 8
        n_descriptors = 20

        index = MemoryDescriptorIndex()
        c = 0
        for i in range(n_features):
            v = numpy.ndarray((8,))
            v[...] = 0
            v[i] = 1
            for j in range(n_descriptors):
                d = DescriptorMemoryElement('test', c)
                d.set_vector(v)
                index.add_descriptor(d)
                c += 1

        print("Creating test MBKM")
        mbkm = MiniBatchKMeans(n_features, batch_size=12, verbose=True,
                               compute_labels=False, random_state=0)

        # Initial fit with half of index
        d_classes = mb_kmeans_build_apply(index, mbkm, n_descriptors)

        # There should be 20 descriptors per class
        for c in d_classes:
            self.assertEqual(
                len(d_classes[c]),
                n_descriptors,
                "Cluster %s did not have expected number of descriptors "
                "(%d != %d)"
                % (c, n_descriptors, len(d_classes[c]))
            )

            # Each descriptor in each cluster should be equal to the other
            # descriptors in that cluster
            uuids = list(d_classes[c])
            v = index[uuids[0]].vector()
            for uuid in uuids[1:]:
                v2 = index[uuid].vector()
                numpy.testing.assert_array_equal(v, v2,
                                                 "vector in cluster %d did not "
                                                 "match other vectors "
                                                 "(%s != %s)"
                                                 % (c, v, v2))
Пример #14
0
    def test_added_descriptor_table_caching(self):
        cache_elem = DataMemoryElement(readonly=False)
        descrs = [random_descriptor() for _ in range(3)]
        expected_table = dict((r.uuid(), r) for r in descrs)

        i = MemoryDescriptorIndex(cache_elem)
        self.assertTrue(cache_elem.is_empty())

        # Should add descriptors to table, caching to writable element.
        i.add_many_descriptors(descrs)
        self.assertFalse(cache_elem.is_empty())
        self.assertEqual(pickle.loads(i.cache_element.get_bytes()),
                         expected_table)

        # Changing the internal table (remove, add) it should reflect in
        # cache
        new_d = random_descriptor()
        expected_table[new_d.uuid()] = new_d
        i.add_descriptor(new_d)
        self.assertEqual(pickle.loads(i.cache_element.get_bytes()),
                         expected_table)

        rm_d = list(expected_table.values())[0]
        del expected_table[rm_d.uuid()]
        i.remove_descriptor(rm_d.uuid())
        self.assertEqual(pickle.loads(i.cache_element.get_bytes()),
                         expected_table)
Пример #15
0
    def test_count(self):
        index = MemoryDescriptorIndex()
        ntools.assert_equal(index.count(), 0)

        d1 = random_descriptor()
        index.add_descriptor(d1)
        ntools.assert_equal(index.count(), 1)

        d2 = random_descriptor()
        index.add_descriptor(d2)
        ntools.assert_equal(index.count(), 2)
Пример #16
0
    def test_update_index_similar_descriptors(self):
        """
        Test that updating a built index with similar descriptors (same
        vectors, different UUIDs) results in contained structures having an
        expected state.
        """
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        # Similar Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5).set_vector([0]),
            DescriptorMemoryElement('t', 6).set_vector([1]),
            DescriptorMemoryElement('t', 7).set_vector([2]),
            DescriptorMemoryElement('t', 8).set_vector([3]),
            DescriptorMemoryElement('t', 9).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_index.count() == 10
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_index
        for d in descriptors2:
            assert d in descr_index
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0, 5}
        assert hash_kvs.get(1) == {1, 6}
        assert hash_kvs.get(2) == {2, 7}
        assert hash_kvs.get(3) == {3, 8}
        assert hash_kvs.get(4) == {4, 9}
Пример #17
0
    def test_update_index_add_new_descriptors(self):
        # Test that calling update index after a build index causes index
        # components to be properly updated.
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)
        descriptors1 = [
            DescriptorMemoryElement('t', 0),
            DescriptorMemoryElement('t', 1),
            DescriptorMemoryElement('t', 2),
            DescriptorMemoryElement('t', 3),
            DescriptorMemoryElement('t', 4),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 5),
            DescriptorMemoryElement('t', 6),
        ]
        # Vectors of length 1 for easy dummy hashing prediction.
        for d in descriptors1 + descriptors2:
            d.set_vector(np.ones(1, float) * d.uuid())

        # Build initial index.
        index.build_index(descriptors1)
        self.assertEqual(descr_index.count(), 5)
        for d in descriptors1:
            self.assertIn(d, descr_index)
        for d in descriptors2:
            self.assertNotIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 5)
        for i in range(5):
            self.assertSetEqual(hash_kvs.get(i), {i})

        # Update index and check that components have new data.
        index.update_index(descriptors2)
        self.assertEqual(descr_index.count(), 7)
        for d in descriptors1 + descriptors2:
            self.assertIn(d, descr_index)
        # Dummy hash function bins sum of descriptor vectors.
        self.assertEqual(hash_kvs.count(), 7)
        for i in range(7):
            self.assertSetEqual(hash_kvs.get(i), {i})
Пример #18
0
    def test_remove(self):
        i = MemoryDescriptorIndex()
        descrs = [random_descriptor() for _ in xrange(100)]
        i.add_many_descriptors(descrs)
        ntools.assert_equal(len(i), 100)
        ntools.assert_equal(list(i.iterdescriptors()), descrs)

        # remove singles
        i.remove_descriptor(descrs[0].uuid())
        ntools.assert_equal(len(i), 99)
        ntools.assert_equal(set(i.iterdescriptors()),
                            set(descrs[1:]))

        # remove many
        rm_d = descrs[slice(45, 80, 3)]
        i.remove_many_descriptors((d.uuid() for d in rm_d))
        ntools.assert_equal(len(i), 99 - len(rm_d))
        ntools.assert_equal(set(i.iterdescriptors()),
                            set(descrs[1:]).difference(rm_d))
Пример #19
0
    def test_update_index_duplicate_descriptors(self):
        """
        Test that updating a built index with the same descriptors results in
        idempotent behavior.
        """
        descr_index = MemoryDescriptorIndex()
        hash_kvs = MemoryKeyValueStore()
        index = LSHNearestNeighborIndex(DummyHashFunctor(),
                                        descr_index, hash_kvs)

        # Identical Descriptors to build and update on (different instances)
        descriptors1 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]
        descriptors2 = [
            DescriptorMemoryElement('t', 0).set_vector([0]),
            DescriptorMemoryElement('t', 1).set_vector([1]),
            DescriptorMemoryElement('t', 2).set_vector([2]),
            DescriptorMemoryElement('t', 3).set_vector([3]),
            DescriptorMemoryElement('t', 4).set_vector([4]),
        ]

        index.build_index(descriptors1)
        index.update_index(descriptors2)

        assert descr_index.count() == 5
        # Above descriptors should be considered "in" the descriptor set now.
        for d in descriptors1:
            assert d in descr_index
        for d in descriptors2:
            assert d in descr_index
        # Known hashes of the above descriptors should be in the KVS
        assert set(hash_kvs.keys()) == {0, 1, 2, 3, 4}
        assert hash_kvs.get(0) == {0}
        assert hash_kvs.get(1) == {1}
        assert hash_kvs.get(2) == {2}
        assert hash_kvs.get(3) == {3}
        assert hash_kvs.get(4) == {4}
Пример #20
0
 def test_from_config_null_cache_elem_type(self):
     # An empty cache should not trigger loading on construction.
     expected_empty_cache = DataMemoryElement()
     inst = MemoryDescriptorIndex.from_config({
         'cache_element': {
             'type': 'DataMemoryElement',
             'DataMemoryElement': {'bytes': ''}
         }
     })
     self.assertEqual(inst.cache_element, expected_empty_cache)
     self.assertEqual(inst._table, {})
Пример #21
0
 def test_from_config(self):
     # Configured cache with some picked bytes
     expected_table = dict(a=1, b=2, c=3)
     expected_cache = DataMemoryElement(bytes=pickle.dumps(expected_table))
     inst = MemoryDescriptorIndex.from_config({
         'cache_element': {
             'type': 'DataMemoryElement',
             'DataMemoryElement': {'bytes': expected_cache.get_bytes()}
         }
     })
     self.assertEqual(inst.cache_element, expected_cache)
     self.assertEqual(inst._table, expected_table)
Пример #22
0
    def test_get_descriptors(self):
        descrs = [
            random_descriptor(),   # [0]
            random_descriptor(),   # [1]
            random_descriptor(),   # [2]
            random_descriptor(),   # [3]
            random_descriptor(),   # [4]
        ]
        index = MemoryDescriptorIndex()
        index.add_many_descriptors(descrs)

        # single descriptor reference
        r = index.get_descriptor(descrs[1].uuid())
        ntools.assert_equal(r, descrs[1])

        # multiple descriptor reference
        r = list(index.get_many_descriptors([descrs[0].uuid(),
                                             descrs[3].uuid()]))
        ntools.assert_equal(len(r), 2)
        ntools.assert_equal(set(r),
                            {descrs[0], descrs[3]})
Пример #23
0
    def test_add_many(self):
        descrs = [
            random_descriptor(),
            random_descriptor(),
            random_descriptor(),
            random_descriptor(),
            random_descriptor(),
        ]
        index = MemoryDescriptorIndex()
        index.add_many_descriptors(descrs)

        # Compare code keys of input to code keys in internal table
        ntools.assert_equal(set(index._table.keys()),
                            set([e.uuid() for e in descrs]))

        # Get the set of descriptors in the internal table and compare it with
        # the set of generated random descriptors.
        r_set = set()
        [r_set.add(d) for d in index._table.values()]
        ntools.assert_equal(
            set([e for e in descrs]),
            r_set
        )
Пример #24
0
    def test_table_caching(self):
        fd, tmp_cache = tempfile.mkstemp()
        os.close(fd)
        os.remove(tmp_cache)

        try:
            i = MemoryDescriptorIndex(tmp_cache)
            descrs = [random_descriptor() for _ in xrange(3)]
            expected_cache = dict((r.uuid(), r) for r in descrs)

            # cache should not exist yet
            ntools.assert_false(os.path.isfile(tmp_cache))

            # Should write file and should be a dictionary of 3
            # elements
            i.add_many_descriptors(descrs)
            ntools.assert_true(os.path.isfile(tmp_cache))
            with open(tmp_cache) as f:
                ntools.assert_equal(cPickle.load(f),
                                    expected_cache)

            # Changing the internal table (remove, add) it should reflect in
            # cache
            new_d = random_descriptor()
            i.add_descriptor(new_d)
            expected_cache[new_d.uuid()] = new_d
            with open(tmp_cache) as f:
                ntools.assert_equal(cPickle.load(f),
                                    expected_cache)

            rm_d = expected_cache.values()[0]
            i.remove_descriptor(rm_d.uuid())
            del expected_cache[rm_d.uuid()]
            with open(tmp_cache) as f:
                ntools.assert_equal(cPickle.load(f),
                                    expected_cache)
        finally:
            os.remove(tmp_cache)
Пример #25
0
class IqrSession(SmqtkObject):
    """
    Encapsulation of IQR Session related data structures with a centralized lock
    for multi-thread access.

    This object is compatible with the python with-statement, so when elements
    are to be used or modified, it should be within a with-block so race
    conditions do not occur across threads/sub-processes.

    """
    @property
    def _log(self):
        return logging.getLogger('.'.join((self.__module__,
                                           self.__class__.__name__)) +
                                 "[%s]" % self.uuid)

    def __init__(self,
                 pos_seed_neighbors=500,
                 rel_index_config=DFLT_REL_INDEX_CONFIG,
                 session_uid=None):
        """
        Initialize the IQR session

        This does not initialize the working index for ranking as there are no
        known positive descriptor examples at this time.

        Adjudications
        -------------
        Adjudications are carried through between initializations. This allows
        indexed material adjudicated through-out the lifetime of the session to
        stay relevant.

        :param pos_seed_neighbors: Number of neighbors to pull from the given
            ``nn_index`` for each positive exemplar when populating the working
            index, i.e. this value determines the size of the working index for
            IQR refinement. By default, we try to get 500 neighbors.

            Since there may be partial to significant overlap of near neighbors
            as a result of nn_index queries for positive exemplars, the working
            index may contain anywhere from this value's number of entries, to
            ``N*P``, where ``N`` is this value and ``P`` is the number of
            positive examples at the time of working index initialization.
        :type pos_seed_neighbors: int

        :param rel_index_config: Plugin configuration dictionary for the
            RelevancyIndex to use for ranking user adjudications. By default we
            we use an in-memory libSVM based index using the histogram
            intersection metric.
        :type rel_index_config: dict

        :param session_uid: Optional manual specification of session UUID.
        :type session_uid: str or uuid.UUID

        """
        self.uuid = session_uid or str(uuid.uuid1()).replace('-', '')
        self.lock = threading.RLock()

        self.pos_seed_neighbors = int(pos_seed_neighbors)

        # Local descriptor index for ranking, populated by a query to the
        #   nn_index instance.
        # Added external data/descriptors not added to this index.
        self.working_index = MemoryDescriptorIndex()

        # Book-keeping set so we know what positive descriptors
        # UUIDs we've used to query the neighbor index with already.
        #: :type: set[collections.Hashable]
        self._wi_seeds_used = set()

        # Descriptor references from our index (above) that have been
        #   adjudicated.
        #: :type: set[smqtk.representation.DescriptorElement]
        self.positive_descriptors = set()
        #: :type: set[smqtk.representation.DescriptorElement]
        self.negative_descriptors = set()

        # Mapping of a DescriptorElement in our relevancy search index (not the
        #   index that the nn_index uses) to the relevancy score given the
        #   recorded positive and negative adjudications.
        # This is None before any initialization or refinement occurs.
        #: :type: None | dict[smqtk.representation.DescriptorElement, float]
        self.results = None

        #
        # Algorithm Instances [+Config]
        #
        # RelevancyIndex configuration and instance that is used for producing
        #   results.
        # This is only [re]constructed when initializing the session.
        self.rel_index_config = rel_index_config
        # This is None until session initialization happens after pos/neg
        # exemplar data has been added.
        #: :type: None | smqtk.algorithms.relevancy_index.RelevancyIndex
        self.rel_index = None

    def __enter__(self):
        """
        :rtype: IqrSession
        """
        self.lock.acquire()
        return self

    # noinspection PyUnusedLocal
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.lock.release()

    def ordered_results(self):
        """
        Return a tuple of the current (id, probability) result pairs in
        order of probability score. If there are no results yet, None is
        returned.

        :rtype: None | tuple[(smqtk.representation.DescriptorElement, float)]

        """
        with self.lock:
            if self.results:
                return tuple(
                    sorted(self.results.items(),
                           key=lambda p: p[1],
                           reverse=True))
            return None

    def adjudicate(self,
                   new_positives=(),
                   new_negatives=(),
                   un_positives=(),
                   un_negatives=()):
        """
        Update current state of working index positive and negative
        adjudications based on descriptor UUIDs.

        If the same descriptor element is listed in both new positives and
        negatives, they cancel each other out, causing that descriptor to not
        be included in the adjudication.

        The given iterables must be re-traversable. Otherwise the given
        descriptors will not be properly registered.

        :param new_positives: Descriptors of elements in our working index to
            now be considered to be positively relevant.
        :type new_positives: collections.Iterable[smqtk.representation.DescriptorElement]

        :param new_negatives: Descriptors of elements in our working index to
            now be considered to be negatively relevant.
        :type new_negatives: collections.Iterable[smqtk.representation.DescriptorElement]

        :param un_positives: Descriptors of elements in our working index to now
            be considered not positive any more.
        :type un_positives: collections.Iterable[smqtk.representation.DescriptorElement]

        :param un_negatives: Descriptors of elements in our working index to now
            be considered not negative any more.
        :type un_negatives: collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self.lock:
            self.positive_descriptors.update(new_positives)
            self.positive_descriptors.difference_update(un_positives)
            self.positive_descriptors.difference_update(new_negatives)

            self.negative_descriptors.update(new_negatives)
            self.negative_descriptors.difference_update(un_negatives)
            self.negative_descriptors.difference_update(new_positives)

    def update_working_index(self, nn_index):
        """
        Initialize or update our current working index using the given
        :class:`.NearestNeighborsIndex` instance given our current positively
        labeled descriptor elements.

        We only query from the index for new positive elements since the last
        update or reset.

        :param nn_index: :class:`.NearestNeighborsIndex` to query from.
        :type nn_index: smqtk.algorithms.NearestNeighborsIndex

        :raises RuntimeError: There are no positive example descriptors in this
            session to use as a basis for querying.

        """
        if len(self.positive_descriptors) <= 0:
            raise RuntimeError("No positive descriptors to query the neighbor "
                               "index with.")

        # Not clearing working index because this step is intended to be
        # additive.
        updated = False

        # adding to working index
        for p in self.positive_descriptors:
            if p.uuid() not in self._wi_seeds_used:
                self._log.info("Querying neighbors to: %s", p)
                self.working_index.add_many_descriptors(
                    nn_index.nn(p, n=self.pos_seed_neighbors)[0])
                self._wi_seeds_used.add(p.uuid())
                updated = True

        # Make new relevancy index
        if updated:
            self._log.info("Creating new relevancy index over working index.")
            #: :type: smqtk.algorithms.relevancy_index.RelevancyIndex
            self.rel_index = plugin.from_plugin_config(
                self.rel_index_config, get_relevancy_index_impls())
            self.rel_index.build_index(self.working_index.iterdescriptors())

    def refine(self):
        """ Refine current model results based on current adjudication state

        :raises RuntimeError: No working index has been initialized.
            :meth:`update_working_index` should have been called after
            adjudicating some positive examples.
        :raises RuntimeError: There are no adjudications to run on. We must
            have at least one positive adjudication.

        """
        with self.lock:
            if not self.rel_index:
                raise RuntimeError("No relevancy index yet. Must not have "
                                   "initialized session (no working index).")

            # fuse pos/neg adjudications + added positive data descriptors
            pos = self.positive_descriptors
            neg = self.negative_descriptors

            if not pos:
                raise RuntimeError("Did not find at least one positive "
                                   "adjudication.")

            element_probability_map = self.rel_index.rank(pos, neg)

            if self.results is None:
                self.results = IqrResultsDict()
            self.results.update(element_probability_map)

            # Force adjudicated positives and negatives to be probability 1 and
            # 0, respectively, since we want to control where they show up in
            # our results view.
            # - Not all pos/neg descriptors may be in our working index.
            for d in pos:
                if d in self.results:
                    self.results[d] = 1.0
            for d in neg:
                if d in self.results:
                    self.results[d] = 0.0

    def reset(self):
        """ Reset the IQR Search state

        No positive adjudications, reload original feature data

        """
        with self.lock:
            self.working_index.clear()
            self._wi_seeds_used.clear()
            self.positive_descriptors.clear()
            self.negative_descriptors.clear()

            self.rel_index = None
            self.results = None
Пример #26
0
 def test_init_empty_cache(self):
     cache_elem = DataMemoryElement()
     inst = MemoryDescriptorIndex(cache_element=cache_elem)
     self.assertEqual(inst.cache_element, cache_elem)
     self.assertEqual(inst._table, {})
Пример #27
0
 def test_default_config(self):
     ntools.assert_equal(MemoryDescriptorIndex.get_default_config(), {
         "file_cache": None,
         "pickle_protocol": -1
     })
Пример #28
0
    def __init__(self,
                 pos_seed_neighbors=500,
                 rel_index_config=DFLT_REL_INDEX_CONFIG,
                 session_uid=None):
        """
        Initialize the IQR session

        This does not initialize the working index for ranking as there are no
        known positive descriptor examples at this time.

        Adjudications
        -------------
        Adjudications are carried through between initializations. This allows
        indexed material adjudicated through-out the lifetime of the session to
        stay relevant.

        :param pos_seed_neighbors: Number of neighbors to pull from the given
            ``nn_index`` for each positive exemplar when populating the working
            index, i.e. this value determines the size of the working index for
            IQR refinement. By default, we try to get 500 neighbors.

            Since there may be partial to significant overlap of near neighbors
            as a result of nn_index queries for positive exemplars, the working
            index may contain anywhere from this value's number of entries, to
            ``N*P``, where ``N`` is this value and ``P`` is the number of
            positive examples at the time of working index initialization.
        :type pos_seed_neighbors: int

        :param rel_index_config: Plugin configuration dictionary for the
            RelevancyIndex to use for ranking user adjudications. By default we
            we use an in-memory libSVM based index using the histogram
            intersection metric.
        :type rel_index_config: dict

        :param session_uid: Optional manual specification of session UUID.
        :type session_uid: str or uuid.UUID

        """
        self.uuid = session_uid or str(uuid.uuid1()).replace('-', '')
        self.lock = threading.RLock()

        self.pos_seed_neighbors = int(pos_seed_neighbors)

        # Local descriptor index for ranking, populated by a query to the
        #   nn_index instance.
        # Added external data/descriptors not added to this index.
        self.working_index = MemoryDescriptorIndex()

        # Book-keeping set so we know what positive descriptors
        # UUIDs we've used to query the neighbor index with already.
        #: :type: set[collections.Hashable]
        self._wi_seeds_used = set()

        # Descriptor references from our index (above) that have been
        #   adjudicated.
        #: :type: set[smqtk.representation.DescriptorElement]
        self.positive_descriptors = set()
        #: :type: set[smqtk.representation.DescriptorElement]
        self.negative_descriptors = set()

        # Mapping of a DescriptorElement in our relevancy search index (not the
        #   index that the nn_index uses) to the relevancy score given the
        #   recorded positive and negative adjudications.
        # This is None before any initialization or refinement occurs.
        #: :type: None | dict[smqtk.representation.DescriptorElement, float]
        self.results = None

        #
        # Algorithm Instances [+Config]
        #
        # RelevancyIndex configuration and instance that is used for producing
        #   results.
        # This is only [re]constructed when initializing the session.
        self.rel_index_config = rel_index_config
        # This is None until session initialization happens after pos/neg
        # exemplar data has been added.
        #: :type: None | smqtk.algorithms.relevancy_index.RelevancyIndex
        self.rel_index = None
Пример #29
0
 def test_is_usable(self):
     ntools.assert_equal(MemoryDescriptorIndex.is_usable(), True)
Пример #30
0
    def test_remove(self):
        i = MemoryDescriptorIndex()
        descrs = [random_descriptor() for _ in range(100)]
        i.add_many_descriptors(descrs)
        ntools.assert_equal(len(i), 100)
        ntools.assert_equal(list(i.iterdescriptors()), descrs)

        # remove singles
        i.remove_descriptor(descrs[0].uuid())
        ntools.assert_equal(len(i), 99)
        ntools.assert_equal(set(i.iterdescriptors()), set(descrs[1:]))

        # remove many
        rm_d = descrs[slice(45, 80, 3)]
        i.remove_many_descriptors((d.uuid() for d in rm_d))
        ntools.assert_equal(len(i), 99 - len(rm_d))
        ntools.assert_equal(set(i.iterdescriptors()),
                            set(descrs[1:]).difference(rm_d))
Пример #31
0
    def __init__(self, pos_seed_neighbors=500,
                 rel_index_config=DFLT_REL_INDEX_CONFIG,
                 session_uid=None):
        """
        Initialize the IQR session

        This does not initialize the working index for ranking as there are no
        known positive descriptor examples at this time.

        Adjudications
        -------------
        Adjudications are carried through between initializations. This allows
        indexed material adjudicated through-out the lifetime of the session to
        stay relevant.

        :param pos_seed_neighbors: Number of neighbors to pull from the given
            ``nn_index`` for each positive exemplar when populating the working
            index, i.e. this value determines the size of the working index for
            IQR refinement. By default, we try to get 500 neighbors.

            Since there may be partial to significant overlap of near neighbors
            as a result of nn_index queries for positive exemplars, the working
            index may contain anywhere from this value's number of entries, to
            ``N*P``, where ``N`` is this value and ``P`` is the number of
            positive examples at the time of working index initialization.
        :type pos_seed_neighbors: int

        :param rel_index_config: Plugin configuration dictionary for the
            RelevancyIndex to use for ranking user adjudications. By default we
            we use an in-memory libSVM based index using the histogram
            intersection metric.
        :type rel_index_config: dict

        :param session_uid: Optional manual specification of session UUID.
        :type session_uid: str | uuid.UUID

        """
        self.uuid = session_uid or str(uuid.uuid1()).replace('-', '')
        self.lock = threading.RLock()

        self.pos_seed_neighbors = int(pos_seed_neighbors)

        # Local descriptor index for ranking, populated by a query to the
        #   nn_index instance.
        # Added external data/descriptors not added to this index.
        self.working_index = MemoryDescriptorIndex()

        # Book-keeping set so we know what positive descriptors
        # UUIDs we've used to query the neighbor index with already.
        #: :type: set[collections.Hashable]
        self._wi_seeds_used = set()

        # Descriptor elements representing data from external sources.
        #: :type: set[smqtk.representation.DescriptorElement]
        self.external_positive_descriptors = set()
        #: :type: set[smqtk.representation.DescriptorElement]
        self.external_negative_descriptors = set()

        # Descriptor references from our index (above) that have been
        #   adjudicated.
        #: :type: set[smqtk.representation.DescriptorElement]
        self.positive_descriptors = set()
        #: :type: set[smqtk.representation.DescriptorElement]
        self.negative_descriptors = set()

        # Mapping of a DescriptorElement in our relevancy search index (not the
        #   index that the nn_index uses) to the relevancy score given the
        #   recorded positive and negative adjudications.
        # This is None before any initialization or refinement occurs.
        #: :type: None | dict[smqtk.representation.DescriptorElement, float]
        self.results = None

        #
        # Algorithm Instances [+Config]
        #
        # RelevancyIndex configuration and instance that is used for producing
        #   results.
        # This is only [re]constructed when initializing the session.
        self.rel_index_config = rel_index_config
        # This is None until session initialization happens after pos/neg
        # exemplar data has been added.
        #: :type: None | smqtk.algorithms.relevancy_index.RelevancyIndex
        self.rel_index = None
Пример #32
0
    def test_count(self):
        index = MemoryDescriptorIndex()
        self.assertEqual(index.count(), 0)

        d1 = random_descriptor()
        index.add_descriptor(d1)
        self.assertEqual(index.count(), 1)

        d2, d3, d4 = (random_descriptor(), random_descriptor(),
                      random_descriptor())
        index.add_many_descriptors([d2, d3, d4])
        self.assertEqual(index.count(), 4)

        d5 = random_descriptor()
        index.add_descriptor(d5)
        self.assertEqual(index.count(), 5)
Пример #33
0
    def test_count(self):
        index = MemoryDescriptorIndex()
        ntools.assert_equal(index.count(), 0)

        d1 = random_descriptor()
        index.add_descriptor(d1)
        ntools.assert_equal(index.count(), 1)

        d2, d3, d4 = random_descriptor(), random_descriptor(
        ), random_descriptor()
        index.add_many_descriptors([d2, d3, d4])
        ntools.assert_equal(index.count(), 4)

        d5 = random_descriptor()
        index.add_descriptor(d5)
        ntools.assert_equal(index.count(), 5)
Пример #34
0
class IqrSession (SmqtkObject):
    """
    Encapsulation of IQR Session related data structures with a centralized lock
    for multi-thread access.

    This object is compatible with the python with-statement, so when elements
    are to be used or modified, it should be within a with-block so race
    conditions do not occur across threads/sub-processes.

    """

    @property
    def _log(self):
        return logging.getLogger(
            '.'.join((self.__module__, self.__class__.__name__)) +
            "[%s]" % self.uuid
        )

    def __init__(self, pos_seed_neighbors=500,
                 rel_index_config=DFLT_REL_INDEX_CONFIG,
                 session_uid=None):
        """
        Initialize the IQR session

        This does not initialize the working index for ranking as there are no
        known positive descriptor examples at this time.

        Adjudications
        -------------
        Adjudications are carried through between initializations. This allows
        indexed material adjudicated through-out the lifetime of the session to
        stay relevant.

        :param pos_seed_neighbors: Number of neighbors to pull from the given
            ``nn_index`` for each positive exemplar when populating the working
            index, i.e. this value determines the size of the working index for
            IQR refinement. By default, we try to get 500 neighbors.

            Since there may be partial to significant overlap of near neighbors
            as a result of nn_index queries for positive exemplars, the working
            index may contain anywhere from this value's number of entries, to
            ``N*P``, where ``N`` is this value and ``P`` is the number of
            positive examples at the time of working index initialization.
        :type pos_seed_neighbors: int

        :param rel_index_config: Plugin configuration dictionary for the
            RelevancyIndex to use for ranking user adjudications. By default we
            we use an in-memory libSVM based index using the histogram
            intersection metric.
        :type rel_index_config: dict

        :param session_uid: Optional manual specification of session UUID.
        :type session_uid: str | uuid.UUID

        """
        self.uuid = session_uid or str(uuid.uuid1()).replace('-', '')
        self.lock = threading.RLock()

        self.pos_seed_neighbors = int(pos_seed_neighbors)

        # Local descriptor index for ranking, populated by a query to the
        #   nn_index instance.
        # Added external data/descriptors not added to this index.
        self.working_index = MemoryDescriptorIndex()

        # Book-keeping set so we know what positive descriptors
        # UUIDs we've used to query the neighbor index with already.
        #: :type: set[collections.Hashable]
        self._wi_seeds_used = set()

        # Descriptor elements representing data from external sources.
        #: :type: set[smqtk.representation.DescriptorElement]
        self.external_positive_descriptors = set()
        #: :type: set[smqtk.representation.DescriptorElement]
        self.external_negative_descriptors = set()

        # Descriptor references from our index (above) that have been
        #   adjudicated.
        #: :type: set[smqtk.representation.DescriptorElement]
        self.positive_descriptors = set()
        #: :type: set[smqtk.representation.DescriptorElement]
        self.negative_descriptors = set()

        # Mapping of a DescriptorElement in our relevancy search index (not the
        #   index that the nn_index uses) to the relevancy score given the
        #   recorded positive and negative adjudications.
        # This is None before any initialization or refinement occurs.
        #: :type: None | dict[smqtk.representation.DescriptorElement, float]
        self.results = None

        #
        # Algorithm Instances [+Config]
        #
        # RelevancyIndex configuration and instance that is used for producing
        #   results.
        # This is only [re]constructed when initializing the session.
        self.rel_index_config = rel_index_config
        # This is None until session initialization happens after pos/neg
        # exemplar data has been added.
        #: :type: None | smqtk.algorithms.relevancy_index.RelevancyIndex
        self.rel_index = None

    def __enter__(self):
        """
        :rtype: IqrSession
        """
        self.lock.acquire()
        return self

    # noinspection PyUnusedLocal
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.lock.release()

    def ordered_results(self):
        """
        Return a tuple of the current (id, probability) result pairs in
        order of descending probability score. If there are no results yet, None
        is returned.

        :rtype: None | tuple[(smqtk.representation.DescriptorElement, float)]

        """
        with self.lock:
            if self.results:
                return tuple(sorted(six.iteritems(self.results),
                                    key=lambda p: p[1],
                                    reverse=True))
            return None

    def external_descriptors(self, positive=(), negative=()):
        """
        Add positive/negative descriptors from external data.

        These descriptors may not be a part of our working index.

        :param positive: Iterable of descriptors from external sources to
            consider positive examples.
        :type positive:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param negative: Iterable of descriptors from external sources to
            consider negative examples.
        :type negative:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        positive = set(positive)
        negative = set(negative)
        with self.lock:
            self.external_positive_descriptors.update(positive)
            self.external_positive_descriptors.difference_update(negative)

            self.external_negative_descriptors.update(negative)
            self.external_negative_descriptors.difference_update(positive)

    def adjudicate(self, new_positives=(), new_negatives=(),
                   un_positives=(), un_negatives=()):
        """
        Update current state of working index positive and negative
        adjudications based on descriptor UUIDs.

        If the same descriptor element is listed in both new positives and
        negatives, they cancel each other out, causing that descriptor to not
        be included in the adjudication.

        The given iterables must be re-traversable. Otherwise the given
        descriptors will not be properly registered.

        :param new_positives: Descriptors of elements in our working index to
            now be considered to be positively relevant.
        :type new_positives:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param new_negatives: Descriptors of elements in our working index to
            now be considered to be negatively relevant.
        :type new_negatives:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param un_positives: Descriptors of elements in our working index to now
            be considered not positive any more.
        :type un_positives:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param un_negatives: Descriptors of elements in our working index to now
            be considered not negative any more.
        :type un_negatives:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        new_positives = set(new_positives)
        new_negatives = set(new_negatives)
        un_positives = set(un_positives)
        un_negatives = set(un_negatives)

        with self.lock:
            self.positive_descriptors.update(new_positives)
            self.positive_descriptors.difference_update(un_positives)
            self.positive_descriptors.difference_update(new_negatives)

            self.negative_descriptors.update(new_negatives)
            self.negative_descriptors.difference_update(un_negatives)
            self.negative_descriptors.difference_update(new_positives)

    def update_working_index(self, nn_index):
        """
        Initialize or update our current working index using the given
        :class:`.NearestNeighborsIndex` instance given our current positively
        labeled descriptor elements.

        We only query from the index for new positive elements since the last
        update or reset.

        :param nn_index: :class:`.NearestNeighborsIndex` to query from.
        :type nn_index: smqtk.algorithms.NearestNeighborsIndex

        :raises RuntimeError: There are no positive example descriptors in this
            session to use as a basis for querying.

        """
        pos_examples = (self.external_positive_descriptors |
                        self.positive_descriptors)
        if len(pos_examples) == 0:
            raise RuntimeError("No positive descriptors to query the neighbor "
                               "index with.")

        # Not clearing working index because this step is intended to be
        # additive.
        updated = False

        # adding to working index
        self._log.info("Building working index using %d positive examples "
                       "(%d external, %d adjudicated)",
                       len(pos_examples),
                       len(self.external_positive_descriptors),
                       len(self.positive_descriptors))
        # TODO: parallel_map and reduce with merge-dict
        for p in pos_examples:
            if p.uuid() not in self._wi_seeds_used:
                self._log.debug("Querying neighbors to: %s", p)
                self.working_index.add_many_descriptors(
                    nn_index.nn(p, n=self.pos_seed_neighbors)[0]
                )
                self._wi_seeds_used.add(p.uuid())
                updated = True

        # Make new relevancy index
        if updated:
            self._log.info("Creating new relevancy index over working index.")
            #: :type: smqtk.algorithms.relevancy_index.RelevancyIndex
            self.rel_index = plugin.from_plugin_config(
                self.rel_index_config, get_relevancy_index_impls()
            )
            self.rel_index.build_index(self.working_index.iterdescriptors())

    def refine(self):
        """ Refine current model results based on current adjudication state

        :raises RuntimeError: No working index has been initialized.
            :meth:`update_working_index` should have been called after
            adjudicating some positive examples.
        :raises RuntimeError: There are no adjudications to run on. We must
            have at least one positive adjudication.

        """
        with self.lock:
            if not self.rel_index:
                raise RuntimeError("No relevancy index yet. Must not have "
                                   "initialized session (no working index).")

            # combine pos/neg adjudications + added external data descriptors
            pos = self.positive_descriptors | self.external_positive_descriptors
            neg = self.negative_descriptors | self.external_negative_descriptors

            if not pos:
                raise RuntimeError("Did not find at least one positive "
                                   "adjudication.")

            self._log.debug("Ranking working set with %d pos and %d neg total "
                            "examples.", len(pos), len(neg))
            element_probability_map = self.rel_index.rank(pos, neg)

            if self.results is None:
                self.results = IqrResultsDict()
            self.results.update(element_probability_map)

            # Force adjudicated positives and negatives to be probability 1 and
            # 0, respectively, since we want to control where they show up in
            # our results view.
            # - Not all pos/neg descriptors may be in our working index.
            for d in pos:
                if d in self.results:
                    self.results[d] = 1.0
            for d in neg:
                if d in self.results:
                    self.results[d] = 0.0

    def reset(self):
        """ Reset the IQR Search state

        No positive adjudications, reload original feature data

        """
        with self.lock:
            self.working_index.clear()
            self._wi_seeds_used.clear()
            self.positive_descriptors.clear()
            self.negative_descriptors.clear()
            self.external_positive_descriptors.clear()
            self.external_negative_descriptors.clear()

            self.rel_index = None
            self.results = None

    ###########################################################################
    # I/O Methods

    # I/O Constants. These should not be changed.
    STATE_ZIP_COMPRESSION = zipfile.ZIP_DEFLATED
    STATE_ZIP_FILENAME = "iqr_state.json"

    def get_state_bytes(self):
        """
        Get a byte representation of the current descriptor and adjudication
        state of this session.

        This does not encode current results or the relevancy index's state, but
        these can be reproduced with this state.

        :return: State representation bytes
        :rtype: bytes

        """
        def d_set_to_list(d_set):
            # Convert set of descriptors to list of tuples:
            #   [..., (uuid, type, vector), ...]
            return [(d.uuid(), d.type(), d.vector().tolist()) for d in d_set]

        with self:
            # Convert session descriptors into basic values.
            pos_d = d_set_to_list(self.positive_descriptors)
            neg_d = d_set_to_list(self.negative_descriptors)
            ext_pos_d = d_set_to_list(self.external_positive_descriptors)
            ext_neg_d = d_set_to_list(self.external_negative_descriptors)

        z_buffer = io.BytesIO()
        z = zipfile.ZipFile(z_buffer, 'w', self.STATE_ZIP_COMPRESSION)
        z.writestr(self.STATE_ZIP_FILENAME, json.dumps({
            'pos': pos_d,
            'neg': neg_d,
            'external_pos': ext_pos_d,
            'external_neg': ext_neg_d,
        }))
        z.close()
        return z_buffer.getvalue()

    def set_state_bytes(self, b, descriptor_factory):
        """
        Set this session's state to the given byte representation, resetting
        this session in the process.

        Bytes given must have been retrieved via a previous call to
        ``get_state_bytes`` otherwise this method will fail.

        Since this state may be completely different from the current state,
        this session is reset before applying the new state. Thus, any current
        ranking results are thrown away.

        :param b: Bytes to set this session's state to.
        :type b: bytes

        :param descriptor_factory: Descriptor element factory to use when
            generating descriptor elements from extracted data.
        :type descriptor_factory: smqtk.representation.DescriptorElementFactory

        :raises ValueError: The input bytes could not be loaded due to
            incompatibility.

        """
        z_buffer = io.BytesIO(b)
        z = zipfile.ZipFile(z_buffer, 'r', self.STATE_ZIP_COMPRESSION)
        if self.STATE_ZIP_FILENAME not in z.namelist():
            raise ValueError("Invalid bytes given, did not contain expected "
                             "zipped file name.")

        # Extract expected json file object
        state = json.loads(z.read(self.STATE_ZIP_FILENAME).decode())
        del z, z_buffer

        with self:
            self.reset()

            def load_descriptor(_uid, _type_str, vec_list):
                _e = descriptor_factory.new_descriptor(_type_str, _uid)
                if _e.has_vector():
                    assert _e.vector().tolist() == vec_list, \
                        "Found existing vector for UUID '%s' but vectors did " \
                        "not match."
                else:
                    _e.set_vector(vec_list)
                return _e

            # Read in raw descriptor data from the state, convert to descriptor
            # element, then store in our descriptor sets.
            for source, target in [(state['external_pos'],
                                    self.external_positive_descriptors),
                                   (state['external_neg'],
                                    self.external_negative_descriptors),
                                   (state['pos'], self.positive_descriptors),
                                   (state['neg'], self.negative_descriptors)]:
                for uid, type_str, vector_list in source:
                    e = load_descriptor(uid, type_str, vector_list)
                    target.add(e)
Пример #35
0
 def test_init_no_cache(self):
     inst = MemoryDescriptorIndex()
     ntools.assert_is_none(inst.cache_element, None)
     ntools.assert_equal(inst._table, {})
Пример #36
0
 def test_iteritems(self):
     i = MemoryDescriptorIndex()
     descrs = [random_descriptor() for _ in range(100)]
     i.add_many_descriptors(descrs)
     ntools.assert_equal(set(i.items()), set((d.uuid(), d) for d in descrs))
Пример #37
0
 def test_default_config(self):
     # Default should be valid for constructing a new instance.
     c = MemoryDescriptorIndex.get_default_config()
     ntools.assert_equal(
         MemoryDescriptorIndex.from_config(c).get_config(), c)
Пример #38
0
 def test_init_no_cache(self):
     inst = MemoryDescriptorIndex()
     self.assertIsNone(inst.cache_element, None)
     self.assertEqual(inst._table, {})
Пример #39
0
    plt.plot(r, p, label="auc=%f" % pr_curve_area)
    plt.xlim([0., 1.])
    plt.ylim([0., 1.05])
    plt.title("PR - HT Positive")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend(loc='best', fancybox=True, framealpha=0.5)
    plt.savefig(PLOT_PR_OUTPUT)

else:
    # Using the final trained classifier
    with open(CLASSIFIER_TRAINING_CONFIG_JSON) as f:
        classifier_config = json.load(f)

    log.info("Loading plugins")
    descriptor_index = MemoryDescriptorIndex(file_cache=DESCRIPTOR_INDEX_FILE_CACHE)
    #: :type: smqtk.algorithms.Classifier
    classifier = from_plugin_config(classifier_config['plugins']['classifier'],
                                    get_classifier_impls())
    c_factory = ClassificationElementFactory(MemoryClassificationElement, {})

    #: :type: dict[str, list[str]]
    phone2shas = json.load(open(PHONE_SHA1_JSON))
    #: :type: dict[str, float]
    phone2score = {}

    log.info("Classifying phone imagery descriptors")
    i = 0
    descriptor_index_shas = set(descriptor_index.iterkeys())
    for p in phone2shas:
        log.info('%s (%d / %d)', p, i + 1, len(phone2shas))
Пример #40
0
 def test_iterkeys(self):
     i = MemoryDescriptorIndex()
     descrs = [random_descriptor() for _ in range(100)]
     i.add_many_descriptors(descrs)
     self.assertEqual(set(i.iterkeys()), set(d.uuid() for d in descrs))
Пример #41
0
 def test_default_config(self):
     ntools.assert_equal(
         MemoryDescriptorIndex.get_default_config(),
         {"file_cache": None, "pickle_protocol": -1}
     )
Пример #42
0
 def test_iteritems(self):
     i = MemoryDescriptorIndex()
     descrs = [random_descriptor() for _ in xrange(100)]
     i.add_many_descriptors(descrs)
     ntools.assert_equal(set(i.iteritems()),
                         set((d.uuid(), d) for d in descrs))
Пример #43
0
 def test_is_usable(self):
     ntools.assert_equal(MemoryDescriptorIndex.is_usable(), True)
Пример #44
0
 def test_cache_table_no_cache(self):
     inst = MemoryDescriptorIndex()
     inst._table = {}
     inst.cache_table()  # should basically do nothing
     ntools.assert_is_none(inst.cache_element)
Пример #45
0
 def test_is_usable(self):
     # Always usable because no dependencies.
     ntools.assert_equal(MemoryDescriptorIndex.is_usable(), True)