def _bulk_rank_error(self, function):
     """Throws ValueError if len(vec) != self.rank for any vec in vecs."""
     vec_short = HyperplaneHasher._random_vectors(1, self.hhenl.rank - 1)
     vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank) + vec_short
     vec_ids = self.letters[:11]
     self.hhenl._bulk_label_chamber_ensemble(vecs[:10], vec_ids[:10])
     self.assertRaises(ValueError, function, *[vecs, vec_ids])
Пример #2
0
 def _bulk_rank_error(self, function):
     """Throws ValueError if len(vec) != self.rank for any vec in vecs."""
     vec_short = HyperplaneHasher._random_vectors(1, self.hhenl.rank - 1)
     vecs = HyperplaneHasher._random_vectors(10,
                                             self.hhenl.rank) + vec_short
     vec_ids = self.letters[:11]
     self.hhenl._bulk_label_chamber_ensemble(vecs[:10], vec_ids[:10])
     self.assertRaises(ValueError, function, *[vecs, vec_ids])
 def test_label_chamber_ensemble_1(self):
     """For each underlying HyperplaneHasher object, a new label is
     added to precisely one chamber. The set of chamber ids present as keys
     in self.kvstore is either unchanged, or enlarged by one element."""
     feature_vecs = self.feature_vecs
     old_chamber_ids = {hh: set([hh.get_chamber_id(vec) for vec in feature_vecs]) for hh in self.hhenl.hhs}
     old_chamber_labels = {hh: [hh.chamber_labels(ch_id) for ch_id in old_chamber_ids[hh]] for hh in self.hhenl.hhs}
     new_vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     self.hhenl._label_chamber_ensemble(new_vec, 'new_vec_id')
     feature_vecs.append(new_vec)
     new_chamber_ids = {hh: set([hh.get_chamber_id(vec) for vec in feature_vecs]) for hh in self.hhenl.hhs}
     new_chamber_labels = {hh: [hh.chamber_labels(ch_id) for ch_id in new_chamber_ids[hh]] for hh in self.hhenl.hhs}
     for hh in self.hhenl.hhs:
         len_diff = len(new_chamber_ids[hh]) - len(old_chamber_ids[hh])
         self.assertIn(len_diff, [0, 1])
         if len_diff == 0:
             #vector 'new_vec' has landed in an existing chamber.
             #the set of chamber ids thus remains unchanged, but
             #exactly one chamber has exactly one new label,
             #namely 'new_vec_id'
             self.assertEqual(old_chamber_ids[hh], new_chamber_ids[hh])
             comparison = list(np.array(old_chamber_labels[hh]) == np.array(new_chamber_labels[hh]))
             expected_bools = set([False] + [True] * (len(old_chamber_ids) - 1))
             self.assertEqual(set(comparison), expected_bools)
             label_diff = new_chamber_labels[hh][comparison.index(False)].difference(old_chamber_labels[hh][comparison.index(False)])
             self.assertEqual(label_diff, set(['new_vec_id']))
         if len_diff == 1:
             #vector 'new_vec' has landed in a new chamber.
             #The id of the new chamber is that of the chamber to
             #which 'new_vec' belongs, and the new chamber
             #is exactly set(['new_vec_id']).
             id_diff = new_chamber_ids[hh].difference(old_chamber_ids[hh])
             self.assertEqual(id_diff, set([hh.get_chamber_id(new_vec)]))
             labels_diff = [entry for entry in new_chamber_labels[hh] if entry not in old_chamber_labels[hh]][0]
             self.assertEqual(labels_diff, set(['new_vec_id']))
Пример #4
0
 def test_bulk_label_chamber_ensemble_5(self):
     """Let first = [first_1, first_2, ..., first_n] and second = [second_1, second_2, ..., second_n] be
     lists of labels, and vecs = [vec_1, vec_2, ..., vec_n] a list of vectors. Then after applying the method
     first to (vecs, first), then to (vecs, second), all chambers C in all hh in self.hhenl.hhs have the property
     that first_i in C iff second_i in C."""
     vecs = HyperplaneHasher._random_vectors(20, self.hhenl.rank)
     first_ex = re.compile(r'first_([\S]*)')
     second_ex = re.compile(r'second_([\S]*)')
     first = ['first_%i' % i for i in range(20)]
     second = ['second_%i' % i for i in range(20)]
     self.hhenl._bulk_label_chamber_ensemble(vecs, first)
     self.hhenl._bulk_label_chamber_ensemble(vecs, second)
     for hh in self.hhenl.hhs:
         ch_ids = hh.get_chamber_ids()
         for ch_id in ch_ids:
             labels = hh.chamber_labels(ch_id)
             flabels = [
                 ''.join(first_ex.findall(label)) for label in labels
             ]
             first_labels = set(
                 [entry for entry in flabels if len(entry) > 0])
             slabels = [
                 ''.join(second_ex.findall(label)) for label in labels
             ]
             second_labels = set(
                 [entry for entry in slabels if len(entry) > 0])
             self.assertEqual(first_labels, second_labels)
 def test_get_nn_candidates_1(self):
     """Returned objects is a set of strings of length
     at least num_neighbours."""
     vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     nn = 10
     result = self.hhenl._get_nn_candidates(vec, nn)
     self.assertIsInstance(result, set)
     for element in result:
         self.assertIsInstance(element, str)
     self.assertGreaterEqual(len(result), nn)
Пример #6
0
 def test_get_nn_candidates_1(self):
     """Returned objects is a set of strings of length
     at least num_neighbours."""
     vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     nn = 10
     result = self.hhenl._get_nn_candidates(vec, nn)
     self.assertIsInstance(result, set)
     for element in result:
         self.assertIsInstance(element, str)
     self.assertGreaterEqual(len(result), nn)
 def setUp(self):
     """Create a HHEnsembleLookup object whose underlying KeyValueStore object
     is a DictionaryStore instance populated by NUM_VECS feature vectors."""
     self.letters = list(string.ascii_lowercase)
     self.feature_vecs = HyperplaneHasher._random_vectors(NUM_VECS, RANK)
     self.feature_vecs_ids = ['%i' % i for i in range(NUM_VECS)]
     self.hhenl = self._create_hhenl()
     for pair in zip(self.feature_vecs, self.feature_vecs_ids):
         vec, vec_id = pair
         self.hhenl.add_vector(vec, vec_id)
Пример #8
0
 def setUp(self):
     """Create a HHEnsembleLookup object whose underlying KeyValueStore object
     is a DictionaryStore instance populated by NUM_VECS feature vectors."""
     self.letters = list(string.ascii_lowercase)
     self.feature_vecs = HyperplaneHasher._random_vectors(NUM_VECS, RANK)
     self.feature_vecs_ids = ['%i' % i for i in range(NUM_VECS)]
     self.hhenl = self._create_hhenl()
     for pair in zip(self.feature_vecs, self.feature_vecs_ids):
         vec, vec_id = pair
         self.hhenl.add_vector(vec, vec_id)
Пример #9
0
 def test_label_chamber_ensemble_1(self):
     """For each underlying HyperplaneHasher object, a new label is
     added to precisely one chamber. The set of chamber ids present as keys
     in self.kvstore is either unchanged, or enlarged by one element."""
     feature_vecs = self.feature_vecs
     old_chamber_ids = {
         hh: set([hh.get_chamber_id(vec) for vec in feature_vecs])
         for hh in self.hhenl.hhs
     }
     old_chamber_labels = {
         hh: [hh.chamber_labels(ch_id) for ch_id in old_chamber_ids[hh]]
         for hh in self.hhenl.hhs
     }
     new_vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     self.hhenl._label_chamber_ensemble(new_vec, 'new_vec_id')
     feature_vecs.append(new_vec)
     new_chamber_ids = {
         hh: set([hh.get_chamber_id(vec) for vec in feature_vecs])
         for hh in self.hhenl.hhs
     }
     new_chamber_labels = {
         hh: [hh.chamber_labels(ch_id) for ch_id in new_chamber_ids[hh]]
         for hh in self.hhenl.hhs
     }
     for hh in self.hhenl.hhs:
         len_diff = len(new_chamber_ids[hh]) - len(old_chamber_ids[hh])
         self.assertIn(len_diff, [0, 1])
         if len_diff == 0:
             #vector 'new_vec' has landed in an existing chamber.
             #the set of chamber ids thus remains unchanged, but
             #exactly one chamber has exactly one new label,
             #namely 'new_vec_id'
             self.assertEqual(old_chamber_ids[hh], new_chamber_ids[hh])
             comparison = list(
                 np.array(old_chamber_labels[hh]) == np.array(
                     new_chamber_labels[hh]))
             expected_bools = set([False] + [True] *
                                  (len(old_chamber_ids) - 1))
             self.assertEqual(set(comparison), expected_bools)
             label_diff = new_chamber_labels[hh][comparison.index(
                 False)].difference(
                     old_chamber_labels[hh][comparison.index(False)])
             self.assertEqual(label_diff, set(['new_vec_id']))
         if len_diff == 1:
             #vector 'new_vec' has landed in a new chamber.
             #The id of the new chamber is that of the chamber to
             #which 'new_vec' belongs, and the new chamber
             #is exactly set(['new_vec_id']).
             id_diff = new_chamber_ids[hh].difference(old_chamber_ids[hh])
             self.assertEqual(id_diff, set([hh.get_chamber_id(new_vec)]))
             labels_diff = [
                 entry for entry in new_chamber_labels[hh]
                 if entry not in old_chamber_labels[hh]
             ][0]
             self.assertEqual(labels_diff, set(['new_vec_id']))
Пример #10
0
 def test_bulk_label_chamber_ensemble_3(self):
     """If vec_ids are all unknown, then for each hh in self.hhenl.hhs, the difference in the
     union over all chamber_ids in hh.get_chamber_ids() of hh.chamber_labels(chamber_id), before
     and after the bulk_label, is equal to vec_ids."""
     vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank)
     vec_ids = self.letters[:10]
     labels_before = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     self.hhenl._bulk_label_chamber_ensemble(vecs, vec_ids)
     labels_after = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     for b, a in zip(labels_before, labels_after):
         self.assertEqual(a.difference(b), set(vec_ids))
 def test_bulk_label_chamber_ensemble_3(self):
     """If vec_ids are all unknown, then for each hh in self.hhenl.hhs, the difference in the
     union over all chamber_ids in hh.get_chamber_ids() of hh.chamber_labels(chamber_id), before
     and after the bulk_label, is equal to vec_ids."""
     vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank)
     vec_ids = self.letters[:10]
     labels_before = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     self.hhenl._bulk_label_chamber_ensemble(vecs, vec_ids)
     labels_after = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     for b, a in zip(labels_before, labels_after):
         self.assertEqual(a.difference(b), set(vec_ids))
 def test_bulk_label_chamber_ensemble_4(self):
     """If vec_ids are partially known, then for each hh in self.hhenl.hhs, the difference in the
     union over all chamber_ids in hh.get_chamber_ids() of hh.chamber_labels(chamber_id), before
     and after the bulk_label, is equal to the unknown vec_ids."""
     vecs = HyperplaneHasher._random_vectors(24, self.hhenl.rank)
     old_vec_ids = self.feature_vecs_ids[:11]
     new_vec_ids = self.letters[:13]
     vec_ids = old_vec_ids + new_vec_ids
     labels_before = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     self.hhenl._bulk_label_chamber_ensemble(vecs, vec_ids)
     labels_after = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     for b, a in zip(labels_before, labels_after):
         self.assertEqual(a.difference(b), set(new_vec_ids))
Пример #13
0
 def test_bulk_label_chamber_ensemble_4(self):
     """If vec_ids are partially known, then for each hh in self.hhenl.hhs, the difference in the
     union over all chamber_ids in hh.get_chamber_ids() of hh.chamber_labels(chamber_id), before
     and after the bulk_label, is equal to the unknown vec_ids."""
     vecs = HyperplaneHasher._random_vectors(24, self.hhenl.rank)
     old_vec_ids = self.feature_vecs_ids[:11]
     new_vec_ids = self.letters[:13]
     vec_ids = old_vec_ids + new_vec_ids
     labels_before = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     self.hhenl._bulk_label_chamber_ensemble(vecs, vec_ids)
     labels_after = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs]
     for b, a in zip(labels_before, labels_after):
         self.assertEqual(a.difference(b), set(new_vec_ids))
Пример #14
0
 def test_add_vector_1(self):
     """Adds 'vec' both to self.hhenl.kvstore, and to exactly one chamber
     of each underlying HyperplaneHasher object. Subsequently, the lists of keys of
     vectors in the objects self.hhenl.kvstore and self.hhenl.hhs[i].kvstore
     are identical, for all i."""
     vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     vec_id = 'new'
     self.hhenl.add_vector(vec, vec_id)
     self.assertTrue((self.hhenl.get_vector(vec_id) == vec).all())
     all_vec_ids = self.hhenl.get_vector_ids()
     self.assertIn(vec_id, all_vec_ids)
     for hh in self.hhenl.hhs:
         chamber_id = hh.get_chamber_id(vec)
         self.assertIn(vec_id, hh.chamber_labels(chamber_id))
 def test_add_vector_1(self):
     """Adds 'vec' both to self.hhenl.kvstore, and to exactly one chamber
     of each underlying HyperplaneHasher object. Subsequently, the lists of keys of
     vectors in the objects self.hhenl.kvstore and self.hhenl.hhs[i].kvstore
     are identical, for all i."""
     vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     vec_id = 'new'
     self.hhenl.add_vector(vec, vec_id)
     self.assertTrue((self.hhenl.get_vector(vec_id)==vec).all())
     all_vec_ids = self.hhenl.get_vector_ids()
     self.assertIn(vec_id, all_vec_ids)
     for hh in self.hhenl.hhs:
         chamber_id = hh.get_chamber_id(vec)
         self.assertIn(vec_id, hh.chamber_labels(chamber_id))
Пример #16
0
 def test_find_neighbours_1(self):
     """Returns a pandas series of length 'num_neighbours', indexed
     by keys that can successfully be passed to the get_vector() method.
     The entries of 'ser' are non-negative real numbers, in ascending order.
     If the input vector is known to the underlying KeyValueStore object,
     then the first entry has value 0.0 and key == 'vec_id', where 'vec_id'
     is the id of the input vector."""
     vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     nn = 10
     neighbours = self.hhenl.find_neighbours(vec, nn)
     self.assertIsInstance(neighbours, pd.Series)
     self.assertEqual(len(neighbours), nn)
     self.assertTrue((neighbours == neighbours.order()).all())
     for i in range(len(neighbours)):
         self.assertGreaterEqual(neighbours[i], 0.0)
 def test_find_neighbours_1(self):
     """Returns a pandas series of length 'num_neighbours', indexed
     by keys that can successfully be passed to the get_vector() method.
     The entries of 'ser' are non-negative real numbers, in ascending order.
     If the input vector is known to the underlying KeyValueStore object,
     then the first entry has value 0.0 and key == 'vec_id', where 'vec_id'
     is the id of the input vector."""
     vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0]
     nn = 10
     neighbours = self.hhenl.find_neighbours(vec, nn)
     self.assertIsInstance(neighbours, pd.Series)
     self.assertEqual(len(neighbours), nn)
     self.assertTrue((neighbours == neighbours.order()).all())
     for i in range(len(neighbours)):
         self.assertGreaterEqual(neighbours[i], 0.0)
 def test_bulk_label_chamber_ensemble_5(self):
     """Let first = [first_1, first_2, ..., first_n] and second = [second_1, second_2, ..., second_n] be
     lists of labels, and vecs = [vec_1, vec_2, ..., vec_n] a list of vectors. Then after applying the method
     first to (vecs, first), then to (vecs, second), all chambers C in all hh in self.hhenl.hhs have the property
     that first_i in C iff second_i in C."""
     vecs = HyperplaneHasher._random_vectors(20, self.hhenl.rank)
     first_ex = re.compile(r'first_([\S]*)')
     second_ex = re.compile(r'second_([\S]*)')
     first = ['first_%i' % i for i in range(20)]
     second = ['second_%i' % i for i in range(20)]
     self.hhenl._bulk_label_chamber_ensemble(vecs, first)
     self.hhenl._bulk_label_chamber_ensemble(vecs, second)
     for hh in self.hhenl.hhs:
         ch_ids = hh.get_chamber_ids()
         for ch_id in ch_ids:
             labels = hh.chamber_labels(ch_id)
             flabels = [''.join(first_ex.findall(label)) for label in labels]
             first_labels = set([entry for entry in flabels if len(entry) > 0])
             slabels = [''.join(second_ex.findall(label)) for label in labels]
             second_labels = set([entry for entry in slabels if len(entry) > 0])
             self.assertEqual(first_labels, second_labels)
 def test_label_chamber_ensemble_2(self):
     """Throws ValueError if len(vec) != self.rank."""
     new_vec_short = HyperplaneHasher._random_vectors(1, self.hhenl.rank - 1)[0]
     self.assertRaises(ValueError, self.hhenl._label_chamber_ensemble, *[new_vec_short, 'new_vec_short_id'])
Пример #20
0
 def _bulk_list_length_error(self, function):
     """Throws ValueError if len(vec_ids) != len(vec_ids)."""
     vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank)
     vec_ids = self.letters[:11]
     self.assertRaises(ValueError, function, *[vecs, vec_ids])
Пример #21
0
 def test_label_chamber_ensemble_2(self):
     """Throws ValueError if len(vec) != self.rank."""
     new_vec_short = HyperplaneHasher._random_vectors(
         1, self.hhenl.rank - 1)[0]
     self.assertRaises(ValueError, self.hhenl._label_chamber_ensemble,
                       *[new_vec_short, 'new_vec_short_id'])
 def _bulk_list_length_error(self, function):
     """Throws ValueError if len(vec_ids) != len(vec_ids)."""
     vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank)
     vec_ids = self.letters[:11]
     self.assertRaises(ValueError, function, *[vecs, vec_ids])