def setUp(self): settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 # initialize the db with custom features CrossmapFeatures(settings, features=test_features) self.indexer = CrossmapIndexer(settings) self.index_file = settings.index_file("targets")
def test_skip_certain_docs(self): """docs do not have any of the limited features should be omitted""" settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 CrossmapFeatures(settings, features=self.limited_features) indexer = CrossmapIndexer(settings) with self.assertLogs(level="WARNING") as cm: indexer.build() self.assertTrue("Skipping item" in str(cm.output))
def setUpClass(cls): """build an indexer using a fixed featuremap""" settings = CrossmapSettings(config_featuremap, create_dir=True) settings.tokens.k = 20 cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.feature_map = cls.indexer.encoder.feature_map
def setUpClass(cls): """build an indexer using target documents only""" settings = CrossmapSettings(config_single, create_dir=True) settings.tokens.k = 10 CrossmapFeatures(settings, features=test_features) cls.indexer = CrossmapIndexer(settings) cls.indexer.build()
def setUpClass(cls): settings = CrossmapSettings(config_plain, create_dir=True) cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.diffuser = CrossmapDiffuser(settings) cls.diffuser.build() cls.feature_map = cls.diffuser.feature_map cls.db = cls.diffuser.db cls.encoder = cls.indexer.encoder
def test_indexer_build_rebuild(self): """run a build when indexes already exist""" self.assertFalse(exists(self.index_file)) self.indexer.build() # the second indexer is created from scratch, with the same settings # build should detect presence of indexes and load instead with self.assertLogs(level="WARNING") as cm: newindexer = CrossmapIndexer(self.indexer.settings) newindexer.build() self.assertTrue("Skip" in str(cm.output)) # after build, the indexer should be ready to use ids_targets = newindexer.db.all_ids("targets") ids_docs = newindexer.db.all_ids("documents") self.assertEqual(len(ids_targets), 6, "dataset still has six items") self.assertGreater(len(ids_docs), 6, "targets have many items") self.assertTrue(exists(newindexer.index_files["targets"])) self.assertTrue(exists(newindexer.index_files["documents"]))
def setUpClass(cls): settings = CrossmapSettings(config_longword, create_dir=True) cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.diffuser = CrossmapDiffuser(settings) cls.diffuser.build() cls.feature_map = cls.diffuser.feature_map cls.db = cls.diffuser.db cls.encoder = cls.indexer.encoder cls.plain_tokenizer = CrossmapTokenizer(settings) cls.diff_tokenizer = CrossmapDiffusionTokenizer(settings) # extract data vectors cls.data = dict() temp = cls.db.get_data(dataset="targets", ids=["L0", "L1", "L2", "L3", "L4"]) for _ in temp: cls.data[_["id"]] = sparse_to_dense(_["data"])
def setUpClass(cls): settings = CrossmapSettings(config_plain, create_dir=True) cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.diffuser = CrossmapDiffuser(settings) cls.diffuser.build()
class CrossmapIndexerBuildTests(unittest.TestCase): """Creating nearest neighbor indexes from documents and text tokens""" def setUp(self): settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 # initialize the db with custom features CrossmapFeatures(settings, features=test_features) self.indexer = CrossmapIndexer(settings) self.index_file = settings.index_file("targets") def tearDown(self): remove_crossmap_cache(data_dir, "crossmap_simple") def test_indexer_build(self): """build indexes from a simple configuration""" self.assertFalse(exists(self.index_file)) self.indexer.build() ids_targets = self.indexer.db.all_ids("targets") ids_docs = self.indexer.db.all_ids("documents") self.assertEqual(len(ids_targets), 6, "dataset has six items") self.assertGreater(len(ids_docs), 6, "documents have several items") self.assertEqual(len(self.indexer.index_files), 2, "one index for targets, one for documents") self.assertTrue(exists(self.indexer.index_files["targets"])) self.assertTrue(exists(self.indexer.index_files["documents"])) def test_indexer_load(self): """prepared indexes from disk""" indexer = self.indexer self.assertFalse(exists(self.index_file)) indexer.build() indexer.indexes = [] indexer.load() self.assertEqual(len(indexer.indexes), 2) # both index and data db should record items self.assertEqual(len(indexer.db.all_ids("targets")), 6) self.assertGreater(len(indexer.db.all_ids("documents")), 6) def test_indexer_build_rebuild(self): """run a build when indexes already exist""" self.assertFalse(exists(self.index_file)) self.indexer.build() # the second indexer is created from scratch, with the same settings # build should detect presence of indexes and load instead with self.assertLogs(level="WARNING") as cm: newindexer = CrossmapIndexer(self.indexer.settings) newindexer.build() self.assertTrue("Skip" in str(cm.output)) # after build, the indexer should be ready to use ids_targets = newindexer.db.all_ids("targets") ids_docs = newindexer.db.all_ids("documents") self.assertEqual(len(ids_targets), 6, "dataset still has six items") self.assertGreater(len(ids_docs), 6, "targets have many items") self.assertTrue(exists(newindexer.index_files["targets"])) self.assertTrue(exists(newindexer.index_files["documents"])) def test_indexer_str(self): """str summarizes main properties""" self.assertTrue("Indexes:\t0" in str(self.indexer)) self.indexer.build() self.assertTrue("Indexes:\t2" in str(self.indexer))
def setUpClass(cls): settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 CrossmapFeatures(settings, features=test_features) cls.indexer = CrossmapIndexer(settings) cls.indexer.build()