def setUp(self): settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 # initialize the db with custom features CrossmapFeatures(settings, features=test_features) self.indexer = CrossmapIndexer(settings) self.index_file = settings.index_file("targets")
def test_saves_disk_file(self): """scan for features and record in cache""" settings = CrossmapSettings(config_file, create_dir=True) cache_file = settings.tsv_file("feature-map") self.assertFalse(exists(cache_file)) # run the feature extraction, should create a cache file with self.assertLogs(level="INFO") as cm1: CrossmapFeatures(settings) self.assertTrue(exists(cache_file)) self.assertTrue("Saving" in str(cm1.output))
def test_warnings_nonexistent_files(self): """Attempt to configure with a configuration with a typo""" with self.assertLogs(level='WARNING'): result = CrossmapSettings(config_nonexistent_file) # one target file is valid, so the overall config is valid self.assertTrue(result.valid)
def test_init_dir(self): """Configure with just a directory""" result = CrossmapSettings(data_dir) self.assertEqual(result.dir, data_dir) self.assertEqual(result.file, "crossmap.yaml") self.assertTrue(result.valid)
def setUpClass(cls): """build an indexer using a fixed featuremap""" settings = CrossmapSettings(config_featuremap, create_dir=True) settings.tokens.k = 20 cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.feature_map = cls.indexer.encoder.feature_map
def setUpClass(cls): """build an indexer using target documents only""" settings = CrossmapSettings(config_single, create_dir=True) settings.tokens.k = 10 CrossmapFeatures(settings, features=test_features) cls.indexer = CrossmapIndexer(settings) cls.indexer.build()
def setUpClass(cls): settings = CrossmapSettings(config_plain, create_dir=True) cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.diffuser = CrossmapDiffuser(settings) cls.diffuser.build() cls.feature_map = cls.diffuser.feature_map cls.db = cls.diffuser.db cls.encoder = cls.indexer.encoder
def test_skip_certain_docs(self): """docs do not have any of the limited features should be omitted""" settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 CrossmapFeatures(settings, features=self.limited_features) indexer = CrossmapIndexer(settings) with self.assertLogs(level="WARNING") as cm: indexer.build() self.assertTrue("Skipping item" in str(cm.output))
def test_connect_with_partial_config(self): """after an instance is built, can connect to it""" # create settings with very minimal settings # (no data fields) settings = CrossmapSettings(config_simple_nodata, require_data_files=False) instance = Crossmap(settings) instance.load() self.assertEqual(instance.valid, True) self.assertEqual(instance.default_label, "targets")
def test_nonzero_weight(self): """all features must have nonzero weight""" # use a configuration with log weights settings = CrossmapSettings(config_constant_file, create_dir=True) with self.assertLogs(level="INFO") as cm: settings.features.weighting = [0, 1] map_const = feature_map(settings) self.assertTrue("Extracting" in str(cm.output)) # all features should have >0 weights for k, v in map_const.items(): kmsg = "feature "+str(k)+" should have weight >0" self.assertGreater(v[1], 0, kmsg)
def test_init_from_settings(self): """Initializing a crossmap object create directory structure""" # create settings by providing a directory # This will trigger search for crossmap.yaml settings = CrossmapSettings(data_dir) subdir = settings.prefix self.assertEqual(subdir, join(data_dir, "crossmap_default")) # data directory does not exist before init, exists after self.assertFalse(exists(subdir)) # initializing using a settings object crossmap = Crossmap(settings) self.assertTrue(exists(subdir)) # the crossmap is not valid because it has not been build yet self.assertFalse(crossmap.valid)
def setUpClass(cls): settings = CrossmapSettings(config_longword, create_dir=True) cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.diffuser = CrossmapDiffuser(settings) cls.diffuser.build() cls.feature_map = cls.diffuser.feature_map cls.db = cls.diffuser.db cls.encoder = cls.indexer.encoder cls.plain_tokenizer = CrossmapTokenizer(settings) cls.diff_tokenizer = CrossmapDiffusionTokenizer(settings) # extract data vectors cls.data = dict() temp = cls.db.get_data(dataset="targets", ids=["L0", "L1", "L2", "L3", "L4"]) for _ in temp: cls.data[_["id"]] = sparse_to_dense(_["data"])
class CrossmapFeatureMapIOTests(unittest.TestCase): """reading and writing feature-map tables""" def setUp(self): self.settings = CrossmapSettings(config_file, create_dir=True) def tearDown(self): remove_crossmap_cache(data_dir, "crossmap_simple") def test_write_read(self): """read and write a custom-made object""" map = dict(a=(0,1), b=(1,0.5)) map_file = self.settings.tsv_file("feature-map") write_feature_map(map, self.settings) map2 = read_feature_map(map_file) self.assertEqual(len(map2), len(map)) self.assertEqual(map2["a"], map["a"]) self.assertEqual(map2["b"], map["b"])
def test_init_no_data(self): """Attempt to configure with a configuration file without any data""" with self.assertLogs(level='ERROR'): result = CrossmapSettings(config_no_data_file) self.assertFalse(result.valid)
def test_init_missing_file(self): """Attempt to configure with a non-existent file""" with self.assertLogs(level='ERROR'): result = CrossmapSettings(join(data_dir, "missing.yaml")) self.assertFalse(result.valid)
def test_validate_weights(self): """validation can detect misspecified feature weighting scheme""" with self.assertLogs(level="WARNING"): settings = CrossmapSettings(config_weighting_file) settings._validate()
def setUp(self): self.settings = CrossmapSettings(config_file, create_dir=True) self.cache_file = self.settings.tsv_file("feature-map")
def setUp(self): settings = CrossmapSettings(config_single, create_dir=True) self.crossmap = Crossmap(settings) self.crossmap.build() self.assertTrue(self.crossmap)
def setUp(self): self.settings = CrossmapSettings(config_file, create_dir=True)
def setUpClass(cls): settings = CrossmapSettings(config_simple) crossmap = Crossmap(settings) crossmap.build() cls.crossmap = CrossmapInfo(settings)
def setUpClass(cls): settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 CrossmapFeatures(settings, features=test_features) cls.indexer = CrossmapIndexer(settings) cls.indexer.build()
""" from functools import wraps from json import dumps, loads from crossmap.crossmap import Crossmap from crossmap.settings import CrossmapSettings from crossmap.vectors import sparse_to_dense from os import environ from urllib.parse import unquote from django.http import HttpResponse from logging import info import yaml # load the crossmap object based on configuration saved in an OS variable config_path = environ.get('DJANGO_CROSSMAP_CONFIG_PATH') settings = CrossmapSettings(config_path, require_data_files=False) crossmap = Crossmap(settings) crossmap.load() info("database collections: " + str(crossmap.db._db.list_collection_names())) def get_vector(dataset, item_id): db = crossmap.indexer.db result = db.get_data(dataset, ids=[item_id]) return result[0]["data"] def decr_by_query(a): return -a["query"]
if config.diffusion is not None: try: config.diffusion = loads(config.diffusion) except Exception as e: raise Exception("Error parsing diffusion json: " + str(e)) action = config.action if config.logging is not None: logging.getLogger().setLevel(config.logging) # output as json or tsv output = tsv_print if config.tsv else json_print # for build, settings check all data files are available # for other actions, the settings can be lenient settings = CrossmapSettings(config.config, require_data_files=(action == "build")) if not settings.valid: sys.exit() crossmap = None if action in {"search", "decompose"}: logging.getLogger().setLevel(level=logging.ERROR) if action in {"build", "search", "decompose", "add", "remove"}: crossmap = Crossmap(settings) if action in { "features", "diffuse", "distances", "matrix", "counts", "summary" }: crossmap = CrossmapInfo(settings) # ############################################################################ # actions associated with primary functionality and batch processing
def setUpClass(cls): cls.settings = CrossmapSettings(config_file)
def setUp(self): self.settings = CrossmapSettings(config_complete_file)
def setUpClass(cls): settings = CrossmapSettings(config_plain, create_dir=True) cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.diffuser = CrossmapDiffuser(settings) cls.diffuser.build()
def test_init_single(self): """a configuration file can specify a single dataset""" result = CrossmapSettings(config_single_file) self.assertTrue(result.valid)
def setUpClass(cls): cls.settings = CrossmapSettings(config_plain) cls.crossmap = Crossmap(cls.settings) cls.crossmap.build() cls.crossinfo = CrossmapInfo(cls.settings)
class CrossmapFeatureMapTests(unittest.TestCase): """Turning text data into tokens""" def setUp(self): self.settings = CrossmapSettings(config_file, create_dir=True) self.cache_file = self.settings.tsv_file("feature-map") def tearDown(self): remove_crossmap_cache(data_dir, "crossmap_simple") def test_feature_map_unique_ids(self): """features in targets and documents must have unique ids""" map = feature_map(self.settings) # map should be a dict from strings into (index, weight) all_indexes = set([v[0] for k,v in map.items()]) self.assertEqual(len(all_indexes), len(map)) def test_skipping_partial_documents(self): """scan for features in targets, but not documents""" # with an intermediate number of required features, # some features will come from targets, some from documents. self.settings.features.max_number = 50 with self.assertLogs(level="INFO"): map = feature_map(self.settings) self.assertTrue("bob" in map) # map will have exactly the max features self.assertEqual(len(map), 50) self.assertTrue("g" in map) self.assertFalse("M" in map) def test_weights_ic(self): """weights using constant and ic scaling""" # compute maps using different weightings with self.assertLogs(level="INFO"): self.settings.features.weighting = [1,0] map_const = feature_map(self.settings) self.settings.features.weighting = [0, 1] map_ic = feature_map(self.settings) self.settings.features.weighting = [1, 1] map_mid = feature_map(self.settings) # maps should all contain same features, i.e. equal length self.assertEqual(len(map_const), len(map_ic)) self.assertEqual(len(map_const), len(map_mid)) # constant map has all the weights equal to 1 for k, v in map_const.items(): self.assertEqual(v[1], 1) # information content map has certain features with less weight with_ic = map_ic["with"][1] with_const = map_const["with"][1] self.assertLess(with_ic, map_ic["alice"][1]) self.assertLess(with_ic, map_ic["uniqu"][1]) self.assertEqual(map_ic["uniqu"][1], map_ic["token"][1]) # Because mixed map has high coefficient for both constant and ic, # all values should be higher than in either constant or ic maps with_mid = map_mid["with"][1] self.assertGreater(with_mid, with_ic) self.assertGreater(with_mid, with_const) def test_feature_map_min_count(self): """only use features present in a miniaml num of docs""" with self.assertLogs(level="INFO"): self.settings.features.max_number = 1000 self.settings.features.min_count = 1 map1 = feature_map(self.settings) self.settings.features.min_count = 2 map2 = feature_map(self.settings) # map will have exactly the max features self.assertTrue("abcde" in map1) self.assertFalse("abcde" in map2)