Exemplo n.º 1
0
 def setUp(self):
     settings = CrossmapSettings(config_plain, create_dir=True)
     settings.tokens.k = 10
     # initialize the db with custom features
     CrossmapFeatures(settings, features=test_features)
     self.indexer = CrossmapIndexer(settings)
     self.index_file = settings.index_file("targets")
Exemplo n.º 2
0
    def test_saves_disk_file(self):
        """scan for features and record in cache"""

        settings = CrossmapSettings(config_file, create_dir=True)
        cache_file = settings.tsv_file("feature-map")
        self.assertFalse(exists(cache_file))
        # run the feature extraction, should create a cache file
        with self.assertLogs(level="INFO") as cm1:
            CrossmapFeatures(settings)
        self.assertTrue(exists(cache_file))
        self.assertTrue("Saving" in str(cm1.output))
Exemplo n.º 3
0
    def test_warnings_nonexistent_files(self):
        """Attempt to configure with a configuration with a typo"""

        with self.assertLogs(level='WARNING'):
            result = CrossmapSettings(config_nonexistent_file)
        # one target file is valid, so the overall config is valid
        self.assertTrue(result.valid)
Exemplo n.º 4
0
    def test_init_dir(self):
        """Configure with just a directory"""

        result = CrossmapSettings(data_dir)
        self.assertEqual(result.dir, data_dir)
        self.assertEqual(result.file, "crossmap.yaml")
        self.assertTrue(result.valid)
Exemplo n.º 5
0
    def setUpClass(cls):
        """build an indexer using a fixed featuremap"""

        settings = CrossmapSettings(config_featuremap, create_dir=True)
        settings.tokens.k = 20
        cls.indexer = CrossmapIndexer(settings)
        cls.indexer.build()
        cls.feature_map = cls.indexer.encoder.feature_map
Exemplo n.º 6
0
    def setUpClass(cls):
        """build an indexer using target documents only"""

        settings = CrossmapSettings(config_single, create_dir=True)
        settings.tokens.k = 10
        CrossmapFeatures(settings, features=test_features)
        cls.indexer = CrossmapIndexer(settings)
        cls.indexer.build()
Exemplo n.º 7
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_plain, create_dir=True)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
     cls.diffuser = CrossmapDiffuser(settings)
     cls.diffuser.build()
     cls.feature_map = cls.diffuser.feature_map
     cls.db = cls.diffuser.db
     cls.encoder = cls.indexer.encoder
Exemplo n.º 8
0
 def test_skip_certain_docs(self):
     """docs do not have any of the limited features should be omitted"""
     settings = CrossmapSettings(config_plain, create_dir=True)
     settings.tokens.k = 10
     CrossmapFeatures(settings, features=self.limited_features)
     indexer = CrossmapIndexer(settings)
     with self.assertLogs(level="WARNING") as cm:
         indexer.build()
     self.assertTrue("Skipping item" in str(cm.output))
Exemplo n.º 9
0
    def test_connect_with_partial_config(self):
        """after an instance is built, can connect to it"""

        # create settings with very minimal settings
        # (no data fields)
        settings = CrossmapSettings(config_simple_nodata,
                                    require_data_files=False)
        instance = Crossmap(settings)
        instance.load()
        self.assertEqual(instance.valid, True)
        self.assertEqual(instance.default_label, "targets")
Exemplo n.º 10
0
    def test_nonzero_weight(self):
        """all features must have nonzero weight"""

        # use a configuration with log weights
        settings = CrossmapSettings(config_constant_file, create_dir=True)
        with self.assertLogs(level="INFO") as cm:
            settings.features.weighting = [0, 1]
            map_const = feature_map(settings)
        self.assertTrue("Extracting" in str(cm.output))
        # all features should have >0 weights
        for k, v in map_const.items():
            kmsg = "feature "+str(k)+" should have weight >0"
            self.assertGreater(v[1], 0, kmsg)
Exemplo n.º 11
0
    def test_init_from_settings(self):
        """Initializing a crossmap object create directory structure"""

        # create settings by providing a directory
        # This will trigger search for crossmap.yaml
        settings = CrossmapSettings(data_dir)
        subdir = settings.prefix
        self.assertEqual(subdir, join(data_dir, "crossmap_default"))
        # data directory does not exist before init, exists after
        self.assertFalse(exists(subdir))
        # initializing using a settings object
        crossmap = Crossmap(settings)
        self.assertTrue(exists(subdir))
        # the crossmap is not valid because it has not been build yet
        self.assertFalse(crossmap.valid)
Exemplo n.º 12
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_longword, create_dir=True)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
     cls.diffuser = CrossmapDiffuser(settings)
     cls.diffuser.build()
     cls.feature_map = cls.diffuser.feature_map
     cls.db = cls.diffuser.db
     cls.encoder = cls.indexer.encoder
     cls.plain_tokenizer = CrossmapTokenizer(settings)
     cls.diff_tokenizer = CrossmapDiffusionTokenizer(settings)
     # extract data vectors
     cls.data = dict()
     temp = cls.db.get_data(dataset="targets",
                            ids=["L0", "L1", "L2", "L3", "L4"])
     for _ in temp:
         cls.data[_["id"]] = sparse_to_dense(_["data"])
Exemplo n.º 13
0
class CrossmapFeatureMapIOTests(unittest.TestCase):
    """reading and writing feature-map tables"""

    def setUp(self):
        self.settings = CrossmapSettings(config_file, create_dir=True)

    def tearDown(self):
        remove_crossmap_cache(data_dir, "crossmap_simple")

    def test_write_read(self):
        """read and write a custom-made object"""
        map = dict(a=(0,1), b=(1,0.5))
        map_file = self.settings.tsv_file("feature-map")
        write_feature_map(map, self.settings)
        map2 = read_feature_map(map_file)
        self.assertEqual(len(map2), len(map))
        self.assertEqual(map2["a"], map["a"])
        self.assertEqual(map2["b"], map["b"])
Exemplo n.º 14
0
    def test_init_no_data(self):
        """Attempt to configure with a configuration file without any data"""

        with self.assertLogs(level='ERROR'):
            result = CrossmapSettings(config_no_data_file)
            self.assertFalse(result.valid)
Exemplo n.º 15
0
    def test_init_missing_file(self):
        """Attempt to configure with a non-existent file"""

        with self.assertLogs(level='ERROR'):
            result = CrossmapSettings(join(data_dir, "missing.yaml"))
            self.assertFalse(result.valid)
Exemplo n.º 16
0
    def test_validate_weights(self):
        """validation can detect misspecified feature weighting scheme"""

        with self.assertLogs(level="WARNING"):
            settings = CrossmapSettings(config_weighting_file)
            settings._validate()
Exemplo n.º 17
0
 def setUp(self):
     self.settings = CrossmapSettings(config_file, create_dir=True)
     self.cache_file = self.settings.tsv_file("feature-map")
Exemplo n.º 18
0
 def setUp(self):
     settings = CrossmapSettings(config_single, create_dir=True)
     self.crossmap = Crossmap(settings)
     self.crossmap.build()
     self.assertTrue(self.crossmap)
Exemplo n.º 19
0
 def setUp(self):
     self.settings = CrossmapSettings(config_file, create_dir=True)
Exemplo n.º 20
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_simple)
     crossmap = Crossmap(settings)
     crossmap.build()
     cls.crossmap = CrossmapInfo(settings)
Exemplo n.º 21
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_plain, create_dir=True)
     settings.tokens.k = 10
     CrossmapFeatures(settings, features=test_features)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
Exemplo n.º 22
0
"""

from functools import wraps
from json import dumps, loads
from crossmap.crossmap import Crossmap
from crossmap.settings import CrossmapSettings
from crossmap.vectors import sparse_to_dense
from os import environ
from urllib.parse import unquote
from django.http import HttpResponse
from logging import info
import yaml

# load the crossmap object based on configuration saved in an OS variable
config_path = environ.get('DJANGO_CROSSMAP_CONFIG_PATH')
settings = CrossmapSettings(config_path, require_data_files=False)
crossmap = Crossmap(settings)
crossmap.load()
info("database collections: " + str(crossmap.db._db.list_collection_names()))


def get_vector(dataset, item_id):
    db = crossmap.indexer.db
    result = db.get_data(dataset, ids=[item_id])
    return result[0]["data"]


def decr_by_query(a):
    return -a["query"]

Exemplo n.º 23
0
if config.diffusion is not None:
    try:
        config.diffusion = loads(config.diffusion)
    except Exception as e:
        raise Exception("Error parsing diffusion json: " + str(e))
action = config.action

if config.logging is not None:
    logging.getLogger().setLevel(config.logging)

# output as json or tsv
output = tsv_print if config.tsv else json_print

# for build, settings check all data files are available
# for other actions, the settings can be lenient
settings = CrossmapSettings(config.config,
                            require_data_files=(action == "build"))
if not settings.valid:
    sys.exit()

crossmap = None
if action in {"search", "decompose"}:
    logging.getLogger().setLevel(level=logging.ERROR)
if action in {"build", "search", "decompose", "add", "remove"}:
    crossmap = Crossmap(settings)
if action in {
        "features", "diffuse", "distances", "matrix", "counts", "summary"
}:
    crossmap = CrossmapInfo(settings)

# ############################################################################
# actions associated with primary functionality and batch processing
Exemplo n.º 24
0
 def setUpClass(cls):
     cls.settings = CrossmapSettings(config_file)
Exemplo n.º 25
0
 def setUp(self):
     self.settings = CrossmapSettings(config_complete_file)
Exemplo n.º 26
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_plain, create_dir=True)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
     cls.diffuser = CrossmapDiffuser(settings)
     cls.diffuser.build()
Exemplo n.º 27
0
    def test_init_single(self):
        """a configuration file can specify a single dataset"""

        result = CrossmapSettings(config_single_file)
        self.assertTrue(result.valid)
Exemplo n.º 28
0
 def setUpClass(cls):
     cls.settings = CrossmapSettings(config_plain)
     cls.crossmap = Crossmap(cls.settings)
     cls.crossmap.build()
     cls.crossinfo = CrossmapInfo(cls.settings)
Exemplo n.º 29
0
class CrossmapFeatureMapTests(unittest.TestCase):
    """Turning text data into tokens"""

    def setUp(self):
        self.settings = CrossmapSettings(config_file, create_dir=True)
        self.cache_file = self.settings.tsv_file("feature-map")

    def tearDown(self):
        remove_crossmap_cache(data_dir, "crossmap_simple")

    def test_feature_map_unique_ids(self):
        """features in targets and documents must have unique ids"""

        map = feature_map(self.settings)
        # map should be a dict from strings into (index, weight)
        all_indexes = set([v[0] for k,v in map.items()])
        self.assertEqual(len(all_indexes), len(map))

    def test_skipping_partial_documents(self):
        """scan for features in targets, but not documents"""

        # with an intermediate number of required features,
        # some features will come from targets, some from documents.
        self.settings.features.max_number = 50
        with self.assertLogs(level="INFO"):
            map = feature_map(self.settings)
        self.assertTrue("bob" in map)
        # map will have exactly the max features
        self.assertEqual(len(map), 50)
        self.assertTrue("g" in map)
        self.assertFalse("M" in map)

    def test_weights_ic(self):
        """weights using constant and ic scaling"""

        # compute maps using different weightings
        with self.assertLogs(level="INFO"):
            self.settings.features.weighting = [1,0]
            map_const = feature_map(self.settings)
            self.settings.features.weighting = [0, 1]
            map_ic = feature_map(self.settings)
            self.settings.features.weighting = [1, 1]
            map_mid = feature_map(self.settings)
        # maps should all contain same features, i.e. equal length
        self.assertEqual(len(map_const), len(map_ic))
        self.assertEqual(len(map_const), len(map_mid))
        # constant map has all the weights equal to 1
        for k, v in map_const.items():
            self.assertEqual(v[1], 1)
        # information content map has certain features with less weight
        with_ic = map_ic["with"][1]
        with_const = map_const["with"][1]
        self.assertLess(with_ic, map_ic["alice"][1])
        self.assertLess(with_ic, map_ic["uniqu"][1])
        self.assertEqual(map_ic["uniqu"][1], map_ic["token"][1])
        # Because mixed map has high coefficient for both constant and ic,
        # all values should be higher than in either constant or ic maps
        with_mid = map_mid["with"][1]
        self.assertGreater(with_mid, with_ic)
        self.assertGreater(with_mid, with_const)

    def test_feature_map_min_count(self):
        """only use features present in a miniaml num of docs"""

        with self.assertLogs(level="INFO"):
            self.settings.features.max_number = 1000
            self.settings.features.min_count = 1
            map1 = feature_map(self.settings)
            self.settings.features.min_count = 2
            map2 = feature_map(self.settings)
        # map will have exactly the max features
        self.assertTrue("abcde" in map1)
        self.assertFalse("abcde" in map2)