예제 #1
0
 def setUpClass(cls):
     print("Load test database csvs for load tests")
     config = Config()
     config.general['log_level'] = logging.DEBUG
     maker = CDBMaker(config)
     csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
     cls.cdb = maker.prepare_csvs(csvs, full_build=True)
예제 #2
0
    def setUp(self):
        self.config = Config()
        self.config.general['log_level'] = logging.DEBUG
        self.maker = CDBMaker(self.config)

        # Building a new CDB from two files (full_build)
        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
        self.cdb = self.maker.prepare_csvs(csvs, full_build=True)
예제 #3
0
 def setUpClass(cls):
     print("Load test database csvs for load tests")
     config = Config()
     config.general['log_level'] = logging.DEBUG
     cls.maker = CDBMaker(config)
     csvs = [
         os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                      'examples', 'cdb.csv'),
         os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                      'examples', 'cdb_2.csv')
     ]
     cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
예제 #4
0
 def setUpClass(cls) -> None:
     config = Config()
     config.general["spacy_model"] = "en_core_sci_md"
     cls.cdb_maker = CDBMaker(config)
예제 #5
0
class CdbMakerArchiveTests(unittest.TestCase):

    def setUp(self):
        self.config = Config()
        self.config.general['log_level'] = logging.DEBUG
        self.maker = CDBMaker(self.config)

        # Building a new CDB from two files (full_build)
        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
        self.cdb = self.maker.prepare_csvs(csvs, full_build=True)

    def test_prepare_csvs(self):
        assert len(self.cdb.cui2names) == 3
        assert len(self.cdb.cui2snames) == 3
        assert len(self.cdb.name2cuis) == 5
        assert len(self.cdb.cui2tags) == 3
        assert len(self.cdb.cui2preferred_name) == 2
        assert len(self.cdb.cui2context_vectors) == 3
        assert len(self.cdb.cui2count_train) == 3
        assert self.cdb.name2cuis2status['virus']['C0000039'] == 'P'
        assert self.cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
        assert self.cdb.addl_info['cui2original_names']['C0000039'] == {'Virus', 'Virus K', 'Virus M', 'Virus Z'}
        assert self.cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")

    def test_name_addition(self):
        self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config), name_status='P', full_build=True)
        assert self.cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'}
        assert 'my:newname.' in self.cdb.name2cuis
        assert 'my:new' in self.cdb.snames
        assert 'my:newname.' in self.cdb.name2cuis2status
        assert self.cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}

    def test_name_removal(self):
        self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
        # Run again to make sure it does not break anything
        self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
        assert len(self.cdb.name2cuis) == 5
        assert 'my:newname.' not in self.cdb.name2cuis2status

    def test_filtering(self):
        cuis_to_keep = {'C0000039'} # Because of transition 2 will be kept
        self.cdb.filter_by_cui(cuis_to_keep=cuis_to_keep)
        assert len(self.cdb.cui2names) == 2
        assert len(self.cdb.name2cuis) == 4
        assert len(self.cdb.snames) == 4

    def test_vector_addition(self):
        self.cdb.reset_training()
        np.random.seed(11)
        cuis = list(self.cdb.cui2names.keys())
        for i in range(2):
            for cui in cuis:
                vectors = {}
                for cntx_type in self.config.linking['context_vector_sizes']:
                    vectors[cntx_type] = np.random.rand(300)
                self.cdb.update_context_vector(cui, vectors, negative=False)

        assert self.cdb.cui2count_train['C0000139'] == 2
        assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300


    def test_negative(self):
        cuis = list(self.cdb.cui2names.keys())
        for cui in cuis:
            vectors = {}
            for cntx_type in self.config.linking['context_vector_sizes']:
                vectors[cntx_type] = np.random.rand(300)
            self.cdb.update_context_vector(cui, vectors, negative=True)

        assert self.cdb.cui2count_train['C0000139'] == 2
        assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300

    def test_save_and_load(self):
        self.cdb.save("./tmp_cdb.dat")
        cdb2 = CDB.load('./tmp_cdb.dat')
        # Check a random thing
        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]

    def test_training_import(self):
        cdb2 = CDB.load('./tmp_cdb.dat')
        self.cdb.reset_training()
        cdb2.reset_training()
        np.random.seed(11)
        cuis = list(self.cdb.cui2names.keys())
        for i in range(2):
            for cui in cuis:
                vectors = {}
                for cntx_type in self.config.linking['context_vector_sizes']:
                    vectors[cntx_type] = np.random.rand(300)
                self.cdb.update_context_vector(cui, vectors, negative=False)

        cdb2.import_training(cdb=self.cdb, overwrite=True)
        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
        assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139']

    def test_concept_similarity(self):
        cdb = CDB(config=self.config)
        np.random.seed(11)
        for i in range(500):
            cui = "C" + str(i)
            type_ids = {'T-' + str(i%10)}
            cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(),
                    name_status='P', type_ids=type_ids, description='', full_build=True)

            vectors = {}
            for cntx_type in self.config.linking['context_vector_sizes']:
                vectors[cntx_type] = np.random.rand(300)
            cdb.update_context_vector(cui, vectors, negative=False)
        res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True)
        assert len(res) == 10

    def test_training_reset(self):
        self.cdb.reset_training()
        assert len(self.cdb.cui2context_vectors['C0']) == 0
        assert self.cdb.cui2count_train['C0'] == 0
예제 #6
0
r''' The tests here are a bit messy but they work, should be converted to python unittests.
'''
from medcat.cdb_maker import CDBMaker
from medcat.config import Config
import numpy as np
import logging

config = Config()
config.general['log_level'] = logging.DEBUG
maker = CDBMaker(config)

# Building a new CDB from two files (full_build)
csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
cdb = maker.prepare_csvs(csvs, full_build=True)

assert len(cdb.cui2names) == 3
assert len(cdb.cui2snames) == 3
assert len(cdb.name2cuis) == 5
assert len(cdb.cui2tags) == 3
assert len(cdb.cui2preferred_name) == 2
assert len(cdb.cui2context_vectors) == 3
assert len(cdb.cui2count_train) == 3
assert cdb.name2cuis2status['virus']['C0000039'] == 'P'
assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
assert cdb.addl_info['cui2original_names']['C0000039'] == {
    'Virus', 'Virus K', 'Virus M', 'Virus Z'
}
assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")

# Test name addition
from medcat.preprocessing.cleaners import prepare_name
예제 #7
0
from medcat.cdb import CDB
from medcat.cdb_maker import CDBMaker
from medcat.config import Config

# Specify cdb name and path to csvs
cdb_name = "cdb_name.dat"
csv_path_list = [" path to list of csvs here"]

# Create CDB
config = Config()
maker = CDBMaker(config)
cdb = maker.prepare_csvs(csv_path_list, full_build=True)
cdb.save(cdb_name)

# Load the newly created cdb:
cdb2 = CDB.load(cdb_name)