def setUpClass(cls): print("Load test database csvs for load tests") config = Config() config.general['log_level'] = logging.DEBUG maker = CDBMaker(config) csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] cls.cdb = maker.prepare_csvs(csvs, full_build=True)
def setUpClass(cls): print("Load test database csvs for load tests") config = Config() config.general['log_level'] = logging.DEBUG cls.maker = CDBMaker(config) csvs = [ os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'), os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv') ] cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
def setUp(self): self.config = Config() self.config.general['log_level'] = logging.DEBUG self.maker = CDBMaker(self.config) # Building a new CDB from two files (full_build) csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] self.cdb = self.maker.prepare_csvs(csvs, full_build=True)
def setUpClass(cls): print("Set up CDB") cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) print("Set up Vocab") vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get( "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) print("Set up NLP pipeline") cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config) cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config), name='skip_and_punct', additional_fields=['is_punct']) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker, config=cls.config) cls.ner = NER(cls.cdb, cls.config) cls.nlp.add_ner(cls.ner) print("Set up Linker") cls.link = Linker(cls.cdb, cls.vocab, cls.config) cls.nlp.add_linker(cls.link) print("Set limits for tokens and uppercase") cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 print("Add concepts") cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229005', names=prepare_name('CDB', cls.nlp, {}, cls.config)) print("Add test text") cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.text_post_pipe = cls.nlp(cls.text)
def setUpClass(cls) -> None: cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.ner = NER(cls.cdb, cls.config) cls.linker = Linker(cls.cdb, cls.vocab, cls.config) cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 cls.meta_cat = MetaCAT() cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config)
def setUp(self) -> None: self.config = Config() self.config.general['log_level'] = logging.INFO cdb = CDB(config=self.config) self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} self.vocab_path = "./tmp_vocab.dat" if not os.path.exists(self.vocab_path): import requests tmp = requests.get( "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(self.vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(self.vocab_path) # Make the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab) self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) ner = NER(cdb, self.config) self.nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, self.config) self.nlp.add_linker(link) self.text = "CDB - I was running and then Movar Virus attacked and CDb"
def load(cls, path, config=None): r''' Load and return a CDB. This allows partial loads in probably not the right way at all. Args: path (`str`): Path to a `cdb.dat` from which to load data. ''' with open(path, 'rb') as f: # Again no idea data = dill.load(f) if config is None: config = Config.from_dict(data['config']) # Create an instance of the CDB (empty) cdb = cls(config=config) # Load data into the new cdb instance for k in cdb.__dict__: if k in data['cdb']: cdb.__dict__[k] = data['cdb'][k] return cdb
def test_for_linker(self): self.config = Config() self.config.general['log_level'] = logging.DEBUG cdb = CDB(config=self.config) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) cdb.add_names(cui='S-2290045', names=prepare_name('Movar', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}} cuis = list(cdb.cui2names.keys()) for cui in cuis[0:50]: vectors = { 'short': np.random.rand(300), 'long': np.random.rand(300), 'medium': np.random.rand(300) } cdb.update_context_vector(cui, vectors, negative=False) d = self.nlp(self.text) vocab = Vocab.load(self.vocab_path) cm = ContextModel(cdb, vocab, self.config) cm.train_using_negative_sampling('S-229004') self.config.linking['train_count_threshold'] = 0 cm.train('S-229004', d._.ents[1], d) cm.similarity('S-229004', d._.ents[1], d) cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
def setUpClass(cls) -> None: config = Config() config.general["spacy_model"] = "en_core_sci_md" cls.cdb_maker = CDBMaker(config)
r''' The tests here are a bit messy but they work, should be converted to python unittests. ''' from medcat.cdb_maker import CDBMaker from medcat.config import Config import numpy as np import logging config = Config() config.general['log_level'] = logging.DEBUG maker = CDBMaker(config) # Building a new CDB from two files (full_build) csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] cdb = maker.prepare_csvs(csvs, full_build=True) assert len(cdb.cui2names) == 3 assert len(cdb.cui2snames) == 3 assert len(cdb.name2cuis) == 5 assert len(cdb.cui2tags) == 3 assert len(cdb.cui2preferred_name) == 2 assert len(cdb.cui2context_vectors) == 3 assert len(cdb.cui2count_train) == 3 assert cdb.name2cuis2status['virus']['C0000039'] == 'P' assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'} assert cdb.addl_info['cui2original_names']['C0000039'] == { 'Virus', 'Virus K', 'Virus M', 'Virus Z' } assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic") # Test name addition from medcat.preprocessing.cleaners import prepare_name
from medcat.ner.vocab_based_ner import NER from medcat.preprocessing.taggers import tag_skip_and_punct from medcat.pipe import Pipe from medcat.utils.normalizers import BasicSpellChecker from medcat.vocab import Vocab from medcat.preprocessing.cleaners import prepare_name from medcat.linking.vector_context_model import ContextModel from functools import partial from medcat.linking.context_based_linker import Linker from medcat.config import Config import logging from medcat.cdb import CDB import os import requests config = Config() config.general['log_level'] = logging.INFO cdb = CDB(config=config) nlp = Pipe(tokenizer=spacy_split_all, config=config) nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config), name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}
from medcat.cdb_maker import CDBMaker from medcat.config import Config import numpy as np import logging import os config = Config() config.general['log_level'] = logging.INFO config.general['spacy_model'] = 'en_core_sci_lg' maker = CDBMaker(config) # Building a new CDB from two files (full_build) csvs = ['./tmp_medmentions.csv'] cdb = maker.prepare_csvs(csvs, full_build=True) cdb.save("./tmp_cdb.dat") from medcat.vocab import Vocab from medcat.cdb import CDB from medcat.cat import CAT vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): import requests tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) config = Config() cdb = CDB.load("./tmp_cdb.dat", config=config)
from medcat.cdb import CDB from medcat.cdb_maker import CDBMaker from medcat.config import Config # Specify cdb name and path to csvs cdb_name = "cdb_name.dat" csv_path_list = [" path to list of csvs here"] # Create CDB config = Config() maker = CDBMaker(config) cdb = maker.prepare_csvs(csv_path_list, full_build=True) cdb.save(cdb_name) # Load the newly created cdb: cdb2 = CDB.load(cdb_name)