def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path cdb = CDB.load(cdb_path) cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE")) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def test_save_and_load(self): self.undertest.add_words( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt")) self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55]) vocab_path = f"{self.tmp_dir}/vocab.dat" self.undertest.save(vocab_path) vocab = Vocab.load(vocab_path) self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys()))
def _create_cat(self): """ Loads MedCAT resources and creates CAT instance """ if os.getenv("APP_MODEL_VOCAB_PATH") is None: raise ValueError( "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified") if os.getenv("APP_MODEL_CDB_PATH") is None: raise Exception( "Concept database (env: APP_MODEL_CDB_PATH) not specified") # Vocabulary and Concept Database are mandatory self.log.debug("Loading VOCAB ...") vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH")) self.log.debug("Loading CDB ...") cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH")) spacy_model = os.getenv("SPACY_MODEL", "") if spacy_model: cdb.config.general["spacy_model"] == spacy_model else: logging.warning("SPACY_MODEL environment var not set, \ attempting to load the spacy model found within the CDB : " + cdb.config.general["spacy_model"]) if cdb.config.general["spacy_model"] == "": raise ValueError( "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \ To solve this declare the SPACY_MODEL in the env_medcat file." ) # this is redundant as the config is already in the CDB conf = cdb.config # Apply CUI filter if provided if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None: self.log.debug("Applying CDB CUI filter ...") with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file: all_lines = (line.rstrip() for line in cui_file) selected_cuis = [line for line in all_lines if line] # filter blank lines cdb.filter_by_cui(selected_cuis) # Meta-annotation models are optional meta_models = [] if os.getenv("APP_MODEL_META_PATH_LIST") is not None: self.log.debug("Loading META annotations ...") for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"): m = MetaCAT.load(model_path) meta_models.append(m) cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models) return cat
def setUpClass(cls): print("Set up CDB") cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) print("Set up Vocab") vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get( "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) print("Set up NLP pipeline") cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config) cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config), name='skip_and_punct', additional_fields=['is_punct']) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker, config=cls.config) cls.ner = NER(cls.cdb, cls.config) cls.nlp.add_ner(cls.ner) print("Set up Linker") cls.link = Linker(cls.cdb, cls.vocab, cls.config) cls.nlp.add_linker(cls.link) print("Set limits for tokens and uppercase") cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 print("Add concepts") cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229005', names=prepare_name('CDB', cls.nlp, {}, cls.config)) print("Add test text") cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.text_post_pipe = cls.nlp(cls.text)
def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None): self.cdb = cdb self.config = config self.w2v = None if vocab is not None: self.vocab = vocab else: self.vocab = Vocab() # Build the required spacy pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok # Used for saving if the real path is not set self.vocab_path = "./tmp_vocab.dat"
def setUp(self) -> None: self.config = Config() self.config.general['log_level'] = logging.INFO cdb = CDB(config=self.config) self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} self.vocab_path = "./tmp_vocab.dat" if not os.path.exists(self.vocab_path): import requests tmp = requests.get( "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(self.vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(self.vocab_path) # Make the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab) self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) ner = NER(cdb, self.config) self.nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, self.config) self.nlp.add_linker(link) self.text = "CDB - I was running and then Movar Virus attacked and CDb"
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path try: cdb = CDB.load(cdb_path) except KeyError as ke: mc_v = pkg_resources.get_distribution('medcat').version if int(mc_v.split('.')[0]) > 0: log.error( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x' ) raise Exception( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x', 'Please re-configure this project to use a MedCAT v1.x CDB or consult the ' 'MedCATTrainer Dev team if you believe this should work' ) from ke raise custom_config = os.getenv("MEDCAT_CONFIG_FILE") if custom_config is not None and os.path.exists(custom_config): cdb.config.parse_config_file(path=custom_config) else: log.info( "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB" ) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def setUpClass(cls) -> None: cls.cdb = CDB.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) cls.vocab = Vocab.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) cls.cdb.config.ner['min_name_len'] = 2 cls.cdb.config.ner['upper_case_limit_len'] = 3 cls.cdb.config.general['spell_check'] = True cls.cdb.config.linking['train_count_threshold'] = 10 cls.cdb.config.linking['similarity_threshold'] = 0.3 cls.cdb.config.linking['train'] = True cls.cdb.config.linking['disamb_length_limit'] = 5 cls.cdb.config.general['full_unlink'] = True cls.undertest = CAT(cdb=cls.cdb, config=cls.cdb.config, vocab=cls.vocab)
def test_for_linker(self): self.config = Config() self.config.general['log_level'] = logging.DEBUG cdb = CDB(config=self.config) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) cdb.add_names(cui='S-2290045', names=prepare_name('Movar', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}} cuis = list(cdb.cui2names.keys()) for cui in cuis[0:50]: vectors = { 'short': np.random.rand(300), 'long': np.random.rand(300), 'medium': np.random.rand(300) } cdb.update_context_vector(cui, vectors, negative=False) d = self.nlp(self.text) vocab = Vocab.load(self.vocab_path) cm = ContextModel(cdb, vocab, self.config) cm.train_using_negative_sampling('S-229004') self.config.linking['train_count_threshold'] = 0 cm.train('S-229004', d._.ents[1], d) cm.similarity('S-229004', d._.ents[1], d) cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
def setUpClass(cls) -> None: cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.ner = NER(cls.cdb, cls.config) cls.linker = Linker(cls.cdb, cls.vocab, cls.config) cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 cls.meta_cat = MetaCAT() cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config)
class MakeVocab(object): r''' Create a new vocab from a text file. To make a vocab and train word embeddings do: Args: cdb (medcat.cdb.CDB): The concept database that will be added ontop of the Vocab built from the text file. vocab (medcat.utils.vocab.Vocab, optional): Vocabulary to be extended, leave as None if you want to make a new Vocab. Default: None word_tokenizer (<function>): A custom tokenizer for word spliting - used if embeddings are BERT or similar. Default: None >>>cdb = <your existing cdb> >>>maker = MakeVocab(cdb=cdb, config=config) >>>maker.make(data_iterator, out_folder="./output/") >>>maker.add_vectors(in_path="./output/data.txt") >>> ''' log = logging.getLogger(__name__) def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None): self.cdb = cdb self.config = config self.w2v = None if vocab is not None: self.vocab = vocab else: self.vocab = Vocab() # Build the required spacy pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok # Used for saving if the real path is not set self.vocab_path = "./tmp_vocab.dat" def _tok(self, text): return [text] def make(self, iter_data, out_folder, join_cdb=True, normalize_tokens=False): r''' Make a vocab - without vectors initially. This will create two files in the out_folder: - vocab.dat -> The vocabulary without vectors - data.txt -> The tokenized dataset prepared for training of word2vec or similar embeddings. Args: iter_data (Iterator): An iterator over sentences or documents. Can also be a simple array of text documents/sentences. out_folder (string): A path to a folder where all the results will be saved join_cdb (bool): Should the words from the CDB be added to the Vocab. Default: True normalize_tokens (bool, defaults to True): If set tokens will be lematized - tends to work better in some cases where the difference between e.g. plural/singular should be ignored. But in general not so important if the dataset is big enough. ''' # Save the preprocessed data, used for emb training out_path = Path(out_folder) / "data.txt" vocab_path = Path(out_folder) / "vocab.dat" self.vocab_path = vocab_path out = open(out_path, 'w', encoding='utf-8') for ind, doc in enumerate(iter_data): if ind % 10000 == 0: self.log.info("Vocab builder at: " + str(ind)) print(ind) doc = self.nlp.nlp.tokenizer(doc) line = "" for token in doc: if token.is_space or token.is_punct: continue if len(token.lower_) > 0: if normalize_tokens: self.vocab.inc_or_add(token._.norm) else: self.vocab.inc_or_add(token.lower_) if normalize_tokens: line = line + " " + "_".join(token._.norm.split(" ")) else: line = line + " " + "_".join(token.lower_.split(" ")) out.write(line.strip()) out.write("\n") out.close() if join_cdb and self.cdb: for word in self.cdb.vocab.keys(): if word not in self.vocab: self.vocab.add_word(word) else: # Update the count with the counts from the new dataset self.cdb.vocab[word] += self.vocab[word] # Save the vocab also self.vocab.save(path=self.vocab_path) def add_vectors(self, in_path=None, w2v=None, overwrite=False, data_iter=None, workers=14, niter=2, min_count=10, window=10, vsize=300, unigram_table_size=100000000): r''' Add vectors to an existing vocabulary and save changes to the vocab_path. Args: in_path (String): Path to the data.txt that was created by the MakeVocab.make() function. w2v (Word2Vec, optional): An existing word2vec instance. Default: None overwrite (bool): If True it will overwrite existing vectors in the vocabulary. Default: False data_iter (iterator): If you want to provide a customer iterator over the data use this. If yes, then in_path is not needed. **: Word2Vec arguments Returns: A trained word2vec model. ''' if w2v is None: if data_iter is None: data = SimpleIter(in_path) else: data = data_iter w2v = Word2Vec(data, window=window, min_count=min_count, workers=workers, size=vsize, iter=niter) for word in w2v.wv.vocab.keys(): if word in self.vocab: if overwrite: self.vocab.add_vec(word, w2v.wv.get_vector(word)) else: if self.vocab.vec(word) is None: self.vocab.add_vec(word, w2v.wv.get_vector(word)) # Save the vocab again, now with vectors self.vocab.make_unigram_table(table_size=unigram_table_size) self.vocab.save(path=self.vocab_path) return w2v def destroy_pipe(self): self.nlp.destroy()
# Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): import requests tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(vocab_path) # Make the pipeline nlp = Pipe(tokenizer=spacy_split_all, config=config) nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config), name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=config, data_vocab=vocab) nlp.add_token_normalizer(spell_checker=spell_checker, config=config) ner = NER(cdb, config) nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, config) nlp.add_linker(link)
def setUp(self) -> None: self.undertest = Vocab() self.tmp_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tmp") os.makedirs(self.tmp_dir, exist_ok=True)
class CATTests(unittest.TestCase): def setUp(self) -> None: self.undertest = Vocab() self.tmp_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tmp") os.makedirs(self.tmp_dir, exist_ok=True) def tearDown(self) -> None: shutil.rmtree(self.tmp_dir) def test_add_words(self): self.undertest.add_words( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt")) self.assertEqual(["house", "dog"], list(self.undertest.vocab.keys())) def test_add_word(self): self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55]) self.assertEqual(["test"], list(self.undertest.vocab.keys())) self.assertTrue("test" in self.undertest) def test_count(self): self.undertest.add_words( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt")) self.assertEqual(34444, self.undertest.count("house")) def test_save_and_load(self): self.undertest.add_words( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt")) self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55]) vocab_path = f"{self.tmp_dir}/vocab.dat" self.undertest.save(vocab_path) vocab = Vocab.load(vocab_path) self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys()))