def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path cdb = CDB() cdb.load_dict(cdb_path) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab() vocab.load_dict(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, vocab=vocab) cat.train = False CAT_MAP[cat_id] = cat return cat
def _import_concepts(id): from medcat.cdb import CDB concept_db = ConceptDB.objects.get(id=id) cdb = CDB() cdb.load_dict(concept_db.cdb_file.path) tuis = None # Get all existing cuis for this CDB existing_cuis = set( Concept.objects.filter(cdb=id).values_list('cui', flat=True)) for cui in cdb.cui2names.keys(): if cui not in existing_cuis: pretty_name = None if cui in cdb.cui2pretty_name: pretty_name = cdb.cui2pretty_name[cui] elif cui in cdb.cui2original_names and len( cdb.cui2original_names[cui]) > 0: pretty_name = next(iter(cdb.cui2original_names[cui])) tui = cdb.cui2tui.get(cui, 'unk') if pretty_name is not None and (tuis is None or tui in tuis): concept = Concept() concept.pretty_name = pretty_name concept.cui = cui concept.tui = tui concept.semantic_type = cdb.tui2name.get(tui, '') concept.desc = cdb.cui2desc.get(cui, '') concept.synonyms = ", ".join( cdb.cui2original_names.get(cui, [])) concept.cdb = concept_db concept.save() set_icd_info_objects(cdb, concept, cui) set_opcs_info_objects(cdb, concept, cui)
def _import_concepts(id): from medcat.cdb import CDB concept_db = ConceptDB.objects.get(id=id) cdb = CDB() cdb.load_dict(concept_db.cdb_file.path) tuis = None for cui in cdb.cui2pretty_name: tui = cdb.cui2tui.get(cui, 'unk') if tuis is None or tui in tuis: concept = Concept() concept.pretty_name = cdb.cui2pretty_name.get(cui, '') concept.cui = cui concept.tui = tui concept.semantic_type = cdb.tui2name.get(tui, '') concept.desc = cdb.cui2desc.get(cui, '') concept.synonyms = ",".join(cdb.cui2original_names.get(cui, [])) concept.cdb = concept_db icd10 = '' try: for pair in cdb.cui2info[cui]['icd10']: icd10 += pair['chapter'] + " | " + pair['name'] icd10 += '\n' icd10.strip() except: pass concept.icd10 = icd10 #concept.vocab = cdb.cui2ontos.get(cui, '') try: concept.save() except: pass
def filter_cdb_by_icd10(cdb: CDB) -> CDB: """ Filters an existing CDB to only contain concepts that have an associated ICD-10 code. Can be used for snomed orr UMLS CDBs. :return: filtered CDB """ cuis_to_keep = [cui for cui in cdb.cui2names.keys() if 'icd10' in cdb.cui2info[cui]] cdb.filter_by_cui(cuis_to_keep) return cdb
def __init__(self, cdb=None): if cdb is None: self.cdb = CDB() else: self.cdb = cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all) self.nlp.add_punct_tagger(tagger=partial(spacy_tag_punct, skip_stopwords=False))
def save(self, *args, **kwargs): if self.concept_db is None: cdb = CDB() cdb.save_dict('empty_cdb.dat') f = open('empty_cdb.dat', 'rb') cdb_obj = ConceptDB() cdb_obj.name = f'{self.name}_empty_cdb' cdb_obj.cdb_file.save(f'{self.name}_empty_cdb.dat', File(f)) cdb_obj.use_for_training = True cdb_obj.save() self.concept_db = cdb_obj super(ProjectAnnotateEntities, self).save(*args, **kwargs)
def train_supervised(self, data_path, reset_cdb=False, reset_cui_count=False, epochs=2, lr=None, anneal=None): """ Given data learns vector embeddings for concepts in a suppervised way. data_path: path to data in json format """ self.train = False data = json.load(open(data_path)) if reset_cdb: self.cdb = CDB() if reset_cui_count: # Get all CUIs cuis = [] for doc in data['documents']: for ann in doc['annotations']: cuis.append(ann['cui']) for cui in set(cuis): if cui in self.cdb.cui_count: self.cdb.cui_count[cui] = 1 for epoch in epochs: log.info("Starting epoch: {}".format(epoch)) for doc in data['documents']: spacy_doc = self(doc['text']) for ann in doc['annotations']: cui = ann['cui'] start = ann['start'] end = ann['end'] deleted = ann['deleted'] if deleted: # Add negatives only if they exist in the CDB if cui in self.cdb.cui2names: self.add_name(cui=cui, source_val=ann['value'], spacy_doc=spacy_doc, text_inds=[start, end], negative=deleted, lr=lr, anneal=anneal) else: self.add_name(cui=cui, source_val=ann['value'], spacy_doc=spacy_doc, text_inds=[start, end], lr=lr, anneal=anneal)
def __init__(self, vocab=None, pretrained_cdb=None, word_tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger(tagger=partial(spacy_tag_punct, skip_stopwords=self.SKIP_STOPWORDS)) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok
def __init__(self, vocab=None, pretrained_cdb=None, tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger( tagger=partial(spacy_tag_punct, skip_stopwords=False)) # Get the tokenizer if tokenizer is not None: self.tokenizer = tokenizer else: self.tokenizer = self._tok #BertTokenizer.from_pretrained('bert-base-uncased')
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path cdb = CDB.load(cdb_path) cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE")) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def test_bg_save_and_load_model_context_vectors(self): self.cdb.save("./tmp_cdb.dat") self.cdb2 = CDB.load('./tmp_cdb.dat') self.assertEqual(self.cdb.cui2count_train['C0000139'], 2, "Count should equal 2") self.assertEqual( self.cdb.cui2context_vectors['C0000139']['long'].shape[0], 300, "Dimensions should equal 300")
def setUpClass(cls): print("Load test database csvs for edit tests") cls.config = Config() cls.config.general['log_level'] = logging.DEBUG cls.maker = CDBMaker(cls.config) csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) cls.cdb2 = CDB(cls.config)
def __init__(self, config, cdb=None, name_max_words=20): self.config = config # Set log level self.log.setLevel(self.config.general['log_level']) # To make life a bit easier self.cnf_cm = config.cdb_maker if cdb is None: self.cdb = CDB(config=self.config) else: self.cdb = cdb # Build the required spacy pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct'])
def _create_cat(self): """ Loads MedCAT resources and creates CAT instance """ if os.getenv("APP_MODEL_VOCAB_PATH") is None: raise ValueError( "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified") if os.getenv("APP_MODEL_CDB_PATH") is None: raise Exception( "Concept database (env: APP_MODEL_CDB_PATH) not specified") # Vocabulary and Concept Database are mandatory self.log.debug("Loading VOCAB ...") vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH")) self.log.debug("Loading CDB ...") cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH")) spacy_model = os.getenv("SPACY_MODEL", "") if spacy_model: cdb.config.general["spacy_model"] == spacy_model else: logging.warning("SPACY_MODEL environment var not set, \ attempting to load the spacy model found within the CDB : " + cdb.config.general["spacy_model"]) if cdb.config.general["spacy_model"] == "": raise ValueError( "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \ To solve this declare the SPACY_MODEL in the env_medcat file." ) # this is redundant as the config is already in the CDB conf = cdb.config # Apply CUI filter if provided if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None: self.log.debug("Applying CDB CUI filter ...") with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file: all_lines = (line.rstrip() for line in cui_file) selected_cuis = [line for line in all_lines if line] # filter blank lines cdb.filter_by_cui(selected_cuis) # Meta-annotation models are optional meta_models = [] if os.getenv("APP_MODEL_META_PATH_LIST") is not None: self.log.debug("Loading META annotations ...") for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"): m = MetaCAT.load(model_path) meta_models.append(m) cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models) return cat
def setUpClass(cls): print("Set up CDB") cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) print("Set up Vocab") vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get( "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) print("Set up NLP pipeline") cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config) cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config), name='skip_and_punct', additional_fields=['is_punct']) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker, config=cls.config) cls.ner = NER(cls.cdb, cls.config) cls.nlp.add_ner(cls.ner) print("Set up Linker") cls.link = Linker(cls.cdb, cls.vocab, cls.config) cls.nlp.add_linker(cls.link) print("Set limits for tokens and uppercase") cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 print("Add concepts") cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229005', names=prepare_name('CDB', cls.nlp, {}, cls.config)) print("Add test text") cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.text_post_pipe = cls.nlp(cls.text)
def update_concept_model(concept: Concept, cdb_model: ConceptDB, cdb: CDB): cui = concept.cui concept.pretty_name = cdb.get_name(cui) concept.type_ids = ','.join(list(cdb.cui2type_ids.get(cui, ''))) concept.semantic_type = ','.join([ cdb.addl_info['type_id2name'].get(type_id, '') for type_id in list(cdb.cui2type_ids.get(cui, '')) ]) concept.desc = cdb.addl_info['cui2description'].get(cui, '') concept.synonyms = ", ".join(cdb.addl_info['cui2original_names'].get( cui, [])) concept.cdb = cdb_model concept.save()
def setUpClass(cls): print("Load test database csvs for edit tests") cls.config = Config() cls.config.general['log_level'] = logging.DEBUG cls.maker = CDBMaker(cls.config) csvs = [ os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'), os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv') ] cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) cls.cdb2 = CDB(cls.config)
def setUp(self) -> None: self.config = Config() self.config.general['log_level'] = logging.INFO cdb = CDB(config=self.config) self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} self.vocab_path = "./tmp_vocab.dat" if not os.path.exists(self.vocab_path): import requests tmp = requests.get( "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(self.vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(self.vocab_path) # Make the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab) self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) ner = NER(cdb, self.config) self.nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, self.config) self.nlp.add_linker(link) self.text = "CDB - I was running and then Movar Virus attacked and CDb"
def __init__(self): super().__init__() self.log.info('Initializing MedCAT processor ...') self.app_name = 'MedCAT' self.app_lang = 'en' self.app_version = MedCatProcessor._get_medcat_version() self.app_model = os.getenv("APP_MODEL_NAME", 'unknown') self.vocab = Vocab() self.cdb = CDB() self.cdb.load_dict( os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat')) self.vocab.load_dict( path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat')) self.cat = CAT(self.cdb, vocab=self.vocab) self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False) self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8)) self.log.info('MedCAT processor is ready')
def test_training_import(self): cdb2 = CDB.load('./tmp_cdb.dat') self.cdb.reset_training() cdb2.reset_training() np.random.seed(11) cuis = list(self.cdb.cui2names.keys()) for i in range(2): for cui in cuis: vectors = {} for cntx_type in self.config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) self.cdb.update_context_vector(cui, vectors, negative=False) cdb2.import_training(cdb=self.cdb, overwrite=True) assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7] assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139']
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path try: cdb = CDB.load(cdb_path) except KeyError as ke: mc_v = pkg_resources.get_distribution('medcat').version if int(mc_v.split('.')[0]) > 0: log.error( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x' ) raise Exception( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x', 'Please re-configure this project to use a MedCAT v1.x CDB or consult the ' 'MedCATTrainer Dev team if you believe this should work' ) from ke raise custom_config = os.getenv("MEDCAT_CONFIG_FILE") if custom_config is not None and os.path.exists(custom_config): cdb.config.parse_config_file(path=custom_config) else: log.info( "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB" ) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def test_concept_similarity(self): cdb = CDB(config=self.config) np.random.seed(11) for i in range(500): cui = "C" + str(i) type_ids = {'T-' + str(i%10)} cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(), name_status='P', type_ids=type_ids, description='', full_build=True) vectors = {} for cntx_type in self.config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) cdb.update_context_vector(cui, vectors, negative=False) res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True) assert len(res) == 10
def setUpClass(cls) -> None: cls.cdb = CDB.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) cls.vocab = Vocab.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) cls.cdb.config.ner['min_name_len'] = 2 cls.cdb.config.ner['upper_case_limit_len'] = 3 cls.cdb.config.general['spell_check'] = True cls.cdb.config.linking['train_count_threshold'] = 10 cls.cdb.config.linking['similarity_threshold'] = 0.3 cls.cdb.config.linking['train'] = True cls.cdb.config.linking['disamb_length_limit'] = 5 cls.cdb.config.general['full_unlink'] = True cls.undertest = CAT(cdb=cls.cdb, config=cls.cdb.config, vocab=cls.vocab)
def load_model(self, model_full_tag_name, vocab_input_file_name="vocab.dat", cdb_input_file_name="cdb.dat"): """ Loads variables of this object This is used to search the site-packages models folder for installed models.. """ vocab = Vocab.load_model(model_full_tag_name=model_full_tag_name, input_file_name=vocab_input_file_name) cdb = CDB.load_model(model_full_tag_name=model_full_tag_name, input_file_name=cdb_input_file_name) if cdb is False or vocab is False: log.error("Exiting...") sys.exit() return CAT(cdb, vocab=vocab)
def import_concepts_from_cdb(cdb_model_id: int): from medcat.cdb import CDB cdb_model = ConceptDB.objects.get(id=cdb_model_id) cdb = CDB.load(cdb_model.cdb_file.path) # Get all existing cuis for this CDB existing_cuis = set( Concept.objects.filter(cdb=cdb_model_id).values_list('cui', flat=True)) all_cuis = set(Concept.objects.all().values_list('cui', flat=True)) for cui in cdb.cui2names.keys(): if cui not in all_cuis: concept = Concept() concept.cui = cui update_concept_model(concept, cdb_model, cdb) if cui in all_cuis and cui not in existing_cuis: # ui has been added from another CDB. Overwrite here. concept = Concept.objects.get(cui=cui) update_concept_model(concept, cdb_model, cdb)
def setUpClass(cls) -> None: cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.ner = NER(cls.cdb, cls.config) cls.linker = Linker(cls.cdb, cls.vocab, cls.config) cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 cls.meta_cat = MetaCAT() cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config)
def _import_concepts(id): from medcat.cdb import CDB concept_db = ConceptDB.objects.get(id=id) cdb = CDB.load(concept_db.cdb_file.path) # Get all existing cuis for this CDB existing_cuis = set( Concept.objects.filter(cdb=id).values_list('cui', flat=True)) for cui in cdb.cui2names.keys(): if cui not in existing_cuis: concept = Concept() concept.pretty_name = cdb.cui2preferred_name.get(cui, cui) concept.cui = cui concept.tui = ','.join(list(cdb.cui2type_ids.get(cui, ''))) concept.semantic_type = ','.join([ cdb.addl_info['type_id2name'].get(tui, '') for tui in list(cdb.cui2type_ids.get(cui, '')) ]) concept.desc = cdb.addl_info['cui2description'].get(cui, '') concept.synonyms = ", ".join( cdb.addl_info['cui2original_names'].get(cui, [])) concept.cdb = concept_db concept.save()
def _create_cat(self): """ Loads MedCAT resources and creates CAT instance """ if os.getenv("APP_MODEL_VOCAB_PATH") is None: raise ValueError( "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified") if os.getenv("APP_MODEL_CDB_PATH") is None: raise Exception( "Concept database (env: APP_MODEL_CDB_PATH) not specified") # Vocabulary and Concept Database are mandatory self.log.debug('Loading VOCAB ...') vocab = Vocab() vocab.load_dict(path=os.getenv("APP_MODEL_VOCAB_PATH")) self.log.debug('Loading CDB ...') cdb = CDB() cdb.load_dict(path=os.getenv("APP_MODEL_CDB_PATH")) # Apply CUI filter if provided if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None: self.log.debug('Applying CDB CUI filter ...') with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file: all_lines = (line.rstrip() for line in cui_file) selected_cuis = [line for line in all_lines if line] # filter blank lines cdb.filter_by_cui(selected_cuis) # Meta-annotation models are optional meta_models = [] if os.getenv("APP_MODEL_META_PATH_LIST") is not None: self.log.debug('Loading META annotations ...') for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(':'): m = MetaCAT(save_dir=model_path) m.load() meta_models.append(m) return CAT(cdb=cdb, vocab=vocab, meta_cats=meta_models)
from argparse import ArgumentParser import pandas as pd from tqdm import tqdm import numpy as np from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB vocab = Vocab() vocab.load_dict(os.environ["MEDCAT_VOCAB_FILE"]) print("Loaded Vocab") # Load the cdb model you downloaded cdb = CDB() cdb.load_dict(os.environ["MEDCAT_CDB_FILE"]) print("Loaded CDB") # create cat cat = CAT(cdb=cdb, vocab=vocab) cat.spacy_cat.TUI_FILTER = ['T047', 'T048', 'T184'] tqdm.pandas() def get_entities(text) : doc = cat.get_entities(text) relevant_entities = [] for ent in doc : if "icd10" in ent["info"] : ent_string = text[ent["start"]:ent['end']]
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, lr=1, groups=None, **kwargs): from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB import json use_groups = False if groups is not None: use_groups = True f1s = {} ps = {} rs = {} tps = {} fns = {} fps = {} cui_counts = {} examples = {} for i in range(cv): cdb = CDB() cdb.load_dict(cdb_path) vocab = Vocab() vocab.load_dict(path=vocab_path) cat = CAT(cdb, vocab=vocab) cat.train = False cat.spacy_cat.MIN_ACC = 0.30 cat.spacy_cat.MIN_ACC_TH = 0.30 # Add groups if they exist if groups is not None: for cui in cdb.cui2info.keys(): if "group" in cdb.cui2info[cui]: del cdb.cui2info[cui]['group'] groups = json.load(open("./groups.json")) for k,v in groups.items(): for val in v: cat.add_cui_to_group(val, k) fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=data_path, lr=1, test_size=test_size, use_groups=use_groups, nepochs=nepochs, **kwargs) for key in f1.keys(): if key in f1s: f1s[key].append(f1[key]) else: f1s[key] = [f1[key]] if key in ps: ps[key].append(p[key]) else: ps[key] = [p[key]] if key in rs: rs[key].append(r[key]) else: rs[key] = [r[key]] if key in tps: tps[key].append(tp.get(key, 0)) else: tps[key] = [tp.get(key, 0)] if key in fps: fps[key].append(fp.get(key, 0)) else: fps[key] = [fp.get(key, 0)] if key in fns: fns[key].append(fn.get(key, 0)) else: fns[key] = [fn.get(key, 0)] return fps, fns, tps, ps, rs, f1s, cui_counts, examples