def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path cdb = CDB() cdb.load_dict(cdb_path) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab() vocab.load_dict(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, vocab=vocab) cat.train = False CAT_MAP[cat_id] = cat return cat
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path cdb = CDB.load(cdb_path) cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE")) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def _create_cat(self): """ Loads MedCAT resources and creates CAT instance """ if os.getenv("APP_MODEL_VOCAB_PATH") is None: raise ValueError( "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified") if os.getenv("APP_MODEL_CDB_PATH") is None: raise Exception( "Concept database (env: APP_MODEL_CDB_PATH) not specified") # Vocabulary and Concept Database are mandatory self.log.debug("Loading VOCAB ...") vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH")) self.log.debug("Loading CDB ...") cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH")) spacy_model = os.getenv("SPACY_MODEL", "") if spacy_model: cdb.config.general["spacy_model"] == spacy_model else: logging.warning("SPACY_MODEL environment var not set, \ attempting to load the spacy model found within the CDB : " + cdb.config.general["spacy_model"]) if cdb.config.general["spacy_model"] == "": raise ValueError( "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \ To solve this declare the SPACY_MODEL in the env_medcat file." ) # this is redundant as the config is already in the CDB conf = cdb.config # Apply CUI filter if provided if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None: self.log.debug("Applying CDB CUI filter ...") with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file: all_lines = (line.rstrip() for line in cui_file) selected_cuis = [line for line in all_lines if line] # filter blank lines cdb.filter_by_cui(selected_cuis) # Meta-annotation models are optional meta_models = [] if os.getenv("APP_MODEL_META_PATH_LIST") is not None: self.log.debug("Loading META annotations ...") for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"): m = MetaCAT.load(model_path) meta_models.append(m) cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models) return cat
def __init__(self): super().__init__() self.log.info('Initializing MedCAT processor ...') self.app_name = 'MedCAT' self.app_lang = 'en' self.app_version = MedCatProcessor._get_medcat_version() self.app_model = os.getenv("APP_MODEL_NAME", 'unknown') self.vocab = Vocab() self.cdb = CDB() self.cdb.load_dict( os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat')) self.vocab.load_dict( path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat')) self.cat = CAT(self.cdb, vocab=self.vocab) self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False) self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8)) self.log.info('MedCAT processor is ready')
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path try: cdb = CDB.load(cdb_path) except KeyError as ke: mc_v = pkg_resources.get_distribution('medcat').version if int(mc_v.split('.')[0]) > 0: log.error( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x' ) raise Exception( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x', 'Please re-configure this project to use a MedCAT v1.x CDB or consult the ' 'MedCATTrainer Dev team if you believe this should work' ) from ke raise custom_config = os.getenv("MEDCAT_CONFIG_FILE") if custom_config is not None and os.path.exists(custom_config): cdb.config.parse_config_file(path=custom_config) else: log.info( "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB" ) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def setUpClass(cls) -> None: cls.cdb = CDB.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) cls.vocab = Vocab.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) cls.cdb.config.ner['min_name_len'] = 2 cls.cdb.config.ner['upper_case_limit_len'] = 3 cls.cdb.config.general['spell_check'] = True cls.cdb.config.linking['train_count_threshold'] = 10 cls.cdb.config.linking['similarity_threshold'] = 0.3 cls.cdb.config.linking['train'] = True cls.cdb.config.linking['disamb_length_limit'] = 5 cls.cdb.config.general['full_unlink'] = True cls.undertest = CAT(cdb=cls.cdb, config=cls.cdb.config, vocab=cls.vocab)
def _create_cat(self): """ Loads MedCAT resources and creates CAT instance """ if os.getenv("APP_MODEL_VOCAB_PATH") is None: raise ValueError( "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified") if os.getenv("APP_MODEL_CDB_PATH") is None: raise Exception( "Concept database (env: APP_MODEL_CDB_PATH) not specified") # Vocabulary and Concept Database are mandatory self.log.debug('Loading VOCAB ...') vocab = Vocab() vocab.load_dict(path=os.getenv("APP_MODEL_VOCAB_PATH")) self.log.debug('Loading CDB ...') cdb = CDB() cdb.load_dict(path=os.getenv("APP_MODEL_CDB_PATH")) # Apply CUI filter if provided if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None: self.log.debug('Applying CDB CUI filter ...') with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file: all_lines = (line.rstrip() for line in cui_file) selected_cuis = [line for line in all_lines if line] # filter blank lines cdb.filter_by_cui(selected_cuis) # Meta-annotation models are optional meta_models = [] if os.getenv("APP_MODEL_META_PATH_LIST") is not None: self.log.debug('Loading META annotations ...') for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(':'): m = MetaCAT(save_dir=model_path) m.load() meta_models.append(m) return CAT(cdb=cdb, vocab=vocab, meta_cats=meta_models)
from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB vocab = Vocab() vocab.load_dict(os.environ["MEDCAT_VOCAB_FILE"]) print("Loaded Vocab") # Load the cdb model you downloaded cdb = CDB() cdb.load_dict(os.environ["MEDCAT_CDB_FILE"]) print("Loaded CDB") # create cat cat = CAT(cdb=cdb, vocab=vocab) cat.spacy_cat.TUI_FILTER = ['T047', 'T048', 'T184'] tqdm.pandas() def get_entities(text) : doc = cat.get_entities(text) relevant_entities = [] for ent in doc : if "icd10" in ent["info"] : ent_string = text[ent["start"]:ent['end']] if ent_string.lower() in ["ms", "mr", "mrs"] : continue cui = ent["cui"] icd_codes = tuple(sorted([x["chapter"] for x in ent["info"]["icd10"]])) if "R69" in icd_codes:
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, lr=1, groups=None, **kwargs): from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB import json use_groups = False if groups is not None: use_groups = True f1s = {} ps = {} rs = {} tps = {} fns = {} fps = {} cui_counts = {} examples = {} for i in range(cv): cdb = CDB() cdb.load_dict(cdb_path) vocab = Vocab() vocab.load_dict(path=vocab_path) cat = CAT(cdb, vocab=vocab) cat.train = False cat.spacy_cat.MIN_ACC = 0.30 cat.spacy_cat.MIN_ACC_TH = 0.30 # Add groups if they exist if groups is not None: for cui in cdb.cui2info.keys(): if "group" in cdb.cui2info[cui]: del cdb.cui2info[cui]['group'] groups = json.load(open("./groups.json")) for k,v in groups.items(): for val in v: cat.add_cui_to_group(val, k) fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=data_path, lr=1, test_size=test_size, use_groups=use_groups, nepochs=nepochs, **kwargs) for key in f1.keys(): if key in f1s: f1s[key].append(f1[key]) else: f1s[key] = [f1[key]] if key in ps: ps[key].append(p[key]) else: ps[key] = [p[key]] if key in rs: rs[key].append(r[key]) else: rs[key] = [r[key]] if key in tps: tps[key].append(tp.get(key, 0)) else: tps[key] = [tp.get(key, 0)] if key in fps: fps[key].append(fp.get(key, 0)) else: fps[key] = [fp.get(key, 0)] if key in fns: fns[key].append(fn.get(key, 0)) else: fns[key] = [fn.get(key, 0)] return fps, fns, tps, ps, rs, f1s, cui_counts, examples
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, reset_cui_count=True, test_size=0.1): from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB import json f1s = {} ps = {} rs = {} tps = {} fns = {} fps = {} cui_counts = {} for i in range(cv): cdb = CDB() cdb.load_dict(cdb_path) vocab = Vocab() vocab.load_dict(path=vocab_path) cat = CAT(cdb, vocab=vocab) cat.train = False cat.spacy_cat.MIN_ACC = 0.30 cat.spacy_cat.MIN_ACC_TH = 0.30 fp, fn, tp, p, r, f1, cui_counts = cat.train_supervised( data_path=data_path, lr=1, nepochs=nepochs, anneal=True, print_stats=True, use_filters=True, reset_cui_count=reset_cui_count, terminate_last=True, test_size=test_size) for key in f1.keys(): if key in f1s: f1s[key].append(f1[key]) else: f1s[key] = [f1[key]] if key in ps: ps[key].append(p[key]) else: ps[key] = [p[key]] if key in rs: rs[key].append(r[key]) else: rs[key] = [r[key]] if key in tps: tps[key].append(tp.get(key, 0)) else: tps[key] = [tp.get(key, 0)] if key in fps: fps[key].append(fp.get(key, 0)) else: fps[key] = [fp.get(key, 0)] if key in fns: fns[key].append(fn.get(key, 0)) else: fns[key] = [fn.get(key, 0)] return fps, fns, tps, ps, rs, f1s, cui_counts
class MedCatProcessor(NlpProcessor): """" MedCAT Processor class is wrapper over MedCAT that implements annotations extractions functionality (both single and bulk processing) that can be easily exposed for an API. """ def __init__(self): super().__init__() self.log.info('Initializing MedCAT processor ...') self.app_name = 'MedCAT' self.app_lang = 'en' self.app_version = MedCatProcessor._get_medcat_version() self.app_model = os.getenv("APP_MODEL_NAME", 'unknown') self.vocab = Vocab() self.cdb = CDB() self.cdb.load_dict( os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat')) self.vocab.load_dict( path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat')) self.cat = CAT(self.cdb, vocab=self.vocab) self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False) self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8)) self.log.info('MedCAT processor is ready') def get_app_info(self): """ Returns general information about the application :return: application information stored as KVPs """ return { 'name': self.app_name, 'language': self.app_lang, 'version': self.app_version, 'model': self.app_model } def process_content(self, content): """ Processes a single document extracting the annotations. :param content: document to be processed, containing 'text' field. :return: processing result containing document with extracted annotations stored as KVPs. """ if 'text' not in content: error_msg = "'text' field missing in the payload content." nlp_result = { 'success': False, 'errors': [error_msg], 'timestamp': NlpProcessor._get_timestamp() } return nlp_result, False text = content['text'] # assume an that a blank document is a valid document and process it only # when it contains any non-blank characters if text is not None and len(text.strip()) > 0: entities = self.cat.get_entities(text) else: entities = [] nlp_result = { 'text': text, 'annotations': entities, 'success': True, 'timestamp': NlpProcessor._get_timestamp() } # append the footer if 'footer' in content: nlp_result['footer'] = content['footer'] return nlp_result def process_content_bulk(self, content): """ Processes an array of documents extracting the annotations. :param content: document to be processed, containing 'text' field. :return: processing result containing documents with extracted annotations,stored as KVPs. """ # process at least 10 docs per thread and don't bother with starting # additional threads when less documents were provided min_doc_per_thread = 10 num_slices = max(1, int(len(content) / min_doc_per_thread)) batch_size = min(300, num_slices) if batch_size >= self.bulk_nproc: nproc = self.bulk_nproc else: batch_size = min_doc_per_thread nproc = max(1, num_slices) if len(content) > batch_size * nproc: nproc += 1 # use generators both to provide input documents and to provide resulting annotations # to avoid too many mem-copies invalid_doc_ids = [] ann_res = self.cat.multi_processing( MedCatProcessor._generate_input_doc(content, invalid_doc_ids), nproc=nproc, batch_size=batch_size) return MedCatProcessor._generate_result(content, ann_res, invalid_doc_ids) # helper generator functions to avoid multiple copies of data # @staticmethod def _generate_input_doc(documents, invalid_doc_idx): """ Generator function returning documents to be processed as a list of tuples: (idx, text), (idx, text), ... Skips empty documents and reports their ids to the invalid_doc_idx array :param documents: array of input documents that contain 'text' field :param invalid_doc_idx: array that will contain invalid document idx :return: consecutive tuples of (idx, document) """ for i in range(0, len(documents)): # assume the document to be processed only when it is not blank if 'text' in documents[i] and documents[i][ 'text'] is not None and len( documents[i]['text'].strip()) > 0: yield i, documents[i]['text'] else: invalid_doc_idx.append(i) @staticmethod def _generate_result(in_documents, annotations, invalid_doc_idx): """ Generator function merging the resulting annotations with the input documents. The result for documents that were invalid will not contain any annotations. :param in_documents: array of input documents that contain 'text' field :param annotations: array of annotations extracted from documents :param invalid_doc_idx: array of invalid document idx :return: """ # generate output for valid annotations for i in range(len(annotations)): res = annotations[i] res_idx = res[0] in_ct = in_documents[res_idx] # parse the result out_res = { 'text': res[1]["text"], 'annotations': res[1]["entities"], 'success': True, 'timestamp': NlpProcessor._get_timestamp() } # append the footer if 'footer' in in_ct: out_res['footer'] = in_ct['footer'] yield out_res # generate output for invalid documents for i in invalid_doc_idx: in_ct = in_documents[i] out_res = { 'text': in_ct["text"], 'annotations': [], 'success': True, 'timestamp': NlpProcessor._get_timestamp() } # append the footer if 'footer' in in_ct: out_res['footer'] = in_ct['footer'] yield out_res @staticmethod def _get_medcat_version(): """ Returns the version string of the MedCAT module as reported by pip :return: """ try: import subprocess result = subprocess.check_output(['pip', 'show', 'medcat'], universal_newlines=True) version_line = list( filter(lambda v: 'Version' in v, result.split('\n'))) return version_line[0].split(' ')[1] except Exception: raise Exception("Cannot read the MedCAT library version")
vocab_url = os.getenv('VOCAB_URL') urlretrieve(vocab_url, vocab_path) if not os.path.exists(cdb_path): cdb_url = os.getenv('CDB_URL') print("*" * 399) print(cdb_url) urlretrieve(cdb_url, cdb_path) vocab = Vocab() vocab.load_dict(vocab_path) cdb = CDB() cdb.load_dict(cdb_path) mc_negated = MetaCAT(save_dir=neg_path) mc_negated.load() cat = CAT(cdb=cdb, vocab=vocab, meta_cats=[mc_negated]) cat.spacy_cat.MIN_ACC = 0.30 cat.spacy_cat.MIN_ACC_TH = 0.30 cat.spacy_cat.ACC_ALWAYS = True except Exception as e: print(str(e)) def get_html_and_json(text): doc = cat(text) a = json.loads(cat.get_json(text)) for i in range(len(a['entities'])): ent = a['entities'][i] new_ent = {}
from medcat.cat import CAT vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): import requests tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) config = Config() cdb = CDB.load("./tmp_cdb.dat", config=config) vocab = Vocab.load(vocab_path) cdb.reset_training() cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) cat.config.ner['min_name_len'] = 3 cat.config.ner['upper_case_limit_len'] = 3 cat.config.linking['disamb_length_limit'] = 3 cat.config.linking['filters'] = {'cuis': set()} cat.config.linking['train_count_threshold'] = -1 cat.config.linking['context_vector_sizes'] = {'xlong': 27, 'long': 18, 'medium': 9, 'short': 3} cat.config.linking['context_vector_weights'] = {'xlong': 0, 'long': 0.4, 'medium': 0.4, 'short': 0.2} cat.config.linking['weighted_average_function'] = lambda step: max(0.1, 1-(step**2*0.0004)) cat.config.linking['similarity_threshold_type'] = 'dynamic' cat.config.linking['similarity_threshold'] = 0.35 cat.config.linking['calculate_dynamic_threshold'] = True cat.train(df.text.values, fine_tune=True)
cdb = CDB() # cdb.load_dict(os.path.join(medcat_path, 'simple_cdb.csv')) # If you need a special CDB you can build one from a .csv file preparator = PrepareCDB(vocab=vocab) csv_paths = [os.path.join(medcat_path, 'simple_cdb.csv')]#, '<another one>', ...] csv_paths = [os.path.join(medcat_path, 'attention_cdb.csv')] cdb = preparator.prepare_csvs(csv_paths) # Save the new CDB for later cdb.save_dict(os.path.join(medcat_path, 'simple_cdb.cdb')) # To annotate documents we do doc = "My simple document with kidney failure" cat = CAT(cdb=cdb, vocab=vocab) cat.train = False doc_spacy = cat(doc) # Entities are in doc_spacy._.ents # Or to get a json doc_json = cat.get_json(doc) # To have a look at the results: from spacy import displacy # Note that this will not show all entites, but only the longest ones displacy.serve(doc_spacy, style='ent') # To run cat on a large number of documents data = [] # [(<doc_id>, <text>), (<doc_id>, <text>), ...] docs = cat.multi_processing(data)