def setUpClass(cls): print("Set up CDB") cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) print("Set up Vocab") vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get( "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) print("Set up NLP pipeline") cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config) cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config), name='skip_and_punct', additional_fields=['is_punct']) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker, config=cls.config) cls.ner = NER(cls.cdb, cls.config) cls.nlp.add_ner(cls.ner) print("Set up Linker") cls.link = Linker(cls.cdb, cls.vocab, cls.config) cls.nlp.add_linker(cls.link) print("Set limits for tokens and uppercase") cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 print("Add concepts") cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229005', names=prepare_name('CDB', cls.nlp, {}, cls.config)) print("Add test text") cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.text_post_pipe = cls.nlp(cls.text)
def setUp(self) -> None: self.config = Config() self.config.general['log_level'] = logging.INFO cdb = CDB(config=self.config) self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} self.vocab_path = "./tmp_vocab.dat" if not os.path.exists(self.vocab_path): import requests tmp = requests.get( "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(self.vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(self.vocab_path) # Make the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab) self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) ner = NER(cdb, self.config) self.nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, self.config) self.nlp.add_linker(link) self.text = "CDB - I was running and then Movar Virus attacked and CDb"
def unlink_concept_name(self, cui, name, preprocessed_name=False): r''' Unlink a concept name from the CUI (or all CUIs if full_unlink), removes the link from the Concept Database (CDB). As a consequence medcat will never again link the `name` to this CUI - meaning the name will not be detected as a concept in the future. Args: cui (str): The CUI from which the `name` will be removed name (str): The span of text to be removed from the linking dictionary Examples: >>> # To never again link C0020538 to HTN >>> cat.unlink_concept_name('C0020538', 'htn', False) ''' cuis = [cui] if preprocessed_name: names = {name: 'nothing'} else: names = prepare_name(name, self, {}, self.config) # If full unlink find all CUIs if self.config.general.get('full_unlink', False): for name in names: cuis.extend(self.cdb.name2cuis.get(name, [])) # Remove name from all CUIs for cui in cuis: self.cdb.remove_names(cui=cui, names=names)
def test_name_addition(self): self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config), name_status='P', full_build=True) assert self.cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'} assert 'my:newname.' in self.cdb.name2cuis assert 'my:new' in self.cdb.snames assert 'my:newname.' in self.cdb.name2cuis2status assert self.cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}
def test_bb_removal_of_name(self): self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config)) self.assertEqual(len(self.cdb.name2cuis), 5, "Should equal 5") self.assertNotIn('my:newname.', self.cdb.name2cuis2status)
def test_ba_addition_of_new_name(self): self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config), name_status='P', full_build=True) self.assertEqual(len(self.cdb.name2cuis), 6, "Should equal 6") target_result = {'MY: new,-_! Name.', 'Second csv'} self.assertEqual(self.cdb.addl_info['cui2original_names']['C0000239'], target_result) self.assertIn('my~:~new~name~.', self.cdb.name2cuis) self.assertIn('my~:~new', self.cdb.snames) self.assertIn('my~:~new~name~.', self.cdb.name2cuis2status)
def test_for_linker(self): self.config = Config() self.config.general['log_level'] = logging.DEBUG cdb = CDB(config=self.config) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) cdb.add_names(cui='S-2290045', names=prepare_name('Movar', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}} cuis = list(cdb.cui2names.keys()) for cui in cuis[0:50]: vectors = { 'short': np.random.rand(300), 'long': np.random.rand(300), 'medium': np.random.rand(300) } cdb.update_context_vector(cui, vectors, negative=False) d = self.nlp(self.text) vocab = Vocab.load(self.vocab_path) cm = ContextModel(cdb, vocab, self.config) cm.train_using_negative_sampling('S-229004') self.config.linking['train_count_threshold'] = 0 cm.train('S-229004', d._.ents[1], d) cm.similarity('S-229004', d._.ents[1], d) cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
def test_concept_similarity(self): cdb = CDB(config=self.config) np.random.seed(11) for i in range(500): cui = "C" + str(i) type_ids = {'T-' + str(i%10)} cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(), name_status='P', type_ids=type_ids, description='', full_build=True) vectors = {} for cntx_type in self.config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) cdb.update_context_vector(cui, vectors, negative=False) res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True) assert len(res) == 10
def add_and_train_concept(self, cui, name, spacy_doc=None, spacy_entity=None, ontologies=set(), name_status='A', type_ids=set(), description='', full_build=True, negative=False, devalue_others=False, do_add_concept=True): r''' Add a name to an existing concept, or add a new concept, or do not do anything if the name and concept alraedy exist. Perform training if spacy_entity and spacy_doc are set. Args: cui (str): CUI of the concept name (str): Name to be linked to the concept (in the case of MedCATtrainer this is simply the selected value in text, no preprocessing or anything needed). spacy_doc (spacy.tokens.Doc): Spacy represenation of the document that was manually annotated. spacy_entity (List[spacy.tokens.Token]): Given the spacy document, this is the annotated span of text - list of annotated tokens that are marked with this CUI. negative (bool): Is this a negative or positive example. devalue_others: If set, cuis to which this name is assigned and are not `cui` will receive negative training given that negative=False. **other: Refer to CDB.add_concept ''' names = prepare_name(name, self, {}, self.config) if do_add_concept: self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, full_build=full_build) if spacy_entity is not None and spacy_doc is not None: # Train Linking self.linker.context_model.train(cui=cui, entity=spacy_entity, doc=spacy_doc, negative=negative, names=names) if not negative and devalue_others: # Find all cuis cuis = set() for name in names: cuis.update(self.cdb.name2cuis.get(name, [])) # Remove the cui for which we just added positive training if cui in cuis: cuis.remove(cui) # Add negative training for all other CUIs that link to these names for _cui in cuis: self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True)
def test_name_removal(self): self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config)) # Run again to make sure it does not break anything self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config)) assert len(self.cdb.name2cuis) == 5 assert 'my:newname.' not in self.cdb.name2cuis2status
assert len(cdb.name2cuis) == 5 assert len(cdb.cui2tags) == 3 assert len(cdb.cui2preferred_name) == 2 assert len(cdb.cui2context_vectors) == 3 assert len(cdb.cui2count_train) == 3 assert cdb.name2cuis2status['virus']['C0000039'] == 'P' assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'} assert cdb.addl_info['cui2original_names']['C0000039'] == { 'Virus', 'Virus K', 'Virus M', 'Virus Z' } assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic") # Test name addition from medcat.preprocessing.cleaners import prepare_name cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config), name_status='P', full_build=True) assert cdb.addl_info['cui2original_names']['C0000239'] == { 'MY: new,-_! Name.', 'Second csv' } assert 'my:newname.' in cdb.name2cuis assert 'my:new' in cdb.snames assert 'my:newname.' in cdb.name2cuis2status assert cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'} # Test name removal cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config)) # Run again to make sure it does not break anything
def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index_col=False, full_build=False, only_existing_cuis=False, **kwargs): r''' Compile one or multiple CSVs into a CDB. Args: csv_paths (`List[str]`): An array of paths to the csv files that should be processed full_build (`bool`, defaults to `True`): If False only the core portions of the CDB will be built (the ones required for the functioning of MedCAT). If True, everything will be added to the CDB - this usually includes concept descriptions, various forms of names etc (take care that this option produces a much larger CDB). sep (`str`, defaults to `,`): If necessary a custom separator for the csv files encoding (`str`, optional): Encoding to be used for reading the CSV file escapechar (`str`, optional): Escape char for the CSV index_col (`bool`, defaults_to `False`): Index column for pandas read_csv only_existing_cuis (`bool`, defaults to False): If True no new CUIs will be added, but only linked names will be extended. Mainly used when enriching names of a CDB (e.g. SNOMED with UMLS terms). Return: `medcat.cdb.CDB` with the new concepts added. Note: **kwargs: Will be passed to pandas for CSV reading csv: Examples of the CSV used to make the CDB can be found on [GitHub](link) ''' useful_columns = ['cui', 'name', 'ontologies', 'name_status', 'type_ids', 'description'] name_status_options = {'A', 'P', 'N'} for csv_path in csv_paths: # Read CSV, everything is converted to strings df = pandas.read_csv(csv_path, sep=sep, encoding=encoding, escapechar=escapechar, index_col=index_col, dtype=str, **kwargs) df = df.fillna('') # Find which columns to use from the CSV cols = [] col2ind = {} for col in list(df.columns): if str(col).lower().strip() in useful_columns: col2ind[str(col).lower().strip()] = len(cols) cols.append(col) self.log.info("Started importing concepts from: {}".format(csv_path)) _time = None # Used to check speed _logging_freq = np.ceil(len(df[cols]) / 100) for row_id, row in enumerate(df[cols].values): if row_id % _logging_freq == 0: # Print some stats if _time is None: # Add last time if it does not exist _time = datetime.datetime.now() # Get current time ctime = datetime.datetime.now() # Get time difference timediff = ctime - _time self.log.info("Current progress: {:.0f}% at {:.3f}s per {} rows".format( (row_id / len(df)) * 100, timediff.microseconds/10**6 + timediff.seconds, (len(df[cols]) // 100))) # Set previous time to current time _time = ctime # This must exist cui = row[col2ind['cui']].strip().upper() if not only_existing_cuis or (only_existing_cuis and cui in self.cdb.cui2names): if 'ontologies' in col2ind: ontologies = set([ontology.strip() for ontology in row[col2ind['ontologies']].upper().split(self.cnf_cm['multi_separator']) if len(ontology.strip()) > 0]) else: ontologies = set() if 'name_status' in col2ind: name_status = row[col2ind['name_status']].strip().upper() # Must be allowed if name_status not in name_status_options: name_status = 'A' else: # Defaults to A - meaning automatic name_status = 'A' if 'type_ids' in col2ind: type_ids = set([type_id.strip() for type_id in row[col2ind['type_ids']].upper().split(self.cnf_cm['multi_separator']) if len(type_id.strip()) > 0]) else: type_ids = set() # Get the ones that do not need any changing if 'description' in col2ind: description = row[col2ind['description']].strip() else: description = "" # We can have multiple versions of a name names = {} # {'name': {'tokens': [<str>], 'snames': [<str>]}} raw_names = [raw_name.strip() for raw_name in row[col2ind['name']].split(self.cnf_cm['multi_separator']) if len(raw_name.strip()) > 0] for raw_name in raw_names: raw_name = raw_name.strip() prepare_name(raw_name, self.nlp, names, self.config) if self.config.cdb_maker.get('remove_parenthesis', 0) > 0 and name_status == 'P': # Should we remove the content in parenthesis from primary names and add them also raw_name = PH_REMOVE.sub(" ", raw_name).strip() if len(raw_name) >= self.config.cdb_maker['remove_parenthesis']: prepare_name(raw_name, self.nlp, names, self.config) self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, full_build=full_build) # DEBUG self.log.debug("\n\n**** Added\n CUI: {}\n Names: {}\n Ontologies: {}\n Name status: {}\n".format(cui, names, ontologies, name_status) + \ " Type IDs: {}\n Description: {}\n Is full build: {}".format( type_ids, description, full_build)) return self.cdb
import logging from medcat.cdb import CDB import os import requests config = Config() config.general['log_level'] = logging.INFO cdb = CDB(config=config) nlp = Pipe(tokenizer=spacy_split_all, config=config) nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config), name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): import requests tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(vocab_path) # Make the pipeline