def test_concept_similarity(self): cdb = CDB(config=self.config) np.random.seed(11) for i in range(500): cui = "C" + str(i) type_ids = {'T-' + str(i%10)} cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(), name_status='P', type_ids=type_ids, description='', full_build=True) vectors = {} for cntx_type in self.config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) cdb.update_context_vector(cui, vectors, negative=False) res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True) assert len(res) == 10
class PrepareCDB(object): """ Prepares CDB data in csv format for annotations, after everything is done the result is in the cdb field. """ SEPARATOR = "" NAME_SEPARATOR = "|" CONCEPT_LENGTH_LIMIT = 20 SKIP_STOPWORDS = False VERSIONS = ['CLEAN', 'RAW'] def __init__(self, vocab=None, pretrained_cdb=None, word_tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger(tagger=partial( spacy_tag_punct, skip_stopwords=self.SKIP_STOPWORDS)) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok def _tok(self, text): return [text] def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, only_existing=False, add_cleaner=None, only_new=False): """ Compile one or multiple CSVs into an internal CDB class csv_paths: an array of paths to the csv files that should be processed sep: if necessarya a custom separator for the csv files return: Compiled CDB class """ _new_cuis = set() for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep, encoding=encoding, escapechar=escapechar) cols = list(df.columns) str_ind = cols.index('str') cui_ind = cols.index('cui') tui_ind = -1 if 'tui' in cols: tui_ind = cols.index('tui') tty_ind = -1 if 'tty' in cols: tty_ind = cols.index('tty') desc_ind = -1 if 'desc' in cols: desc_ind = cols.index('desc') onto_ind = -1 if 'onto' in cols: onto_ind = cols.index('onto') is_unique_ind = -1 if 'is_unique' in cols: is_unique_ind = cols.index('is_unique') examples_ind = -1 if 'examples' in cols: examples_ind = cols.index('examples') for ind in range(len(df)): names = str(df.iat[ind, str_ind]).split(self.NAME_SEPARATOR) if ind % 10000 == 0: print("Done: {}".format(ind)) for _name in names: skip_raw = False for version in self.VERSIONS: # Get the cui cui = str(df.iat[ind, cui_ind]) if only_new: # Add only new concepts, skip exisitng ones #_tmp_name = clean_name(_name).lower().replace(" ", "") if (cui in self.cdb.cui2names and cui not in _new_cuis ): #and _tmp_name in self.cdb.name2cui: continue else: if cui not in self.cdb.cui2names: _new_cuis.add(cui) if (version == "RAW" and skip_raw) or \ (only_existing and cui not in self.cdb.cui2names): continue # Save originals pretty_name = _name original_name = _name name = _name if version == "CLEAN" and add_cleaner is not None: name = add_cleaner(name) name = clean_name(name) # Clean and preprocess the name sc_name = self.nlp(name) if version == 'CLEAN': tokens = [ str(t.lemma_).lower() for t in sc_name if not t._.is_punct and not t._.to_skip ] elif version == 'RAW': tokens = [ str(t.lower_) for t in sc_name if not t._.is_punct and not t._.to_skip ] tokens_vocab = [ t.lower_ for t in sc_name if not t._.is_punct ] # Don't allow concept names to be above concept_length_limit if len(tokens) > self.CONCEPT_LENGTH_LIMIT: continue name = self.SEPARATOR.join(tokens) tmp_name = "".join(tokens) if add_cleaner is None and name == self.SEPARATOR.join( tokens_vocab): # Both names are the same, skip raw version skip_raw = True is_pref_name = False if 'tty' in df.columns: _tmp = str(df.iat[ind, tty_ind]) if _tmp.lower().strip() == 'pn': is_pref_name = True # Skip concepts are digits or each token is a single letter length_one = [ True if len(x) < 2 else False for x in tokens ] if tmp_name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + self.SEPARATOR snames.append(sname.strip()) # Check is unique is_unique = None if 'is_unique' in df.columns: _tmp = str(df.iat[ind, is_unique_ind]).strip() if _tmp.lower().strip() == '0': is_unique = False elif _tmp.lower().strip() == '1': is_unique = True # Get the ontology: 'sab' in umls onto = 'default' if 'onto' in df.columns: # Get the ontology onto = str(df.iat[ind, onto_ind]) # Get the tui tui = None if 'tui' in df.columns: _tui = str(df.iat[ind, tui_ind]).strip() if len(_tui) > 0 and _tui != "nan": tui = _tui #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] # Get the concept description desc = None if 'desc' in df.columns: _desc = str(df.iat[ind, desc_ind]).strip() if len(_desc) > 0: desc = _desc # Add the concept self.cdb.add_concept(cui, name, onto, tokens, snames, tui=tui, pretty_name=pretty_name, tokens_vocab=tokens_vocab, is_unique=is_unique, desc=desc, original_name=original_name, is_pref_name=is_pref_name) # Process examples if we have them examples = [] if 'examples' in df.columns: tmp = str(df.iat[ind, examples_ind]).strip().split( self.NAME_SEPARATOR) for example in tmp: example = example.strip() if len(example) > 0: examples.append(example) # If we have examples for example in examples: doc = self.nlp(example) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word.lower_): if w in self.vocab and self.vocab.vec( w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='MED') return self.cdb
class CDBMaker(object): r''' Given a CSV as shown in https://github.com/CogStack/MedCAT/tree/master/examples/<example> it creates a CDB. Args: cdb (`medcat.cdb.CDB`, optional): If set the `CDBMaker` will updat the existing `CDB` with new concepts in the CSV. ''' def __init__(self, cdb=None): if cdb is None: self.cdb = CDB() else: self.cdb = cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all) self.nlp.add_punct_tagger(tagger=partial(spacy_tag_punct, skip_stopwords=False)) def prepare_csvs(self, csv_paths, sep=','): r''' Compile one or multipe CSVs into a CDB. Args: csv_paths (`List[str]`): An array of paths to the csv files that should be processed sep (`str`, optional, defaults to `,`): If necessarya a custom separator for the csv files return: Compiled UMLS class ''' for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep) for ind in range(len(df)): names = str(df.iloc[ind]['str']).split("||") for _name in names: if ind % 10000 == 0: print("Done: {}".format(ind)) # Save originals pretty_name = _name original_name = _name name = clean_umls(_name) # Clean and preprocess the name doc = self.nlp(name) tokens = [str(t.lemma_).lower() for t in doc if not t._.is_punct and not t._.to_skip] # Don't allow concept names to be above concept_length_limit if len(tokens) > CONCEPT_LENGTH_LIMIT: continue isupper = False if len(doc) == 1: if doc[0].is_upper and len(doc[0]) > 1: isupper = True name = SEPARATOR.join(tokens) _name = "".join(tokens) length_one = [True if len(x) < 2 else False for x in tokens] # Skip concepts are digits or each token is a single letter if _name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + SEPARATOR snames.append(sname.strip()) # Check is prefered name, it is if the column "TTY" equals PN is_pref_name = False if 'tty' in df.columns: _tmp = str(df.iloc[ind]['tty']) if _tmp.lower().strip() == 'pn': is_pref_name = True onto = 'default' if 'sab' in df.columns: # Get the ontology onto = df.iloc[ind]['sab'] # Get the cui cui = df.iloc[ind]['cui'] # Get the tui tui = None if 'tui' in df.columns: tui = str(df.iloc[ind]['tui']) #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] desc = None if 'def' in df.columns: tmp = str(df.iloc[ind]['def']).strip() if len(tmp) > 0: desc = tmp self.cdb.add_concept(cui, name, onto, tokens, snames, isupper=isupper, is_pref_name=is_pref_name, tui=tui, pretty_name=pretty_name, desc=desc) # If we had desc we can also add vectors if desc is not None: doc = self.nlp(clean_def(desc)) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word.lower_): if w in self.vocab and self.vocab.vec(w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='LONG') # Increase cui count because we added the context if cui in self.cdb.cui_count: self.cdb.cui_count[cui] += 1 else: self.cdb.cui_count[cui] = 1 return self.cdb
cdb2.import_training(cdb=cdb, overwrite=True) assert cdb2.cui2context_vectors['C0000139']['long'][ 7] == cdb.cui2context_vectors['C0000139']['long'][7] assert cdb2.cui2count_train['C0000139'] == cdb.cui2count_train['C0000139'] # Test concept similarity cdb = CDB(config=config) np.random.seed(11) for i in range(500): cui = "C" + str(i) type_ids = {'T-' + str(i % 10)} cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), maker.nlp, {}, config), ontologies=set(), name_status='P', type_ids=type_ids, description='', full_build=True) vectors = {} for cntx_type in config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) cdb.update_context_vector(cui, vectors, negative=False) res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True) assert len(res) == 10
class CDBMaker(object): r''' Given a CSV as shown in https://github.com/CogStack/MedCAT/tree/master/examples/<example> it creates a CDB or updates an exisitng one. Args: config (`medcat.config.Config`): Global config for MedCAT. cdb (`medcat.cdb.CDB`, optional): If set the `CDBMaker` will updat the existing `CDB` with new concepts in the CSV. name_max_words (`int`, defaults to `20`): Names with more words will be skipped during the build of a CDB ''' log = logging.getLogger(__package__) log = add_handlers(log) def __init__(self, config, cdb=None, name_max_words=20): self.config = config # Set log level self.log.setLevel(self.config.general['log_level']) # To make life a bit easier self.cnf_cm = config.cdb_maker if cdb is None: self.cdb = CDB(config=self.config) else: self.cdb = cdb # Build the required spacy pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index_col=False, full_build=False, only_existing_cuis=False, **kwargs): r''' Compile one or multiple CSVs into a CDB. Args: csv_paths (`List[str]`): An array of paths to the csv files that should be processed full_build (`bool`, defaults to `True`): If False only the core portions of the CDB will be built (the ones required for the functioning of MedCAT). If True, everything will be added to the CDB - this usually includes concept descriptions, various forms of names etc (take care that this option produces a much larger CDB). sep (`str`, defaults to `,`): If necessary a custom separator for the csv files encoding (`str`, optional): Encoding to be used for reading the CSV file escapechar (`str`, optional): Escape char for the CSV index_col (`bool`, defaults_to `False`): Index column for pandas read_csv only_existing_cuis (`bool`, defaults to False): If True no new CUIs will be added, but only linked names will be extended. Mainly used when enriching names of a CDB (e.g. SNOMED with UMLS terms). Return: `medcat.cdb.CDB` with the new concepts added. Note: **kwargs: Will be passed to pandas for CSV reading csv: Examples of the CSV used to make the CDB can be found on [GitHub](link) ''' useful_columns = ['cui', 'name', 'ontologies', 'name_status', 'type_ids', 'description'] name_status_options = {'A', 'P', 'N'} for csv_path in csv_paths: # Read CSV, everything is converted to strings df = pandas.read_csv(csv_path, sep=sep, encoding=encoding, escapechar=escapechar, index_col=index_col, dtype=str, **kwargs) df = df.fillna('') # Find which columns to use from the CSV cols = [] col2ind = {} for col in list(df.columns): if str(col).lower().strip() in useful_columns: col2ind[str(col).lower().strip()] = len(cols) cols.append(col) self.log.info("Started importing concepts from: {}".format(csv_path)) _time = None # Used to check speed _logging_freq = np.ceil(len(df[cols]) / 100) for row_id, row in enumerate(df[cols].values): if row_id % _logging_freq == 0: # Print some stats if _time is None: # Add last time if it does not exist _time = datetime.datetime.now() # Get current time ctime = datetime.datetime.now() # Get time difference timediff = ctime - _time self.log.info("Current progress: {:.0f}% at {:.3f}s per {} rows".format( (row_id / len(df)) * 100, timediff.microseconds/10**6 + timediff.seconds, (len(df[cols]) // 100))) # Set previous time to current time _time = ctime # This must exist cui = row[col2ind['cui']].strip().upper() if not only_existing_cuis or (only_existing_cuis and cui in self.cdb.cui2names): if 'ontologies' in col2ind: ontologies = set([ontology.strip() for ontology in row[col2ind['ontologies']].upper().split(self.cnf_cm['multi_separator']) if len(ontology.strip()) > 0]) else: ontologies = set() if 'name_status' in col2ind: name_status = row[col2ind['name_status']].strip().upper() # Must be allowed if name_status not in name_status_options: name_status = 'A' else: # Defaults to A - meaning automatic name_status = 'A' if 'type_ids' in col2ind: type_ids = set([type_id.strip() for type_id in row[col2ind['type_ids']].upper().split(self.cnf_cm['multi_separator']) if len(type_id.strip()) > 0]) else: type_ids = set() # Get the ones that do not need any changing if 'description' in col2ind: description = row[col2ind['description']].strip() else: description = "" # We can have multiple versions of a name names = {} # {'name': {'tokens': [<str>], 'snames': [<str>]}} raw_names = [raw_name.strip() for raw_name in row[col2ind['name']].split(self.cnf_cm['multi_separator']) if len(raw_name.strip()) > 0] for raw_name in raw_names: raw_name = raw_name.strip() prepare_name(raw_name, self.nlp, names, self.config) if self.config.cdb_maker.get('remove_parenthesis', 0) > 0 and name_status == 'P': # Should we remove the content in parenthesis from primary names and add them also raw_name = PH_REMOVE.sub(" ", raw_name).strip() if len(raw_name) >= self.config.cdb_maker['remove_parenthesis']: prepare_name(raw_name, self.nlp, names, self.config) self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, full_build=full_build) # DEBUG self.log.debug("\n\n**** Added\n CUI: {}\n Names: {}\n Ontologies: {}\n Name status: {}\n".format(cui, names, ontologies, name_status) + \ " Type IDs: {}\n Description: {}\n Is full build: {}".format( type_ids, description, full_build)) return self.cdb def destroy_pipe(self): self.nlp.destroy()
class PrepareCDB(object): """ Prepares CDB data in csv format for annotations, after everything is done the result is in the cdb field. """ SEPARATOR = "" NAME_SEPARATOR = "|" CONCEPT_LENGTH_LIMIT = 8 SKIP_STOPWORDS = True def __init__(self, vocab=None, pretrained_cdb=None, word_tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger(tagger=partial( spacy_tag_punct, skip_stopwords=self.SKIP_STOPWORDS)) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok def _tok(self, text): return [text] def prepare_csvs(self, csv_paths, sep=','): """ Compile one or multiple CSVs into an internal CDB class csv_paths: an array of paths to the csv files that should be processed sep: if necessarya a custom separator for the csv files return: Compiled CDB class """ for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep) for ind in range(len(df)): names = str(df.iloc[ind]['str']).split(self.NAME_SEPARATOR) for _name in names: if ind % 10000 == 0: print("Done: {}".format(ind)) pretty_name = _name name = clean_name(_name) # Clean and preprocess the name sc_name = self.nlp(name) tokens = [ str(t.lemma_).lower() for t in sc_name if not t._.is_punct and not t._.to_skip ] tokens_vocab = [ t.lower_ for t in sc_name if not t._.is_punct ] # Don't allow concept names to be above concept_length_limit if len(tokens) > self.CONCEPT_LENGTH_LIMIT: continue name = self.SEPARATOR.join(tokens) _name = "".join(tokens) length_one = [ True if len(x) < 2 else False for x in tokens ] # Skip concepts are digits or each token is a single letter if _name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + self.SEPARATOR snames.append(sname.strip()) # Check is unique unique = True if 'unique' in df.columns: _tmp = str(df.iloc[ind]['unique']).strip() if _tmp.lower().strip() == '0': unique = False onto = 'default' if 'sab' in df.columns: # Get the ontology onto = df.iloc[ind]['sab'] # Get the cui cui = df.iloc[ind]['cui'] # Get the tui tui = None if 'tui' in df.columns: tui = str(df.iloc[ind]['tui']) #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] examples = [] if 'examples' in df.columns: tmp = str(df.iloc[ind]['examples']).strip().split( self.NAME_SEPARATOR) for example in tmp: example = example.strip() if len(example) > 0: examples.append(example) self.cdb.add_concept(cui, name, onto, tokens, snames, tui=tui, pretty_name=pretty_name, tokens_vocab=tokens_vocab, unique=unique) # If we have examples for example in examples: doc = self.nlp(example) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word._.norm): if w in self.vocab and self.vocab.vec( w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='MED') return self.cdb
class PrepareUMLS(object): """ Prepares UMLS data in csv format for annotations, after everything is done the result is in the umls field. """ def __init__(self, vocab=None, pretrained_cdb=None, tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger( tagger=partial(spacy_tag_punct, skip_stopwords=False)) # Get the tokenizer if tokenizer is not None: self.tokenizer = tokenizer else: self.tokenizer = self._tok #BertTokenizer.from_pretrained('bert-base-uncased') def _tok(self, text): return [text] def prepare_csvs(self, csv_paths, sep=','): """ Compile one or multiple CSVs into an internal UMLS class csv_paths: an array of paths to the csv files that should be processed sep: if necessarya a custom separator for the csv files return: Compiled UMLS class """ for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep) for ind in range(len(df)): names = str(df.iloc[ind]['str']).split("||") for _name in names: if ind % 10000 == 0: print("Done: {}".format(ind)) pretty_name = _name name = clean_umls(_name) # Clean and preprocess the name doc = self.nlp(name) tokens = [ str(t.lemma_).lower() for t in doc if not t._.is_punct and not t._.to_skip ] # Don't allow concept names to be above concept_length_limit if len(tokens) > CONCEPT_LENGTH_LIMIT: continue isupper = False if len(doc) == 1: if doc[0].is_upper and len(doc[0]) > 1: isupper = True name = SEPARATOR.join(tokens) _name = "".join(tokens) length_one = [ True if len(x) < 2 else False for x in tokens ] # Skip concepts are digits or each token is a single letter if _name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + SEPARATOR snames.append(sname.strip()) # Check is prefered name, it is if the column "TTY" equals PN is_pref_name = False if 'tty' in df.columns: _tmp = str(df.iloc[ind]['tty']) if _tmp.lower().strip() == 'pn': is_pref_name = True onto = 'default' if 'sab' in df.columns: # Get the ontology onto = df.iloc[ind]['sab'] # Get the cui cui = df.iloc[ind]['cui'] # Get the tui tui = None if 'tui' in df.columns: tui = str(df.iloc[ind]['tui']) #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] desc = None if 'def' in df.columns: tmp = str(df.iloc[ind]['def']).strip() if len(tmp) > 0: desc = tmp self.cdb.add_concept(cui, name, onto, tokens, snames, isupper=isupper, is_pref_name=is_pref_name, tui=tui, pretty_name=pretty_name, desc=desc) # If we had desc we can also add vectors if desc is not None: doc = self.nlp(clean_def(desc)) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word.lower_): if w in self.vocab and self.vocab.vec( w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='LONG') # Increase cui count because we added the context if cui in self.cdb.cui_count: self.cdb.cui_count[cui] += 1 else: self.cdb.cui_count[cui] = 1 return self.cdb