示例#1
0
    def setUpClass(cls):
        print("Set up CDB")
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.cdb = CDB(config=cls.config)

        print("Set up Vocab")
        vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(vocab_path):
            tmp = requests.get(
                "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
            with open(vocab_path, 'wb') as f:
                f.write(tmp.content)

        cls.vocab = Vocab.load(vocab_path)

        print("Set up NLP pipeline")
        cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config)
        cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct,
                                          config=cls.config),
                           name='skip_and_punct',
                           additional_fields=['is_punct'])

        cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab,
                                              config=cls.config,
                                              data_vocab=cls.vocab)
        cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker,
                                     config=cls.config)
        cls.ner = NER(cls.cdb, cls.config)
        cls.nlp.add_ner(cls.ner)

        print("Set up Linker")
        cls.link = Linker(cls.cdb, cls.vocab, cls.config)
        cls.nlp.add_linker(cls.link)

        print("Set limits for tokens and uppercase")
        cls.config.ner['max_skip_tokens'] = 1
        cls.config.ner['upper_case_limit_len'] = 4
        cls.config.linking['disamb_length_limit'] = 2

        print("Add concepts")
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar', cls.nlp, {}, cls.config))
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar viruses', cls.nlp, {},
                                             cls.config))
        cls.cdb.add_names(cui='S-229005',
                          names=prepare_name('CDB', cls.nlp, {}, cls.config))

        print("Add test text")
        cls.text = "CDB - I was running and then Movar    Virus attacked and CDb"
        cls.text_post_pipe = cls.nlp(cls.text)
示例#2
0
    def setUp(self) -> None:
        self.config = Config()
        self.config.general['log_level'] = logging.INFO
        cdb = CDB(config=self.config)

        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

        self.vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(self.vocab_path):
            import requests
            tmp = requests.get(
                "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(self.vocab_path, 'wb') as f:
                f.write(tmp.content)

        vocab = Vocab.load(self.vocab_path)
        # Make the pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])
        spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab,
                                          config=self.config,
                                          data_vocab=vocab)
        self.nlp.add_token_normalizer(spell_checker=spell_checker,
                                      config=self.config)
        ner = NER(cdb, self.config)
        self.nlp.add_ner(ner)

        # Add Linker
        link = Linker(cdb, vocab, self.config)
        self.nlp.add_linker(link)

        self.text = "CDB - I was running and then Movar    Virus attacked and CDb"
示例#3
0
    def unlink_concept_name(self, cui, name, preprocessed_name=False):
        r'''
        Unlink a concept name from the CUI (or all CUIs if full_unlink), removes the link from
        the Concept Database (CDB). As a consequence medcat will never again link the `name`
        to this CUI - meaning the name will not be detected as a concept in the future.

        Args:
            cui (str):
                The CUI from which the `name` will be removed
            name (str):
                The span of text to be removed from the linking dictionary
        Examples:
            >>> # To never again link C0020538 to HTN
            >>> cat.unlink_concept_name('C0020538', 'htn', False)
        '''

        cuis = [cui]
        if preprocessed_name:
            names = {name: 'nothing'}
        else:
            names = prepare_name(name, self, {}, self.config)

        # If full unlink find all CUIs
        if self.config.general.get('full_unlink', False):
            for name in names:
                cuis.extend(self.cdb.name2cuis.get(name, []))

        # Remove name from all CUIs
        for cui in cuis:
            self.cdb.remove_names(cui=cui, names=names)
示例#4
0
 def test_name_addition(self):
     self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config), name_status='P', full_build=True)
     assert self.cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'}
     assert 'my:newname.' in self.cdb.name2cuis
     assert 'my:new' in self.cdb.snames
     assert 'my:newname.' in self.cdb.name2cuis2status
     assert self.cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}
示例#5
0
 def test_bb_removal_of_name(self):
     self.cdb.remove_names(cui='C0000239',
                           names=prepare_name('MY: new,-_! Name.',
                                              self.maker.nlp, {},
                                              self.config))
     self.assertEqual(len(self.cdb.name2cuis), 5, "Should equal 5")
     self.assertNotIn('my:newname.', self.cdb.name2cuis2status)
示例#6
0
 def test_ba_addition_of_new_name(self):
     self.cdb.add_names(cui='C0000239',
                        names=prepare_name('MY: new,-_! Name.',
                                           self.maker.nlp, {}, self.config),
                        name_status='P',
                        full_build=True)
     self.assertEqual(len(self.cdb.name2cuis), 6, "Should equal 6")
     target_result = {'MY: new,-_! Name.', 'Second csv'}
     self.assertEqual(self.cdb.addl_info['cui2original_names']['C0000239'],
                      target_result)
     self.assertIn('my~:~new~name~.', self.cdb.name2cuis)
     self.assertIn('my~:~new', self.cdb.snames)
     self.assertIn('my~:~new~name~.', self.cdb.name2cuis2status)
示例#7
0
    def test_for_linker(self):
        self.config = Config()
        self.config.general['log_level'] = logging.DEBUG
        cdb = CDB(config=self.config)

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        cdb.add_names(cui='S-2290045',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}

        cuis = list(cdb.cui2names.keys())
        for cui in cuis[0:50]:
            vectors = {
                'short': np.random.rand(300),
                'long': np.random.rand(300),
                'medium': np.random.rand(300)
            }
            cdb.update_context_vector(cui, vectors, negative=False)

        d = self.nlp(self.text)
        vocab = Vocab.load(self.vocab_path)
        cm = ContextModel(cdb, vocab, self.config)
        cm.train_using_negative_sampling('S-229004')
        self.config.linking['train_count_threshold'] = 0

        cm.train('S-229004', d._.ents[1], d)

        cm.similarity('S-229004', d._.ents[1], d)

        cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
示例#8
0
    def test_concept_similarity(self):
        cdb = CDB(config=self.config)
        np.random.seed(11)
        for i in range(500):
            cui = "C" + str(i)
            type_ids = {'T-' + str(i%10)}
            cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(),
                    name_status='P', type_ids=type_ids, description='', full_build=True)

            vectors = {}
            for cntx_type in self.config.linking['context_vector_sizes']:
                vectors[cntx_type] = np.random.rand(300)
            cdb.update_context_vector(cui, vectors, negative=False)
        res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True)
        assert len(res) == 10
示例#9
0
    def add_and_train_concept(self, cui, name, spacy_doc=None, spacy_entity=None, ontologies=set(), name_status='A', type_ids=set(),
                              description='', full_build=True, negative=False, devalue_others=False, do_add_concept=True):
        r''' Add a name to an existing concept, or add a new concept, or do not do anything if the name and concept alraedy exist. Perform
        training if spacy_entity and spacy_doc are set.

        Args:
            cui (str):
                CUI of the concept
            name (str):
                Name to be linked to the concept (in the case of MedCATtrainer this is simply the
                selected value in text, no preprocessing or anything needed).
            spacy_doc (spacy.tokens.Doc):
                Spacy represenation of the document that was manually annotated.
            spacy_entity (List[spacy.tokens.Token]):
                Given the spacy document, this is the annotated span of text - list of annotated tokens that are marked with this CUI.
            negative (bool):
                Is this a negative or positive example.
            devalue_others:
                If set, cuis to which this name is assigned and are not `cui` will receive negative training given
                that negative=False.

            **other:
                Refer to CDB.add_concept
        '''

        names = prepare_name(name, self, {}, self.config)
        if do_add_concept:
            self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
                                 full_build=full_build)

        if spacy_entity is not None and spacy_doc is not None:
            # Train Linking
            self.linker.context_model.train(cui=cui, entity=spacy_entity, doc=spacy_doc, negative=negative, names=names)

            if not negative and devalue_others:
                # Find all cuis
                cuis = set()
                for name in names:
                    cuis.update(self.cdb.name2cuis.get(name, []))
                # Remove the cui for which we just added positive training
                if cui in cuis:
                    cuis.remove(cui)
                # Add negative training for all other CUIs that link to these names
                for _cui in cuis:
                    self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True)
示例#10
0
 def test_name_removal(self):
     self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
     # Run again to make sure it does not break anything
     self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
     assert len(self.cdb.name2cuis) == 5
     assert 'my:newname.' not in self.cdb.name2cuis2status
示例#11
0
assert len(cdb.name2cuis) == 5
assert len(cdb.cui2tags) == 3
assert len(cdb.cui2preferred_name) == 2
assert len(cdb.cui2context_vectors) == 3
assert len(cdb.cui2count_train) == 3
assert cdb.name2cuis2status['virus']['C0000039'] == 'P'
assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
assert cdb.addl_info['cui2original_names']['C0000039'] == {
    'Virus', 'Virus K', 'Virus M', 'Virus Z'
}
assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")

# Test name addition
from medcat.preprocessing.cleaners import prepare_name
cdb.add_names(cui='C0000239',
              names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config),
              name_status='P',
              full_build=True)
assert cdb.addl_info['cui2original_names']['C0000239'] == {
    'MY: new,-_! Name.', 'Second csv'
}
assert 'my:newname.' in cdb.name2cuis
assert 'my:new' in cdb.snames
assert 'my:newname.' in cdb.name2cuis2status
assert cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}

# Test name removal
cdb.remove_names(cui='C0000239',
                 names=prepare_name('MY: new,-_! Name.', maker.nlp, {},
                                    config))
# Run again to make sure it does not break anything
示例#12
0
    def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index_col=False, full_build=False, only_existing_cuis=False, **kwargs):
        r''' Compile one or multiple CSVs into a CDB.

        Args:
            csv_paths (`List[str]`):
                An array of paths to the csv files that should be processed
            full_build (`bool`, defaults to `True`):
                If False only the core portions of the CDB will be built (the ones required for
                the functioning of MedCAT). If True, everything will be added to the CDB - this
                usually includes concept descriptions, various forms of names etc (take care that
                this option produces a much larger CDB).
            sep (`str`, defaults to `,`):
                If necessary a custom separator for the csv files
            encoding (`str`, optional):
                Encoding to be used for reading the CSV file
            escapechar (`str`, optional):
                Escape char for the CSV
            index_col (`bool`, defaults_to `False`):
                Index column for pandas read_csv
            only_existing_cuis (`bool`, defaults to False):
                If True no new CUIs will be added, but only linked names will be extended. Mainly used when
                enriching names of a CDB (e.g. SNOMED with UMLS terms).
        Return:
            `medcat.cdb.CDB` with the new concepts added.

        Note:
            **kwargs:
                Will be passed to pandas for CSV reading
            csv:
                Examples of the CSV used to make the CDB can be found on [GitHub](link)
        '''

        useful_columns = ['cui', 'name', 'ontologies', 'name_status', 'type_ids', 'description']
        name_status_options = {'A', 'P', 'N'}

        for csv_path in csv_paths:
            # Read CSV, everything is converted to strings
            df = pandas.read_csv(csv_path, sep=sep, encoding=encoding, escapechar=escapechar, index_col=index_col, dtype=str, **kwargs)
            df = df.fillna('')

            # Find which columns to use from the CSV
            cols = []
            col2ind = {}
            for col in list(df.columns):
                if str(col).lower().strip() in useful_columns:
                    col2ind[str(col).lower().strip()] = len(cols)
                    cols.append(col)

            self.log.info("Started importing concepts from: {}".format(csv_path))
            _time = None # Used to check speed
            _logging_freq = np.ceil(len(df[cols]) / 100)
            for row_id, row in enumerate(df[cols].values):
                if row_id % _logging_freq == 0:
                    # Print some stats
                    if _time is None:
                        # Add last time if it does not exist
                        _time = datetime.datetime.now()
                    # Get current time
                    ctime = datetime.datetime.now()
                    # Get time difference
                    timediff = ctime - _time
                    self.log.info("Current progress: {:.0f}% at {:.3f}s per {} rows".format(
                        (row_id / len(df)) * 100, timediff.microseconds/10**6 + timediff.seconds, (len(df[cols]) // 100)))
                    # Set previous time to current time
                    _time = ctime

                # This must exist
                cui = row[col2ind['cui']].strip().upper()

                if not only_existing_cuis or (only_existing_cuis and cui in self.cdb.cui2names):
                    if 'ontologies' in col2ind:
                        ontologies = set([ontology.strip() for ontology in row[col2ind['ontologies']].upper().split(self.cnf_cm['multi_separator']) if
                                         len(ontology.strip()) > 0])
                    else:
                        ontologies = set()

                    if 'name_status' in col2ind:
                        name_status = row[col2ind['name_status']].strip().upper()

                        # Must be allowed
                        if name_status not in name_status_options:
                            name_status = 'A'
                    else:
                        # Defaults to A - meaning automatic
                        name_status = 'A'

                    if 'type_ids' in col2ind:
                        type_ids = set([type_id.strip() for type_id in row[col2ind['type_ids']].upper().split(self.cnf_cm['multi_separator']) if
                                        len(type_id.strip()) > 0])
                    else:
                        type_ids = set()

                    # Get the ones that do not need any changing
                    if 'description' in col2ind:
                        description = row[col2ind['description']].strip()
                    else:
                        description = ""

                    # We can have multiple versions of a name
                    names = {} # {'name': {'tokens': [<str>], 'snames': [<str>]}}

                    raw_names = [raw_name.strip() for raw_name in row[col2ind['name']].split(self.cnf_cm['multi_separator']) if 
                                 len(raw_name.strip()) > 0]
                    for raw_name in raw_names:
                        raw_name = raw_name.strip()
                        prepare_name(raw_name, self.nlp, names, self.config)

                        if self.config.cdb_maker.get('remove_parenthesis', 0) > 0 and name_status == 'P':
                            # Should we remove the content in parenthesis from primary names and add them also
                            raw_name = PH_REMOVE.sub(" ", raw_name).strip()
                            if len(raw_name) >= self.config.cdb_maker['remove_parenthesis']:
                                prepare_name(raw_name, self.nlp, names, self.config)

                    self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids,
                                         description=description, full_build=full_build)
                    # DEBUG
                    self.log.debug("\n\n**** Added\n CUI: {}\n Names: {}\n Ontologies: {}\n Name status: {}\n".format(cui, names, ontologies, name_status) + \
                                   " Type IDs: {}\n Description: {}\n Is full build: {}".format(
                                   type_ids, description, full_build))

        return self.cdb
示例#13
0
import logging
from medcat.cdb import CDB
import os
import requests

config = Config()
config.general['log_level'] = logging.INFO
cdb = CDB(config=config)

nlp = Pipe(tokenizer=spacy_split_all, config=config)
nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
               name='skip_and_punct',
               additional_fields=['is_punct'])

# Add a couple of names
cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config))
cdb.add_names(cui='S-229004',
              names=prepare_name('Movar viruses', nlp, {}, config))
cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config))
# Check
#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
    import requests
    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
    with open(vocab_path, 'wb') as f:
        f.write(tmp.content)

vocab = Vocab.load(vocab_path)
# Make the pipeline