예제 #1
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB.load(cdb_path)
            cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE"))
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
예제 #2
0
 def test_save_and_load(self):
     self.undertest.add_words(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "vocab_data.txt"))
     self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
     vocab_path = f"{self.tmp_dir}/vocab.dat"
     self.undertest.save(vocab_path)
     vocab = Vocab.load(vocab_path)
     self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys()))
예제 #3
0
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug("Loading VOCAB ...")
        vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug("Loading CDB ...")

        cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH"))

        spacy_model = os.getenv("SPACY_MODEL", "")

        if spacy_model:
            cdb.config.general["spacy_model"] == spacy_model
        else:
            logging.warning("SPACY_MODEL environment var not set, \
                attempting to load the spacy model found within the CDB : " +
                            cdb.config.general["spacy_model"])

            if cdb.config.general["spacy_model"] == "":
                raise ValueError(
                    "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \
                 To solve this declare the SPACY_MODEL in the env_medcat file."
                )

        # this is redundant as the config is already in the CDB
        conf = cdb.config

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug("Applying CDB CUI filter ...")
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug("Loading META annotations ...")
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"):
                m = MetaCAT.load(model_path)
                meta_models.append(m)

        cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models)
        return cat
예제 #4
0
    def setUpClass(cls):
        print("Set up CDB")
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.cdb = CDB(config=cls.config)

        print("Set up Vocab")
        vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(vocab_path):
            tmp = requests.get(
                "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
            with open(vocab_path, 'wb') as f:
                f.write(tmp.content)

        cls.vocab = Vocab.load(vocab_path)

        print("Set up NLP pipeline")
        cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config)
        cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct,
                                          config=cls.config),
                           name='skip_and_punct',
                           additional_fields=['is_punct'])

        cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab,
                                              config=cls.config,
                                              data_vocab=cls.vocab)
        cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker,
                                     config=cls.config)
        cls.ner = NER(cls.cdb, cls.config)
        cls.nlp.add_ner(cls.ner)

        print("Set up Linker")
        cls.link = Linker(cls.cdb, cls.vocab, cls.config)
        cls.nlp.add_linker(cls.link)

        print("Set limits for tokens and uppercase")
        cls.config.ner['max_skip_tokens'] = 1
        cls.config.ner['upper_case_limit_len'] = 4
        cls.config.linking['disamb_length_limit'] = 2

        print("Add concepts")
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar', cls.nlp, {}, cls.config))
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar viruses', cls.nlp, {},
                                             cls.config))
        cls.cdb.add_names(cui='S-229005',
                          names=prepare_name('CDB', cls.nlp, {}, cls.config))

        print("Add test text")
        cls.text = "CDB - I was running and then Movar    Virus attacked and CDb"
        cls.text_post_pipe = cls.nlp(cls.text)
예제 #5
0
    def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None):
        self.cdb = cdb
        self.config = config
        self.w2v = None
        if vocab is not None:
            self.vocab = vocab
        else:
            self.vocab = Vocab()

        # Build the required spacy pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Get the tokenizer
        if word_tokenizer is not None:
            self.tokenizer = word_tokenizer
        else:
            self.tokenizer = self._tok

        # Used for saving if the real path is not set
        self.vocab_path = "./tmp_vocab.dat"
예제 #6
0
    def setUp(self) -> None:
        self.config = Config()
        self.config.general['log_level'] = logging.INFO
        cdb = CDB(config=self.config)

        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

        self.vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(self.vocab_path):
            import requests
            tmp = requests.get(
                "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(self.vocab_path, 'wb') as f:
                f.write(tmp.content)

        vocab = Vocab.load(self.vocab_path)
        # Make the pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])
        spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab,
                                          config=self.config,
                                          data_vocab=vocab)
        self.nlp.add_token_normalizer(spell_checker=spell_checker,
                                      config=self.config)
        ner = NER(cdb, self.config)
        self.nlp.add_ner(ner)

        # Add Linker
        link = Linker(cdb, vocab, self.config)
        self.nlp.add_linker(link)

        self.text = "CDB - I was running and then Movar    Virus attacked and CDb"
예제 #7
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            try:
                cdb = CDB.load(cdb_path)
            except KeyError as ke:
                mc_v = pkg_resources.get_distribution('medcat').version
                if int(mc_v.split('.')[0]) > 0:
                    log.error(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x'
                    )
                    raise Exception(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x',
                        'Please re-configure this project to use a MedCAT v1.x CDB or consult the '
                        'MedCATTrainer Dev team if you believe this should work'
                    ) from ke
                raise

            custom_config = os.getenv("MEDCAT_CONFIG_FILE")
            if custom_config is not None and os.path.exists(custom_config):
                cdb.config.parse_config_file(path=custom_config)
            else:
                log.info(
                    "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB"
                )
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
예제 #8
0
 def setUpClass(cls) -> None:
     cls.cdb = CDB.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "cdb.dat"))
     cls.vocab = Vocab.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "vocab.dat"))
     cls.cdb.config.ner['min_name_len'] = 2
     cls.cdb.config.ner['upper_case_limit_len'] = 3
     cls.cdb.config.general['spell_check'] = True
     cls.cdb.config.linking['train_count_threshold'] = 10
     cls.cdb.config.linking['similarity_threshold'] = 0.3
     cls.cdb.config.linking['train'] = True
     cls.cdb.config.linking['disamb_length_limit'] = 5
     cls.cdb.config.general['full_unlink'] = True
     cls.undertest = CAT(cdb=cls.cdb,
                         config=cls.cdb.config,
                         vocab=cls.vocab)
예제 #9
0
    def test_for_linker(self):
        self.config = Config()
        self.config.general['log_level'] = logging.DEBUG
        cdb = CDB(config=self.config)

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        cdb.add_names(cui='S-2290045',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}

        cuis = list(cdb.cui2names.keys())
        for cui in cuis[0:50]:
            vectors = {
                'short': np.random.rand(300),
                'long': np.random.rand(300),
                'medium': np.random.rand(300)
            }
            cdb.update_context_vector(cui, vectors, negative=False)

        d = self.nlp(self.text)
        vocab = Vocab.load(self.vocab_path)
        cm = ContextModel(cdb, vocab, self.config)
        cm.train_using_negative_sampling('S-229004')
        self.config.linking['train_count_threshold'] = 0

        cm.train('S-229004', d._.ents[1], d)

        cm.similarity('S-229004', d._.ents[1], d)

        cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
예제 #10
0
    def setUpClass(cls) -> None:
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.cdb = CDB(config=cls.config)

        vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(vocab_path):
            tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(vocab_path, 'wb') as f:
                f.write(tmp.content)

        cls.vocab = Vocab.load(vocab_path)
        cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab)
        cls.ner = NER(cls.cdb, cls.config)
        cls.linker = Linker(cls.cdb, cls.vocab, cls.config)
        cls.config.ner['max_skip_tokens'] = 1
        cls.config.ner['upper_case_limit_len'] = 4
        cls.config.linking['disamb_length_limit'] = 2
        cls.meta_cat = MetaCAT()
        cls.text = "CDB - I was running and then Movar Virus attacked and CDb"
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config)
예제 #11
0
class MakeVocab(object):
    r'''
    Create a new vocab from a text file. To make a vocab and train word embeddings do:
    Args:
        cdb (medcat.cdb.CDB):
            The concept database that will be added ontop of the Vocab built from the text file.
        vocab (medcat.utils.vocab.Vocab, optional):
            Vocabulary to be extended, leave as None if you want to make a new Vocab. Default: None
        word_tokenizer (<function>):
            A custom tokenizer for word spliting - used if embeddings are BERT or similar.
            Default: None

    >>>cdb = <your existing cdb>
    >>>maker = MakeVocab(cdb=cdb, config=config)
    >>>maker.make(data_iterator, out_folder="./output/")
    >>>maker.add_vectors(in_path="./output/data.txt")
    >>>
    '''
    log = logging.getLogger(__name__)

    def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None):
        self.cdb = cdb
        self.config = config
        self.w2v = None
        if vocab is not None:
            self.vocab = vocab
        else:
            self.vocab = Vocab()

        # Build the required spacy pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Get the tokenizer
        if word_tokenizer is not None:
            self.tokenizer = word_tokenizer
        else:
            self.tokenizer = self._tok

        # Used for saving if the real path is not set
        self.vocab_path = "./tmp_vocab.dat"

    def _tok(self, text):
        return [text]

    def make(self,
             iter_data,
             out_folder,
             join_cdb=True,
             normalize_tokens=False):
        r'''
        Make a vocab - without vectors initially. This will create two files in the out_folder:
        - vocab.dat -> The vocabulary without vectors
        - data.txt -> The tokenized dataset prepared for training of word2vec or similar embeddings.

        Args:
            iter_data (Iterator):
                An iterator over sentences or documents. Can also be a simple array of text documents/sentences.
            out_folder (string):
                A path to a folder where all the results will be saved
            join_cdb (bool):
                Should the words from the CDB be added to the Vocab. Default: True
            normalize_tokens (bool, defaults to True):
                If set tokens will be lematized - tends to work better in some cases where the difference
                between e.g. plural/singular should be ignored. But in general not so important if the dataset is big enough.
        '''
        # Save the preprocessed data, used for emb training
        out_path = Path(out_folder) / "data.txt"
        vocab_path = Path(out_folder) / "vocab.dat"
        self.vocab_path = vocab_path
        out = open(out_path, 'w', encoding='utf-8')

        for ind, doc in enumerate(iter_data):
            if ind % 10000 == 0:
                self.log.info("Vocab builder at: " + str(ind))
                print(ind)

            doc = self.nlp.nlp.tokenizer(doc)
            line = ""

            for token in doc:
                if token.is_space or token.is_punct:
                    continue

                if len(token.lower_) > 0:
                    if normalize_tokens:
                        self.vocab.inc_or_add(token._.norm)
                    else:
                        self.vocab.inc_or_add(token.lower_)

                if normalize_tokens:
                    line = line + " " + "_".join(token._.norm.split(" "))
                else:
                    line = line + " " + "_".join(token.lower_.split(" "))

            out.write(line.strip())
            out.write("\n")
        out.close()

        if join_cdb and self.cdb:
            for word in self.cdb.vocab.keys():
                if word not in self.vocab:
                    self.vocab.add_word(word)
                else:
                    # Update the count with the counts from the new dataset
                    self.cdb.vocab[word] += self.vocab[word]

        # Save the vocab also
        self.vocab.save(path=self.vocab_path)

    def add_vectors(self,
                    in_path=None,
                    w2v=None,
                    overwrite=False,
                    data_iter=None,
                    workers=14,
                    niter=2,
                    min_count=10,
                    window=10,
                    vsize=300,
                    unigram_table_size=100000000):
        r'''
        Add vectors to an existing vocabulary and save changes to the vocab_path.

        Args:
            in_path (String):
                Path to the data.txt that was created by the MakeVocab.make() function.
            w2v (Word2Vec, optional):
                An existing word2vec instance. Default: None
            overwrite (bool):
                If True it will overwrite existing vectors in the vocabulary. Default: False
            data_iter (iterator):
                If you want to provide a customer iterator over the data use this. If yes, then in_path is not needed.
            **: Word2Vec arguments

        Returns:
            A trained word2vec model.
        '''
        if w2v is None:
            if data_iter is None:
                data = SimpleIter(in_path)
            else:
                data = data_iter
            w2v = Word2Vec(data,
                           window=window,
                           min_count=min_count,
                           workers=workers,
                           size=vsize,
                           iter=niter)

        for word in w2v.wv.vocab.keys():
            if word in self.vocab:
                if overwrite:
                    self.vocab.add_vec(word, w2v.wv.get_vector(word))
                else:
                    if self.vocab.vec(word) is None:
                        self.vocab.add_vec(word, w2v.wv.get_vector(word))

        # Save the vocab again, now with vectors
        self.vocab.make_unigram_table(table_size=unigram_table_size)
        self.vocab.save(path=self.vocab_path)
        return w2v

    def destroy_pipe(self):
        self.nlp.destroy()
예제 #12
0
# Add a couple of names
cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config))
cdb.add_names(cui='S-229004',
              names=prepare_name('Movar viruses', nlp, {}, config))
cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config))
# Check
#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
    import requests
    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
    with open(vocab_path, 'wb') as f:
        f.write(tmp.content)

vocab = Vocab.load(vocab_path)
# Make the pipeline
nlp = Pipe(tokenizer=spacy_split_all, config=config)
nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
               name='skip_and_punct',
               additional_fields=['is_punct'])
spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab,
                                  config=config,
                                  data_vocab=vocab)
nlp.add_token_normalizer(spell_checker=spell_checker, config=config)
ner = NER(cdb, config)
nlp.add_ner(ner)

# Add Linker
link = Linker(cdb, vocab, config)
nlp.add_linker(link)
예제 #13
0
 def setUp(self) -> None:
     self.undertest = Vocab()
     self.tmp_dir = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "tmp")
     os.makedirs(self.tmp_dir, exist_ok=True)
예제 #14
0
class CATTests(unittest.TestCase):
    def setUp(self) -> None:
        self.undertest = Vocab()
        self.tmp_dir = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "tmp")
        os.makedirs(self.tmp_dir, exist_ok=True)

    def tearDown(self) -> None:
        shutil.rmtree(self.tmp_dir)

    def test_add_words(self):
        self.undertest.add_words(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "examples", "vocab_data.txt"))
        self.assertEqual(["house", "dog"], list(self.undertest.vocab.keys()))

    def test_add_word(self):
        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
        self.assertEqual(["test"], list(self.undertest.vocab.keys()))
        self.assertTrue("test" in self.undertest)

    def test_count(self):
        self.undertest.add_words(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "examples", "vocab_data.txt"))
        self.assertEqual(34444, self.undertest.count("house"))

    def test_save_and_load(self):
        self.undertest.add_words(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "examples", "vocab_data.txt"))
        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
        vocab_path = f"{self.tmp_dir}/vocab.dat"
        self.undertest.save(vocab_path)
        vocab = Vocab.load(vocab_path)
        self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys()))