def __init__(self, cdb, config, vocab, meta_cats=[]): self.cdb = cdb self.vocab = vocab # Take config from the cdb self.config = config # Set log level self.log.setLevel(self.config.general['log_level']) # Build the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=self.cdb.vocab, config=self.config, data_vocab=vocab) self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) # Add NER self.ner = NER(self.cdb, self.config) self.nlp.add_ner(self.ner) # Add LINKER self.linker = Linker(self.cdb, vocab, self.config) self.nlp.add_linker(self.linker) # Add meta_annotaiton classes if they exist self._meta_annotations = False for meta_cat in meta_cats: self.nlp.add_meta_cat(meta_cat, meta_cat.category_name) self._meta_annotations = True # Set max document length self.nlp.nlp.max_length = self.config.preprocessing.get('max_document_length', 1000000)
def setUpClass(cls): print("Set up CDB") cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) print("Set up Vocab") vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get( "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) print("Set up NLP pipeline") cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config) cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config), name='skip_and_punct', additional_fields=['is_punct']) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker, config=cls.config) cls.ner = NER(cls.cdb, cls.config) cls.nlp.add_ner(cls.ner) print("Set up Linker") cls.link = Linker(cls.cdb, cls.vocab, cls.config) cls.nlp.add_linker(cls.link) print("Set limits for tokens and uppercase") cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 print("Add concepts") cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', cls.nlp, {}, cls.config)) cls.cdb.add_names(cui='S-229005', names=prepare_name('CDB', cls.nlp, {}, cls.config)) print("Add test text") cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.text_post_pipe = cls.nlp(cls.text)
def setUp(self) -> None: self.config = Config() self.config.general['log_level'] = logging.INFO cdb = CDB(config=self.config) self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} self.vocab_path = "./tmp_vocab.dat" if not os.path.exists(self.vocab_path): import requests tmp = requests.get( "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(self.vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(self.vocab_path) # Make the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab) self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) ner = NER(cdb, self.config) self.nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, self.config) self.nlp.add_linker(link) self.text = "CDB - I was running and then Movar Virus attacked and CDb"
def setUpClass(cls) -> None: cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.cdb = CDB(config=cls.config) vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) cls.vocab = Vocab.load(vocab_path) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) cls.ner = NER(cls.cdb, cls.config) cls.linker = Linker(cls.cdb, cls.vocab, cls.config) cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 cls.meta_cat = MetaCAT() cls.text = "CDB - I was running and then Movar Virus attacked and CDb" cls.config = Config() cls.config.general['log_level'] = logging.INFO cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config)
vocab = Vocab.load(vocab_path) # Make the pipeline nlp = Pipe(tokenizer=spacy_split_all, config=config) nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config), name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=config, data_vocab=vocab) nlp.add_token_normalizer(spell_checker=spell_checker, config=config) ner = NER(cdb, config) nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, config) nlp.add_linker(link) # Test limits for tokens and uppercase config.ner['max_skip_tokens'] = 1 config.ner['upper_case_limit_len'] = 4 config.linking['disamb_length_limit'] = 2 text = "CDB - I was running and then Movar Virus attacked and CDb" d = nlp(text) assert len(d._.ents) == 2 assert d._.ents[0]._.link_candidates[0] == 'S-229004' # Change limit for skip config.ner['max_skip_tokens'] = 3 d = nlp(text)