def test_sanity_no_misaligned_entities(CorpusType: Type[HunerDataset]): dataset_name = CorpusType.__class__.__name__.lower() base_path = flair.cache_root / "datasets" data_folder = base_path / dataset_name from flair.tokenization import SciSpacyTokenizer tokenizer = SciSpacyTokenizer() corpus = CorpusType() internal = corpus.to_internal(data_folder) for doc_id, doc_text in internal.documents.items(): misaligned_starts = [] misaligned_ends = [] token_starts = set() token_ends = set() for token, token_start in zip(*tokenizer.tokenize(doc_text)): token_starts.add(token_start) token_ends.add(token_start + len(token)) entities = internal.entities_per_document[doc_id] entity_starts = [i.char_span.start for i in entities] entity_ends = [i.char_span.stop for i in entities] for start in entity_starts: if start not in entity_starts: misaligned_starts.append(start) for end in entity_ends: if end not in entity_ends: misaligned_starts.append(end) assert len(misaligned_starts) <= len(entities) // 10 assert len(misaligned_ends) <= len(entities) // 10
def test_scispacy_tokenization(): from flair.tokenization import SciSpacyTokenizer tokenizer = SciSpacyTokenizer() tokens = tokenizer.tokenize("HBeAg(+) patients") assert len(tokens) == 5 assert tokens[0].text == "HBeAg" assert tokens[0].start_pos == 0 assert tokens[1].text == "(" assert tokens[1].start_pos == 5 assert tokens[2].text == "+" assert tokens[2].start_pos == 6 assert tokens[3].text == ")" assert tokens[3].start_pos == 7 assert tokens[4].text == "patients" assert tokens[4].start_pos == 9 tokens = tokenizer.tokenize("HBeAg(+)/HBsAg(+)") assert len(tokens) == 9 assert tokens[0].text == "HBeAg" assert tokens[0].start_pos == 0 assert tokens[1].text == "(" assert tokens[1].start_pos == 5 assert tokens[2].text == "+" assert tokens[2].start_pos == 6 assert tokens[3].text == ")" assert tokens[3].start_pos == 7 assert tokens[4].text == "/" assert tokens[4].start_pos == 8 assert tokens[5].text == "HBsAg" assert tokens[5].start_pos == 9 assert tokens[6].text == "(" assert tokens[6].start_pos == 14 assert tokens[7].text == "+" assert tokens[7].start_pos == 15 assert tokens[8].text == ")" assert tokens[8].start_pos == 16 tokens = tokenizer.tokenize("doxorubicin (DOX)-induced") assert len(tokens) == 5 assert tokens[0].text == "doxorubicin" assert tokens[1].text == "(" assert tokens[2].text == "DOX" assert tokens[3].text == ")" assert tokens[4].text == "-induced"
def test_create_sentence_using_scispacy_tokenizer(): sentence: Sentence = Sentence( "Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron", use_tokenizer=SciSpacyTokenizer() ) assert 13 == len(sentence.tokens) assert "Spinal" == sentence.tokens[0].text assert "and" == sentence.tokens[1].text assert "bulbar" == sentence.tokens[2].text assert "muscular" == sentence.tokens[3].text assert "atrophy" == sentence.tokens[4].text assert "(" == sentence.tokens[5].text assert "SBMA" == sentence.tokens[6].text assert ")" == sentence.tokens[7].text assert "is" == sentence.tokens[8].text assert "an" == sentence.tokens[9].text assert "inherited" == sentence.tokens[10].text assert "motor" == sentence.tokens[11].text assert "neuron" == sentence.tokens[12].text assert 0 == sentence.tokens[0].start_pos assert 7 == sentence.tokens[1].start_pos assert 11 == sentence.tokens[2].start_pos assert 18 == sentence.tokens[3].start_pos assert 27 == sentence.tokens[4].start_pos assert 35 == sentence.tokens[5].start_pos assert 36 == sentence.tokens[6].start_pos assert 40 == sentence.tokens[7].start_pos assert 42 == sentence.tokens[8].start_pos assert 45 == sentence.tokens[9].start_pos assert 48 == sentence.tokens[10].start_pos assert 58 == sentence.tokens[11].start_pos assert 64 == sentence.tokens[12].start_pos assert True == sentence.tokens[4].whitespace_after assert False == sentence.tokens[5].whitespace_after assert False == sentence.tokens[6].whitespace_after assert True == sentence.tokens[7].whitespace_after
from flair.data import Sentence from flair.models import MultiTagger from flair.tokenization import SciSpacyTokenizer scp = SciSpacyTokenizer() tagger = MultiTagger.load("hunflair")
# -*- coding: utf-8 -*- """ Created on Sat Aug 22 11:12:40 2020 @author: MAGESHWARAN """ # -----------------Template for using hunflair for BioMedical NER-------------- from flair.data import Sentence from flair.models import MultiTagger from flair.tokenization import SciSpacyTokenizer # make a sentence and tokenize with SciSpaCy sentence = Sentence( "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome", use_tokenizer=SciSpacyTokenizer()) # load biomedical tagger tagger = MultiTagger.load("hunflair") # inference tagger.predict(sentence) # print sentence with predicted tags print(sentence.to_tagged_string()) # Entities may have multiple words, here's an easy way to get each annotated span for disease in sentence.get_spans("hunflair-disease"): print(disease) # Can be converted to dictionary, to get additional information