Exemplo n.º 1
0
    def __init__(
        self,
        nlp: Language = None,
        name: str = "scispacy_linker",
        candidate_generator: CandidateGenerator = None,
        resolve_abbreviations: bool = True,
        k: int = 30,
        threshold: float = 0.7,
        no_definition_threshold: float = 0.95,
        filter_for_definitions: bool = True,
        max_entities_per_mention: int = 5,
        linker_name: str = None,
    ):
        # TODO(Mark): Remove in scispacy v1.0.
        Span.set_extension("umls_ents", default=[], force=True)
        Span.set_extension("kb_ents", default=[], force=True)

        self.candidate_generator = candidate_generator or CandidateGenerator(
            name=linker_name)
        self.resolve_abbreviations = resolve_abbreviations
        self.k = k
        self.threshold = threshold
        self.no_definition_threshold = no_definition_threshold
        self.kb = self.candidate_generator.kb
        self.filter_for_definitions = filter_for_definitions
        self.max_entities_per_mention = max_entities_per_mention

        # TODO(Mark): Remove in scispacy v1.0. This is for backward compatability only.
        self.umls = self.kb
    def test_candidate_generation(self):

        umls_fixture = UmlsKnowledgeBase(
            "tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(
                dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer,
                                                 umls_concept_aliases,
                                                 umls_fixture)
        results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10)

        canonical_ids = [x.concept_id for x in results[0]]
        assert canonical_ids == ['C0000005', 'C0000102', 'C0000084']

        # The mention was an exact match, so should have a distance of zero to a concept:
        assert results[0][0] == MentionCandidate(
            concept_id='C0000005',
            aliases=['(131)I-Macroaggregated Albumin'],
            similarities=[1.0])

        # Test we don't crash with zero vectors
        results = candidate_generator(['ZZZZ'], 10)
        assert results == [[]]
Exemplo n.º 3
0
    def test_empty_list(self):
        
        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)
        results = candidate_generator([], 10)

        assert results == []
Exemplo n.º 4
0
    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)

        self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
Exemplo n.º 5
0
    def __init__(self,
                 candidate_generator: CandidateGenerator = None,
                 resolve_abbreviations: bool = True,
                 k: int = 30,
                 threshold: float = 0.7,
                 filter_for_definitions: bool = True,
                 max_entities_per_mention: int = 5):

        Span.set_extension("umls_ents", default=[], force=True)

        self.candidate_generator = candidate_generator or CandidateGenerator()
        self.resolve_abbreviations = resolve_abbreviations
        self.k = k
        self.threshold = threshold
        self.umls = self.candidate_generator.umls
        self.filter_for_definitions = filter_for_definitions
        self.max_entities_per_mention = max_entities_per_mention
Exemplo n.º 6
0
def init_umls_nlp_linker():
    base_dir = ''
    tfidf_path = base_dir + 'tfidf_vectors_sparse.npz'
    ann_path = base_dir + 'nmslib_index.bin'
    ann_index = load_approximate_nearest_neighbours_index(
        tfidf_vectors_path=tfidf_path, ann_index_path=ann_path)
    vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib'))
    ann_concept = json.load(
        open(cached_path(base_dir + 'concept_aliases.json')))
    umlsknowlegebase = UmlsKnowledgeBase(
        file_path=base_dir + 'umls_2017_aa_cat0129.json',
        types_file_path=base_dir + 'umls_semantic_type_tree.tsv')
    cg = CandidateGenerator(ann_index=ann_index,
                            tfidf_vectorizer=vec,
                            ann_concept_aliases_list=ann_concept,
                            umls=umlsknowlegebase)
    linker = UmlsEntityLinker(candidate_generator=cg,
                              max_entities_per_mention=1)
    nlp.add_pipe(linker)
    return linker
Exemplo n.º 7
0
import os
from multiprocessing import Pool
import multiprocessing as multi
import pickle
import scispacy
from scispacy.linking import EntityLinker
from spacy.symbols import ORTH
import time
import re
from spacy.language import Language
import pdb
import copy
from tqdm import tqdm
from scispacy.candidate_generation import CandidateGenerator

MeshCandidateGenrator = CandidateGenerator(name='mesh')
KB=MeshCandidateGenrator.kb
K=100
Resolve_abbreviations = True
Threshold = 0.3
No_definition_threshold = 0.95
Filter_for_definitions = True
Max_entities_per_mention  = 30

def candidate_dui_generator(mention_strings):
    batch_candidates = MeshCandidateGenrator(mention_strings, K)
    batched_sorted_candidates = list()
    for candidates in batch_candidates:
        predicted = []
        for cand in candidates:
            score = max(cand.similarities)