def test_candidate_generation(self):

        umls_fixture = UmlsKnowledgeBase(
            "tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(
                dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer,
                                                 umls_concept_aliases,
                                                 umls_fixture)
        results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10)

        canonical_ids = [x.concept_id for x in results[0]]
        assert canonical_ids == ['C0000005', 'C0000102', 'C0000084']

        # The mention was an exact match, so should have a distance of zero to a concept:
        assert results[0][0] == MentionCandidate(
            concept_id='C0000005',
            aliases=['(131)I-Macroaggregated Albumin'],
            similarities=[1.0])

        # Test we don't crash with zero vectors
        results = candidate_generator(['ZZZZ'], 10)
        assert results == [[]]
Exemplo n.º 2
0
    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)

        self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
Exemplo n.º 3
0
    def test_empty_list(self):
        
        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)
        results = candidate_generator([], 10)

        assert results == []
Exemplo n.º 4
0
    def test_create_index(self):

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)

        assert len(umls_concept_aliases) == 93
        assert len(ann_index) == 93 # Number of deduplicated aliases + canonical ids
        tfidf_params = tfidf_vectorizer.get_params()

        assert tfidf_params["analyzer"] == "char_wb"
        assert tfidf_params["min_df"] == 10
        assert tfidf_params["ngram_range"] == (3, 3)
Exemplo n.º 5
0
def main(kb_path: str, output_path: str):

    os.makedirs(output_path, exist_ok=True)
    kb = KnowledgeBase(kb_path)
    create_tfidf_ann_index(output_path, kb)