예제 #1
0
def train_ner(output_dir: str,
              data_path: str,
              run_test: bool = None,
              model: str = None,
              n_iter: int = 100,
              label_granularity: int = None):

    if label_granularity is not None:
        umls_tree = construct_umls_tree_from_tsv(
            "data/umls_semantic_type_tree.tsv")
        label_mapping = umls_tree.get_collapsed_type_id_map_at_level(
            label_granularity)
        if label_granularity == 0:
            span_only = True
    else:
        label_mapping = None
        span_only = False
    train_data, dev_data, test_data = read_full_med_mentions(
        data_path, label_mapping, span_only)
    os.makedirs(output_dir, exist_ok=True)
    if run_test:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        evaluate_ner(nlp,
                     dev_data,
                     dump_path=os.path.join(output_dir, "dev_metrics.json"))
        evaluate_ner(nlp,
                     test_data,
                     dump_path=os.path.join(output_dir, "test_metrics.json"))
    else:
        train(model, train_data, dev_data, output_dir, n_iter)
예제 #2
0
    def __init__(
        self,
        file_path: str = DEFAULT_UMLS_PATH,
        types_file_path: str = DEFAULT_UMLS_TYPES_PATH,
    ):

        super().__init__(file_path)

        self.semantic_type_tree: UmlsSemanticTypeTree = construct_umls_tree_from_tsv(
            types_file_path)
예제 #3
0
    def __init__(self,
                 file_path: str = DEFAULT_UMLS_PATH,
                 types_file_path: str = DEFAULT_UMLS_TYPES_PATH):
        raw = json.load(open(cached_path(file_path)))

        alias_to_cuis: Dict[str, Set[str]] = defaultdict(set)
        self.cui_to_entity: Dict[str, UmlsEntity] = {}

        for concept in raw:
            unique_aliases = set(concept["aliases"])
            unique_aliases.add(concept["canonical_name"])
            for alias in unique_aliases:
                alias_to_cuis[alias].add(concept["concept_id"])
            self.cui_to_entity[concept["concept_id"]] = UmlsEntity(**concept)

        self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis}
        self.semantic_type_tree: UmlsSemanticTypeTree = construct_umls_tree_from_tsv(
            types_file_path)
예제 #4
0
"""
utils for reading MedMentions original format
adapted from scispacy: https://github.com/allenai/scispacy
"""

from typing import NamedTuple, List, Iterator, Dict, Tuple
import tarfile
import atexit
import os
import shutil
import tempfile

from scispacy.file_cache import cached_path

from scispacy.umls_semantic_type_tree import construct_umls_tree_from_tsv
umls_tree = construct_umls_tree_from_tsv("data/umls_semantic_type_tree.tsv")


class MedMentionEntity(NamedTuple):
    start: int
    end: int
    mention_text: str
    mention_type: str
    umls_id: str

class MedMentionExample(NamedTuple):
    title: str
    abstract: str
    text: str
    pubmed_id: str
    entities: List[MedMentionEntity]
예제 #5
0
"""
This script expects you've followed the instructions in https://github.com/chb/py-umls to install UMLS.
"""

import json
from collections import Counter
import sqlite3

from scispacy.umls_semantic_type_tree import construct_umls_tree_from_tsv
umls_tree = construct_umls_tree_from_tsv('data/umls_semantic_type_tree.tsv')  # change to your location

umls_db_path = '/home/dan/projects/py-umls/databases/umls.db'  # change to your location
conn = sqlite3.connect(umls_db_path)
c = conn.cursor()

cui_data = {}
source_counter = Counter()
def_mismatches = set()


st21pv_sources = set(['CPT', 'FMA', 'GO', 'HGNC', 'HPO', 'ICD10',
                      'ICD10CM', 'ICD9CM', 'MDR', 'MSH', 'MTH',
                      'NCBI', 'NCI', 'NDDF', 'NDFRT', 'OMIM',
                      'RXNORM', 'SNOMEDCT_US'])


st21pv_types = set(['T005', 'T007', 'T017', 'T022', 'T031', 'T033', 'T037', 
                    'T038', 'T058', 'T062', 'T074', 'T082', 'T091', 'T092', 
                    'T097', 'T098', 'T103', 'T168', 'T170', 'T201', 'T204'])

 def setUp(self):
     super().setUp()
     self.tree = construct_umls_tree_from_tsv(
         "tests/fixtures/test_umls_tree.tsv")