def test_load_disable(self): nlp = medspacy.load(disable=["context"]) expected_pipe_names = { "medspacy_pyrush", "medspacy_target_matcher", } assert set(nlp.pipe_names) == expected_pipe_names
def load(): import medspacy nlp = medspacy.load(enable=["sentencizer", "tokenizer"]) # Add components from medspacy.target_matcher import TargetMatcher, TargetRule target_matcher = TargetMatcher(nlp) target_filepath = path.join(RESOURCES_DIR, "target_rules.json") target_rules = TargetRule.from_json(target_filepath) target_matcher.add(target_rules) nlp.add_pipe(target_matcher) from medspacy.context import ConTextComponent, ConTextRule context_filepath = path.join(RESOURCES_DIR, "context_rules.json") context = ConTextComponent(nlp, rules=None, add_attrs=CONTEXT_ATTRS) context_rules = ConTextRule.from_json(context_filepath) context.add(context_rules) nlp.add_pipe(context) from medspacy.section_detection import Sectionizer # TODO: Add radiology section rules sectionizer = Sectionizer(nlp) nlp.add_pipe(sectionizer) clf = DocumentClassifier(nlp) nlp.add_pipe(clf) return nlp
def test_default_load(self): nlp = medspacy.load() expected_pipe_names = { "medspacy_pyrush", "medspacy_context", "medspacy_target_matcher", } assert set(nlp.pipe_names) == expected_pipe_names
def test_default_load(self): nlp = medspacy.load() expected_pipe_names = { "sentencizer", "context", "target_matcher", } assert set(nlp.pipe_names) == expected_pipe_names
def test_load_all_components(self): full_pipe_names = [ "sentencizer", "target_matcher", "context", "sectionizer", "postprocessor" ] nlp = medspacy.load(enable="all") assert nlp.pipe_names == full_pipe_names assert isinstance(nlp.tokenizer, medspacy.preprocess.Preprocessor)
def test_disable_medspacy_tokenizer(self): default_tokenizer = spacy.blank("en").tokenizer custom_tokenizer = medspacy.load(disable=["tokenizer"]).tokenizer text = r"Pt c\o n;v;d h\o chf+cp n/v/d" default_doc = default_tokenizer(text) medspacy_doc = custom_tokenizer(text) assert [token.text for token in default_doc] == [token.text for token in medspacy_doc]
def test_load_disable(self): nlp = medspacy.load(disable=["tagger", "parser"]) expected_pipe_names = { "sentencizer", "target_matcher", "context", "sectionizer", "postprocessor", } assert set(nlp.pipe_names) == expected_pipe_names assert isinstance(nlp.tokenizer, nlp_preprocessor.Preprocessor)
def test_load_all_components(self): full_pipe_names = [ "medspacy_pyrush", "medspacy_target_matcher", "medspacy_context", "medspacy_sectionizer", "medspacy_postprocessor", "medspacy_doc_consumer", ] nlp = medspacy.load(enable="all") assert nlp.pipe_names == full_pipe_names assert isinstance(nlp.tokenizer, medspacy.preprocess.Preprocessor)
def test_medspacy_tokenizer_numerics(self): custom_tokenizer = medspacy.load(enable=["medspacy_tokenizer"]).tokenizer text = r"1.5 mg" medspacy_doc = custom_tokenizer(text) tokens = [token.text for token in medspacy_doc] assert len(tokens) == 2 # Check that some expected token boundries are generated joined_tokens = " ".join(tokens) assert "1.5" in joined_tokens assert "1 . 5" not in joined_tokens
def test_medspacy_tokenizer(self): default_tokenizer = spacy.blank("en").tokenizer custom_tokenizer = medspacy.load(enable=["tokenizer"]).tokenizer text = r"Pt c\o n;v;d h\o chf+cp n/v/d" default_doc = default_tokenizer(text) medspacy_doc = custom_tokenizer(text) assert [token.text for token in default_doc] != [token.text for token in medspacy_doc] # Check that some expected token boundries are generated joined_tokens = " ".join([token.text for token in medspacy_doc]) assert "c \\ o" in joined_tokens assert "n / v / d" in joined_tokens assert "chf + cp" in joined_tokens
def test_medspacy_tokenizer_uppercase(self): custom_tokenizer = medspacy.load(enable=["medspacy_tokenizer"]).tokenizer # Issue 13: Ensure that uppercase tokens are not tokenized as each character # https://github.com/medspacy/medspacy/issues/13 text = r"DO NOT BREAK ME UP" medspacy_doc = custom_tokenizer(text) tokens = [token.text for token in medspacy_doc] assert len(tokens) == 5 # Check that some expected token boundries are generated joined_tokens = " ".join(tokens) assert "DO NOT BREAK ME UP" in joined_tokens assert "B R E A K" not in joined_tokens
def test_quickumls_extractions(self): """ Test that extractions can be performed using the very small (<100 concept) UMLS sample resources """ # let's make sure that this pipe has been initialized # At least for MacOS and Linux which are currently supported... if not TestQuickUMLS.can_test_quickumls(): return # allow default QuickUMLS (very small sample data) to be loaded nlp = medspacy.load(enable=["quickumls"]) quickumls = nlp.get_pipe("QuickUMLS matcher") # TODO -- Consider moving this and other extraction tests to separate tests from loading doc = nlp( 'Decreased dipalmitoyllecithin content found in lung specimens') assert len(doc.ents) == 1 entity_spans = [ent.text for ent in doc.ents] assert 'dipalmitoyllecithin' in entity_spans
def test_initialize_pipeline(self): """ Test that a pipeline with a QuickUMLS component can be loaded in medpacy NOTE: Currently this is only available by default in Linux and MacOS Windows requires additional steps, but this will test capability on Windows if these manual steps are followed """ # let's make sure that this pipe has been initialized # At least for MacOS and Linux which are currently supported... if not TestQuickUMLS.can_test_quickumls(): return # allow default QuickUMLS (very small sample data) to be loaded nlp = medspacy.load(enable=["quickumls"]) assert nlp quickumls = nlp.get_pipe("QuickUMLS matcher") assert quickumls # this is a member of the QuickUMLS algorithm inside the component assert quickumls.quickumls # Check that the simstring database exists assert quickumls.quickumls.ss_db
def test_load_lang_model(self): nlp = spacy.load("en_core_web_sm", disable={"ner"}) nlp = medspacy.load(nlp) assert {"tagger", "parser"}.intersection(set(nlp.pipe_names))
def test_not_load_rules(self): nlp = medspacy.load(load_rules=False) context = nlp.get_pipe("medspacy_context") assert not context.rules
def test_nlp(self): nlp = medspacy.load() assert nlp("This is a sentence. So is this.")
def test_load_enable(self): nlp = medspacy.load(enable={"medspacy_target_matcher", "medspacy_sectionizer"}) assert len(nlp.pipeline) == 2 assert set(nlp.pipe_names) == {"medspacy_target_matcher", "medspacy_sectionizer"}
def test_not_load_rules(self): nlp = medspacy.load(load_rules=False) context = nlp.get_pipe("context") assert not context.item_data sectionizer = nlp.get_pipe("sectionizer") assert not sectionizer.patterns
def test_load_de(self): assert medspacy.load("de_core_news_sm")
def test_load_enable(self): nlp = medspacy.load(enable=["target_matcher"]) assert len(nlp.pipeline) == 1 assert "target_matcher" in nlp.pipe_names assert isinstance(nlp.tokenizer, spacy.tokenizer.Tokenizer)
port=3306, user=sys.argv[1], passwd=sys.argv[2], db="mimic2") return conn print("creating db connection") conn = get_mimic_connection() cursor = conn.cursor() print("loading i2b2 language model") nlp = medspacy.load("en_info_3700_i2b2_2012", disable=[ "tagger", "parser", "ner", "target_matcher", "sectionizer", "context", "postprocessor" ]) print(nlp.pipeline) cursor.execute( """SELECT text FROM noteevents WHERE category='RADIOLOGY_REPORT' LIMIT %d""" % NUM_REPORTS) r = [r[0] for r in cursor.fetchall()] print(len(r)) r = [rr for rr in r if rr] print(len(r)) docs = nlp.pipe(r, n_process=6, batch_size=64) print("processed tokenization") sents = [[utils.simple_preprocess(line.string) for line in doc.sents] for _, doc in enumerate(docs)]
def test_load_rules(self): nlp = medspacy.load(load_rules=True) context = nlp.get_pipe("context") assert context.item_data
import pytest import os import tempfile from medspacy.io.db_connect import DbConnect import sqlite3 import medspacy from medspacy.target_matcher import TargetRule from medspacy.io import DocConsumer tmpdirname = tempfile.TemporaryDirectory() db = os.path.join(tmpdirname.name, "test") nlp = medspacy.load(enable=["sentencizer", "target_matcher", "context", "sectionizer"]) nlp.get_pipe("target_matcher").add(TargetRule("pneumonia", "CONDITION")) doc = nlp("There is no evidence of pneumonia.") doc_consumer = DocConsumer(nlp) doc_consumer(doc) class TestDbWriter: def test_init_from_sqlite3_conn_defaults(self): """Test writing with default values for ent attributes.""" sq_conn = sqlite3.connect(db) cursor = sq_conn.cursor() db_conn = DbConnect(conn=sq_conn) from medspacy.io.db_writer import DbWriter
def get_mimic_connection(): conn = pymysql.connect(host=MIMICHOST, port=3306, user=sys.argv[1], passwd=sys.argv[2], db="mimic2") return conn print("creating db connection") conn = get_mimic_connection() cursor = conn.cursor() print("loading i2b2 language model") nlp = medspacy.load("en_info_3700_i2b2_2012", disable=["tagger", "parser", "ner"]) class MyCorpus(object): """An interator that yields sentences (lists of str).""" def __iter__(self): cursor.execute("""SELECT text FROM noteevents""") while True: r = cursor.fetchone() if not r: return r = r[0] for line in nlp(r).sents: # assume there's one document per line, tokens separated by whitespace yield utils.simple_preprocess(line.string)