def test_parse_doc(): text = 'No pneumothorax.' tree = '(S1 (S (S (NP (DT No) (NN pneumothorax))) (. .)))' document = text_to_bioc([text], type='d/p/s') d = parser.__call__(document) assert d.passages[0].sentences[0].infons['parse tree'] == tree # test empty sentence document = text_to_bioc([''], type='d/p/s') d = parser.__call__(document) assert d.passages[0].sentences[0].infons['parse tree'] is None
def test_parse(): converter = NegBioPtb2DepConverter(representation='CCprocessed', universal=True) # neg(evidence-2, no-1) # !root(ROOT-0, evidence-2) # !case(infiltrate-5, of-3) # amod(infiltrate-5, focal-4) # nmod:of(evidence-2, infiltrate-5) # nmod:of(evidence-2, effusion-7) # conj:or(infiltrate-5, effusion-7) # cc(infiltrate-5, or-8) # nmod:of(evidence-2, pneumothorax-9) # conj:or(infiltrate-5, pneumothorax-9) text = 'no evidence of focal infiltrate, effusion or pneumothorax.' tree = '(S1 (S (S (NP (NP (DT no) (NN evidence)) (PP (IN of) (NP (NP (JJ focal)' \ ' (NN infiltrate)) (, ,) (NP (NN effusion)) (CC or) (NP (NN pneumothorax)))))) (. .)))' t = parser.parse(text) assert str(t) == tree d = text_to_bioc([text], type='d/p/s') s = d.passages[0].sentences[0] s.infons['parse tree'] = tree converter.__call__(d) # print(repr(d)) for i, word in enumerate( 'no evidence of focal infiltrate , effusion or pneumothorax .'. split()): assert s.annotations[i].text == word for i, dep in enumerate( 'neg case amod nmod:of punct nmod:of conj:or cc nmod:of conj:or punct' .split()): assert s.relations[i].infons['dependency'] == dep
def test_extend(): text = 'findings: no pneumothorax.' d = text_to_bioc([text], type='d/p/s') a = bioc.BioCAnnotation() a.text = 'pneumothorax' a.add_location(bioc.BioCLocation(13, 12)) d.passages[0].add_annotation(a) detector.__call__(d) # fake ann a = bioc.BioCAnnotation() a.text = 'eumothor' a.add_location(bioc.BioCLocation(15, 8)) d.passages[0].add_annotation(a) a = bioc.BioCAnnotation() a.text = 'foo' a.add_location(bioc.BioCLocation(27, 3)) d.passages[0].add_annotation(a) _extend(d, 'negation') assert d.passages[0].annotations[1].infons['negation'] == 'True' assert 'negation' not in d.passages[0].annotations[2].infons d.passages[0].annotations[0].infons['CUI'] = 'xxx' d.passages[0].annotations[2].infons['CUI'] = 'xxx' _extend(d, 'negation') assert 'negation' not in d.passages[0].annotations[2].infons
def _get_document(text, tree, sen_ann_index): d = text_to_bioc([text], type='d/p/s') d.passages[0].sentences[0].infons['parse tree'] = tree c = NegBioPtb2DepConverter() c.__call__(d) d.passages[0].add_annotation(d.passages[0].sentences[0].annotations[sen_ann_index]) return d
def test_normalize(): text = '[**Hospital 9**] MEDICAL CONDITION' expe = ' MEDICAL CONDITION' d = text_to_bioc([text], 'd/p') d = normalize(d) assert d.passages[0].text == expe d.passages[0].text = None normalize(d) # skip if there is more than one passages d = text_to_bioc([text, text], 'd/p') d = normalize(d) assert d.passages[0].text == text del d.passages[:] normalize(d)
def test_convert_doc2(self): text = "Can't exclude 1 cm lesion in or near lower esophagus (for example series 2 image 91) BOOKMARK (1.1 cm) appearing or better demonstrated." tree = "(S1 (S (S (VP (MD Can) (RB n't) (VP (VB exclude) (NP (NP (ADJP (CD 1) (NN cm)) (NN lesion)) (PP (IN in) (NP (NP (NP (test_convert_doc2CC or) (JJ near) (NP (NP (JJR lower) (NN esophagus)) (PRN (-LRB- -LRB-) (PP (IN for) (NP (NN example))) (NP (NN series) (CD 2) (NN image) (CD 91)) (-RRB- -RRB-))) (NN BOOKMARK)) (PRN (-LRB- -LRB-) (NP (CD 1.1) (NN cm)) (-RRB- -RRB-))) (VP (VBG appearing) (ADVP (CC or) (ADVP (RBR better))) (VP (VBN demonstrated))))))))) (. .)))" d = text_to_bioc([text], type='d/p/s') s = d.passages[0].sentences[0] s.infons['parse tree'] = tree c = NegBioPtb2DepConverter() c(d)
def create_collections(): filenames = [] top_dir = tempfile.mkdtemp() for i in range(10): c = text_to_bioc(['No pneumothorax.'], 'c/d/p') filename = os.path.join(top_dir, '{}.xml'.format(i)) with open(filename, 'w') as fp: bioc.dump(c, fp) filenames.append(filename) return filenames
def test_neg_regex(): text = 'findings: no pneumothorax.' assert is_neg_regex(text) d = text_to_bioc([text], type='d/p/s') a = bioc.BioCAnnotation() a.text = 'pneumothorax' a.add_location(bioc.BioCLocation(13, 12)) d.passages[0].add_annotation(a) detector.__call__(d) assert d.passages[0].annotations[0].infons['negation'] == 'True'
def test_convert_doc_no_jpype(self): c = NegBioPtb2DepConverter() c._backend = 'subprocess' c._sd = StanfordDependencies.get_instance(backend=c._backend) text = 'No pneumothorax.' tree = '(S1 (S (S (NP (DT No) (NN pneumothorax))) (. .)))' d = text_to_bioc([text], type='d/p/s') s = d.passages[0].sentences[0] s.infons['parse tree'] = tree d = c.__call__(d) s = d.passages[0].sentences[0] assert 'lemma' not in s.annotations[1].infons
def test_split_doc(self, splitter): text = 'No pneumothorax.\nNo pneumothorax.' document = text_to_bioc([text], 'd/p') p = document.passages[0] assert p.text == text assert len(p.sentences) == 0 document = splitter.__call__(document) p = document.passages[0] assert len(p.sentences) == 2 assert p.sentences[0].text == 'No pneumothorax.' assert p.sentences[0].offset == 0 assert p.sentences[1].text == 'No pneumothorax.' assert p.sentences[1].offset == 17
def test_split_document(): text = """findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are stable. lungs are unchanged. air- filled cystic changes. no pneumothorax. osseous structures unchanged scoliosis impression: stable chest. dictating """ d = text_to_bioc([text], type='d/p') d = SectionSplitter().__call__(d) assert len(d.passages) == 4 assert d.passages[0].text == 'findings:' assert d.passages[ 1].text == """pa and lat cxr at 7:34 p.m.. heart and mediastinum are stable. lungs are unchanged. air- filled cystic changes. no pneumothorax. osseous structures unchanged scoliosis""" assert d.passages[2].text == 'impression:' assert d.passages[3].text == """stable chest.
def test_convert_doc(self): text = 'No pneumothorax.' tree = '(S1 (S (S (NP (DT No) (NN pneumothorax))) (. .)))' d = text_to_bioc([text], type='d/p/s') s = d.passages[0].sentences[0] s.infons['parse tree'] = tree c = NegBioPtb2DepConverter() d = c.__call__(d) s = d.passages[0].sentences[0] assert len(s.annotations) == 3, len(s.annotations) assert len(s.relations) == 2 assert s.annotations[0].text == 'No' assert s.annotations[0].infons['tag'] == 'DT' assert s.annotations[0].infons['lemma'] == 'no' assert s.annotations[0].total_span.offset == 0 assert s.annotations[1].text == 'pneumothorax' assert s.annotations[1].infons['tag'] == 'NN' assert s.annotations[1].infons['lemma'] == 'pneumothorax' assert s.annotations[1].total_span.offset == 3 assert s.annotations[2].text == '.' assert s.annotations[2].infons['tag'] == '.' assert s.annotations[2].infons['lemma'] == '.' assert s.annotations[2].total_span.offset == 15 assert s.relations[0].infons['dependency'] == 'neg' assert s.relations[0].nodes[0].refid == 'T0' assert s.relations[0].nodes[1].refid == 'T1' assert s.relations[1].infons['dependency'] == 'punct' assert s.relations[1].nodes[0].refid == 'T2' assert s.relations[1].nodes[1].refid == 'T1' # test empty parse tree del s.annotations[:] del s.infons['parse tree'] c.__call__(d) s.infons['parse tree'] = None c.__call__(d)
def test_clean_sentences(): cleanup = CleanUp() doc = text_to_bioc(['No pneumothorax.', 'No pneumothorax.'], type='d/p/s') p = doc.passages[0] for i in range(10, 0, -1): ann = bioc.BioCAnnotation() ann.add_location(bioc.BioCLocation(i, 1)) p.add_annotation(ann) assert len(doc.passages[0].sentences) == 2 doc = cleanup.__call__(doc) assert len(doc.passages[0].sentences) == 0 assert len(doc.passages[0].annotations) == 10 for i in range(10): assert doc.passages[0].annotations[i].total_span.offset == 10 - i doc = cleanup.__call__(doc, sort_anns=True) for i in range(10): assert doc.passages[0].annotations[i].total_span.offset == i + 1
def test_lemmatize_doc(): converter = NegBioPtb2DepConverter(representation='CCprocessed', universal=True) lemmatizer = Lemmatizer() text = 'no evidence of focal infiltrate, effusion or pneumothorax.' tree = '(S1 (S (S (NP (NP (DT no) (NN evidence)) (PP (IN of) (NP (NP (JJ focal)' \ ' (NN infiltrate)) (, ,) (NP (NN effusion)) (CC or) (NP (NN pneumothorax)))))) (. .)))' d = text_to_bioc([text], type='d/p/s') s = d.passages[0].sentences[0] s.infons['parse tree'] = tree converter.__call__(d) expected = [] for ann in s.annotations: expected.append(ann.infons['lemma']) del ann.infons['lemma'] lemmatizer(d) for i, ann in enumerate(s.annotations): assert expected[i] == ann.infons['lemma']