def test_chexpert_extractor(): extractor = RegExExtractor( __tests_dir / 'data/patterns/chexpert_phrases.yml', 'CheXpert labeler') dir = get_example_dir() with open(dir / '1.chexpert.xml') as fp: c = bioc.load(fp) actual_documents = c.documents expected_documents = [] for doc in actual_documents: doc = copy.deepcopy(doc) for p in doc.passages: del p.annotations[:] expected_documents.append(doc) for expected_doc, actual_doc in zip(expected_documents, actual_documents): extractor.__call__(expected_doc) expected_anns = sorted(list( bioc.annotations(expected_doc, bioc.PASSAGE)), key=lambda a: a.total_span.offset) actual_anns = sorted(list(bioc.annotations(actual_doc, bioc.PASSAGE)), key=lambda a: a.total_span.offset) assert len(expected_anns) == len(actual_anns), \ '{} vs {}'.format(len(expected_anns), len(actual_anns)) for expected_ann, actual_ann in zip(expected_anns, actual_anns): assert expected_ann.total_span == actual_ann.total_span for k in ['observation', 'annotator']: assert expected_ann.infons[k] == actual_ann.infons[k]
def test_annotations(): results = list(bioc.annotations(collection)) assert {'1', '2', '3', '4', '5'} == {r.annotation.id for r in results} results = list(bioc.annotations(collection, level=DOCUMENT)) assert {'5'} == {r.annotation.id for r in results} assert {'1'} == {r.document.id for r in results} assert {None} == {r.passage for r in results} results = list(bioc.annotations(collection, level=PASSAGE)) assert {'1', '2'} == {r.annotation.id for r in results} assert {'1'} == {r.document.id for r in results} assert {0} == {r.passage.offset for r in results} assert {None} == {r.sentence for r in results} results = list(bioc.annotations(collection, level=SENTENCE)) assert {'3', '4'} == {r.annotation.id for r in results} assert {'2'} == {r.document.id for r in results} assert {27} == {r.passage.offset for r in results} assert {27, 34} == {r.sentence.offset for r in results} results = list(bioc.annotations(collection.documents[0], level=SENTENCE)) assert len(results) == 0 with pytest.raises(TypeError): next(bioc.annotations('Foo'))
def test_sibling_intext_citations(table_article): all_passages = [] all_annotations = [] file = StringIO(table_article) for doc in docs2bioc(file, 'pmcxml', trim_sentences=False): all_passages.extend(doc.passages) all_annotations.extend(bioc.annotations(doc)) assert any([ 'inspected using the graphics program PyMOL.' in chunk.text for chunk in all_passages ]) assert '[14],[16],[23]\u2013[25]' in [ a.infons['citation_text'] for a in all_annotations ]
def test_annotations(): annotations = list(bioc.annotations(collection)) assert 2 == len(annotations) assert '1' == annotations[0].id assert '2' == annotations[1].id annotations = list(bioc.annotations(collection, level=bioc.SENTENCE)) assert 2 == len(annotations) assert '3' == annotations[0].id assert '4' == annotations[1].id annotations = list(bioc.annotations(collection, level=bioc.DOCUMENT)) assert 1 == len(annotations) assert '5' == annotations[0].id annotations = list( bioc.annotations(collection.documents[1], level=bioc.SENTENCE)) assert 2 == len(annotations) assert '4' == annotations[1].id with pytest.raises(TypeError): next(bioc.annotations('Foo')) with pytest.raises(ValueError): next(bioc.annotations(collection, level=-1)) with pytest.raises(ValueError): next( bioc.annotations(collection.documents[0].passages[0], level=bioc.DOCUMENT)) with pytest.raises(ValueError): next( bioc.annotations(collection.documents[1].passages[0].sentences[0], level=bioc.DOCUMENT)) next( bioc.annotations(collection.documents[1].passages[0].sentences[0], level=bioc.PASSAGE))
def __extract_tags(self, tagged_article, document, wanted_tags): """ This function extract the words corresponding to the wanted_tags and lemmatizes these, to reduce redundancy. These words are saved in the Tag object. :param tagged_article: A tagged article. :param document: The document corresponding to the tagged_article. :param wanted_tags: The words corresponding to these tags are saved. :return: An Tag object containing the article information + the wanted words. """ for anno in list(bioc.annotations(document)): tag = anno.infons['type'] for wanted_tag in wanted_tags: if wanted_tag == tag: word = LEMMATIZER.lemmatize( anno.text ) # e.g. rats --> rat, so to reduce redundancy tagged_article.add_annotation(tag, word) return tagged_article