def test_offset_mapping_matches_pos_mapped_manually(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' document_data = dict(cdata_xpath='./TEXT') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = document_data , skip_chars = r'[\s]' ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = offset_mapping , annotation_path = \ './TAGS/DATE' , tag_name = 'DateTime' , begin_attribute = 'start' , end_attribute = 'end' ) ## assert strict_starts['87'][0]['begin_pos'] == '87' assert strict_starts[ '87' ][ 0 ][ 'begin_pos_mapped' ] == \ offset_mapping[ '87' ] assert strict_starts['87'][0]['end_pos'] == '97' assert strict_starts[ '87' ][ 0 ][ 'end_pos_mapped' ] == \ offset_mapping[ '97' ] ## assert strict_starts['2404'][0]['begin_pos'] == '2404' assert strict_starts[ '2404' ][ 0 ][ 'begin_pos_mapped' ] == \ offset_mapping[ '2404' ] assert strict_starts['2404'][0]['end_pos'] == '2410' assert strict_starts[ '2404' ][ 0 ][ 'end_pos_mapped' ] == \ offset_mapping[ '2409' ]
def test_offset_mapping_matches_pos_mapped_automatically(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' document_data = dict(cdata_xpath='./TEXT') raw_content , offset_mapping = \ text_extraction.extract_chars( ingest_file , namespaces = {} , document_data = document_data , skip_chars = r'[\s]' ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = offset_mapping , annotation_path = \ './TAGS/DATE' , tag_name = 'DateTime' , begin_attribute = 'start' , end_attribute = 'end' ) for start_key in strict_starts: begin_pos = strict_starts[start_key][0]['begin_pos'] begin_pos_mapped = strict_starts[start_key][0]['begin_pos_mapped'] end_pos = strict_starts[start_key][0]['end_pos'] end_pos_mapped = strict_starts[start_key][0]['end_pos_mapped'] ## dictionary key is set to begin_pos assert start_key == begin_pos ## mapping works for begin position assert begin_pos != begin_pos_mapped while (offset_mapping[begin_pos] == None): begin_pos = str(int(begin_pos) + 1) assert begin_pos_mapped == offset_mapping[begin_pos] ## mapping works for end position assert end_pos != end_pos_mapped while (offset_mapping[end_pos] == None): end_pos = str(int(end_pos) - 1) assert end_pos_mapped == offset_mapping[end_pos]
def test_extracting_no_optional_attributes(): ingest_file = 'tests/data/013_Conditional_Problem.xmi' config_file = 'config/webanno_problems_allergies_xmi.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './custom:Problems' , tag_name = 'Problem' , namespaces = namespaces , begin_attribute = 'begin' , end_attribute = 'end' , optional_attributes = [] ) expected_output = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None } ] } assert strict_starts == expected_output
def test_default_namespace_same_as_empty(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' strict_starts_default = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './TAGS/DATE' , tag_name = 'DateTime' , begin_attribute = 'start' , end_attribute = 'end' ) strict_starts_empty = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './TAGS/DATE' , tag_name = 'DateTime' , namespaces = {} , begin_attribute = 'start' , end_attribute = 'end' ) assert strict_starts_default == strict_starts_empty
def test_extracting_sentences_from_CTAKES4_OpenNLP1_8(): ingest_file = 'tests/data/sentences/992321-OUT.xmi' config_file = 'config/uima_sentences.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , namespaces = namespaces , annotation_path = \ './/type:Sentence' , tag_name = 'Sentence' , begin_attribute = 'begin' , end_attribute = 'end' ) assert len(strict_starts) == 82
def test_extracting_sentences_from_0005_gs(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' config_file = 'config/uima_sentences.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , namespaces = namespaces , annotation_path = \ './/type:Sentence' , tag_name = 'Sentence' , begin_attribute = 'begin' , end_attribute = 'end' ) assert strict_starts == {}
def test_extracting_datetime_from_0005_gs(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './TAGS/DATE' , tag_name = 'DateTime' , begin_attribute = 'start' , end_attribute = 'end' ) expected_output = \ { '2404' : [ { 'type': 'DateTime' , 'begin_pos': '2404' , 'end_pos': '2410' , 'raw_text': None } ] , '87' : [ { 'type': 'DateTime' , 'begin_pos': '87' , 'end_pos': '97' , 'raw_text': None } ] } assert strict_starts == expected_output
def test_extracting_with_and_without_optional_attributes(): ingest_file = 'tests/data/013_Conditional_Problem.xmi' config_file = 'config/webanno_problems_allergies_xmi.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts_no_opt_attributes = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './custom:Problems' , tag_name = 'Problem' , namespaces = namespaces , begin_attribute = 'begin' , end_attribute = 'end' , optional_attributes = [] ) strict_starts_with_opt_attributes = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './custom:Problems' , tag_name = 'Problem' , namespaces = namespaces , begin_attribute = 'begin' , end_attribute = 'end' , optional_attributes = \ patterns[ 0 ][ 'optional_attributes' ] ) expected_output_no_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None } ] } expected_output_with_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None , 'conditional' : 'true' , 'generic' : 'false' , 'historical' : 'false' , 'negated' : 'false' , 'not_patient' : 'true' , 'uncertain' : 'false' } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None , 'conditional' : 'false' , 'generic' : 'false' , 'historical' : 'true' , 'negated' : 'false' , 'not_patient' : 'false' , 'uncertain' : 'true' } ] } assert strict_starts_no_opt_attributes == \ expected_output_no_opt_attributes assert strict_starts_with_opt_attributes == \ expected_output_with_opt_attributes assert strict_starts_no_opt_attributes != \ expected_output_with_opt_attributes assert strict_starts_with_opt_attributes != \ expected_output_no_opt_attributes