Python extract_annotations_xml示例，text_extraction.extract_annotations_xml Python示例

示例#1

0

显示文件

def test_offset_mapping_matches_pos_mapped_manually():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    document_data = dict(cdata_xpath='./TEXT')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = document_data ,
                                     skip_chars = r'[\s]' )
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                               offset_mapping = offset_mapping ,
                                               annotation_path = \
                                                 './TAGS/DATE' ,
                                               tag_name = 'DateTime' ,
                                               begin_attribute = 'start' ,
                                               end_attribute = 'end' )
    ##
    assert strict_starts['87'][0]['begin_pos'] == '87'
    assert strict_starts[ '87' ][ 0 ][ 'begin_pos_mapped' ] == \
        offset_mapping[ '87' ]
    assert strict_starts['87'][0]['end_pos'] == '97'
    assert strict_starts[ '87' ][ 0 ][ 'end_pos_mapped' ] == \
        offset_mapping[ '97' ]
    ##
    assert strict_starts['2404'][0]['begin_pos'] == '2404'
    assert strict_starts[ '2404' ][ 0 ][ 'begin_pos_mapped' ] == \
        offset_mapping[ '2404' ]
    assert strict_starts['2404'][0]['end_pos'] == '2410'
    assert strict_starts[ '2404' ][ 0 ][ 'end_pos_mapped' ] == \
        offset_mapping[ '2409' ]

示例#2

0

显示文件

def test_offset_mapping_matches_pos_mapped_automatically():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    document_data = dict(cdata_xpath='./TEXT')
    raw_content , offset_mapping = \
      text_extraction.extract_chars( ingest_file ,
                                     namespaces = {} ,
                                     document_data = document_data ,
                                     skip_chars = r'[\s]' )
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                                  offset_mapping = offset_mapping ,
                                                  annotation_path = \
                                                      './TAGS/DATE' ,
                                                  tag_name = 'DateTime' ,
                                                  begin_attribute = 'start' ,
                                                  end_attribute = 'end' )
    for start_key in strict_starts:
        begin_pos = strict_starts[start_key][0]['begin_pos']
        begin_pos_mapped = strict_starts[start_key][0]['begin_pos_mapped']
        end_pos = strict_starts[start_key][0]['end_pos']
        end_pos_mapped = strict_starts[start_key][0]['end_pos_mapped']
        ## dictionary key is set to begin_pos
        assert start_key == begin_pos
        ## mapping works for begin position
        assert begin_pos != begin_pos_mapped
        while (offset_mapping[begin_pos] == None):
            begin_pos = str(int(begin_pos) + 1)
        assert begin_pos_mapped == offset_mapping[begin_pos]
        ## mapping works for end position
        assert end_pos != end_pos_mapped
        while (offset_mapping[end_pos] == None):
            end_pos = str(int(end_pos) - 1)
        assert end_pos_mapped == offset_mapping[end_pos]

示例#3

0

显示文件

def test_extracting_no_optional_attributes():
    ingest_file = 'tests/data/013_Conditional_Problem.xmi'
    config_file = 'config/webanno_problems_allergies_xmi.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                               offset_mapping = {} ,
                                               annotation_path = \
                                                 './custom:Problems' ,
                                               tag_name = 'Problem' ,
                                               namespaces = namespaces ,
                                               begin_attribute = 'begin' ,
                                               end_attribute = 'end' ,
                                               optional_attributes = [] )
    expected_output = \
      { '181' :  [ { 'type': 'Problem' ,
                      'begin_pos': '181' ,
                      'end_pos': '188' ,
                      'raw_text': None } ] ,
        '218' : [ { 'type': 'Problem' ,
                   'begin_pos': '218' ,
                   'end_pos': '224' ,
                   'raw_text': None } ]
      }
    assert strict_starts == expected_output

示例#4

0

显示文件

def test_default_namespace_same_as_empty():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    strict_starts_default = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                                  offset_mapping = {} ,
                                                  annotation_path = \
                                                      './TAGS/DATE' ,
                                                  tag_name = 'DateTime' ,
                                                  begin_attribute = 'start' ,
                                                  end_attribute = 'end' )
    strict_starts_empty = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                                  offset_mapping = {} ,
                                                  annotation_path = \
                                                      './TAGS/DATE' ,
                                                  tag_name = 'DateTime' ,
                                                  namespaces = {} ,
                                                  begin_attribute = 'start' ,
                                                  end_attribute = 'end' )
    assert strict_starts_default == strict_starts_empty

示例#5

0

显示文件

def test_extracting_sentences_from_CTAKES4_OpenNLP1_8():
    ingest_file = 'tests/data/sentences/992321-OUT.xmi'
    config_file = 'config/uima_sentences.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                               offset_mapping = {} ,
                                               namespaces = namespaces ,
                                               annotation_path = \
                                                   './/type:Sentence' ,
                                               tag_name = 'Sentence' ,
                                               begin_attribute = 'begin' ,
                                               end_attribute = 'end' )
    assert len(strict_starts) == 82

示例#6

0

显示文件

def test_extracting_sentences_from_0005_gs():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    config_file = 'config/uima_sentences.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                                  offset_mapping = {} ,
                                                  namespaces = namespaces ,
                                                  annotation_path = \
                                                      './/type:Sentence' ,
                                                  tag_name = 'Sentence' ,
                                                  begin_attribute = 'begin' ,
                                                  end_attribute = 'end' )
    assert strict_starts == {}

示例#7

0

显示文件

def test_extracting_datetime_from_0005_gs():
    ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml'
    strict_starts = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                               offset_mapping = {} ,
                                               annotation_path = \
                                                 './TAGS/DATE' ,
                                               tag_name = 'DateTime' ,
                                               begin_attribute = 'start' ,
                                               end_attribute = 'end' )
    expected_output = \
      { '2404' :  [ { 'type': 'DateTime' ,
                      'begin_pos': '2404' ,
                      'end_pos': '2410' ,
                      'raw_text': None } ] ,
        '87' : [ { 'type': 'DateTime' ,
                   'begin_pos': '87' ,
                   'end_pos': '97' ,
                   'raw_text': None } ]
      }
    assert strict_starts == expected_output

示例#8

0

显示文件

def test_extracting_with_and_without_optional_attributes():
    ingest_file = 'tests/data/013_Conditional_Problem.xmi'
    config_file = 'config/webanno_problems_allergies_xmi.conf'
    namespaces , document_data , patterns = \
      args_and_configs.process_config( config_file = config_file ,
                                       score_key = 'Short Name' ,
                                       score_values = [ '.*' ] )
    strict_starts_no_opt_attributes = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                               offset_mapping = {} ,
                                               annotation_path = \
                                                 './custom:Problems' ,
                                               tag_name = 'Problem' ,
                                               namespaces = namespaces ,
                                               begin_attribute = 'begin' ,
                                               end_attribute = 'end' ,
                                               optional_attributes = [] )
    strict_starts_with_opt_attributes = \
      text_extraction.extract_annotations_xml( ingest_file ,
                                               offset_mapping = {} ,
                                               annotation_path = \
                                                 './custom:Problems' ,
                                               tag_name = 'Problem' ,
                                               namespaces = namespaces ,
                                               begin_attribute = 'begin' ,
                                               end_attribute = 'end' ,
                                               optional_attributes = \
                                                 patterns[ 0 ][ 'optional_attributes' ] )
    expected_output_no_opt_attributes = \
      { '181' :  [ { 'type': 'Problem' ,
                      'begin_pos': '181' ,
                      'end_pos': '188' ,
                      'raw_text': None } ] ,
        '218' : [ { 'type': 'Problem' ,
                   'begin_pos': '218' ,
                   'end_pos': '224' ,
                   'raw_text': None } ]
      }
    expected_output_with_opt_attributes = \
      { '181' :  [ { 'type': 'Problem' ,
                     'begin_pos': '181' ,
                     'end_pos': '188' ,
                     'raw_text': None ,
                     'conditional' : 'true' ,
                     'generic' : 'false' ,
                     'historical' : 'false' ,
                     'negated' : 'false' ,
                     'not_patient' : 'true' ,
                     'uncertain' : 'false' } ] ,
        '218' : [ { 'type': 'Problem' ,
                    'begin_pos': '218' ,
                    'end_pos': '224' ,
                    'raw_text': None ,
                    'conditional' : 'false' ,
                    'generic' : 'false' ,
                    'historical' : 'true' ,
                    'negated' : 'false' ,
                    'not_patient' : 'false' ,
                    'uncertain' : 'true' } ]
      }
    assert strict_starts_no_opt_attributes == \
        expected_output_no_opt_attributes
    assert strict_starts_with_opt_attributes == \
        expected_output_with_opt_attributes
    assert strict_starts_no_opt_attributes != \
        expected_output_with_opt_attributes
    assert strict_starts_with_opt_attributes != \
        expected_output_no_opt_attributes