def test_union_patterns_empty_test(): filename = 'config/i2b2_2016_track-1.conf' score_values = ['(Patient|Provider)'] namespaces , document_data , ref_patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) score_values = ['I.Do.No.Exist'] namespaces , document_data , test_patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) ref_patterns , test_patterns = \ args_and_configs.align_patterns( ref_patterns , test_patterns ) for ref_pattern in ref_patterns: match_flag = False for test_pattern in test_patterns: if (test_pattern['type'] == ref_pattern['type']): match_flag = True test_pattern['type'] == ref_pattern['type'] break if (match_flag == False): assert ref_pattern['type'] == False for test_pattern in test_patterns: match_flag = False for ref_pattern in ref_patterns: if (test_pattern['type'] == ref_pattern['type']): match_flag = True test_pattern['type'] == ref_pattern['type'] break if (match_flag == False): assert test_pattern['type'] == False
def test_writing_dictionary_for_datetime_from_0005_gs(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' reference_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml' config_file = 'config/i2b2_2016_track-1.conf' try: tmp_descriptor, tmp_file = tempfile.mkstemp() os.close(tmp_descriptor) namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) text_extraction.extract_annotations(ingest_file, namespaces=namespaces, document_data=document_data, patterns=patterns, skip_chars=r'[\s]', out_file=tmp_file) with open(reference_file, 'r') as rf: reloaded_reference = json.load(rf) with open(tmp_file, 'r') as tf: reloaded_test = json.load(tf) assert reloaded_reference['annotations'] == reloaded_test[ 'annotations'] assert reloaded_reference['offset_mapping'] == reloaded_test[ 'offset_mapping'] assert reloaded_reference['raw_content'] == reloaded_test[ 'raw_content'] finally: os.remove(tmp_file)
def test_extracting_no_optional_attributes(): ingest_file = 'tests/data/013_Conditional_Problem.xmi' config_file = 'config/webanno_problems_allergies_xmi.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './custom:Problems' , tag_name = 'Problem' , namespaces = namespaces , begin_attribute = 'begin' , end_attribute = 'end' , optional_attributes = [] ) expected_output = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None } ] } assert strict_starts == expected_output
def test_contents_of_write_of_dictionary_for_brat_patterns(): ingest_file = 'tests/data/brat_reference/problems_and_allergens.ann' config_file = 'config/brat_problems_allergies_standoff.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with tempfile.NamedTemporaryFile() as tmpfile_handle: assert os.path.exists(tmpfile_handle.name) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = tmpfile_handle.name ) reloaded_json = json.load(tmpfile_handle) assert reloaded_json['annotations'] == strict_starts ## T34 Problem 474 493 shortness of breath ## A1 Negated T34 assert strict_starts['474'][0]['begin_pos'] == '474' assert strict_starts['474'][0]['end_pos'] == '493' assert strict_starts['474'][0]['raw_text'] == 'shortness of breath' assert strict_starts['474'][0]['Historical'] == 'false' assert strict_starts['474'][0]['Negated'] == 'true' assert os.path.exists(tmpfile_handle.name) assert os.path.exists(tmpfile_handle.name) == False
def test_count_ref_set_default(capsys): presaved_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml' command_line_args = [ 'etude.py', '--reference-input', 'tests/data/i2b2_2016_track-1_reference', '--reference-config', 'config/i2b2_2016_track-1.conf', '--print-counts', '--no-metrics' ] with patch.object(sys, 'argv', command_line_args): args = etude.init_args() namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = args.reference_config , score_key = args.score_key , score_values = args.score_values ) with open(presaved_file, 'r') as fp: reloaded_json = json.load(fp) etude.count_ref_set(this_ns=namespaces, this_dd=document_data, this_patterns=patterns, this_folder=args.reference_input, args=args, file_prefix=args.file_prefix, file_suffix=args.file_suffix[0]) default_out, err = capsys.readouterr() expected_values = [['counts', 'n'], ['Total', '482']] for expected_values in expected_values: print(args.delim.join('{}'.format(m) for m in expected_values)) expected_out, err = capsys.readouterr() default_out = default_out.strip() expected_out = expected_out.strip() assert default_out == expected_out
def test_count_ref_set_csv_out(): presaved_file = 'tests/data/i2b2_2016_track-1_csv_out.csv' try: tmp_descriptor, tmp_file = tempfile.mkstemp() os.close(tmp_descriptor) command_line_args = [ 'etude.py', '--reference-input', 'tests/data/i2b2_2016_track-1_reference', '--reference-config', 'config/i2b2_2016_track-1.conf', '--csv-out', tmp_file, '--by-file', '--by-type', '--by-file-and-type', '--by-type-and-file', '--print-counts', '--no-metrics' ] with patch.object(sys, 'argv', command_line_args): args = etude.init_args() namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = args.reference_config , score_key = args.score_key , score_values = args.score_values ) import csv with open(presaved_file, 'r') as fp: reloaded_csv = fp.read() etude.count_ref_set(this_ns=namespaces, this_dd=document_data, this_patterns=patterns, this_folder=args.reference_input, args=args, file_prefix=args.file_prefix, file_suffix=args.file_suffix[0]) with open(tmp_file, 'r') as fp: new_csv = fp.read() assert new_csv == reloaded_csv finally: os.remove(tmp_file)
def test_default_document_format(): filename = 'config/i2b2_2016_track-1.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) assert document_data['format'] == 'Unknown'
def test_set_score_key_Sentences(): filename = 'config/uima_sentences.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = [ '.*' ] ) for pattern in patterns: assert pattern['type'] == "Sentence"
def test_plaintext_document_format(): filename = 'config/plaintext_sentences.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) assert document_data['format'] == 'txt'
def test_i2b2_2016_track_1_has_empty_namespace(): config_file = 'config/i2b2_2016_track-1.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) ## Empty dictionary resolves as False assert not bool(namespaces)
def test_skip_missing_XPath(): filename = 'config/i2b2_2016_track-1.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) for pattern in patterns: assert pattern['long_name'] != "Other Person Name"
def test_raw_content_extraction_from_plaintext(): filename = 'config/plaintext_sentences.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) assert 'cdata_xpath' not in document_data assert 'tag_xpath' not in document_data assert 'content_attribute' not in document_data
def test_raw_content_extraction_from_attribute(): filename = 'config/webanno_phi_xmi.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) assert 'cdata_xpath' not in document_data assert document_data['tag_xpath'] == './cas:Sofa' assert document_data['content_attribute'] == 'sofaString'
def test_raw_content_extraction_from_cdata(): filename = 'config/i2b2_2016_track-1.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) assert document_data['cdata_xpath'] == './TEXT' assert 'tag_xpath' not in document_data assert 'content_attribute' not in document_data
def test_webanno_custom_namespaces(): config_file = 'config/webanno_uima_xmi.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) ## Non-empty dictionary resolves as True expected_namespaces = {'custom': 'http:///webanno/custom.ecore'} with open('/tmp/stdout.log', 'w') as fp: fp.write('-----------\n{}\n-------------\n'.format(namespaces)) assert namespaces == expected_namespaces
def test_set_score_key_match_over_multiple_values_Tutorial(): filename = 'config/CAS_XMI.conf' score_values = ['^D.*e$', '^D.*n$', '^T.*e$'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) for pattern in patterns: assert pattern['type'] == "DateTime" namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Parent' , score_values = score_values ) for pattern in patterns: assert pattern['type'] == "Time" namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Long Name' , score_values = score_values ) for pattern in patterns: assert pattern['type'] == "Date and Time Information"
def test_set_score_key_match_strict_start_and_end_char_Tutorial(): filename = 'config/CAS_XMI.conf' score_values = ['^[DT].*[en]$'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) for pattern in patterns: assert pattern['type'] == "DateTime" namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Parent' , score_values = score_values ) for pattern in patterns: assert pattern['type'] == "Time" namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Long Name' , score_values = score_values ) for pattern in patterns: assert pattern['type'] == "Date and Time Information"
def convert_configs_to_json(): fileroots = [ 'CAS_XMI', 'i2b2_2016_track-1', 'uima_sentences', 'webanno_uima_xmi' ] for fileroot in fileroots: filename = 'config/' + fileroot + '.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = [ '.*' ] ) with open('tests/data/' + fileroot + '.json', 'w') as fp: json.dump(patterns, fp, indent=4)
def test_sentences_has_defined_namespaces(): config_file = 'config/uima_sentences.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) ## Non-empty dictionary resolves as True expected_namespaces = \ { 'cas' : 'http:///uima/cas.ecore' , 'type': 'http:///com/clinacuity/deid/nlp/uima/type.ecore', 'type4': 'http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore' } assert namespaces == expected_namespaces
def test_optional_attributes(): filename = 'config/webanno_problems_allergies_xmi.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) assert 'conditional' in patterns[0]['optional_attributes'] assert 'generic' in patterns[0]['optional_attributes'] assert 'historical' in patterns[0]['optional_attributes'] assert 'negated' in patterns[0]['optional_attributes'] assert 'not_patient' in patterns[0]['optional_attributes'] assert 'uncertain' in patterns[0]['optional_attributes']
def test_brat_standoff_format(): filename = 'config/brat_problems_allergies_standoff.conf' score_values = ['.*'] namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = filename , score_key = 'Short Name' , score_values = score_values ) for pattern in patterns: assert pattern['short_name'] == 'Problem' or pattern[ 'short_name'] == 'Allergen' assert pattern['type_prefix'] == 'T' assert pattern['optional_attributes'] == [ 'Conditional', 'Generic', 'Historical', 'Negated', 'NotPatient', 'Uncertain' ]
def test_extracting_sentences_from_CTAKES4_OpenNLP1_8(): ingest_file = 'tests/data/sentences/992321-OUT.xmi' config_file = 'config/uima_sentences.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , namespaces = namespaces , annotation_path = \ './/type:Sentence' , tag_name = 'Sentence' , begin_attribute = 'begin' , end_attribute = 'end' ) assert len(strict_starts) == 82
def test_extracting_sentences_from_0005_gs(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' config_file = 'config/uima_sentences.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , namespaces = namespaces , annotation_path = \ './/type:Sentence' , tag_name = 'Sentence' , begin_attribute = 'begin' , end_attribute = 'end' ) assert strict_starts == {}
def test_of_presaved_dictionary_for_complex_patterns(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' presaved_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml' config_file = 'config/i2b2_2016_track-1.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with open(presaved_file, 'r') as fp: reloaded_json = json.load(fp) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = None ) assert reloaded_json['annotations'] == strict_starts
def test_count_ref_set_by_type_and_file(capsys): presaved_file = 'tests/data/i2b2_2016_track-1_reference_out/0005_gs.xml' command_line_args = [ 'etude.py', '--reference-input', 'tests/data/i2b2_2016_track-1_reference', '--reference-config', 'config/i2b2_2016_track-1.conf', '--by-type', '--by-file', '--print-counts', '--no-metrics' ] with patch.object(sys, 'argv', command_line_args): args = etude.init_args() namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = args.reference_config , score_key = args.score_key , score_values = args.score_values ) with open(presaved_file, 'r') as fp: reloaded_json = json.load(fp) etude.count_ref_set(this_ns=namespaces, this_dd=document_data, this_patterns=patterns, this_folder=args.reference_input, args=args, file_prefix=args.file_prefix, file_suffix=args.file_suffix[0]) default_out, err = capsys.readouterr() expected_values = [['counts', 'n'], ['Total', '482'], ['0005_gs.xml', '36'], ['0016_gs.xml', '54'], ['0267_gs.xml', '63'], ['0273_gs.xml', '35'], ['0389_gs.xml', '40'], ['0475_gs.xml', '46'], ['0617_gs.xml', '38'], ['0709_gs.xml', '45'], ['0982_gs.xml', '100'], ['0992_gs.xml', '25'], ['Age', '92'], ['DateTime', '124'], ['HCUnit', '76'], ['OtherGeo', '5'], ['OtherID', '7'], ['OtherOrg', '21'], ['Patient', '19'], ['PhoneFax', '6'], ['Provider', '64'], ['SSN', '0'], ['StateCountry', '33'], ['StreetCity', '29'], ['Zip', '4'], ['eAddress', '2']] for expected_values in expected_values: print(args.delim.join('{}'.format(m) for m in expected_values)) expected_out, err = capsys.readouterr() default_out = default_out.strip() expected_out = expected_out.strip() assert default_out == expected_out
def test_of_identity_read_write_of_dictionary_for_complex_patterns(): ingest_file = 'tests/data/i2b2_2016_track-1_reference/0005_gs.xml' config_file = 'config/i2b2_2016_track-1.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with tempfile.NamedTemporaryFile() as tmpfile_handle: assert os.path.exists(tmpfile_handle.name) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = tmpfile_handle.name ) reloaded_json = json.load(tmpfile_handle) assert reloaded_json['annotations'] == strict_starts assert os.path.exists(tmpfile_handle.name) assert os.path.exists(tmpfile_handle.name) == False
def test_empty_contents_of_write_of_dictionary_for_brat_patterns(): ingest_file = 'tests/data/brat_reference/ibm.ann' config_file = 'config/brat_problems_allergies_standoff.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) with tempfile.NamedTemporaryFile() as tmpfile_handle: assert os.path.exists(tmpfile_handle.name) offset_mapping , strict_starts = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = r'[\s]' , out_file = tmpfile_handle.name ) assert strict_starts == {} assert os.path.exists(tmpfile_handle.name) with open(tmpfile_handle.name, 'r') as rf: reloaded_out_file = json.load(rf) assert reloaded_out_file["annotations"] == {} assert reloaded_out_file[ "raw_content"] == "International Business Machines Corporation: IBM is Big Blue\n" assert os.path.exists(tmpfile_handle.name) == False
def test_extracting_with_and_without_optional_attributes_called_by_parent(): ingest_file = 'tests/data/013_Conditional_Problem.xmi' config_file = 'config/webanno_problems_allergies_xmi.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) patterns.pop() offset_mapping , annots_with_opt_attributes = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = None , out_file = None ) patterns[0]['optional_attributes'] = [] offset_mapping , annots_without_opt_attributes = \ text_extraction.extract_annotations( ingest_file , namespaces = namespaces , document_data = document_data , patterns = patterns , skip_chars = None , out_file = None ) expected_output_without_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None } ] } expected_output_with_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None , 'conditional' : 'true' , 'generic' : 'false' , 'historical' : 'false' , 'negated' : 'false' , 'not_patient' : 'true' , 'uncertain' : 'false' } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None , 'conditional' : 'false' , 'generic' : 'false' , 'historical' : 'true' , 'negated' : 'false' , 'not_patient' : 'false' , 'uncertain' : 'true' } ] } assert annots_with_opt_attributes == \ expected_output_with_opt_attributes assert annots_without_opt_attributes == \ expected_output_without_opt_attributes assert annots_with_opt_attributes != \ expected_output_without_opt_attributes assert annots_without_opt_attributes != \ expected_output_with_opt_attributes
args.test_config, args.test_input, args.test_out, args.score_key, args.fuzzy_flags ]) return args if __name__ == "__main__": ## args = init_args() ## Extract and process the two input file configs if (args.reference_input): try: reference_ns , reference_dd , reference_patterns = \ args_and_configs.process_config( config_file = args.reference_config , score_key = args.score_key , score_values = args.score_values , collapse_all_patterns = args.collapse_all_patterns , verbose = args.verbose ) except: e = sys.exc_info()[0] log.error( 'Uncaught exception in process_config for reference config: {}' .format(e)) if (reference_patterns == []): log.error( 'No reference patterns extracted from config. Bailing out now.' ) exit(1) if (args.test_input): try: test_ns , test_dd , test_patterns = \
def test_extracting_with_and_without_optional_attributes(): ingest_file = 'tests/data/013_Conditional_Problem.xmi' config_file = 'config/webanno_problems_allergies_xmi.conf' namespaces , document_data , patterns = \ args_and_configs.process_config( config_file = config_file , score_key = 'Short Name' , score_values = [ '.*' ] ) strict_starts_no_opt_attributes = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './custom:Problems' , tag_name = 'Problem' , namespaces = namespaces , begin_attribute = 'begin' , end_attribute = 'end' , optional_attributes = [] ) strict_starts_with_opt_attributes = \ text_extraction.extract_annotations_xml( ingest_file , offset_mapping = {} , annotation_path = \ './custom:Problems' , tag_name = 'Problem' , namespaces = namespaces , begin_attribute = 'begin' , end_attribute = 'end' , optional_attributes = \ patterns[ 0 ][ 'optional_attributes' ] ) expected_output_no_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None } ] } expected_output_with_opt_attributes = \ { '181' : [ { 'type': 'Problem' , 'begin_pos': '181' , 'end_pos': '188' , 'raw_text': None , 'conditional' : 'true' , 'generic' : 'false' , 'historical' : 'false' , 'negated' : 'false' , 'not_patient' : 'true' , 'uncertain' : 'false' } ] , '218' : [ { 'type': 'Problem' , 'begin_pos': '218' , 'end_pos': '224' , 'raw_text': None , 'conditional' : 'false' , 'generic' : 'false' , 'historical' : 'true' , 'negated' : 'false' , 'not_patient' : 'false' , 'uncertain' : 'true' } ] } assert strict_starts_no_opt_attributes == \ expected_output_no_opt_attributes assert strict_starts_with_opt_attributes == \ expected_output_with_opt_attributes assert strict_starts_no_opt_attributes != \ expected_output_with_opt_attributes assert strict_starts_with_opt_attributes != \ expected_output_no_opt_attributes