Exemplo n.º 1
0
def test_surrogate_annotations_errors_ignore():
    original_doc = Document(name='test_doc',
                            text='This document was written on INVALID_DATE.',
                            annotations=[
                                Annotation(text='INVALID_DATE',
                                           start=29,
                                           end=41,
                                           tag='Date',
                                           doc_id='',
                                           ann_id='T0')
                            ])

    gen = surrogate_annotations([original_doc], errors='ignore')
    surrogate_doc = list(gen)[0]
    assert surrogate_doc.text == original_doc.text
    assert surrogate_doc.annotations == original_doc.annotations
    assert surrogate_doc.annotations_without_surrogates == original_doc.annotations
Exemplo n.º 2
0
def apply_surrogates(text, annotations, surrogates, errors='raise'):
    adjusted_annotations = []
    # Amount of characters by which start point of annotation is adjusted
    # Positive shift if surrogates are longer than original annotations
    # Negative shift if surrogates are shorter
    shift = 0
    original_text_pointer = 0
    text_rewritten = ''

    failed_replacements = []

    for annotation, surrogate in zip(annotations, surrogates):
        if not surrogate:
            if errors == 'raise':
                raise ValueError(f'No valid surrogate for {annotation}')
            if errors == 'ignore':
                surrogate = annotation.text
            elif errors == 'coerce':
                surrogate = f'[{annotation.tag}]'
            failed_replacements.append(annotation)

        part = text[original_text_pointer:annotation.start]

        start = annotation.start + shift
        end = start + len(surrogate)
        shift += len(surrogate) - len(annotation.text)

        adjusted_annotations.append(Annotation(
            text=surrogate,
            start=start,
            end=end,
            tag=annotation.tag,
            doc_id=annotation.doc_id,
            ann_id=annotation.ann_id
        ))

        text_rewritten += part + surrogate
        original_text_pointer = annotation.end

    text_rewritten += text[original_text_pointer:]
    doc_rewritten = Document(name='', text=text_rewritten, annotations=adjusted_annotations)
    doc_rewritten.annotations_without_surrogates = failed_replacements
    return doc_rewritten
def test_apply_surrogates_errors_coerce():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='coerce')
    assert surrogate_doc.text == 'a [A] b'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('[A]', start=2, end=5, tag='A'),
        Annotation('b', start=6, end=7, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == [
        Annotation('cc', start=4, end=6, tag='A'),
    ]
Exemplo n.º 4
0
def main(args):
    df_surrogates = pd.read_csv(args.surrogate_table)
    logger.info('Rewrite {} files.'.format(len(df_surrogates.doc_id.unique())))

    # Use manual surrogate if it exists. If not, use the automatically generated one
    df_surrogates['surrogate'] = df_surrogates.manual_surrogate.fillna(df_surrogates['surrogate'])

    for doc_id, rows in df_surrogates.groupby('doc_id'):
        text = load_brat_text(join(args.data_path, '{}.txt'.format(doc_id)))

        rows = rows.sort_values(by='start')
        annotations = rows.apply(lambda row: Annotation(
            text=row['text'],
            start=row['start'],
            end=row['end'],
            tag=row['tag'],
            doc_id=row['doc_id'],
            ann_id=row['ann_id']
        ), axis=1)

        surrogates = rows.surrogate.values
        surrogate_doc = apply_surrogates(text, annotations, surrogates)
        write_brat_document(
            args.output_path,
            doc_id,
            text=surrogate_doc.text,
            annotations=surrogate_doc.annotations
        )

    files_with_annotations = set(df_surrogates.doc_id.values)
    all_files = [splitext(basename(f))[0] for f in glob.glob(join(args.data_path, '*.txt'))]
    files_without_annotations = [f for f in all_files if f not in files_with_annotations]
    logger.info('Found {} files without any annotations. '
                'Copy them to output_path...'.format(len(files_without_annotations)))

    for file in files_without_annotations:
        shutil.copy2(join(args.data_path, '{}.txt'.format(file)), args.output_path)
        shutil.copy2(join(args.data_path, '{}.ann'.format(file)), args.output_path)

    logger.info('Done.')
Exemplo n.º 5
0
def test_mask_annotations():
    text = "De patient J. Jansen (e: [email protected], t: 06-12345678)"
    annotations = [
        Annotation(text='J. Jansen',
                   start=11,
                   end=20,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='*****@*****.**',
                   start=25,
                   end=42,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='06-12345678',
                   start=47,
                   end=58,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]

    doc = Document(name='test_doc', text=text, annotations=annotations)

    doc = mask_annotations(doc)
    assert doc.text == "De patient [NAME] (e: [EMAIL], t: [PHONE_FAX])"
    assert doc.annotations == [
        Annotation(text='[NAME]',
                   start=11,
                   end=17,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='[EMAIL]',
                   start=22,
                   end=29,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='[PHONE_FAX]',
                   start=34,
                   end=45,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]
def test_apply_surrogates():
    text = 'ccc cc ccc c c ccc cccccc cccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=15, end=18, tag='B')
    ]
    surrogates = ['a', 'dd', 'bbbbb']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
    assert surrogate_doc.text == 'a dd ccc c c bbbbb cccccc cccc'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('dd', start=2, end=4, tag='A'),
        Annotation('bbbbb', start=13, end=18, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == []
Exemplo n.º 7
0
def xml_to_document(xml_file):
    """Converts an i2b2/UTHealth XML document to a `deidentify.base.Document`.

    XML Structure:
    ```
    <?xml version="1.0" encoding="UTF-8" ?>
    <deIdi2b2>
    <TEXT><![CDATA[
        this is the record content
    ]]></TEXT>
    <TAGS>
    <DATE id="P0" start="16" end="26" text="2067-05-03" TYPE="DATE" comment="" />
    <AGE id="P1" start="50" end="52" text="55" TYPE="AGE" comment="" />
    </TAGS>
    </deIdi2b2>
    ```
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    text = root.find('TEXT').text
    doc_name = 'doc-' + splitext(basename(xml_file))[0]

    annotations = []
    for tag_element in root.find('TAGS'):
        tag_name = tag_element.tag + ':' + tag_element.attrib['TYPE']
        annotations.append(Annotation(
            text=tag_element.attrib['text'],
            start=tag_element.attrib['start'],
            end=tag_element.attrib['end'],
            # Example: NAME:DOCTOR
            tag=TAG_MAPPING.get(tag_name, tag_name),
            # i2b2 annotations have id prefixed with P. Example: P12
            doc_id=doc_name,
            ann_id='T{}'.format(tag_element.attrib['id'][1:])
        ))

    return Document(name=doc_name, text=text, annotations=annotations)
Exemplo n.º 8
0
def test_rewrite_text():
    text = 'ccc cc ccc c c ccc cccccc cccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=15, end=18, tag='B')
    ]

    surrogates = ['a', 'dd', 'bbbbb']

    result = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
    text_rewritten, adjusted_annotations = result
    assert text_rewritten == 'a dd ccc c c bbbbb cccccc cccc'
    assert adjusted_annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('dd', start=2, end=4, tag='A'),
        Annotation('bbbbb', start=13, end=18, tag='B')
    ]