def test_entity_level():
    gold = [
        Document(name='doc_a',
                 text='',
                 annotations=[Annotation('', 3, 6, 'MISC')]),
        Document(name='doc_b',
                 text='',
                 annotations=[Annotation('', 0, 2, 'PER')])
    ]

    predicted = [
        Document(name='doc_a',
                 text='',
                 annotations=[Annotation('', 2, 6, 'MISC')]),
        Document(name='doc_b',
                 text='',
                 annotations=[Annotation('', 0, 2, 'PER')])
    ]

    evaluator = Evaluator(gold, predicted)
    scores = evaluator.entity_level()
    assert scores.micro_avg_f_score() == 0.5
    assert scores.macro_avg_f_score() == 0.5
    assert scores.f_score('PER') == 1
    assert scores.f_score('MISC') == 0
示例#2
0
def test_annotation():
    ann_a = Annotation(text='test',
                       start=12,
                       end=15,
                       tag='ABC',
                       doc_id='123',
                       ann_id='456')
    ann_b = Annotation(text='test',
                       start=12,
                       end=15,
                       tag='ABC',
                       doc_id='123',
                       ann_id='456')
    ann_c = Annotation(text='test2',
                       start=12,
                       end=15,
                       tag='ABC',
                       doc_id='123',
                       ann_id='456')

    assert ann_a == ann_b
    assert ann_a != ann_c

    with pytest.raises(AttributeError):
        ann_a.text = "Annotation should be immutable"

    # Annotation should also be hashable
    assert len(set([ann_a, ann_b, ann_c])) == 2
示例#3
0
def test_surrogate_annotations():
    text = "De patient J. Jansen (e: [email protected], t: 06-12345678)"
    annotations = [
        Annotation(text='J. Jansen',
                   start=11,
                   end=20,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='*****@*****.**',
                   start=25,
                   end=42,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='06-12345678',
                   start=47,
                   end=58,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]
    doc = Document(name='test_doc', text=text, annotations=annotations)

    surrogate_doc = list(surrogate_annotations([doc]))[0]

    assert len(surrogate_doc.annotations) == len(doc.annotations)
    assert re.match(r'De patient .* \(e: .*, t: .*\)', doc.text)
    assert not surrogate_doc.annotations_without_surrogates

    for ann in surrogate_doc.annotations:
        assert surrogate_doc.text[ann.start:ann.end] == ann.text
def test_generate_surrogates_shuffle_choices():
    text = 'Patient is being treated at UMCU.'
    annotations = [
        Annotation('UMCU', text.index('UMCU'),
                   text.index('UMCU') + 4, 'Hospital')
    ]
    doc_1 = Document(annotations, text)

    text = 'Patient is being treated at MST.'
    annotations = [
        Annotation('MST', text.index('MST'),
                   text.index('MST') + 3, 'Hospital')
    ]
    doc_2 = Document(annotations, text)

    surrogate_docs = DatasetDeidentifier().generate_surrogates([doc_1, doc_2])

    original_annotations, surrogates = surrogate_docs[
        0].annotation_surrogate_pairs()
    assert len(original_annotations) == 1 and len(surrogates) == 1
    assert original_annotations[0].text == 'UMCU'
    assert surrogates[0] == 'MST'

    original_annotations, surrogates = surrogate_docs[
        1].annotation_surrogate_pairs()
    assert len(original_annotations) == 1 and len(surrogates) == 1
    assert original_annotations[0].text == 'MST'
    assert surrogates[0] == 'UMCU'
def test_token_annotations():
    evaluator = Evaluator(gold=(), predicted=())
    doc = Document(name='doc_a',
                   text='A B C D.',
                   annotations=[
                       Annotation('B C', 2, 5, 'PER'),
                       Annotation('D.', 6, 8, 'ORG')
                   ])

    assert evaluator.token_annotations(doc) == ['O', 'PER', 'PER', 'ORG']
    assert evaluator.token_annotations(
        doc, tag_blind=True) == ['O', 'ENT', 'ENT', 'ENT']
def test_apply_surrogates_errors_raise():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    with pytest.raises(ValueError):
        rewrite_dataset.apply_surrogates(text, annotations, surrogates)

    with pytest.raises(ValueError):
        rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='raise')
def test_flair_sentence_with_whitespace_tokens():
    text = 'Mw geniet zichtbaar.  Maarten is de afgelopen periode veelal afwezig.'
    annotation = Annotation(text='Maarten',
                            start=text.index('Maarten'),
                            end=text.index('Maarten') + len('Maarten'),
                            tag='PERSON')
    doc = Document(name='', text=text, annotations=[annotation])

    tokenizer = TokenizerFactory().tokenizer('ons')
    flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer)

    # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained
    # in the Flair sentence, otherwise it's not possible to reconstruct the original document from
    # the tokenized representation.
    assert [token.text for token in flair_sents[0]
            ] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>']

    spacy_doc = docs[0].spacy_doc
    spacy_sents = list(spacy_doc.sents)
    assert len(flair_sents) == 2
    assert len(spacy_sents) == 2

    assert len(flair_sents[0]) == 5
    assert len(spacy_sents[0]) == 5
    assert len(flair_sents[1]) == 8
    assert len(spacy_sents[1]) == 8
示例#8
0
def annotations_iter(annotations):
    lines = readlines(annotations)

    current_pid, current_rid = lines[0].split(maxsplit=5)[0:2]

    annotations = []
    i = 1
    for line in lines:
        pid, rid, start, end, tag, text = line.strip().split(maxsplit=5)
        if pid != current_pid or rid != current_rid:
            yield annotations
            annotations = []
            i = 1
            current_pid = pid
            current_rid = rid

        annotations.append(Annotation(
            text=text,
            start=int(start),
            end=int(end),
            tag=tag,
            ann_id='T{}'.format(i),
            doc_id='note-{}-{}'.format(current_pid, current_rid)
        ))
        i += 1

    yield annotations
示例#9
0
def apply_surrogates(text, annotations, surrogates):
    adjusted_annotations = []
    # Amount of characters by which start point of annotation is adjusted
    # Positive shift if surrogates are longer than original annotations
    # Negative shift if surrogates are shorter
    shift = 0

    original_text_pointer = 0
    text_rewritten = ''

    for annotation, surrogate in zip(annotations, surrogates):
        part = text[original_text_pointer:annotation.start]

        start = annotation.start + shift
        end = start + len(surrogate)
        shift += len(surrogate) - len(annotation.text)

        adjusted_annotations.append(
            Annotation(text=surrogate,
                       start=start,
                       end=end,
                       tag=annotation.tag,
                       doc_id=annotation.doc_id,
                       ann_id=annotation.ann_id))

        text_rewritten += part + surrogate
        original_text_pointer = annotation.end

    text_rewritten += text[original_text_pointer:]
    return text_rewritten, adjusted_annotations
示例#10
0
def _bio_to_standoff(bio_tags: List[str],
                     spacy_doc: spacy.tokens.Doc) -> List[Annotation]:
    """Convert BIO tagged document to annotations in standoff format.

    The original spaCy document is used to recreate correct entity offsets.

    Parameters
    ----------
    bio_tags : List[str]
        A BIO tagged sentence. `len(bio_tags) == len(spacy_doc)` has to hold.
    spacy_doc : spacy.tokens.Doc
        The spaCy doc corresponding to the BIO tags.

    Returns
    -------
    List[Annotation]
        The standoff annotations.

    """
    bio_tags = fix_dangling_entities(bio_tags)
    biluo_tags = _bio_to_biluo(bio_tags)
    offsets = offsets_from_biluo_tags(spacy_doc, biluo_tags)

    annotations = []
    for i, offset in enumerate(offsets):
        annotations.append(
            Annotation(
                text=spacy_doc.char_span(offset[0], offset[1]).text,
                start=offset[0],
                end=offset[1],
                tag=offset[2],
                ann_id='T{}'.format(i),
            ))

    return annotations
def test_token_level():
    text = 'A B C D.'

    gold_a = [Annotation('B C', 2, 5, 'PER')]
    gold_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'PER')]

    pred_a = [Annotation('B', 2, 3, 'PER'), Annotation('C', 4, 5, 'PER')]
    pred_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'ORG')]

    gold = [
        Document(name='doc_a', text=text, annotations=gold_a),
        Document(name='doc_b', text=text, annotations=gold_b)
    ]

    predicted = [
        Document(name='doc_a', text=text, annotations=pred_a),
        Document(name='doc_b', text=text, annotations=pred_b)
    ]

    evaluator = Evaluator(gold, predicted)
    scores = evaluator.token_level()
    assert scores.precision('PER') == 1
    assert scores.recall('PER') == 0.6667
    assert scores.f_score('PER') == 0.8

    assert scores.precision('ORG') == 0.5
    assert scores.recall('ORG') == 1
    assert scores.f_score('ORG') == 0.6667
示例#12
0
def test_annotations():
    actual = ANNOTATOR.annotations()

    assert actual == [
        Annotation('Jan Jansen', 39, 49, 'PERSOON'),
        Annotation('patient J. Jansen', 54, 71, 'PERSOON'),
        Annotation('*****@*****.**', 76, 93, 'URL'),
        Annotation('06-12345678', 98, 109, 'TELEFOONNUMMER'),
        Annotation('64', 114, 116, 'LEEFTIJD'),
        Annotation('Utrecht', 143, 150, 'LOCATIE'),
        Annotation('10 oktober', 164, 174, 'DATUM'),
        Annotation('Peter de Visser', 185, 200, 'PERSOON'),
        # We explicitly check that the following annotation is included in it's correct form.
        # Deduce annotates UMCU as umcu. During annotation, we attempt to recover the original text.
        Annotation('UMCU', 234, 238, 'INSTELLING')
    ]
示例#13
0
    def annotations(self):
        """
        List of annotated PHI entities with their offset within the orginal (unannotated) text.
        """
        annotations = []

        text_parts = deduce.utility.split_tags(self.annotated_text)

        # Deduce denotes entities inline in form of <TYPE text>. We need to take this
        # into account when computing the character positions in the original text.
        original_text_pointer = 0
        ann_id = 0

        for part in text_parts:
            if self.is_annotation(part):
                tag = self.annotation_tag(part)
                # Disregard nested annotations. Nested content is considered to be part of the
                # parent annotation.
                ann_text = self.flatten_annotation_content(part)

                try:
                    # Deduce randomly removes spaces preceeding an annotation. We do a best effort
                    # to find back the entity in the original text. Matching is done relative to
                    # the deduce match, so that we do not capture unwanted text.
                    #
                    # Casing is ignored as deduce sometimes changes the original text.
                    # Example: deduce.annotate_text('UMCU') -> "<INSTELLING umcu>"
                    idx_match = self.text[original_text_pointer:].lower(
                    ).index(ann_text.lower())
                except ValueError:
                    # Sometimes, Deduce changes the original annotation text. Example:
                    # gemeld door <PERSOON Jan van Jansen>
                    # gemeld door <PERSOON Jan Jan van Jansen>
                    #
                    # In those case, we cannot recover the annotation and skip to the next.
                    original_text_pointer += len(ann_text)
                    continue

                start_idx = idx_match + original_text_pointer
                end_idx = start_idx + len(ann_text)
                original_text_pointer = end_idx

                annotations.append(
                    Annotation(ann_id='T{}'.format(ann_id),
                               tag=tag,
                               text=self.text[start_idx:end_idx],
                               start=start_idx,
                               end=end_idx))

                ann_id += 1
            else:
                original_text_pointer += len(part)

        return annotations
def test_annotate():
    tagger = DeduceTagger()
    doc = Document(name='',
                   text='Jan Jannsen vanuit het UMCU.',
                   annotations=[])
    anns = tagger.annotate([doc])[0].annotations

    assert anns == [
        Annotation(text='Jan Jannsen',
                   start=0,
                   end=11,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='UMCU',
                   start=23,
                   end=27,
                   tag='Named_Location',
                   doc_id='',
                   ann_id='T1')
    ]
def test_annotate():
    doc = Document(
        name='',
        text=
        'Hij werd op 10 oktober door arts Peter de Visser ontslagen van de kliniek.',
        annotations=[])

    anns = tagger.annotate([doc])[0].annotations
    assert anns == [
        Annotation(text='10 oktober',
                   start=12,
                   end=22,
                   tag='Date',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='Peter de Visser',
                   start=33,
                   end=48,
                   tag='Name',
                   doc_id='',
                   ann_id='T1')
    ]
示例#16
0
def main(args):
    df_surrogates = pd.read_csv(args.surrogate_table)
    logger.info('Rewrite {} files.'.format(len(df_surrogates.doc_id.unique())))

    # Use manual surrogate if it exists. If not, use the automatically generated one
    df_surrogates['surrogate'] = df_surrogates.manual_surrogate.fillna(
        df_surrogates['surrogate'])

    for doc_id, rows in df_surrogates.groupby('doc_id'):
        text = load_brat_text(join(args.data_path, '{}.txt'.format(doc_id)))

        rows = rows.sort_values(by='start')
        annotations = rows.apply(lambda row: Annotation(text=row['text'],
                                                        start=row['start'],
                                                        end=row['end'],
                                                        tag=row['tag'],
                                                        doc_id=row['doc_id'],
                                                        ann_id=row['ann_id']),
                                 axis=1)

        surrogates = rows.surrogate.values

        text_rewritten, adjusted_annotations = apply_surrogates(
            text, annotations, surrogates)

        write_brat_document(args.output_path,
                            doc_id,
                            text=text_rewritten,
                            annotations=adjusted_annotations)

    files_with_annotations = set(df_surrogates.doc_id.values)
    all_files = [
        splitext(basename(f))[0]
        for f in glob.glob(join(args.data_path, '*.txt'))
    ]
    files_without_annotations = [
        f for f in all_files if f not in files_with_annotations
    ]
    logger.info('Found {} files without any annotations. '
                'Copy them to output_path...'.format(
                    len(files_without_annotations)))

    for file in files_without_annotations:
        shutil.copy2(join(args.data_path, '{}.txt'.format(file)),
                     args.output_path)
        shutil.copy2(join(args.data_path, '{}.ann'.format(file)),
                     args.output_path)

    logger.info('Done.')
示例#17
0
def test_surrogate_annotations_errors_coerce():
    original_doc = Document(name='test_doc',
                            text='This document was written on INVALID_DATE.',
                            annotations=[
                                Annotation(text='INVALID_DATE',
                                           start=29,
                                           end=41,
                                           tag='Date',
                                           doc_id='',
                                           ann_id='T0')
                            ])

    gen = surrogate_annotations([original_doc], errors='coerce')
    surrogate_doc = list(gen)[0]
    assert surrogate_doc.text == 'This document was written on [Date].'
    assert surrogate_doc.annotations == [
        Annotation(text='[Date]',
                   start=29,
                   end=35,
                   tag='Date',
                   doc_id='',
                   ann_id='T0')
    ]
    assert surrogate_doc.annotations_without_surrogates == original_doc.annotations
示例#18
0
def load_brat_annotations(ann_file):
    """Load a brat standoff annotations (.ann) files.

    This method does not support brat fragment annotations. These annotations are inserted when
    annotating text spanning multiple lines.

    Example of fragment annotation that is not supported:
    `T30	Address 3232 3245;3246 3263	Calslaan 11 1234AB Wildervank`
    ```

    Parameters
    ----------
    ann_file : str
        Full path to .ann file.

    Returns
    -------
    list of deidentify.base.Annotation
        The annotations

    """
    annotations = []
    doc_id = splitext(basename(ann_file))[0]

    with open(ann_file) as file:
        lines = file.readlines()

    for line in lines:
        if not line.startswith('T'):
            continue

        splitted = line.split(None, 4)
        ann_id, tag, start, end, text = splitted
        text = text.rstrip('\n')
        try:
            annotation = Annotation(text=text,
                                    start=int(start),
                                    end=int(end),
                                    tag=tag,
                                    doc_id=doc_id,
                                    ann_id=ann_id)
            annotations.append(annotation)
        except ValueError:
            logger.warning(
                'Brat fragment annotations are not supported, skipping line\n{}'
                .format(line))

    return annotations
def test_generate_surrogates_without_choices():
    text = 'Patient is being treated at UMCU.'
    annotations = [
        Annotation('UMCU', text.index('UMCU'),
                   text.index('UMCU') + 4, 'Hospital')
    ]
    doc = Document(annotations, text)

    surrogate_doc = DatasetDeidentifier().generate_surrogates([doc])[0]

    original_annotations, surrogates = surrogate_doc.annotation_surrogate_pairs(
    )
    assert len(original_annotations) == 1
    assert len(surrogates) == 1
    assert original_annotations[0].text == 'UMCU'
    assert surrogates[0] == 'UMCU'
示例#20
0
def test_surrogate_annotations_errors_raise():
    doc = Document(name='test_doc',
                   text='This document was written on INVALID_DATE.',
                   annotations=[
                       Annotation(text='INVALID_DATE',
                                  start=29,
                                  end=41,
                                  tag='Date',
                                  doc_id='',
                                  ann_id='T0')
                   ])

    with pytest.raises(
            ValueError,
            match=r'No valid surrogate for Annotation\(.*INVALID_DATE.*\)'):
        _ = list(surrogate_annotations([doc]))[0]
示例#21
0
def apply_surrogates(text, annotations, surrogates, errors='raise'):
    adjusted_annotations = []
    # Amount of characters by which start point of annotation is adjusted
    # Positive shift if surrogates are longer than original annotations
    # Negative shift if surrogates are shorter
    shift = 0
    original_text_pointer = 0
    text_rewritten = ''

    failed_replacements = []

    for annotation, surrogate in zip(annotations, surrogates):
        if not surrogate:
            if errors == 'raise':
                raise ValueError(f'No valid surrogate for {annotation}')
            if errors == 'ignore':
                surrogate = annotation.text
            elif errors == 'coerce':
                surrogate = f'[{annotation.tag}]'
            failed_replacements.append(annotation)

        part = text[original_text_pointer:annotation.start]

        start = annotation.start + shift
        end = start + len(surrogate)
        shift += len(surrogate) - len(annotation.text)

        adjusted_annotations.append(Annotation(
            text=surrogate,
            start=start,
            end=end,
            tag=annotation.tag,
            doc_id=annotation.doc_id,
            ann_id=annotation.ann_id
        ))

        text_rewritten += part + surrogate
        original_text_pointer = annotation.end

    text_rewritten += text[original_text_pointer:]
    doc_rewritten = Document(name='', text=text_rewritten, annotations=adjusted_annotations)
    doc_rewritten.annotations_without_surrogates = failed_replacements
    return doc_rewritten
def test_apply_surrogates_errors_coerce():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='coerce')
    assert surrogate_doc.text == 'a [A] b'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('[A]', start=2, end=5, tag='A'),
        Annotation('b', start=6, end=7, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == [
        Annotation('cc', start=4, end=6, tag='A'),
    ]
示例#23
0
def test_mask_annotations():
    text = "De patient J. Jansen (e: [email protected], t: 06-12345678)"
    annotations = [
        Annotation(text='J. Jansen',
                   start=11,
                   end=20,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='*****@*****.**',
                   start=25,
                   end=42,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='06-12345678',
                   start=47,
                   end=58,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]

    doc = Document(name='test_doc', text=text, annotations=annotations)

    doc = mask_annotations(doc)
    assert doc.text == "De patient [NAME] (e: [EMAIL], t: [PHONE_FAX])"
    assert doc.annotations == [
        Annotation(text='[NAME]',
                   start=11,
                   end=17,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='[EMAIL]',
                   start=22,
                   end=29,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='[PHONE_FAX]',
                   start=34,
                   end=45,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]
def test_apply_surrogates():
    text = 'ccc cc ccc c c ccc cccccc cccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=15, end=18, tag='B')
    ]
    surrogates = ['a', 'dd', 'bbbbb']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
    assert surrogate_doc.text == 'a dd ccc c c bbbbb cccccc cccc'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('dd', start=2, end=4, tag='A'),
        Annotation('bbbbb', start=13, end=18, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == []
示例#25
0
def xml_to_document(xml_file):
    """Converts an i2b2/UTHealth XML document to a `deidentify.base.Document`.

    XML Structure:
    ```
    <?xml version="1.0" encoding="UTF-8" ?>
    <deIdi2b2>
    <TEXT><![CDATA[
        this is the record content
    ]]></TEXT>
    <TAGS>
    <DATE id="P0" start="16" end="26" text="2067-05-03" TYPE="DATE" comment="" />
    <AGE id="P1" start="50" end="52" text="55" TYPE="AGE" comment="" />
    </TAGS>
    </deIdi2b2>
    ```
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    text = root.find('TEXT').text
    doc_name = 'doc-' + splitext(basename(xml_file))[0]

    annotations = []
    for tag_element in root.find('TAGS'):
        tag_name = tag_element.tag + ':' + tag_element.attrib['TYPE']
        annotations.append(Annotation(
            text=tag_element.attrib['text'],
            start=tag_element.attrib['start'],
            end=tag_element.attrib['end'],
            # Example: NAME:DOCTOR
            tag=TAG_MAPPING.get(tag_name, tag_name),
            # i2b2 annotations have id prefixed with P. Example: P12
            doc_id=doc_name,
            ann_id='T{}'.format(tag_element.attrib['id'][1:])
        ))

    return Document(name=doc_name, text=text, annotations=annotations)
示例#26
0
def test_rewrite_text():
    text = 'ccc cc ccc c c ccc cccccc cccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=15, end=18, tag='B')
    ]

    surrogates = ['a', 'dd', 'bbbbb']

    result = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
    text_rewritten, adjusted_annotations = result
    assert text_rewritten == 'a dd ccc c c bbbbb cccccc cccc'
    assert adjusted_annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('dd', start=2, end=4, tag='A'),
        Annotation('bbbbb', start=13, end=18, tag='B')
    ]