def test_entity_level():
    gold = [
        Document(name='doc_a',
                 text='',
                 annotations=[Annotation('', 3, 6, 'MISC')]),
        Document(name='doc_b',
                 text='',
                 annotations=[Annotation('', 0, 2, 'PER')])
    ]

    predicted = [
        Document(name='doc_a',
                 text='',
                 annotations=[Annotation('', 2, 6, 'MISC')]),
        Document(name='doc_b',
                 text='',
                 annotations=[Annotation('', 0, 2, 'PER')])
    ]

    evaluator = Evaluator(gold, predicted)
    scores = evaluator.entity_level()
    assert scores.micro_avg_f_score() == 0.5
    assert scores.macro_avg_f_score() == 0.5
    assert scores.f_score('PER') == 1
    assert scores.f_score('MISC') == 0
示例#2
0
def sents_to_standoff(sentence_tags: List[List[str]],
                      docs: List[ParsedDoc]) -> List[Document]:
    """Convert a BIO tagged documents to standoff annotated documents.

    Parameters
    ----------
    sentence_tags : List[List[str]]
        List of sentences of BIO tagged tokens.
    docs : List[ParsedDoc]
        The documents corresponding to each sentence.

    Returns
    -------
    annotated_docs : List[Document]
        The documents with annotated entities in standoff format.

    """
    tags_by_doc = _group_sentences(sentence_tags, docs)

    annotated_docs = []
    for doc, tags in tags_by_doc:
        try:
            annotated_docs.append(
                Document(name=doc.name,
                         text=doc.text,
                         annotations=_bio_to_standoff(tags, doc.spacy_doc)))
        except Exception as e:
            logger.warning(
                'Could not convert document to standoff {}\n tags = {}\n{}'.
                format(doc.name, tags, e))

            annotated_docs.append(
                Document(name=doc.name, text=doc.text, annotations=[]))

    return annotated_docs
def test_token_level():
    text = 'A B C D.'

    gold_a = [Annotation('B C', 2, 5, 'PER')]
    gold_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'PER')]

    pred_a = [Annotation('B', 2, 3, 'PER'), Annotation('C', 4, 5, 'PER')]
    pred_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'ORG')]

    gold = [
        Document(name='doc_a', text=text, annotations=gold_a),
        Document(name='doc_b', text=text, annotations=gold_b)
    ]

    predicted = [
        Document(name='doc_a', text=text, annotations=pred_a),
        Document(name='doc_b', text=text, annotations=pred_b)
    ]

    evaluator = Evaluator(gold, predicted)
    scores = evaluator.token_level()
    assert scores.precision('PER') == 1
    assert scores.recall('PER') == 0.6667
    assert scores.f_score('PER') == 0.8

    assert scores.precision('ORG') == 0.5
    assert scores.recall('ORG') == 1
    assert scores.f_score('ORG') == 0.6667
def test_flair_sentence_with_whitespace_tokens():
    text = 'Mw geniet zichtbaar.  Maarten is de afgelopen periode veelal afwezig.'
    annotation = Annotation(text='Maarten',
                            start=text.index('Maarten'),
                            end=text.index('Maarten') + len('Maarten'),
                            tag='PERSON')
    doc = Document(name='', text=text, annotations=[annotation])

    tokenizer = TokenizerFactory().tokenizer('ons')
    flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer)

    # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained
    # in the Flair sentence, otherwise it's not possible to reconstruct the original document from
    # the tokenized representation.
    assert [token.text for token in flair_sents[0]
            ] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>']

    spacy_doc = docs[0].spacy_doc
    spacy_sents = list(spacy_doc.sents)
    assert len(flair_sents) == 2
    assert len(spacy_sents) == 2

    assert len(flair_sents[0]) == 5
    assert len(spacy_sents[0]) == 5
    assert len(flair_sents[1]) == 8
    assert len(spacy_sents[1]) == 8
示例#5
0
def test_surrogate_annotations():
    text = "De patient J. Jansen (e: [email protected], t: 06-12345678)"
    annotations = [
        Annotation(text='J. Jansen',
                   start=11,
                   end=20,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='*****@*****.**',
                   start=25,
                   end=42,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='06-12345678',
                   start=47,
                   end=58,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]
    doc = Document(name='test_doc', text=text, annotations=annotations)

    surrogate_doc = list(surrogate_annotations([doc]))[0]

    assert len(surrogate_doc.annotations) == len(doc.annotations)
    assert re.match(r'De patient .* \(e: .*, t: .*\)', doc.text)
    assert not surrogate_doc.annotations_without_surrogates

    for ann in surrogate_doc.annotations:
        assert surrogate_doc.text[ann.start:ann.end] == ann.text
示例#6
0
def load_data():
    corpus = CONLL_03_DUTCH()
    sentences = corpus.train[:N_SENTS]
    tokens = sum(len(sent) for sent in sentences)
    docs = [
        Document(name='', text=sent.to_plain_string(), annotations=[])
        for sent in sentences
    ]
    return docs, tokens
示例#7
0
def apply_surrogates(text, annotations, surrogates, errors='raise'):
    adjusted_annotations = []
    # Amount of characters by which start point of annotation is adjusted
    # Positive shift if surrogates are longer than original annotations
    # Negative shift if surrogates are shorter
    shift = 0
    original_text_pointer = 0
    text_rewritten = ''

    failed_replacements = []

    for annotation, surrogate in zip(annotations, surrogates):
        if not surrogate:
            if errors == 'raise':
                raise ValueError(f'No valid surrogate for {annotation}')
            if errors == 'ignore':
                surrogate = annotation.text
            elif errors == 'coerce':
                surrogate = f'[{annotation.tag}]'
            failed_replacements.append(annotation)

        part = text[original_text_pointer:annotation.start]

        start = annotation.start + shift
        end = start + len(surrogate)
        shift += len(surrogate) - len(annotation.text)

        adjusted_annotations.append(Annotation(
            text=surrogate,
            start=start,
            end=end,
            tag=annotation.tag,
            doc_id=annotation.doc_id,
            ann_id=annotation.ann_id
        ))

        text_rewritten += part + surrogate
        original_text_pointer = annotation.end

    text_rewritten += text[original_text_pointer:]
    doc_rewritten = Document(name='', text=text_rewritten, annotations=adjusted_annotations)
    doc_rewritten.annotations_without_surrogates = failed_replacements
    return doc_rewritten
def test_token_annotations():
    evaluator = Evaluator(gold=(), predicted=())
    doc = Document(name='doc_a',
                   text='A B C D.',
                   annotations=[
                       Annotation('B C', 2, 5, 'PER'),
                       Annotation('D.', 6, 8, 'ORG')
                   ])

    assert evaluator.token_annotations(doc) == ['O', 'PER', 'PER', 'ORG']
    assert evaluator.token_annotations(
        doc, tag_blind=True) == ['O', 'ENT', 'ENT', 'ENT']
示例#9
0
    def _load_folder(path):
        files = glob.glob(join(path, '*.ann'))
        files = sorted(files)

        documents = []
        for file in files:
            doc_name = get_basename(file)
            annotations, text = brat.load_brat_document(path, doc_name)
            doc = Document(name=doc_name, text=text, annotations=annotations)
            documents.append(doc)

        return documents
def _unflatten_tags(tag_mapping, documents):
    replaced_docs = []

    for doc in documents:
        new_annotations = []
        for ann in doc.annotations:
            ann_key = (doc.name, ann.start, ann.end, ann.tag)
            new_tag = tag_mapping.get(ann_key, ann.tag)
            new_ann = ann._replace(tag=new_tag)
            new_annotations.append(new_ann)
        replaced_docs.append(
            Document(name=doc.name, text=doc.text,
                     annotations=new_annotations))

    return replaced_docs
示例#11
0
def test_surrogate_annotations_errors_raise():
    doc = Document(name='test_doc',
                   text='This document was written on INVALID_DATE.',
                   annotations=[
                       Annotation(text='INVALID_DATE',
                                  start=29,
                                  end=41,
                                  tag='Date',
                                  doc_id='',
                                  ann_id='T0')
                   ])

    with pytest.raises(
            ValueError,
            match=r'No valid surrogate for Annotation\(.*INVALID_DATE.*\)'):
        _ = list(surrogate_annotations([doc]))[0]
示例#12
0
def documents_iter(notes):
    lines = readlines(notes)

    record_lines = []
    for line in lines:
        if line.startswith('START_OF_RECORD'):
            record_lines = []
            patient_id, record_id = re.findall(r'\d+', line)
        elif line.startswith('||||END_OF_RECORD'):
            yield Document(
                name='note-{}-{}'.format(patient_id, record_id),
                text=''.join(record_lines).rstrip(),
                annotations=[]
            )
        else:
            record_lines.append(line)
示例#13
0
def test_mask_annotations():
    text = "De patient J. Jansen (e: [email protected], t: 06-12345678)"
    annotations = [
        Annotation(text='J. Jansen',
                   start=11,
                   end=20,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='*****@*****.**',
                   start=25,
                   end=42,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='06-12345678',
                   start=47,
                   end=58,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]

    doc = Document(name='test_doc', text=text, annotations=annotations)

    doc = mask_annotations(doc)
    assert doc.text == "De patient [NAME] (e: [EMAIL], t: [PHONE_FAX])"
    assert doc.annotations == [
        Annotation(text='[NAME]',
                   start=11,
                   end=17,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='[EMAIL]',
                   start=22,
                   end=29,
                   tag='Email',
                   doc_id='',
                   ann_id='T1'),
        Annotation(text='[PHONE_FAX]',
                   start=34,
                   end=45,
                   tag='Phone_fax',
                   doc_id='',
                   ann_id='T2')
    ]
示例#14
0
def test_surrogate_annotations_errors_ignore():
    original_doc = Document(name='test_doc',
                            text='This document was written on INVALID_DATE.',
                            annotations=[
                                Annotation(text='INVALID_DATE',
                                           start=29,
                                           end=41,
                                           tag='Date',
                                           doc_id='',
                                           ann_id='T0')
                            ])

    gen = surrogate_annotations([original_doc], errors='ignore')
    surrogate_doc = list(gen)[0]
    assert surrogate_doc.text == original_doc.text
    assert surrogate_doc.annotations == original_doc.annotations
    assert surrogate_doc.annotations_without_surrogates == original_doc.annotations
示例#15
0
def predict(documents: List[Document],
            corpus_name='ons',
            verbose=False) -> List[Document]:
    predictions = []

    for doc in tqdm(documents, disable=not verbose, desc='Tag documents'):
        annotator = DeduceAnnotator(doc.text)
        annotations = annotator.annotations()
        if corpus_name == 'ons':
            annotations = rewrite_annotations(doc.text, annotations)

        new_doc = Document(name=doc.name,
                           text=doc.text,
                           annotations=annotations)
        predictions.append(new_doc)

    return predictions
示例#16
0
def mask_annotations(document: Document,
                     replacement_formatter: Callable[[Annotation], str] = _uppercase_formatter
                     ) -> Document:
    """Utility function to replace sensitive PHI spans with a placeholder.

    Parameters
    ----------
    document : Document
        The document whose PHI annotations should be replaced.
    replacement_formatter : Callable[[Annotation], str]
        A callable that can be used to configure the formatting of the replacement.
        The default formatter replaces an annotation with `[annotation.tag.upper()]`.

    Returns
    -------
    Document
        The document with masked annotations.
    """
    # Amount of characters by which start point of annotation is adjusted
    # Positive shift if replacement is longer than original annotation
    # Negative shift if replacement is shorter
    shift = 0

    original_text_pointer = 0
    text_rewritten = ''
    annotations_rewritten = []

    for annotation in document.annotations:
        replacement = replacement_formatter(annotation)
        part = document.text[original_text_pointer:annotation.start]

        start = annotation.start + shift
        end = start + len(replacement)
        shift += len(replacement) - len(annotation.text)

        text_rewritten += part + replacement
        original_text_pointer = annotation.end
        annotations_rewritten.append(annotation._replace(
            start=start,
            end=end,
            text=replacement
        ))

    text_rewritten += document.text[original_text_pointer:]
    return Document(name=document.name, text=text_rewritten, annotations=annotations_rewritten)
def test_annotate():
    tagger = DeduceTagger()
    doc = Document(name='',
                   text='Jan Jannsen vanuit het UMCU.',
                   annotations=[])
    anns = tagger.annotate([doc])[0].annotations

    assert anns == [
        Annotation(text='Jan Jannsen',
                   start=0,
                   end=11,
                   tag='Name',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='UMCU',
                   start=23,
                   end=27,
                   tag='Named_Location',
                   doc_id='',
                   ann_id='T1')
    ]
def test_annotate():
    doc = Document(
        name='',
        text=
        'Hij werd op 10 oktober door arts Peter de Visser ontslagen van de kliniek.',
        annotations=[])

    anns = tagger.annotate([doc])[0].annotations
    assert anns == [
        Annotation(text='10 oktober',
                   start=12,
                   end=22,
                   tag='Date',
                   doc_id='',
                   ann_id='T0'),
        Annotation(text='Peter de Visser',
                   start=33,
                   end=48,
                   tag='Name',
                   doc_id='',
                   ann_id='T1')
    ]
示例#19
0
def get_documents(docs_path, anns_path) -> List[Document]:
    if not isdir(docs_path):
        raise ValueError('docs_path = {} does not exist.'.format(docs_path))
    if not isdir(anns_path):
        raise ValueError('anns_path = {} does not exist.'.format(anns_path))

    txt_files = sorted(glob.glob(join(docs_path, '*.txt')))
    ann_files = sorted(glob.glob(join(anns_path, '*.ann')))

    assert ann_files and txt_files and _basenames(txt_files) == _basenames(
        ann_files)

    docs = []
    for txt_file, ann_file in zip(txt_files, ann_files):
        doc_name = splitext(basename(txt_file))[0]
        doc_txt = brat.load_brat_text(txt_file)
        doc_annos = brat.load_brat_annotations(ann_file)

        docs.append(
            Document(name=doc_name, text=doc_txt, annotations=doc_annos))

    return docs
示例#20
0
def xml_to_document(xml_file):
    """Converts an i2b2/UTHealth XML document to a `deidentify.base.Document`.

    XML Structure:
    ```
    <?xml version="1.0" encoding="UTF-8" ?>
    <deIdi2b2>
    <TEXT><![CDATA[
        this is the record content
    ]]></TEXT>
    <TAGS>
    <DATE id="P0" start="16" end="26" text="2067-05-03" TYPE="DATE" comment="" />
    <AGE id="P1" start="50" end="52" text="55" TYPE="AGE" comment="" />
    </TAGS>
    </deIdi2b2>
    ```
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    text = root.find('TEXT').text
    doc_name = 'doc-' + splitext(basename(xml_file))[0]

    annotations = []
    for tag_element in root.find('TAGS'):
        tag_name = tag_element.tag + ':' + tag_element.attrib['TYPE']
        annotations.append(Annotation(
            text=tag_element.attrib['text'],
            start=tag_element.attrib['start'],
            end=tag_element.attrib['end'],
            # Example: NAME:DOCTOR
            tag=TAG_MAPPING.get(tag_name, tag_name),
            # i2b2 annotations have id prefixed with P. Example: P12
            doc_id=doc_name,
            ann_id='T{}'.format(tag_element.attrib['id'][1:])
        ))

    return Document(name=doc_name, text=text, annotations=annotations)
示例#21
0
def anonymize(input_file):

    with open(input_file, "r", encoding="utf-8") as f:
        text = f.read()

    # Wrap text in document
    documents = [Document(name='doc_01', text=text)]

    # Select downloaded model
    model = 'models/model_bilstmcrf_ons_fast-v0.1.0/final-model.pt'

    nlp = spacy.load('de_core_news_sm')

    # Instantiate tokenizer
    tokenizer = TokenizerFactory().tokenizer(corpus='germeval',
                                             disable=("tagger", "ner"),
                                             model=nlp)

    # Load tagger with a downloaded model file and tokenizer
    tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False)

    # Annotate your documents
    annotated_doc = tagger.annotate(documents)[0]

    # Spacy NER extraction
    ners = nlp(text)

    filtered_annotations = []

    # Dict for storing SpaCy and deidentify tag correspondences
    tag_dict = {
        "PER": "Name",
        "LOC": "Address",
        "ORG": "Organization_Company",
        "MISC": "Other"
    }

    # Add all SpaCy-detected NEs to list
    for ent in ners.ents:
        filtered_annotations.append({
            "text": ent.text,
            "start": ent.start_char,
            "end": ent.end_char,
            "tag": tag_dict[ent.label_]
        })

    for ann in annotated_doc.annotations:
        # discard names; they have a high likelihood of false positives since
        # nouns are capitalized in German, unlike in Dutch
        if ann.tag == "Name":
            continue
        # don't add the entity if it overlaps with SpaCy's - SpaCy makes fewer mistakes
        if True in [ent.start_char <= ann.end <= ent.end_char for ent in ners.ents] or \
            True in [ann.start <= ent.end_char <= ann.end for ent in ners.ents]:
            continue
        filtered_annotations.append({
            "text": ann.text,
            "start": ann.start,
            "end": ann.end,
            "tag": ann.tag
        })

    filtered_annotations.sort(key=lambda x: x["start"])

    masked_output = mask_annotations(annotated_doc.text, filtered_annotations)
    print(masked_output)
示例#22
0
from deidentify.base import Document
from deidentify.taggers import FlairTagger
from deidentify.tokenizer import TokenizerFactory

# Create some text
text = (
    "Dit is stukje tekst met daarin de naam Jan Jansen. De patient J. Jansen (e: "
    "[email protected], t: 06-12345678) is 64 jaar oud en woonachtig in Utrecht. Hij werd op 10 "
    "oktober door arts Peter de Visser ontslagen van de kliniek van het UMCU."
)

# Wrap text in document
documents = [
    Document(name='doc_01', text=text)
]

# Select downloaded model
model = 'model_bilstmcrf_ons_fast-v0.2.0'

# Instantiate tokenizer
tokenizer = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner"))

# Load tagger with a downloaded model file and tokenizer
tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False)

# Annotate your documents
annotated_docs = tagger.annotate(documents)


from pprint import pprint