def test_apply_surrogates_errors_raise():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    with pytest.raises(ValueError):
        rewrite_dataset.apply_surrogates(text, annotations, surrogates)

    with pytest.raises(ValueError):
        rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='raise')
Exemplo n.º 2
0
def surrogate_annotations(docs: List[Document], seed=42, errors='raise') -> List[Document]:
    """Replaces PHI annotations in documents with random surrogates.

    Parameters
    ----------
    seed : int
        Set this seed to make the random generation deterministic.
    errors : str {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise',  errors during surrogate generation will raise an exception.
        - If 'ignore', failing annotations are skipped (they and PHI remains in text)
        - If 'coerce', failing annotations are replaced with pattern `[annotation.tag]`

    Returns
    -------
    List[Document]
        A copy of `docs` with with text and annotations rewritten to their surrogates.

        If errors is 'ignore' or 'coerce', an extra property of type List is added to the returned
        documents (`Document.annotations_without_surrogates`), which includes annotations of the
        *input document* that could not be replaced with a surrogate.

    """
    random_data = RandomData(seed=seed)
    dataset_deidentifier = DatasetDeidentifier(random_data=random_data)

    surrogate_docs = [SurrogateDocument(doc.annotations, doc.text) for doc in docs]
    surrogate_docs = dataset_deidentifier.generate_surrogates(documents=surrogate_docs)

    for doc in surrogate_docs:
        annotations, surrogates = doc.annotation_surrogate_pairs()
        doc_rewritten = apply_surrogates(doc.text, annotations, surrogates, errors=errors)
        yield doc_rewritten
Exemplo n.º 3
0
def test_rewrite_text_no_annotations():
    result = rewrite_dataset.apply_surrogates('ccc cc ccc',
                                              annotations=[],
                                              surrogates=[])
    text_rewritten, adjusted_annotations = result
    assert text_rewritten == 'ccc cc ccc'
    assert adjusted_annotations == []
def test_apply_surrogates():
    text = 'ccc cc ccc c c ccc cccccc cccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=15, end=18, tag='B')
    ]
    surrogates = ['a', 'dd', 'bbbbb']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
    assert surrogate_doc.text == 'a dd ccc c c bbbbb cccccc cccc'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('dd', start=2, end=4, tag='A'),
        Annotation('bbbbb', start=13, end=18, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == []
Exemplo n.º 5
0
def test_rewrite_text():
    text = 'ccc cc ccc c c ccc cccccc cccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=15, end=18, tag='B')
    ]

    surrogates = ['a', 'dd', 'bbbbb']

    result = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
    text_rewritten, adjusted_annotations = result
    assert text_rewritten == 'a dd ccc c c bbbbb cccccc cccc'
    assert adjusted_annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('dd', start=2, end=4, tag='A'),
        Annotation('bbbbb', start=13, end=18, tag='B')
    ]
def test_apply_surrogates_errors_coerce():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='coerce')
    assert surrogate_doc.text == 'a [A] b'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('[A]', start=2, end=5, tag='A'),
        Annotation('b', start=6, end=7, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == [
        Annotation('cc', start=4, end=6, tag='A'),
    ]
def test_apply_surrogates_no_annotations():
    surrogate_doc = rewrite_dataset.apply_surrogates('ccc cc ccc', annotations=[], surrogates=[])
    assert surrogate_doc.text == 'ccc cc ccc'
    assert surrogate_doc.annotations == []
    assert surrogate_doc.annotations_without_surrogates == []