def surrogate_annotations(docs: List[Document], seed=42, errors='raise') -> List[Document]: """Replaces PHI annotations in documents with random surrogates. Parameters ---------- seed : int Set this seed to make the random generation deterministic. errors : str {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', errors during surrogate generation will raise an exception. - If 'ignore', failing annotations are skipped (they and PHI remains in text) - If 'coerce', failing annotations are replaced with pattern `[annotation.tag]` Returns ------- List[Document] A copy of `docs` with with text and annotations rewritten to their surrogates. If errors is 'ignore' or 'coerce', an extra property of type List is added to the returned documents (`Document.annotations_without_surrogates`), which includes annotations of the *input document* that could not be replaced with a surrogate. """ random_data = RandomData(seed=seed) dataset_deidentifier = DatasetDeidentifier(random_data=random_data) surrogate_docs = [SurrogateDocument(doc.annotations, doc.text) for doc in docs] surrogate_docs = dataset_deidentifier.generate_surrogates(documents=surrogate_docs) for doc in surrogate_docs: annotations, surrogates = doc.annotation_surrogate_pairs() doc_rewritten = apply_surrogates(doc.text, annotations, surrogates, errors=errors) yield doc_rewritten
def test_replace_pattern(): pfs = PhoneFaxSurrogates(annotations=[], random_data=RandomData(seed=42)) for _ in range(100): pattern = '(0DD) ### ### #' replacement = pfs.replace_pattern(pattern) assert replacement[2:4] in DIAL_CODES_BY_LENGTH[2] assert replacement[6] in '123456789' assert len(replacement) == len(pattern) pattern = '0DD - ### ### #' replacement = pfs.replace_pattern(pattern) assert replacement[1:3] in DIAL_CODES_BY_LENGTH[2] assert replacement[6] in '123456789' assert len(replacement) == len(pattern) pattern = '0DD- ### ### #' replacement = pfs.replace_pattern(pattern) assert replacement[1:3] in DIAL_CODES_BY_LENGTH[2] assert replacement[5] in '123456789' assert len(replacement) == len(pattern) pattern = '0DD#######' replacement = pfs.replace_pattern(pattern) assert replacement[1:3] in DIAL_CODES_BY_LENGTH[2] assert replacement[3] in '123456789' assert len(replacement) == len(pattern) pattern = '### ### #' replacement = pfs.replace_pattern(pattern) assert replacement[0] in '123456789' assert len(replacement) == len(pattern) pattern = '+CC DDD ### ###' replacement = pfs.replace_pattern(pattern) assert replacement[0] == '+' assert replacement[1:3] == '31' assert replacement[4:7] in DIAL_CODES_BY_LENGTH[3] assert replacement[8] in '123456789' assert len(replacement) == len(pattern) pattern = '+CC D ## ## ## ##' replacement = pfs.replace_pattern(pattern) assert replacement[0] == '+' assert replacement[1:3] == '31' assert replacement[4] == '6' # mobile phone assert replacement[6] in '123456789' assert len(replacement) == len(pattern) pattern = '00CC D ## ## ## ##' replacement = pfs.replace_pattern(pattern) assert replacement[0:2] == '00' assert replacement[2:4] == '31' assert replacement[5] == '6' # mobile phone assert replacement[7] in '123456789' assert len(replacement) == len(pattern)
def test_date_surrogate_generator(): annotations = [ '01 januari 1915', '01-02', 'marc 2001', 'February 2001', '01-02-2010', ] date_surrogates = DateSurrogates(annotations, random_data=RandomData(42)) assert date_surrogates.replace_all() == [ '09 januari 2006', '09-02', None, 'February 2086', '09-02-2095' ]
def test_replace_phonenumber(): pfs = PhoneFaxSurrogates(annotations=[], random_data=RandomData(seed=42)) for _ in range(100): pattern = '(026) 123 456 7' replacement = pfs.replace_one(pattern) assert replacement[2:4] in DIAL_CODES_BY_LENGTH[2] assert replacement[6] in '123456789' assert len(replacement) == len(pattern) pattern = '+31 (0) 6 11 22 11 11' replacement = pfs.replace_one(pattern) assert replacement[0] == '+' assert replacement[1:3] == '31' assert replacement[3:8] == ' (0) ' assert replacement[8] == '6' # mobile phone assert replacement[10] in '123456789' assert len(replacement) == len(pattern)
def __init__(self, random_data=None): if not random_data: random_data = RandomData(seed=45) self.random_data = random_data
def test_random_char_mapping(): random_data = RandomData() random_mapping = random_char_mapping(random_data=random_data) assert set(random_mapping.keys()) == set(string.ascii_lowercase) assert set(random_mapping.values()) == set(string.ascii_lowercase)