def test_filter_by_score(record_and_meta_2):
    entity_xf_list = [
        # Replace names with PERSON_NAME. Should be applied to all.
        RedactWithLabelConfig(labels=['person_name'], minimum_score=Score.HIGH),

        # Replace names with XXXX. Should be applied to Qualcomm but not Gretel.
        RedactWithCharConfig(labels=['company_name'], minimum_score=Score.HIGH),

        # Replace names with LOCATION_CITY. Should be applied to San Diego.
        RedactWithLabelConfig(labels=['location_city']),
    ]
    data_paths = [
        DataPath(input='summary', xforms=entity_xf_list),
        # Transforms should be no-ops for all these, no matching entities.
        DataPath(input='dni', xforms=entity_xf_list),
        DataPath(input='city', xforms=entity_xf_list),
        DataPath(input='state', xforms=entity_xf_list),
        DataPath(input='stuff', xforms=entity_xf_list),
        DataPath(input='latitude', xforms=entity_xf_list)
    ]
    xf = DataTransformPipeline(data_paths)
    check = xf.transform_record(record_and_meta_2).get('record')
    assert check == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'XXXXXXXX.',
        'dni': 'He loves 8.8.8.8 for DNS',
        'city': 'LOCATION_CITY',
        'state': 'California',
        'stuff': 'nothing labeled here',
        'latitude': 112.221
    }
def test_pipe_record_filter(record_meta_data_check):
    entity_xf = [
        RedactWithLabelConfig(labels=['date']),
        SecureHashConfig(secret='rockybalboa', labels=['location']),
        FpeStringConfig(labels=['credit_card_number'],
                        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94",
                        radix=10)
    ]
    data_paths = [
        DataPath(input='Country', xforms=entity_xf),
        DataPath(input='?ddress', xforms=entity_xf),
        DataPath(input='Cr*', xforms=entity_xf)
    ]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    transformed = xf.transform_record(record_meta_data_check)
    assert transformed['record']['Credit Card'] == '4471585942734458'
    assert transformed['metadata']['fields']['Credit Card']['ner']['labels'][0]['text'] == '4471585942734458'
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['start'] == 0
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['end'] == 64
    # The metadata has one entry less than record entries, because Address does not have meta data in this test.
    assert len(transformed['metadata']['fields']) == 2
    assert len(transformed['record']) == 3
    restored = rf.transform_record(transformed)
    assert restored['record']['Credit Card'] == record_meta_data_check['record']['Credit Card']
Пример #3
0
def test_config_serialize():
    # empty transformer
    xf_list = [
        # replace names with PERSON_NAM
        RedactWithLabelConfig(labels=['person_name']),

        # swap emails with fake (but consistent emails)
        FakeConstantConfig(labels=['email_address'], seed=SEED),

        # character-redact IP addresses
        RedactWithCharConfig(labels=['ip_address']),

        # field redact entire city
        RedactWithCharConfig(char='Y'),

        # this should not be run
        RedactWithCharConfig(char='N', labels=['location_city']),

        # secure hash
        SecureHashConfig(secret='rockybalboa', labels=['location_state']),

        # replace latitude
        FakeConstantConfig(labels=['latitude'], seed=SEED)
    ]

    json_encode = jsonpickle.encode(xf_list)
    loaded_xf_list = jsonpickle.decode(json_encode)
    assert [xf for xf in xf_list] == [xf for xf in loaded_xf_list]
Пример #4
0
    def build_anonymizing_transforms(self):
        for entity in self.id_entities:
            # Get all the project fields tagged as this entity type
            entity_fields = [
                d["field"]
                for d in self.project.get_field_details(entity=entity)
            ]
            for field in entity_fields:
                dice_roll = random.randint(1, 6)
                xf = []
                if dice_roll == 1:
                    print(f"Dropping field {field}")
                    xf = [DropConfig()]
                if dice_roll == 2:
                    print(f"Faking field {field}")
                    xf = [
                        FakeConstantConfig(seed=SEED,
                                           fake_method=FAKER_MAP.get(
                                               entity, "name"))
                    ]
                if dice_roll == 3:
                    print(f"Encrypting field {field}")
                    # radix 62 will encrypt alphanumeric but no special characters
                    xf = [FpeStringConfig(secret=SECRET, radix=62)]
                if dice_roll == 4:
                    print(f"Character redacting field {field}")
                    # Use a fancier mask for emails
                    if entity == "email_address":
                        xf = [
                            RedactWithCharConfig(
                                char="X",
                                mask=[
                                    StringMask(start_pos=3, mask_until="@"),
                                    StringMask(mask_after="@",
                                               mask_until=".",
                                               greedy=True),
                                ],
                            )
                        ]
                    else:
                        xf = [
                            RedactWithCharConfig(
                                "#", mask=[StringMask(start_pos=3)])
                        ]
                if dice_roll == 5:
                    print(f"Label redacting field {field}")
                    xf = [RedactWithLabelConfig(labels=[entity])]
                if dice_roll == 6:
                    print(f"String redacting field {field}")
                    xf = [RedactWithStringConfig(string="CLASSIFIED")]

                self.data_paths.append(DataPath(input=field, xforms=xf))
def test_metadata_in_xf(record_meta_data_check):
    path = DataPath(input="*", xforms=[RedactWithLabelConfig()])

    with patch.object(
        path.transformations[0],
        "_transform_field",
        wraps=path.transformations[0]._transform_field,
    ) as xf_fn:
        xf = DataTransformPipeline([path])
        xf.transform_record(record_meta_data_check)
        assert (
            xf_fn.call_args_list[0][0][2]["gretel_id"]
            == record_meta_data_check["metadata"]["gretel_id"]
        )
def test_meta_data_transform(record_meta_data_check):
    entity_xf = [
        RedactWithLabelConfig(labels=['date']),
        SecureHashConfig(secret='rockybalboa', labels=['location']),
        FpeStringConfig(labels=['credit_card_number'],
                        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94",
                        radix=10)
    ]
    data_paths = [DataPath(input='*', xforms=entity_xf)]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    transformed = xf.transform_record(record_meta_data_check)
    assert transformed['record']['Credit Card'] == '4471585942734458'
    assert transformed['metadata']['fields']['Credit Card']['ner']['labels'][0]['text'] == '4471585942734458'
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['start'] == 0
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['end'] == 64
    restored = rf.transform_record(transformed)
    assert restored['record']['Credit Card'] == record_meta_data_check['record']['Credit Card']
def test_conditional_transformer(records_conditional):
    xf_fpe = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    xf_consent = ConditionalConfig(conditional_value=FieldRef('user_consent'), regex=r"['1']",
                                   true_xform=xf_fpe,
                                   false_xform=RedactWithLabelConfig())

    data_paths_encrypt = [DataPath(input='lon', xforms=xf_fpe),
                          DataPath(input='lat', xforms=xf_fpe),
                          DataPath(input='*')
                          ]

    data_paths_decrypt = [DataPath(input='lon', xforms=xf_consent),
                          DataPath(input='lat', xforms=xf_consent),
                          DataPath(input='*')
                          ]

    xf_encrypt = DataTransformPipeline(data_paths_encrypt)
    xf_decrypt = DataRestorePipeline(data_paths_decrypt)
    check_aw = xf_encrypt.transform_record(records_conditional[0])
    check_ae = xf_encrypt.transform_record(records_conditional[1])
    assert check_ae['record']['lat'] == 50.65564864394322
    assert check_ae['record']['lon'] == 191.8142181740291
    assert check_aw['record']['lat'] == 77.00217823076872
    assert check_aw['record']['lon'] == 254.0404040486477
    check_aw = xf_decrypt.transform_record(check_aw)
    check_ae = xf_decrypt.transform_record(check_ae)
    assert check_ae['record']['lat'] == 'LATITUDE'
    assert check_ae['record']['lon'] == 'LONGITUDE'
    assert check_aw['record']['lat'] == 112.22134
    assert check_aw['record']['lon'] == 135.76433

    xf_fpe = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    xf_consent = ConditionalConfig(conditional_value=FieldRef('user_consent'), regex=r"['1']",
                                   true_xform=xf_fpe)

    data_paths_encrypt = [DataPath(input='lon', xforms=xf_fpe),
                          DataPath(input='lat', xforms=xf_fpe),
                          DataPath(input='*')
                          ]

    data_paths_decrypt = [DataPath(input='lon', xforms=xf_consent),
                          DataPath(input='lat', xforms=xf_consent),
                          DataPath(input='*')
                          ]

    xf_encrypt = DataTransformPipeline(data_paths_encrypt)
    xf_decrypt = DataRestorePipeline(data_paths_decrypt)
    check_aw = xf_encrypt.transform_record(records_conditional[0])
    check_ae = xf_encrypt.transform_record(records_conditional[1])
    assert check_ae['record']['lat'] == 50.65564864394322
    assert check_ae['record']['lon'] == 191.8142181740291
    assert check_aw['record']['lat'] == 77.00217823076872
    assert check_aw['record']['lon'] == 254.0404040486477
    check_aw = xf_decrypt.transform_record(check_aw)
    check_ae = xf_decrypt.transform_record(check_ae)
    assert check_ae['record']['lat'] == 50.65564864394322
    assert check_ae['record']['lon'] == 191.8142181740291
    assert check_aw['record']['lat'] == 112.22134
    assert check_aw['record']['lon'] == 135.76433

    xf_fpe = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    xf_consent = ConditionalConfig(conditional_value=FieldRef('user_consent'), regex=r"['1']",
                                   false_xform=xf_fpe)

    data_paths_decrypt = [DataPath(input='lon', xforms=xf_consent),
                          DataPath(input='lat', xforms=xf_consent),
                          DataPath(input='*')
                          ]

    xf_decrypt = DataRestorePipeline(data_paths_decrypt)
    check_aw = xf_decrypt.transform_record(check_aw)
    check_ae = xf_decrypt.transform_record(check_ae)
    assert check_ae['record']['lat'] == 35.659491
    assert check_ae['record']['lon'] == 139.72785
    assert check_aw['record']['lat'] == 112.22134
    assert check_aw['record']['lon'] == 135.76433
def test_record_xf(record_and_meta_2):
    # empty transformer
    entity_xf_list = [
        # replace names with PERSON_NAM
        RedactWithLabelConfig(labels=['person_name']),

        # swap emails with fake (but consistent emails)
        FakeConstantConfig(labels=['email_address'], seed=SEED),

        # character-redact IP addresses
        RedactWithCharConfig(labels=['ip_address']),

        # this should not be run
        RedactWithCharConfig(char='N', labels=['location_city']),

        # secure hash
        SecureHashConfig(secret='rockybalboa', labels=['location_state']),

        # replace latitude
        FakeConstantConfig(labels=['latitude'], seed=SEED)
    ]
    # field redact entire city
    city_redact = RedactWithCharConfig(char='Y')

    data_paths = [
        DataPath(input='summary', xforms=entity_xf_list),
        DataPath(input='dni', xforms=entity_xf_list),
        DataPath(input='city', xforms=[entity_xf_list, city_redact]),
        DataPath(input='state', xforms=entity_xf_list),
        DataPath(input='stuff', xforms=entity_xf_list),
        DataPath(input='latitude', xforms=entity_xf_list)
    ]

    xf = DataTransformPipeline(data_paths)

    check1 = xf.transform_record(record_and_meta_2).get('record')

    data_paths = [
        DataPath(input='city', xforms=[entity_xf_list, city_redact]),
        DataPath(input='*', xforms=entity_xf_list),
    ]

    xf = DataTransformPipeline(data_paths)

    check2 = xf.transform_record(record_and_meta_2).get('record')

    assert check1 == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'Qualcomm.',
        'dni': 'He loves X.X.X.X for DNS',
        'city': 'YYY YYYYY',
        'state': '8896cd9f38ceac0e98f47c41a2028219f17d8ef41277e4e2138d52a08c24e0aa',
        'stuff': 'nothing labeled here',
        'latitude': -89.3146475}

    assert check2 == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'Qualcomm.',
        'dni': 'He loves X.X.X.X for DNS',
        'city': 'YYY YYYYY',
        'state': '8896cd9f38ceac0e98f47c41a2028219f17d8ef41277e4e2138d52a08c24e0aa',
        'stuff': 'nothing labeled here',
        'latitude': -89.3146475}
    # now add a drop field that contains an entity
    entity_xf_list.insert(0, DropConfig(labels=['ip_address']))

    data_paths = [
        DataPath(input='summary', xforms=entity_xf_list),
        DataPath(input='dni', xforms=entity_xf_list),
        DataPath(input='city', xforms=[entity_xf_list, city_redact]),
        DataPath(input='state', xforms=entity_xf_list),
        DataPath(input='stuff', xforms=entity_xf_list),
        DataPath(input='latitude', xforms=entity_xf_list)
    ]

    xf = DataTransformPipeline(data_paths)

    check = xf.transform_record(record_and_meta_2).get('record')

    assert check == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'Qualcomm.',
        'city': 'YYY YYYYY',
        'state': '8896cd9f38ceac0e98f47c41a2028219f17d8ef41277e4e2138d52a08c24e0aa',
        'stuff': 'nothing labeled here',
        'latitude': -89.3146475}
Пример #9
0
                            'start': 0,
                            'end': 7,
                            'score': 1,
                            'text': '112.221',
                            'label': 'latitude'
                        }
                    ]
                }
            }
        }
    }
}

entity_xf_list = [
    # Replace names with PERSON_NAME. Should be applied to all.
    RedactWithLabelConfig(labels=['person_name']),

    # Replace names with COMPANY_NAME. Should be applied to Example.com but not Spacely Sprockets.
    RedactWithLabelConfig(labels=['company_name'], minimum_score=Score.HIGH),
]

data_paths = [
    DataPath(input='summary', xforms=entity_xf_list),
    # Transforms should be no-ops for all these, no matching entities.
    DataPath(input='dni', xforms=entity_xf_list),
    DataPath(input='city', xforms=entity_xf_list),
    DataPath(input='state', xforms=entity_xf_list),
    DataPath(input='stuff', xforms=entity_xf_list),
    DataPath(input='latitude', xforms=entity_xf_list)
]