def test_set_attribute_type_with_sequence_of_attributes(self): dataset = Dataset(self.test_data) dataset.set_attribute_type(AttributeType.IDENTIFYING, "id", "name") self.assertEqual(AttributeType.IDENTIFYING.value, dataset._attributes[0].type.value) self.assertEqual(AttributeType.IDENTIFYING.value, dataset._attributes[1].type.value)
def test_set_attribute_type__single_attribute(self): dataset = Dataset(self.test_data) dataset._set_attribute_type("id", AttributeType.QUASIIDENTIFYING) self.assertEqual(AttributeType.QUASIIDENTIFYING.value, dataset._attributes[0].type.value) self.assertEqual(Dataset._DEFAULT_ATTRIBUTE_TYPE.value, dataset._attributes[1].type.value)
def id_name_dataset() -> Dataset: test_data = [['id', 'name'], ['0', 'Viktor'], ['1', 'Jerry']] test_attribute_type_mapping = { 'id': AttributeType.IDENTIFYING, 'name': AttributeType.QUASIIDENTIFYING } return Dataset(test_data, test_attribute_type_mapping)
def test_from_dict(self): data = {"id": [1, 2], "name": ["Monsen", "Mikkel"]} expected_df = pandas.DataFrame.from_dict(data) dataset = Dataset.from_dict(data) self.assertIsNotNone(dataset) self.assertIsInstance(dataset, Dataset) self.assertEqual(expected_df.to_dict(), dataset.to_dataframe().to_dict())
def test_equaltiy(self): ar1 = AnonymizeResult(self.test_dataset, self.test_risk_profile, self.test_anon_metrics, 'ANONYMOUS') ar2 = AnonymizeResult(self.test_dataset, self.test_risk_profile, self.test_anon_metrics, 'ANONYMOUS') self.assertEqual(ar1, ar2) ar2._dataset = Dataset([["data", "data2"]]) self.assertNotEqual(ar1, ar2)
def test__payload__with_hierarchies(self): test_hierarchy_id = [["0", "*"], ["1", "*"]] test_hierarchy_name = [["Viktor", "NAME"], ["Jerry", "NAME"]] dataset = Dataset(self.test_data) dataset._set_attribute_type("id", AttributeType.QUASIIDENTIFYING) dataset._set_attribute_type("name", AttributeType.QUASIIDENTIFYING) dataset.set_hierarchies({ "id": test_hierarchy_id, "name": test_hierarchy_name }) payload = dataset._payload() self.assertEqual(test_hierarchy_id, payload["attributes"][0]["hierarchy"]) self.assertEqual(test_hierarchy_name, payload["attributes"][1]["hierarchy"])
def setUp(self): self.test_data = [['id', 'name'], ['0', 'Viktor'], ['1', 'Jerry']] self.test_attribute_type_mapping = { 'id': AttributeType.IDENTIFYING, 'name': AttributeType.QUASIIDENTIFYING } self.test_dataset = Dataset(self.test_data, self.test_attribute_type_mapping) self.test_raw_analyze_response = data_generator.analyze_response() self.test_raw_anon_response = data_generator.anonymize_response()
def test_create_from_pandas_dataframe(self): dataframe = pandas.DataFrame(self.test_data[1:], columns=self.test_data[0]) dataset = Dataset.from_pandas(dataframe) pandas_df = dataset.to_dataframe() # assert column names are in top row self.assertEqual(dataframe.to_dict(), pandas_df.to_dict()) # assert default AttributeType is set self.assertEqual(Dataset._DEFAULT_ATTRIBUTE_TYPE.value, dataset._attributes[0].type.value)
def test_set_hierarchies(self): test_hierarchy_id = [["0", "*"], ["1", "*"]] test_hierarchy_name = [["Viktor", "*"], ["Jerry", "*"]] dataset = Dataset(self.test_data) dataset._set_attribute_type("id", AttributeType.QUASIIDENTIFYING) dataset._set_attribute_type("name", AttributeType.QUASIIDENTIFYING) dataset.set_hierarchies({ "id": test_hierarchy_id, "name": test_hierarchy_name }) self.assertEqual(dataset._attributes[0].hierarchy, test_hierarchy_id) self.assertEqual(dataset._attributes[1].hierarchy, test_hierarchy_name)
def test_set_hierarchy(self): test_hierarchy = [["0", "*"], ["1", "*"]] dataset = Dataset(self.test_data) dataset._set_attribute_type("id", AttributeType.QUASIIDENTIFYING) dataset.set_hierarchy("id", test_hierarchy) self.assertEqual(dataset._attributes[0].hierarchy, test_hierarchy)
def test_set_hierarchy__not_valid_attribute_name(self): test_hierarchy = [["0", "*"], ["1", "*"]] dataset = Dataset(self.test_data) dataset._set_attribute_type("id", AttributeType.QUASIIDENTIFYING) with self.assertRaises(KeyError): dataset.set_hierarchy("fail", test_hierarchy) self.assertIsNone(dataset._attributes[0].hierarchy)
def test_set_hierarchy_with_pandas(self): test_hierarchy = [["0", "*"], ["1", "*"]] hierarchy_df = pandas.DataFrame(test_hierarchy) dataset = Dataset(self.test_data) dataset._set_attribute_type("id", AttributeType.QUASIIDENTIFYING) dataset.set_hierarchy("id", hierarchy_df) self.assertEqual(dataset._attributes[0].hierarchy, test_hierarchy)
def test_set_hierarchy__not_valid_attribute_type(self): test_hierarchy = [["0", "*"], ["1", "*"]] dataset = Dataset(self.test_data) dataset._set_attribute_type("id", AttributeType.INSENSITIVE) with self.assertRaises(ValueError): dataset.set_hierarchy("id", test_hierarchy) self.assertIsNone(dataset._attributes[0].hierarchy) self.assertIsNot(test_hierarchy, dataset._attributes[0].hierarchy)
def _anonymize_result(self, response): """ Creates the result to be delivered back to the caller :param response: :return: """ json_string = response.text response_dict = json.loads(json_string) attributes = self._attributes(response_dict) dataset = Dataset(response_dict["anonymizeResult"]["data"], attributes) risk_profile = RiskProfile(response_dict["riskProfile"]) anon_status = response_dict["anonymizeResult"]["anonymizationStatus"] anonymization_metrics = response_dict["anonymizeResult"]["metrics"] return AnonymizeResult._from_response(dataset, risk_profile, anonymization_metrics, anon_status)
def test_set_attribute_type_with_single_attribute(self): dataset = Dataset(self.test_data) dataset.set_attribute_type(AttributeType.IDENTIFYING, "id") self.assertEqual(AttributeType.IDENTIFYING.value, dataset._attributes[0].type.value)
def test_set_attribute_types_default_value(self): dataset = Dataset(self.test_data) self.assertEqual(AttributeType.QUASIIDENTIFYING.value, dataset._attributes[0].type.value) self.assertEqual(AttributeType.QUASIIDENTIFYING.value, dataset._attributes[1].type.value)
def test_init__without_attribute_types_param(self): dataset = Dataset(self.test_data) self.assertEqual(dataset._DEFAULT_ATTRIBUTE_TYPE.value, dataset._attributes[0].type.value) self.assertEqual(self.test_data[0][0], dataset._attributes[0].name) self.assertEqual(self.test_data[0][1], dataset._attributes[1].name)
def test_to_dataframe(self): dataset = Dataset(self.test_data, self.test_attribute_type_mapping) df = dataset.to_dataframe() self.assertIsInstance(df, pandas.DataFrame)
def test__payload(self): dataset = Dataset(self.test_data) payload = dataset._payload() self.assertEqual(AttributeType.QUASIIDENTIFYING.value, payload["attributes"][0]["attributeTypeModel"]) self.assertEqual(None, payload["attributes"][0]["hierarchy"])
def setUp(self): self.test_data = [['id', 'name'], ['0', 'Viktor'], ['1', 'Jerry']] self.test_attribute_type_mapping = { 'id': AttributeType.IDENTIFYING, 'name': AttributeType.QUASIIDENTIFYING } self.test_dataset = Dataset(self.test_data, self.test_attribute_type_mapping) self.risk_profile_response = { "reIdentificationRisk": { "measures": { "measure_value": "[%]", "Prosecutor_attacker_success_rate": "98.72", "records_affected_by_highest_prosecutor_risk": "97.46000000000001", "sample_uniques": "97.46000000000001", "estimated_prosecutor_risk": "100.0", "population_model": "PITMAN", "highest_journalist_risk": "100.0", "records_affected_by_lowest_risk": "0.06", "estimated_marketer_risk": "98.72000000000001", "Journalist_attacker_success_rate": "98.72", "highest_prosecutor_risk": "100.0", "estimated_journalist_risk": "100.0", "lowest_risk": "33.33333333333333", "Marketer_attacker_success_rate": "98.72", "average_prosecutor_risk": "98.72000000000001", "records_affected_by_highest_journalist_risk": "97.46000000000001", "population_uniques": "39.64593493418713", "quasi_identifiers": [ "Innvandrerbakgrunn", "Ytelse", "Innsatsgruppe", "Ledighetsstatus" ] } }, "distributionOfRisk": { "riskIntervalList": [{ "interval": "]50,100]", "recordsWithRiskWithinInteval": 0.9746, "recordsWithMaxmalRiskWithinInterval": 1.0 }, { "interval": "]33.4,50]", "recordsWithRiskWithinInteval": 0.0248, "recordsWithMaxmalRiskWithinInterval": 0.0254 }, { "interval": "]25,33.4]", "recordsWithRiskWithinInteval": 0.0006, "recordsWithMaxmalRiskWithinInterval": 0.0006 }, { "interval": "]20,25]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]16.7,20]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]14.3,16.7]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]12.5,14.3]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]10,12.5]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]9,10]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]8,9]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]7,8]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]6,7]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]5,6]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]4,5]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]3,4]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]2,3]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]1,2]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]0.1,1]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]0.01,0.1]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]0.001,0.01]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]0.0001,0.001]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]1e-5,0.0001]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]1e-6,1e-5]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }, { "interval": "]0,1e-6]", "recordsWithRiskWithinInteval": 0.0, "recordsWithMaxmalRiskWithinInterval": 0.0 }] } } self.test_riskprofile = RiskProfile(self.risk_profile_response) self.test_anonymize_result = AnonymizeResult(self.test_data, self.test_riskprofile, self.test_metrics)
def test_init(self): Dataset(self.test_data, self.test_attribute_type_mapping)