def test_alias_linked_column_values(self): ''' Doc string ''' with patch( "exhibit.core.linkage.hierarchical._LinkedDataGenerator.__init__" ) as mock_init: mock_init.return_value = None test_LDG = tm._LinkedDataGenerator(Mock, Mock, Mock) test_dict = { "columns": { "C1": { "anonymising_set": "random", "original_values": pd.DataFrame( data={"C1": ["repl_A", "B", MISSING_DATA_STR]}), "paired_columns": [] }, "C2": { "anonymising_set": "random", "original_values": pd.DataFrame( data={"C2": ["eggs", "spam", MISSING_DATA_STR]}), "paired_columns": [] }, } } create_temp_table(table_name="temp_1234_0", col_names=["C1", "C2"], data=[("A", "spam"), ("B", "eggs")]) #A - spam, B - eggs is initial linkage that was put into SQLdb test_linked_df = pd.DataFrame(data={ "C1": ["A", "A", "B", "B"], "C2": ["spam", "spam", "eggs", "eggs"] }) #repl_A - spam, B - eggs is user-edited linkage that exists only in spec expected_df = pd.DataFrame( data={ "C1": ["repl_A", "repl_A", "B", "B"], "C2": ["spam", "spam", "eggs", "eggs"] }) setattr(test_LDG, "spec_dict", test_dict) setattr(test_LDG, "table_name", "temp_1234_0") setattr(test_LDG, "id", "1234") setattr(test_LDG, "linked_group", (0, ["C1", "C2"])) setattr(test_LDG, "linked_cols", ["C1", "C2"]) assert_frame_equal( left=test_LDG.alias_linked_column_values(test_linked_df), right=expected_df) db_util.drop_tables(["temp_1234_0"])
def test_weights_for_linked_columns_with_mixed_inline_limits(self): ''' Doc string ''' data = [("A", "A1"), ("A", "A2"), ("B", "B1"), ("B", "B2"), ("B", "B3"), (MISSING_DATA_STR, MISSING_DATA_STR)] create_temp_table(table_name="temp_test_id_weights_1", col_names=["LinkCat1", "LinkCat2"], data=data) self._temp_tables.append("temp_test_id_weights_0") values = pd.DataFrame(data={ "LinkCat1": ["A", "B", MISSING_DATA_STR], "NumC": [0.1, 0.9, 0.0] }) test_dict = { "metadata": { "numerical_columns": ["NumC"], "inline_limit": 3, "id": "test_id_weights" }, "columns": { "LinkCat1": { "type": "categorical", "original_values": values, "uniques": 2, "anonymising_set": "random" }, "LinkCat2": { "type": "categorical", "original_values": ORIGINAL_VALUES_DB, "uniques": 5, "anonymising_set": "random" }, "NumC": { "type": "continuous", } }, "linked_columns": [(1, ["LinkCat1", "LinkCat2"])] } test_cols = ["LinkCat1", "LinkCat2"] test_wt = tm.generate_weights_table(test_dict, test_cols) self.assertEqual( test_wt[("NumC", "LinkCat1", MISSING_DATA_STR)]["weights"].weight, 0.0) self.assertEqual(test_wt[("NumC", "LinkCat1", "B")]["weights"].weight, 0.9) self.assertEqual( test_wt[("NumC", "LinkCat2", MISSING_DATA_STR)]["weights"].weight, 0.2) self.assertEqual(test_wt[("NumC", "LinkCat2", "B1")]["weights"].weight, 0.2)
def test_equal_weight_for_single_column_exceeding_inline_limit(self): ''' Missind data is a special value that might or might not appear in the actually generated data, hence when calculating equal weights, we ignore it and only divide 1 by the total number of valid unique values in the column. ''' data = [("A", ), ("B", ), ("C", ), ("D", ), ("E", ), (MISSING_DATA_STR, )] create_temp_table(table_name="temp_test_id_weights_CatC", col_names=["CatC"], data=data) self._temp_tables.append("temp_test_id_weights_CatC") test_dict = { "metadata": { "numerical_columns": ["NumC"], "inline_limit": 1, "id": "test_id_weights" }, "columns": { "CatC": { "type": "categorical", "original_values": ORIGINAL_VALUES_DB, "uniques": 5, "anonymising_set": "random" }, "NumC": { "type": "continuous", } } } test_cols = ["CatC"] test_wt = tm.generate_weights_table(test_dict, test_cols) result_md = test_wt[("NumC", "CatC", MISSING_DATA_STR)]["weights"].weight result_col = test_wt[("NumC", "CatC", "A")]["weights"].weight self.assertEqual(result_md, 0.2) self.assertEqual(result_col, 0.2)
def test_temp_table_insertion(self): ''' Temporary lookup table in anon.db - also testing extra whitespace in source data. When values are formatted for the spec, extra whitespace is stripped so we have to make sure the same happens when values are put in the SQL db. ''' expected = [("A", "B"), ("A", "B")] output = tm.create_temp_table(table_name="test_table", col_names=list("AB"), data=[("A ", "B"), ("A", "B")], db_uri="file:test_db?mode=memory", return_table=True) self.assertListEqual(expected, output)
def test_random_column_with_missing_pairs_sql(self): ''' An edge case where a paired column isn't in sql alongside the base column; generation set is random shuffle. ''' test_dict = { "metadata": { "inline_limit": 5, "id": 1234 }, "columns": { "test_Root": { "type": "categorical", "paired_columns": ["test_C1", "test_C2"], "uniques": 10, "original_values": pd.DataFrame(), "anonymising_set": "random", "cross_join_all_unique_values": False, } } } test_num_rows = 100 test_col_name = "test_Root" test_col_attrs = test_dict["columns"][test_col_name] path = "exhibit.core.generate.categorical.CategoricalDataGenerator.__init__" with patch(path) as mock_init: mock_init.return_value = None generatorMock = tm.CategoricalDataGenerator(Mock(), Mock()) setattr(generatorMock, "spec_dict", test_dict) setattr(generatorMock, "num_rows", test_num_rows) setattr(generatorMock, "rng", np.random.default_rng(seed=0)) with tempfile.TemporaryDirectory() as td: db_name = "test.db" db_path = abspath(join(td, db_name)) create_temp_table(table_name="temp_1234_test_Root", col_names=["test_Root", "test_C1"], data=[("A ", "B"), ("A", "B")], db_uri=db_path, return_table=False) result = generatorMock._generate_from_sql(test_col_name, test_col_attrs, db_uri=db_path) expected = pd.DataFrame( data={ "test_Root": ["A"] * test_num_rows, "test_C1": ["B"] * test_num_rows, "test_C2": ["A"] * test_num_rows }) assert_frame_equal( left=expected, right=result, )