def test_seed_per_observation_plus_product_of_seeding_variables(df_na): # test case 3: observation seed, 2 variables as seed, product of seed variables imputer = RandomSampleImputer( variables=["City", "Studies"], random_state=["Marks", "Age"], seed="observation", seeding_method="multiply", ) df_na[["Marks", "Age"]] = df_na[["Marks", "Age"]].fillna(1) X_transformed = imputer.fit_transform(df_na) # expected output ref = { "Name": ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"], "City": [ "London", "Manchester", "London", "Manchester", "London", "London", "Bristol", "Manchester", ], "Studies": [ "Bachelor", "Bachelor", "Bachelor", "Masters", "Bachelor", "PhD", "None", "Masters", ], "Age": [20, 21, 19, np.nan, 23, 40, 41, 37], "Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6], "dob": pd.date_range("2020-02-24", periods=8, freq="T"), } ref = pd.DataFrame(ref) assert imputer.variables == ["City", "Studies"] assert imputer.random_state == ["Marks", "Age"] assert imputer.seed == "observation" pd.testing.assert_frame_equal(imputer.X_[["City", "Studies"]], df_na[["City", "Studies"]]) pd.testing.assert_frame_equal( X_transformed[["City", "Studies"]], ref[["City", "Studies"]], check_dtype=False, )
def test_general_seed_plus_automatically_select_variables(df_na): # set up transformer imputer = RandomSampleImputer(variables=None, random_state=5, seed="general") X_transformed = imputer.fit_transform(df_na) # expected output: # fillna based on seed used (found experimenting on Jupyter notebook) ref = { "Name": ["tom", "nick", "krish", "peter", "peter", "sam", "fred", "sam"], "City": [ "London", "Manchester", "London", "Manchester", "London", "London", "Bristol", "Manchester", ], "Studies": [ "Bachelor", "Bachelor", "PhD", "Masters", "Bachelor", "PhD", "None", "Masters", ], "Age": [20, 21, 19, 23, 23, 40, 41, 37], "Marks": [0.9, 0.8, 0.7, 0.3, 0.3, 0.6, 0.8, 0.6], "dob": pd.date_range("2020-02-24", periods=8, freq="T"), } ref = pd.DataFrame(ref) # test init params assert imputer.variables == [ "Name", "City", "Studies", "Age", "Marks", "dob" ] assert imputer.random_state == 5 assert imputer.seed == "general" # test fit attr assert imputer.input_shape_ == (8, 6) pd.testing.assert_frame_equal(imputer.X_, df_na) # test transform output pd.testing.assert_frame_equal(X_transformed, ref, check_dtype=False)
def test_seed_per_observation_with_only_1_variable_as_seed(df_na): # test case 4: observation seed, only variable indicated as seed, method: addition # Note the variable used as seed should not have missing data imputer = RandomSampleImputer(variables=["City", "Studies"], random_state="Age", seed="observation") df_na["Age"] = df_na["Age"].fillna(1) X_transformed = imputer.fit_transform(df_na) # expected output ref = { "Name": ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"], "City": [ "London", "Manchester", "Manchester", "Manchester", "London", "London", "Bristol", "Manchester", ], "Studies": [ "Bachelor", "Bachelor", "Masters", "Masters", "Bachelor", "PhD", "None", "Masters", ], "Age": [20, 21, 19, np.nan, 23, 40, 41, 37], "Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6], "dob": pd.date_range("2020-02-24", periods=8, freq="T"), } ref = pd.DataFrame(ref) assert imputer.random_state == ["Age"] pd.testing.assert_frame_equal(imputer.X_[["City", "Studies"]], df_na[["City", "Studies"]]) pd.testing.assert_frame_equal( X_transformed[["City", "Studies"]], ref[["City", "Studies"]], check_dtype=False, )
def test_seed_per_observation_and_multiple_variables_in_random_state(df_na): # test case 2: imputer seed per observation using multiple variables to determine # the random_state # Note the variables used as seed should not have missing data imputer = RandomSampleImputer(variables=["City", "Studies"], random_state=["Marks", "Age"], seed="observation") df_na[["Marks", "Age"]] = df_na[["Marks", "Age"]].fillna(1) X_transformed = imputer.fit_transform(df_na) # expected output ref = { "Name": ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"], "City": [ "London", "Manchester", "London", "London", "London", "London", "Bristol", "Manchester", ], "Studies": [ "Bachelor", "Bachelor", "PhD", "Bachelor", "Bachelor", "PhD", "None", "Masters", ], "Age": [20, 21, 19, np.nan, 23, 40, 41, 37], "Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6], "dob": pd.date_range("2020-02-24", periods=8, freq="T"), } ref = pd.DataFrame(ref) assert imputer.variables == ["City", "Studies"] assert imputer.random_state == ["Marks", "Age"] assert imputer.seed == "observation" pd.testing.assert_frame_equal(imputer.X_[["City", "Studies"]], df_na[["City", "Studies"]]) pd.testing.assert_frame_equal(X_transformed[["City", "Studies"]], ref[["City", "Studies"]])