def test_MS_df_mean_inplace(self): """ Positive test data: Correct dataframe (divcols) Checks that mean_substitution removes 8 NA values from the dataframe. """ # 1. Arrange df = generate_example_df_divcols() # 2. Act mean_substitution(df, inplace=True) # 3. Assert self.assertEqual(df.isna().sum().sum(), 10)
def test_MS_wrong_type(self): """ Negative test data: array (unsupported type) Checks that the mean_substitution raises a TypeError if the data is passed as an array. """ # 1. Arrange data = [2, 4, np.nan, 1] # 2. Act & 3. Assert with self.assertRaises(TypeError): mean_substitution(data)
def test_MS_series_mean_inplace(self): """ Positive test data: Correct Series (example series) Checks that mean_substitution removes 3 NA values from the series. """ # 1. Arrange ser = generate_example_series() # 2. Act mean_substitution(ser, inplace=True) # 3. Assert self.assertEqual(ser.isna().sum(), 0)
def test_MS_wrong_method(self): """ Negative test data: Correct series (example series) method: 'z' (not a valid method) Checks that the function raises a ValueError if the value passed for the parameter method is not valid. """ # 1. Arrange ser = generate_example_series() # 2. Act & 3. Assert with self.assertRaises(ValueError): mean_substitution(ser, method='z')
def test_MS_df_mean_inplace_wrong_column(self): """ Negative test data: Correct dataframe (divcols) columns: ['f', 'g', 'z'] ('z' doesn't exist in the data) Checks that mean_substitution raises a ValueError if the data is passed as an array. """ # 1. Arrange df = generate_example_df_divcols() # 2. Act & 3. Assert with self.assertRaises(ValueError): mean_substitution(df, columns=['f', 'g', 'z'], inplace=True)
def test_MS_col_for_series(self): """ Negative test data: Correct series (example_series) columns: ['a'] (series can't have columns) Checks that the function raises a ValueError if a column is passed for a series. """ # 1. Arrange ser = generate_example_series() # 2. Act & 3. Assert with self.assertRaises(ValueError): mean_substitution(ser, columns=['a'])
def test_MS_df_mean_inplace_columns(self): """ Positive test data: Correct dataframe (divcols) columns: ['f', 'g'] Checks that mean_substitution removes 4 NA values from the specified columns. """ # 1. Arrange df = generate_example_df_divcols() # 2. Act mean_substitution(df, columns=['f', 'g'], inplace=True) # 3. Assert self.assertEqual(df.isna().sum().sum(), 14)
def mice_one_imputation(data): """Auxiliary function that performs one MICE imputation, choosing the order in which the columns are imputed at random. :param data: The data on which to perform the imputation. :type data: pandas.DataFrame :return: The dataframe with one MICE imputation performed. :rtype: pandas.DataFrame """ # This auxiliary function always returns a copy: res = data.copy() # Save the mask of missing values: na_mask = pd.isna(data) # Compute the list of columns with missing values columns_with_na = [] for column in data.columns: if data[column].isna().any(): columns_with_na.append(column) # Shuffle the list of columns to impute: shuffle(columns_with_na) # Impute with mean substitution: for column in columns_with_na: if is_numeric_dtype(data[column]): mean_substitution(res, columns=[column], inplace=True) else: random_sample_imputation(res, columns=[column], inplace=True) # Compute which columns are numeric in order to use them as predictors: numerics = [col for col in data.columns if is_numeric_dtype(data[col])] # Impute each column: for column in columns_with_na: if is_numeric_dtype(data[column]): res.loc[na_mask[column], column] = np.nan linear_regression(res, column, predictors=numerics, inplace=True) else: res.loc[na_mask[column], column] = None logistic_regression(res, column, inplace=True) return res
def test_MS_series_mean_returning(self): """ Positive test data: Correct Series (example series) Checks that the original series remains unmodified and that the returned series contains no NA values, 3 less than the original. """ # 1. Arrange ser = generate_example_series() # 2. Act ser2 = mean_substitution(ser) # 3. Assert self.assertEqual(ser.isna().sum(), 3) self.assertEqual(ser2.isna().sum(), 0)
def test_MS_df_mean_returning(self): """ Positive test data: Correct dataframe (divcols) Checks that the original dataframe remains unmodified and that the returned dataframe contains 10 NA values, 8 less than the original. """ # 1. Arrange df = generate_example_df_divcols() # 2. Act df2 = mean_substitution(df) # 3. Assert self.assertEqual(df.isna().sum().sum(), 18) self.assertEqual(df2.isna().sum().sum(), 10)
def test_MS_df_median_returning_columns(self): """ Positive test data: Correct dataframe (divcols) columns: ['f', 'g'] method: 'median' Checks that the original dataframe remains unmodified and that the returned dataframe contains 14 NA values, 4 less than the original. """ # 1. Arrange df = generate_example_df_divcols() # 2. Act df2 = mean_substitution(df, method='median', columns=['f', 'g']) # 3. Assert self.assertEqual(df.isna().sum().sum(), 18) self.assertEqual(df2.isna().sum().sum(), 14)