Exemplo n.º 1
0
    def test_MS_df_mean_inplace(self):
        """
        Positive test

        data: Correct dataframe (divcols)

        Checks that mean_substitution removes 8 NA values from the dataframe.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act
        mean_substitution(df, inplace=True)
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 10)
Exemplo n.º 2
0
    def test_MS_wrong_type(self):
        """
        Negative test

        data: array (unsupported type)

        Checks that the mean_substitution raises a TypeError if the data is
        passed as an array.
        """
        # 1. Arrange
        data = [2, 4, np.nan, 1]
        # 2. Act & 3. Assert
        with self.assertRaises(TypeError):
            mean_substitution(data)
Exemplo n.º 3
0
    def test_MS_series_mean_inplace(self):
        """
        Positive test

        data: Correct Series (example series)

        Checks that mean_substitution removes 3 NA values from the series.
        """
        # 1. Arrange
        ser = generate_example_series()
        # 2. Act
        mean_substitution(ser, inplace=True)
        # 3. Assert
        self.assertEqual(ser.isna().sum(), 0)
Exemplo n.º 4
0
    def test_MS_wrong_method(self):
        """
        Negative test

        data: Correct series (example series)
        method: 'z' (not a valid method)

        Checks that the function raises a ValueError if the value passed for
        the parameter method is not valid.
        """
        # 1. Arrange
        ser = generate_example_series()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            mean_substitution(ser, method='z')
Exemplo n.º 5
0
    def test_MS_df_mean_inplace_wrong_column(self):
        """
        Negative test

        data: Correct dataframe (divcols)
        columns: ['f', 'g', 'z'] ('z' doesn't exist in the data)

        Checks that mean_substitution raises a ValueError if the data is
        passed as an array.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            mean_substitution(df, columns=['f', 'g', 'z'], inplace=True)
Exemplo n.º 6
0
    def test_MS_col_for_series(self):
        """
        Negative test

        data: Correct series (example_series)
        columns: ['a'] (series can't have columns)

        Checks that the function raises a ValueError if a column is passed
        for a series.
        """
        # 1. Arrange
        ser = generate_example_series()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            mean_substitution(ser, columns=['a'])
Exemplo n.º 7
0
    def test_MS_df_mean_inplace_columns(self):
        """
        Positive test

        data: Correct dataframe (divcols)
        columns: ['f', 'g']

        Checks that mean_substitution removes 4 NA values from the specified
        columns.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act
        mean_substitution(df, columns=['f', 'g'], inplace=True)
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 14)
Exemplo n.º 8
0
def mice_one_imputation(data):
    """Auxiliary function that performs one MICE imputation, choosing the
    order in which the columns are imputed at random.

    :param data: The data on which to perform the imputation.
    :type data: pandas.DataFrame
    :return: The dataframe with one MICE imputation performed.
    :rtype: pandas.DataFrame
    """
    # This auxiliary function always returns a copy:
    res = data.copy()
    # Save the mask of missing values:
    na_mask = pd.isna(data)
    # Compute the list of columns with missing values
    columns_with_na = []
    for column in data.columns:
        if data[column].isna().any():
            columns_with_na.append(column)
    # Shuffle the list of columns to impute:
    shuffle(columns_with_na)
    # Impute with mean substitution:
    for column in columns_with_na:
        if is_numeric_dtype(data[column]):
            mean_substitution(res, columns=[column], inplace=True)
        else:
            random_sample_imputation(res, columns=[column], inplace=True)
    # Compute which columns are numeric in order to use them as predictors:
    numerics = [col for col in data.columns if is_numeric_dtype(data[col])]
    # Impute each column:
    for column in columns_with_na:
        if is_numeric_dtype(data[column]):
            res.loc[na_mask[column], column] = np.nan
            linear_regression(res, column, predictors=numerics, inplace=True)
        else:
            res.loc[na_mask[column], column] = None
            logistic_regression(res, column, inplace=True)
    return res
Exemplo n.º 9
0
    def test_MS_series_mean_returning(self):
        """
        Positive test

        data: Correct Series (example series)

        Checks that the original series remains unmodified and that the
        returned series contains no NA values, 3 less than the original.
        """
        # 1. Arrange
        ser = generate_example_series()
        # 2. Act
        ser2 = mean_substitution(ser)
        # 3. Assert
        self.assertEqual(ser.isna().sum(), 3)
        self.assertEqual(ser2.isna().sum(), 0)
Exemplo n.º 10
0
    def test_MS_df_mean_returning(self):
        """
        Positive test

        data: Correct dataframe (divcols)

        Checks that the original dataframe remains unmodified and that the
        returned dataframe contains 10 NA values, 8 less than the original.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act
        df2 = mean_substitution(df)
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 18)
        self.assertEqual(df2.isna().sum().sum(), 10)
Exemplo n.º 11
0
    def test_MS_df_median_returning_columns(self):
        """
        Positive test

        data: Correct dataframe (divcols)
        columns: ['f', 'g']
        method: 'median'

        Checks that the original dataframe remains unmodified and that the
        returned dataframe contains 14 NA values, 4 less than the original.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act
        df2 = mean_substitution(df, method='median', columns=['f', 'g'])
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 18)
        self.assertEqual(df2.isna().sum().sum(), 14)