Exemplo n.º 1
0
    def test_split_can_use_non_integer_indices(self):
        expected_trains = [
            pd.DataFrame(data={'variable': [4, 5, 6, 7, 8, 9]},
                         index=['4', '5', '6', '7', '8', '9']),
            pd.DataFrame(data={'variable': [0, 1, 2, 3, 7, 8, 9]},
                         index=['0', '1', '2', '3', '7', '8', '9']),
            pd.DataFrame(data={'variable': [0, 1, 2, 3, 4, 5, 6]},
                         index=['0', '1', '2', '3', '4', '5', '6'])
        ]
        expected_tests = [
            pd.DataFrame({'variable': [0, 1, 2, 3]},
                         index=['0', '1', '2', '3']),
            pd.DataFrame({'variable': [4, 5, 6]}, index=['4', '5', '6']),
            pd.DataFrame({'variable': [7, 8, 9]}, index=['7', '8', '9'])
        ]
        data = data_preparation.InferenceData(
            pd.DataFrame(
                data={
                    'variable': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                },
                index=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']))

        iterator = zip(data.split(cross_validation=3), expected_trains,
                       expected_tests)
        for (train_data, test_data), expected_train, expected_test in iterator:
            pd.testing.assert_frame_equal(train_data.data,
                                          expected_train,
                                          check_dtype=False)
            pd.testing.assert_frame_equal(test_data.data,
                                          expected_test,
                                          check_dtype=False)
    def test_zscored_input_raises_warning(self):
        data = pd.DataFrame(
            data=[[0.0, 1.0, 0.0, 10.0], [-0.5, 1.0, 0.0, 10.0],
                  [0.1, 1.0, 0.0, 5.00], [0.2, 0.0, 0.0, 0.00]],
            columns=['variable_0', 'variable_1', 'variable_2', 'variable_3'])

        data = data.apply(stats.zscore).fillna(0)
        inference_data = data_preparation.InferenceData(data)
        with self.assertWarns(Warning):
            _ = inference_data.address_low_variance()
    def test_minmaxscaling_with_invalid_threshold_raises_warning(self):
        data = pd.DataFrame(
            data=[[0.0, 1.0, 0.0, 10.0], [-0.5, 1.0, 0.0, 10.0],
                  [0.1, 1.0, 0.0, 5.00], [0.2, 0.0, 0.0, 0.00]],
            columns=['variable_0', 'variable_1', 'variable_2', 'variable_3'])

        inference_data = data_preparation.InferenceData(data)
        with self.assertWarns(Warning):
            _ = inference_data.address_low_variance(minmax_scaling=True,
                                                    threshold=.5)
    def test_impute_missing_values_replaced_with_mean(self):
        inference_data = data_preparation.InferenceData(self._missing_data)
        expected_result = pd.DataFrame(data=[[0.4000,
                                              0.0000], [0.6000, 0.0000],
                                             [0.4000, 3.0000],
                                             [0.2000, 1.0000]],
                                       columns=['first', 'second'])

        result = inference_data.impute_missing_values(strategy='mean')

        pd.testing.assert_frame_equal(result, expected_result)
Exemplo n.º 5
0
    def test_encode_categorical_covariate_dummy_variable_2(self):
        data = pd.DataFrame(
            data=[[0.0, 1.0, 'a', 10.0], [0.0, 1.0, 'b', 10.0],
                  [1.0, 1.0, 'c', 5.00], [1.0, 0.0, 'a', 0.00]],
            columns=['control', 'variable_1', 'variable_2', 'outcome'])
        expected_result = pd.DataFrame(data=[[0.0, 1.0, 10.0, 1, 0, 0],
                                             [0.0, 1.0, 10.0, 0, 1, 0],
                                             [1.0, 1.0, 5.00, 0, 0, 1],
                                             [1.0, 0.0, 0.00, 1, 0, 0]],
                                       columns=[
                                           'control', 'variable_1', 'outcome',
                                           'variable_2_a', 'variable_2_b',
                                           'variable_2_c'
                                       ])

        inference_data = data_preparation.InferenceData(
            data, target_column='outcome')
        result = inference_data.encode_categorical_covariates(
            columns=['variable_2'])

        pd.testing.assert_frame_equal(result, expected_result)

        data = pd.DataFrame(
            data=[[0.0, 1.0, 'a', 10.0], [0.0, 1.0, 'b', 10.0],
                  [1.0, 1.0, 'c', 5.00], [1.0, 0.0, 'a', 0.00]],
            columns=['control', 'variable_1', 'variable_2', 'outcome'])
        expected_result = pd.DataFrame(data=[[0.0, 1.0, 10.0, 0, 0],
                                             [0.0, 1.0, 10.0, 1, 0],
                                             [1.0, 1.0, 5.00, 0, 1],
                                             [1.0, 0.0, 0.00, 0, 0]],
                                       columns=[
                                           'control', 'variable_1', 'outcome',
                                           'variable_2_b', 'variable_2_c'
                                       ])

        inference_data = data_preparation.InferenceData(
            data, target_column='outcome')
        result = inference_data.encode_categorical_covariates(
            columns=['variable_2'], drop_first=True)

        pd.testing.assert_frame_equal(result, expected_result)
    def test_address_collinearity_with_vif_removes_column(self):
        iris = datasets.load_iris()
        iris_data = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                                 columns=iris['feature_names'] + ['target'])
        expected_result = iris_data.drop(columns='petal length (cm)')

        inference_data = data_preparation.InferenceData(iris_data,
                                                        target_column='target')
        result = inference_data.address_collinearity_with_vif(
            sequential=True, interactive=False, drop=True)

        pd.testing.assert_frame_equal(result, expected_result)
Exemplo n.º 7
0
    def test_vif_raises_error_on_ill_conditioned_correlation_matrix(self):
        ill_conditioned_correlation_matrix_df = pd.DataFrame(
            data=[[1.0, 2.0, 3.0, 4.0, 1.0], [0.0, 2.0, 0.0, 1.0, 1.0],
                  [1.0, 1.0, 2.0, 5.0, 1.0], [0.0, 2.0, 3.0, 0.0, 1.0]],
            columns=[
                'control', 'variable_1', 'variable_2', 'variable_3', 'outcome'
            ])
        inference_data = data_preparation.InferenceData(
            ill_conditioned_correlation_matrix_df, target_column='outcome')

        with self.assertRaises(data_preparation.SingularDataError):
            inference_data.address_collinearity_with_vif(
                handle_singular_data_errors_automatically=False)
    def test_fixed_effect_raise_exception_on_categorical_covariate(self):
        data = pd.DataFrame(
            data=[['0', 0.0, '1', 3.0], ['1', 0.0, '2', 2.0],
                  ['1', 1.0, '3', 2.0], ['1', 1.0, '4', 1.0]],
            columns=['control_1', 'control_2', 'variable_1', 'variable_2'],
            index=['group1', 'group2', 'group3', 'group3'])
        inference_data = data_preparation.InferenceData(data)

        with self.assertRaises(data_preparation.CategoricalCovariateError):
            inference_data.control_with_fixed_effect(
                strategy='quick',
                control_columns=['control_1', 'control_2'],
                min_frequency=1)
    def test_descretize(self, equal_sized_bins, numeric, expected_result):
        data = data_preparation.InferenceData(
            pd.DataFrame(data=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20],
                         columns=['variable']))

        result = data.discretize_numeric_covariate(
            'variable',
            equal_sized_bins=equal_sized_bins,
            bins=5,
            numeric=numeric)

        pd.testing.assert_frame_equal(result,
                                      expected_result,
                                      check_dtype=False)
    def test_address_low_variance_removes_column(self):
        data = pd.DataFrame(
            data=[[0.0, 1.0, 0.0, 10.0], [0.0, 1.0, 0.0, 10.0],
                  [1.0, 1.0, 0.0, 5.00], [1.0, 0.0, 0.0, 0.00]],
            columns=['control', 'variable', 'variable_1', 'outcome'])
        expected_result = pd.DataFrame(
            data=[[0.0, 1.0, 10.0], [0.0, 1.0, 10.0], [1.0, 1.0, 5.00],
                  [1.0, 0.0, 0.00]],
            columns=['control', 'variable', 'outcome'])

        inference_data = data_preparation.InferenceData(
            data, target_column='outcome')
        result = inference_data.address_low_variance(drop=True)

        pd.testing.assert_frame_equal(result, expected_result)
    def test_minmaxscaling_drops_appropriate_variables(self, scaling):
        data = pd.DataFrame(
            data=[[0.0, 1.0, 0.0, 10.0], [-0.5, 1.0, 0.0, 10.0],
                  [0.1, 1.0, 0.0, 5.00], [0.2, 0.0, 0.0, 0.00]],
            columns=['variable_0', 'variable_1', 'variable_2', 'outcome'])
        data = data * scaling
        expected_result = data[['variable_1', 'outcome']]

        inference_data = data_preparation.InferenceData(data)
        result = inference_data.address_low_variance(
            threshold=.15,
            drop=True,
            minmax_scaling=True,
        )

        pd.testing.assert_frame_equal(result, expected_result)
Exemplo n.º 12
0
    def test_split_without_groups_yields_expected_folds(
            self, cross_validation, expected_trains, expected_tests):
        data = data_preparation.InferenceData(
            pd.DataFrame({
                'variable': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
            }))

        iterator = zip(data.split(cross_validation=cross_validation),
                       expected_trains, expected_tests)
        for (train_data, test_data), expected_train, expected_test in iterator:
            pd.testing.assert_frame_equal(train_data.data,
                                          expected_train,
                                          check_dtype=False)
            pd.testing.assert_frame_equal(test_data.data,
                                          expected_test,
                                          check_dtype=False)
def _prepare_data_and_target(ready_for_modelling=True):
  # Prepare data
  data = np.array(
      [[0.496714150, -0.13826430, 0.647688540, 1.523029860, -0.23415337],
       [-0.23413696, 1.579212820, 0.767434730, -0.46947439, 0.542560040],
       [-0.46341769, -0.46572975, 0.241962270, -1.91328024, -1.72491783],
       [-0.56228753, -1.01283112, 0.314247330, -0.90802408, -1.41230370],
       [1.465648770, -0.22577630, 0.067528200, -1.42474819, -0.54438272],
       [0.110922590, -1.15099358, 0.375698020, -0.60063869, -0.29169375],
       [-0.60170661, 1.852278180, -0.01349722, -1.05771093, 0.822544910],
       [-1.22084365, 0.208863600, -1.95967012, -1.32818605, 0.196861240],
       [0.738466580, 0.171368280, -0.11564828, -0.30110370, -1.47852199],
       [-0.71984421, -0.46063877, 1.057122230, 0.343618290, -1.76304016],
       [0.324083970, -0.38508228, -0.67692200, 0.611676290, 1.030999520],
       [0.931280120, -0.83921752, -0.30921238, 0.331263430, 0.975545130],
       [-0.47917424, -0.18565898, -1.10633497, -1.19620662, 0.812525820],
       [1.356240030, -0.07201012, 1.003532900, 0.361636030, -0.64511975],
       [0.361395610, 1.538036570, -0.03582604, 1.564643660, -2.61974510],
       [0.821902500, 0.087047070, -0.29900735, 0.091760780, -1.98756891],
       [-0.21967189, 0.357112570, 1.477894040, -0.51827022, -0.80849360],
       [-0.50175704, 0.915402120, 0.328751110, -0.52976020, 0.513267430],
       [0.097077550, 0.968644990, -0.70205309, -0.32766215, -0.39210815],
       [-1.46351495, 0.296120280, 0.261055270, 0.005113460, -0.23458713]])
  # Decreasing coefficients with alternated signs
  idx = np.arange(data.shape[1])
  coefficients = (-1) ** idx * np.exp(-idx / 10)
  coefficients[10:] = 0  # sparsify
  target = np.dot(data, coefficients)
  # Add noise
  noise = np.array(
      [0.496714150, -0.13826430, 0.64768854, 1.523029860, -0.23415337,
       -0.23413696, 1.579212820, 0.76743473, -0.46947439, 0.542560040,
       -0.46341769, -0.46572975, 0.24196227, -1.91328024, -1.72491783,
       -0.56228753, -1.01283112, 0.31424733, -0.90802408, -1.41230370])
  target += 0.01 * noise

  data = pd.DataFrame(data)
  data['target'] = target
  inference_data = data_preparation.InferenceData(data, 'target')

  if ready_for_modelling:
    inference_data._has_control_factors = True
    inference_data._checked_low_variance = True
    inference_data._checked_collinearity = True

  return inference_data
Exemplo n.º 14
0
    def test_vif_noise_injection_catches_perfect_correlation(self):
        iris = datasets.load_iris()
        iris_data = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                                 columns=iris['feature_names'] + ['target'])
        iris_data['perfectly_correlated_column'] = iris_data[
            'petal length (cm)']
        expected_result = iris_data.drop(
            columns=['petal length (cm)', 'perfectly_correlated_column'])

        inference_data = data_preparation.InferenceData(iris_data,
                                                        target_column='target')

        result = inference_data.address_collinearity_with_vif(
            vif_method='quick',
            drop=True,
            handle_singular_data_errors_automatically=True,
            vif_threshold=50.0)

        pd.testing.assert_frame_equal(result, expected_result)
Exemplo n.º 15
0
    def test_vif_noise_injection_fails_correctly_when_too_few_samples(self):
        too_few_samples_df = pd.DataFrame(data=[[1.0, 2.0, 3.0, 4.0, 1.0],
                                                [0.0, 2.0, 0.0, 1.0, 1.0],
                                                [1.0, 1.0, 2.0, 5.0, 1.0]],
                                          columns=[
                                              'control', 'variable_1',
                                              'variable_2', 'variable_3',
                                              'outcome'
                                          ])
        inference_data = data_preparation.InferenceData(
            too_few_samples_df, target_column='outcome')

        expected_regex = (
            'Automatic attempt to resolve SingularDataError by '
            'injecting artifical noise to the data has failed. This '
            'probably means the dataset has too many features relative '
            'to the number of samples.')
        with self.assertRaisesRegex(data_preparation.SingularDataError,
                                    expected_regex):
            inference_data.address_collinearity_with_vif(
                handle_singular_data_errors_automatically=True)
Exemplo n.º 16
0
    def test_split_with_groups_yields_expected_folds_with_non_overlaping_groups(
            self, cross_validation, groups, expected_trains, expected_tests):
        data = data_preparation.InferenceData(
            pd.DataFrame({
                'variable': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
            }))

        iterator = zip(
            data.split(cross_validation=cross_validation, groups=groups),
            expected_trains, expected_tests)
        for (train_data, test_data), expected_train, expected_test in iterator:
            train_groups = set(groups[train_data.data.index.tolist()])
            test_groups = set(groups[test_data.data.index.tolist()])

            pd.testing.assert_frame_equal(train_data.data,
                                          expected_train,
                                          check_dtype=False)
            pd.testing.assert_frame_equal(test_data.data,
                                          expected_test,
                                          check_dtype=False)
            self.assertEmpty(train_groups.intersection(test_groups))
    def test_fixed_effect_demeaning_subtract_mean_in_groups(self):
        data = pd.DataFrame(
            data=[['0', 0.0, 1, 3.0], ['1', 0.0, 2, 2.0], ['1', 1.0, 3, 2.0],
                  ['1', 1.0, 4, 1.0]],
            columns=['control_1', 'control_2', 'variable_1', 'variable_2'],
            index=['group1', 'group2', 'group3', 'group3'])
        expected_result = pd.DataFrame(data=[['0', 0.0, 2.5, 2.0],
                                             ['1', 0.0, 2.5, 2.0],
                                             ['1', 1.0, 2.0, 2.5],
                                             ['1', 1.0, 3.0, 1.5]],
                                       columns=data.columns,
                                       index=data.index).set_index(
                                           ['control_1', 'control_2'],
                                           append=True)

        inference_data = data_preparation.InferenceData(data)
        result = inference_data.control_with_fixed_effect(
            strategy='quick',
            control_columns=['control_1', 'control_2'],
            min_frequency=1)

        pd.testing.assert_frame_equal(result, expected_result)
    def test_address_collinearity_with_vif_interactive(self, user_inputs,
                                                       expected_dropped,
                                                       sequential):
        dataframe = pd.DataFrame(data=[[1.1, 2.1, 3.1, 4.1, 0],
                                       [1.0, 2.0, 3.0, 4.0, 0],
                                       [1.0, 2.0, 3.0, 4.0, 0],
                                       [1.0, 2.0, 3.0, 4.0, 1]],
                                 columns=['1', '2', '3', '4', 'target'])
        data = data_preparation.InferenceData(dataframe,
                                              target_column='target')

        with mock.patch.object(data_preparation, '_input_mock') as input_mock:
            # Avoid Colab\Notebook prints in tests output
            with mock.patch.object(data_preparation, '_print_mock') as _:
                user_inputs = list(reversed(user_inputs))
                input_mock.side_effect = lambda x: user_inputs.pop()

                result = data.address_collinearity_with_vif(
                    sequential=sequential, interactive=True, drop=True)

        pd.testing.assert_frame_equal(result,
                                      dataframe.drop(expected_dropped, axis=1))
Exemplo n.º 19
0
    def test_vif_error_has_correct_message(self):
        ill_conditioned_correlation_matrix_df = pd.DataFrame(
            data=[[1.0, 2.0, 3.0, 4.0, 1.0], [0.0, 2.0, 0.0, 1.0, 1.0],
                  [1.0, 1.0, 2.0, 5.0, 1.0], [0.0, 2.0, 3.0, 0.0, 1.0]],
            columns=[
                'control', 'variable_1', 'variable_2', 'variable_3', 'outcome'
            ])
        inference_data = data_preparation.InferenceData(
            ill_conditioned_correlation_matrix_df, target_column='outcome')

        expected_message = (
            'Inference Data has a singular or nearly singular correlation matrix. '
            'This could be caused by extremely correlated or collinear columns. '
            'The three pairs of columns with the highest absolute correlation '
            'coefficients are: (control,variable_3): 0.970, (variable_1,variable_3)'
            ': -0.700, (control,variable_1): -0.577. This could also be caused by '
            'columns with extremiely low variance. Recommend running the '
            'address_low_variance() method before VIF. Alternatively, consider '
            'running address_collinearity_with_vif() with '
            'use_correlation_matrix_inversion=False to avoid this error.')
        with self.assertRaises(data_preparation.SingularDataError,
                               msg=expected_message):
            inference_data.address_collinearity_with_vif(
                handle_singular_data_errors_automatically=False)
 def test_invalid_target_column_raise_exception(self):
     with self.assertRaises(KeyError):
         data_preparation.InferenceData(initial_data=self._missing_data,
                                        target_column='non_ci_sono')
    def test_check_data_raises_exception_on_missing_data(self):
        inference_data = data_preparation.InferenceData(self._missing_data)

        with self.assertRaises(data_preparation.MissingValueError):
            inference_data.data_check(raise_on_error=True)
 def test_missing_value_emits_warning_twice(self):
     with self.assertWarns(data_preparation.MissingValueWarning):
         data_preparation.InferenceData(self._missing_data)
     with self.assertWarns(data_preparation.MissingValueWarning):
         data_preparation.InferenceData(self._missing_data)
Exemplo n.º 23
0
 def test_vif_method_fails_correctly_with_unknown_value(self):
     inference_data = data_preparation.InferenceData(self._missing_data)
     with self.assertRaises(ValueError):
         inference_data.address_collinearity_with_vif(
             vif_method='incorrect_value')