Exemplo n.º 1
0
  def test_regression_method_doesnt_throw_errors_on_ill_conditioned_data(self):
    vifs = vif.calculate_vif(
        self.ill_conditioned_correlation_matrix_df,
        use_correlation_matrix_inversion=False,
        raise_on_ill_conditioned=True)

    self.assertNotEmpty(vifs)
Exemplo n.º 2
0
  def test_calculate_vif_correct_results(self):
    # Must drop response variable.
    vifs = vif.calculate_vif(self.data.drop(columns='target'))
    calculated_results = vifs['VIF'].round(2).to_list()

    expected = sorted(self.unsorted_vifs, reverse=True)  # by default its sorted
    self.assertListEqual(calculated_results, expected)
Exemplo n.º 3
0
  def test_calculate_vif_correct_results_inversion_method(self):
    expected = sorted(
        self.unsorted_vifs, reverse=True)  # by default it's sorted

    # Must drop response variable.
    vifs = vif.calculate_vif(
        self.data.drop(columns='target'), use_correlation_matrix_inversion=True)
    calculated_results = vifs['VIF'].round(2).to_list()

    self.assertListEqual(calculated_results, expected)
Exemplo n.º 4
0
    def address_collinearity_with_vif(
        self,
        vif_method: Union[str, VifMethod] = 'sequential',
        vif_threshold: int = 10,
        drop: bool = True,
        use_correlation_matrix_inversion: bool = True,
        raise_on_ill_conditioned: bool = True,
        min_absolute_corr: float = 0.4,
        handle_singular_data_errors_automatically: bool = True
    ) -> pd.DataFrame:
        """Uses VIF to identify columns that are collinear and optionally drop them.

    The 'vif_method' argument specifies the control flow for the variance
    inflation factor analysis. It can be either 'sequential', 'interactive', or
    'quick'.

    * sequential: Sequentially remove a column with the highest VIF value until
    all columns meet the vif_threshold.
    * interactive: Same as `sequential` but the user is prompted which column(s)
    to remove at each iteration.
    * quick: Remove all columns with VIF value greater than vif_threshold.

    To remove problematic collinear features, the 'sequential' method performs
    the VIF analysis iteratively, removing the column with the highest VIF each
    time until all the columns meet the `vif_threshold` without user input.

    If you want to manually decide which column or columns you want to drop in
    each iteration, you can choose the 'interactive' method to be prompted to
    screen the columns to remove. To assist in choosing which column(s) to drop,
    a list of correlated features will be shown having at least a minimum
    correlation of `min_absolute_corr`.

    As alternative to the iterative methods, a quicker solution is to remove all
    columns having VIF value higher than `vif_threshold`. In this case the VIF
    analysis will be performed only once. Note that removing multiple variables
    at once like this leads to removing variables that otherwise wouldn't be
    removed using the sequential approach.

    Args:
      vif_method: Specify the control flow for the analysis. It can be either
        'sequential', 'quick', or 'interactive'.
      vif_threshold: Threshold to identify which columns have high collinearity
        and anything higher than this threshold is dropped or used for warning.
      drop: Boolean to either drop columns with high VIF or just print a
        message, default is set to True.
      use_correlation_matrix_inversion: If True, uses correlation matrix
        inversion algorithm to optimize VIF calculations. If False, uses
        regression algorithm.
      raise_on_ill_conditioned: Whether to raise an exception if the correlation
        matrix is ill-conditioned. Only applies when
        use_correlation_matrix_inversion=True.
      min_absolute_corr: Minimum absolute correlation required to display a
        feature as "correlated" to another feature in interactive mode. Only
        applies when vif_method='interactive'. Should be between 0 and 1, though
        this is not currently enforced.
      handle_singular_data_errors_automatically: If True, then
        SingularDataErrors and IllConditionedDataErrors from vif.calculate_vif()
        will be handled automatically by injecting artifical noise into the data
        and re-running. Note that the data with artifical noise is an
        intermediate product of this method (tmp_data) and the output of the
        method does not contain that artifical noise. The noise is random noise
        following a normal distribution, with standard deviation for each column
        defined by the standard deviation of the data in that column multiplied
        by the fraction fractional_noise_to_add_per_iteration (set to 1e-4). To
        avoid getting stuck in an infinite loop, only max_number_of_iterations
        (set to 1000) of the noise injection procedure are allowed; after this
        number of iterations if the correlation matrix is still singular, the
        method fails with a SingularDataError. This argument is only relevant
        when use_correlation_matrix_inversion=True.

    Returns:
      Data after collinearity check with vif has been applied. When drop=True,
        columns with high collinearity will not be present in the returned data.

    Raises:
      SingularDataError: Raised when use_correlation_matrix_inversion=True and
        the correlation matrix of self.data is singular or ill-conditioned.
        Also raised when use_correlation_matrix_inversion=True and
        handle_singular_data_errors_automatically=True, if the random noise
        injected into the data was not sufficient to resolve the problem.
      ValueError: Raised when vif_method is not one of the three expected values
        ('sequential', 'quick', or 'interactive').
    """

        vif_method = VifMethod(vif_method)

        covariates = self.data.drop(columns=self.target_column)
        columns_to_drop = []
        corr_matrix = covariates.corr()

        fractional_noise_to_add_per_iteration = 1.0e-4
        max_number_of_iterations = 1000

        while True:
            tmp_data = covariates.drop(columns=columns_to_drop)

            trimmed_corr_matrix = corr_matrix.drop(
                columns_to_drop, axis=0).drop(columns_to_drop, axis=1)
            corr_matrix_for_vif = trimmed_corr_matrix.copy()

            if handle_singular_data_errors_automatically:
                rng = np.random.default_rng()
                variances_for_each_column = tmp_data.var(ddof=0)
                variance_df = pd.DataFrame(
                    data=[variances_for_each_column.to_list()] *
                    tmp_data.shape[0])

            vif_succeeded_flag = False
            for iteration_count in range(max_number_of_iterations):

                if iteration_count > 0:
                    corr_matrix_for_vif = tmp_data.corr()

                try:
                    vif_data = vif.calculate_vif(
                        tmp_data,
                        sort=True,
                        use_correlation_matrix_inversion=
                        use_correlation_matrix_inversion,
                        raise_on_ill_conditioned=raise_on_ill_conditioned,
                        corr_matrix=corr_matrix_for_vif)
                    vif_succeeded_flag = True
                except (vif.SingularDataError, vif.IllConditionedDataError):
                    message_postscript = ''
                    if handle_singular_data_errors_automatically:
                        if iteration_count < max_number_of_iterations - 1:
                            noise = rng.normal(
                                size=tmp_data.shape,
                                scale=np.sqrt(variance_df) *
                                fractional_noise_to_add_per_iteration)
                            tmp_data += noise
                            continue
                        else:
                            message_postscript = (
                                ' Automatic attempt to resolve SingularDataError by '
                                'injecting artifical noise to the data has failed. This '
                                'probably means the dataset has too many features relative '
                                'to the number of samples.')

                    message = self._generate_vif_error_message(
                        trimmed_corr_matrix)
                    message += message_postscript
                    raise SingularDataError(message)

                if vif_succeeded_flag:
                    break
            if max(vif_data['VIF']) < vif_threshold:
                break

            if vif_method == VifMethod.INTERACTIVE:
                correlated_features = self._get_list_of_correlated_features(
                    vif_data,
                    corr_matrix.drop(columns_to_drop,
                                     axis=0).drop(columns_to_drop, axis=1),
                    min_absolute_corr)
                vif_data['correlated_features'] = correlated_features
                selected_columns = _vif_interactive_input_and_validation(
                    vif_data)
            elif vif_method == VifMethod.SEQUENTIAL:
                selected_columns = [vif_data.iloc[0].features]
            else:
                vif_filter = vif_data['VIF'] >= vif_threshold
                selected_columns = vif_data['features'][vif_filter].tolist()

            columns_to_drop.extend(selected_columns)

            if (vif_method == VifMethod.QUICK) or not selected_columns:
                break

        if drop:
            self.data = self.data.drop(columns=columns_to_drop)
        else:
            message = (
                f'Consider removing the following columns due to collinearity: '
                f'{columns_to_drop}')
            warnings.warn(CollinearityWarning(message))

        self._checked_collinearity = True

        return self.data
  def address_collinearity_with_vif(self,
                                    vif_threshold: int = 10,
                                    sequential: bool = True,
                                    interactive: bool = False,
                                    drop: bool = True) -> pd.DataFrame:
    """Uses VIF to identify columns that are collinear and option to drop them.

    You can customize how collinearity will be resolved with `sequential` and
    `interactive` parameters. By default, the VIF score will re-calculated every
    time the column with the highest VIF score is dropped until the threshold is
    met. If you wish to remove all the columns with VIF score higher than the
    threshold, you can set `sequential=False`.
    If you want to have a say on which column is going to removed, rather than
    automatically pick the column with the highest VIF score, you can set
    `interactive=True`. This will prompt for your input every time columns are
    found with VIF score higher than your threshold, whether `sequential` is set
    to True of False.

    Args:
      vif_threshold: Threshold to identify which columns have high collinearity
        and anything higher than this threshold is dropped or used for warning.
      sequential: Whether you want to sequentially re-calculate VIF each time
        after a set column(s) have been removed or only once.
      interactive: Whether you want to manually specify which column(s) you want
       to remove.
      drop: Boolean to either drop columns with high VIF or print message,
        default is set to True.

    Returns:
      Data after collinearity check with vif has been applied. When drop=True
        columns with high collinearity will not be present in the returned data.
    """
    covariates = self.data.drop(columns=self.target_column)
    columns_to_drop = []

    while True:
      tmp_data = covariates.drop(columns=columns_to_drop)
      vif_data = vif.calculate_vif(tmp_data, sort=True).reset_index(drop=True)

      if vif_data['VIF'][0] < vif_threshold:
        break

      if interactive:
        selected_columns = _vif_interactive_input_and_validation(vif_data)
      elif sequential:
        selected_columns = [vif_data['features'][0]]
      else:
        vif_filter = vif_data['VIF'] >= vif_threshold
        selected_columns = vif_data['features'][vif_filter].tolist()

      columns_to_drop.extend(selected_columns)

      if not sequential or not selected_columns:
        break

    if drop:
      self.data = self.data.drop(columns=columns_to_drop)
    else:
      message = (
          f'Consider removing the following columns due to collinearity: '
          f'{columns_to_drop}')
      warnings.warn(CollinearityWarning(message))

    self._checked_collinearity = True

    return self.data
Exemplo n.º 6
0
  def test_regression_method_doesnt_throw_singular_error_on_singular_data(self):
    vifs = vif.calculate_vif(
        self.singular_correlation_matrix_df,
        use_correlation_matrix_inversion=False)

    self.assertNotEmpty(vifs)
Exemplo n.º 7
0
 def test_inversion_method_throws_singular_error_on_singular_data(self):
   with self.assertRaises(vif.SingularDataError):
     vif.calculate_vif(
         self.singular_correlation_matrix_df,
         use_correlation_matrix_inversion=True)
Exemplo n.º 8
0
  def test_calculate_vif_sorted_flag_unsorted_results(self):
    vifs = vif.calculate_vif(self.data.drop(columns='target'), sort=False)
    calculated_results = vifs['VIF'].round(2).to_list()

    self.assertListEqual(calculated_results, self.unsorted_vifs)
Exemplo n.º 9
0
 def test_inversion_method_throws_error_on_ill_conditioned_data(self):
   with self.assertRaises(vif.IllConditionedDataError):
     vif.calculate_vif(
         self.ill_conditioned_correlation_matrix_df,
         use_correlation_matrix_inversion=True,
         raise_on_ill_conditioned=True)
Exemplo n.º 10
0
 def test_inversion_method_throws_warning_on_ill_conditioned_data(self):
   with self.assertWarns(vif.IllConditionedDataWarning):
     vif.calculate_vif(
         self.ill_conditioned_correlation_matrix_df,
         use_correlation_matrix_inversion=True,
         raise_on_ill_conditioned=False)