def test_regression_method_doesnt_throw_errors_on_ill_conditioned_data(self): vifs = vif.calculate_vif( self.ill_conditioned_correlation_matrix_df, use_correlation_matrix_inversion=False, raise_on_ill_conditioned=True) self.assertNotEmpty(vifs)
def test_calculate_vif_correct_results(self): # Must drop response variable. vifs = vif.calculate_vif(self.data.drop(columns='target')) calculated_results = vifs['VIF'].round(2).to_list() expected = sorted(self.unsorted_vifs, reverse=True) # by default its sorted self.assertListEqual(calculated_results, expected)
def test_calculate_vif_correct_results_inversion_method(self): expected = sorted( self.unsorted_vifs, reverse=True) # by default it's sorted # Must drop response variable. vifs = vif.calculate_vif( self.data.drop(columns='target'), use_correlation_matrix_inversion=True) calculated_results = vifs['VIF'].round(2).to_list() self.assertListEqual(calculated_results, expected)
def address_collinearity_with_vif( self, vif_method: Union[str, VifMethod] = 'sequential', vif_threshold: int = 10, drop: bool = True, use_correlation_matrix_inversion: bool = True, raise_on_ill_conditioned: bool = True, min_absolute_corr: float = 0.4, handle_singular_data_errors_automatically: bool = True ) -> pd.DataFrame: """Uses VIF to identify columns that are collinear and optionally drop them. The 'vif_method' argument specifies the control flow for the variance inflation factor analysis. It can be either 'sequential', 'interactive', or 'quick'. * sequential: Sequentially remove a column with the highest VIF value until all columns meet the vif_threshold. * interactive: Same as `sequential` but the user is prompted which column(s) to remove at each iteration. * quick: Remove all columns with VIF value greater than vif_threshold. To remove problematic collinear features, the 'sequential' method performs the VIF analysis iteratively, removing the column with the highest VIF each time until all the columns meet the `vif_threshold` without user input. If you want to manually decide which column or columns you want to drop in each iteration, you can choose the 'interactive' method to be prompted to screen the columns to remove. To assist in choosing which column(s) to drop, a list of correlated features will be shown having at least a minimum correlation of `min_absolute_corr`. As alternative to the iterative methods, a quicker solution is to remove all columns having VIF value higher than `vif_threshold`. In this case the VIF analysis will be performed only once. Note that removing multiple variables at once like this leads to removing variables that otherwise wouldn't be removed using the sequential approach. Args: vif_method: Specify the control flow for the analysis. It can be either 'sequential', 'quick', or 'interactive'. vif_threshold: Threshold to identify which columns have high collinearity and anything higher than this threshold is dropped or used for warning. drop: Boolean to either drop columns with high VIF or just print a message, default is set to True. use_correlation_matrix_inversion: If True, uses correlation matrix inversion algorithm to optimize VIF calculations. If False, uses regression algorithm. raise_on_ill_conditioned: Whether to raise an exception if the correlation matrix is ill-conditioned. Only applies when use_correlation_matrix_inversion=True. min_absolute_corr: Minimum absolute correlation required to display a feature as "correlated" to another feature in interactive mode. Only applies when vif_method='interactive'. Should be between 0 and 1, though this is not currently enforced. handle_singular_data_errors_automatically: If True, then SingularDataErrors and IllConditionedDataErrors from vif.calculate_vif() will be handled automatically by injecting artifical noise into the data and re-running. Note that the data with artifical noise is an intermediate product of this method (tmp_data) and the output of the method does not contain that artifical noise. The noise is random noise following a normal distribution, with standard deviation for each column defined by the standard deviation of the data in that column multiplied by the fraction fractional_noise_to_add_per_iteration (set to 1e-4). To avoid getting stuck in an infinite loop, only max_number_of_iterations (set to 1000) of the noise injection procedure are allowed; after this number of iterations if the correlation matrix is still singular, the method fails with a SingularDataError. This argument is only relevant when use_correlation_matrix_inversion=True. Returns: Data after collinearity check with vif has been applied. When drop=True, columns with high collinearity will not be present in the returned data. Raises: SingularDataError: Raised when use_correlation_matrix_inversion=True and the correlation matrix of self.data is singular or ill-conditioned. Also raised when use_correlation_matrix_inversion=True and handle_singular_data_errors_automatically=True, if the random noise injected into the data was not sufficient to resolve the problem. ValueError: Raised when vif_method is not one of the three expected values ('sequential', 'quick', or 'interactive'). """ vif_method = VifMethod(vif_method) covariates = self.data.drop(columns=self.target_column) columns_to_drop = [] corr_matrix = covariates.corr() fractional_noise_to_add_per_iteration = 1.0e-4 max_number_of_iterations = 1000 while True: tmp_data = covariates.drop(columns=columns_to_drop) trimmed_corr_matrix = corr_matrix.drop( columns_to_drop, axis=0).drop(columns_to_drop, axis=1) corr_matrix_for_vif = trimmed_corr_matrix.copy() if handle_singular_data_errors_automatically: rng = np.random.default_rng() variances_for_each_column = tmp_data.var(ddof=0) variance_df = pd.DataFrame( data=[variances_for_each_column.to_list()] * tmp_data.shape[0]) vif_succeeded_flag = False for iteration_count in range(max_number_of_iterations): if iteration_count > 0: corr_matrix_for_vif = tmp_data.corr() try: vif_data = vif.calculate_vif( tmp_data, sort=True, use_correlation_matrix_inversion= use_correlation_matrix_inversion, raise_on_ill_conditioned=raise_on_ill_conditioned, corr_matrix=corr_matrix_for_vif) vif_succeeded_flag = True except (vif.SingularDataError, vif.IllConditionedDataError): message_postscript = '' if handle_singular_data_errors_automatically: if iteration_count < max_number_of_iterations - 1: noise = rng.normal( size=tmp_data.shape, scale=np.sqrt(variance_df) * fractional_noise_to_add_per_iteration) tmp_data += noise continue else: message_postscript = ( ' Automatic attempt to resolve SingularDataError by ' 'injecting artifical noise to the data has failed. This ' 'probably means the dataset has too many features relative ' 'to the number of samples.') message = self._generate_vif_error_message( trimmed_corr_matrix) message += message_postscript raise SingularDataError(message) if vif_succeeded_flag: break if max(vif_data['VIF']) < vif_threshold: break if vif_method == VifMethod.INTERACTIVE: correlated_features = self._get_list_of_correlated_features( vif_data, corr_matrix.drop(columns_to_drop, axis=0).drop(columns_to_drop, axis=1), min_absolute_corr) vif_data['correlated_features'] = correlated_features selected_columns = _vif_interactive_input_and_validation( vif_data) elif vif_method == VifMethod.SEQUENTIAL: selected_columns = [vif_data.iloc[0].features] else: vif_filter = vif_data['VIF'] >= vif_threshold selected_columns = vif_data['features'][vif_filter].tolist() columns_to_drop.extend(selected_columns) if (vif_method == VifMethod.QUICK) or not selected_columns: break if drop: self.data = self.data.drop(columns=columns_to_drop) else: message = ( f'Consider removing the following columns due to collinearity: ' f'{columns_to_drop}') warnings.warn(CollinearityWarning(message)) self._checked_collinearity = True return self.data
def address_collinearity_with_vif(self, vif_threshold: int = 10, sequential: bool = True, interactive: bool = False, drop: bool = True) -> pd.DataFrame: """Uses VIF to identify columns that are collinear and option to drop them. You can customize how collinearity will be resolved with `sequential` and `interactive` parameters. By default, the VIF score will re-calculated every time the column with the highest VIF score is dropped until the threshold is met. If you wish to remove all the columns with VIF score higher than the threshold, you can set `sequential=False`. If you want to have a say on which column is going to removed, rather than automatically pick the column with the highest VIF score, you can set `interactive=True`. This will prompt for your input every time columns are found with VIF score higher than your threshold, whether `sequential` is set to True of False. Args: vif_threshold: Threshold to identify which columns have high collinearity and anything higher than this threshold is dropped or used for warning. sequential: Whether you want to sequentially re-calculate VIF each time after a set column(s) have been removed or only once. interactive: Whether you want to manually specify which column(s) you want to remove. drop: Boolean to either drop columns with high VIF or print message, default is set to True. Returns: Data after collinearity check with vif has been applied. When drop=True columns with high collinearity will not be present in the returned data. """ covariates = self.data.drop(columns=self.target_column) columns_to_drop = [] while True: tmp_data = covariates.drop(columns=columns_to_drop) vif_data = vif.calculate_vif(tmp_data, sort=True).reset_index(drop=True) if vif_data['VIF'][0] < vif_threshold: break if interactive: selected_columns = _vif_interactive_input_and_validation(vif_data) elif sequential: selected_columns = [vif_data['features'][0]] else: vif_filter = vif_data['VIF'] >= vif_threshold selected_columns = vif_data['features'][vif_filter].tolist() columns_to_drop.extend(selected_columns) if not sequential or not selected_columns: break if drop: self.data = self.data.drop(columns=columns_to_drop) else: message = ( f'Consider removing the following columns due to collinearity: ' f'{columns_to_drop}') warnings.warn(CollinearityWarning(message)) self._checked_collinearity = True return self.data
def test_regression_method_doesnt_throw_singular_error_on_singular_data(self): vifs = vif.calculate_vif( self.singular_correlation_matrix_df, use_correlation_matrix_inversion=False) self.assertNotEmpty(vifs)
def test_inversion_method_throws_singular_error_on_singular_data(self): with self.assertRaises(vif.SingularDataError): vif.calculate_vif( self.singular_correlation_matrix_df, use_correlation_matrix_inversion=True)
def test_calculate_vif_sorted_flag_unsorted_results(self): vifs = vif.calculate_vif(self.data.drop(columns='target'), sort=False) calculated_results = vifs['VIF'].round(2).to_list() self.assertListEqual(calculated_results, self.unsorted_vifs)
def test_inversion_method_throws_error_on_ill_conditioned_data(self): with self.assertRaises(vif.IllConditionedDataError): vif.calculate_vif( self.ill_conditioned_correlation_matrix_df, use_correlation_matrix_inversion=True, raise_on_ill_conditioned=True)
def test_inversion_method_throws_warning_on_ill_conditioned_data(self): with self.assertWarns(vif.IllConditionedDataWarning): vif.calculate_vif( self.ill_conditioned_correlation_matrix_df, use_correlation_matrix_inversion=True, raise_on_ill_conditioned=False)