def get_absolute_scores(self) -> np.ndarray: # Check that fit is called before check_true(self._is_initial_fit, Exception("Call fit before getting importances")) return self._imp.abs_scores
def transform(self, data: pd.DataFrame) -> pd.DataFrame: # Check that fit is called before check_true(self._is_initial_fit, Exception("Call fit before transform")) # Return transformed data return self._imp.transform(data)
def _validate(self): check_true(isinstance(self.threshold, (int, float)), TypeError("Threshold must a non-negative number.")) check_true( self.threshold >= 0, ValueError("Threshold must be greater or equal to zero.")) check_true(self.threshold <= 1, ValueError("Threshold must be less or equal to one.")) check_true( self.method in ["pearson", "kendall", "spearman"], ValueError( "Method of correlation can be pearson, kendall, or spearman." ))
def _validate_fit(self, data, labels): # VIF is a Statistical methods, hence BaseSupervised, but does not need labels if isinstance( self._imp, _Statistical ) and self.selection_method.method == "variance_inflation": pass else: # Supervised implementors, except VIF, require labels if isinstance(self._imp, _BaseSupervisedSelector): check_true(labels is not None, ValueError("Labels column cannot be none")) check_true( isinstance(labels, pd.Series), ValueError("Labels should be a pandas series/column.")) if not hasattr(self.selection_method, 'num_features'): return if not isinstance(self.selection_method.num_features, int): return # Num features when integer, should be less or equal to size of feature columns # When float case is validated when selection method is created check_true( self.selection_method.num_features <= len(data.columns), ValueError("num_features cannot exceed size of feature columns " + str(self.selection_method.num_features) + " vs. " + str(len(data.columns))))
def _validate(self): check_true(isinstance(self.num_features, (int, float)), TypeError("Num features must a number.")) check_true(self.num_features > 0, ValueError("Num features must be greater than zero.")) if isinstance(self.num_features, float): check_true( self.num_features <= 1, ValueError("Num features ratio must be between [0..1].")) check_true( self.method in [ "anova", "chi_square", "mutual_info", "maximal_info", "variance_inflation" ], ValueError( "Statistical method can only be anova, chi_square, mutual_info, or maximal_info." ))
def _validate(self): check_true(isinstance(self.num_features, (int, float)), TypeError("Num features must a number.")) check_true(self.num_features > 0, ValueError("Num features must be greater than zero.")) if isinstance(self.num_features, float): check_true( self.num_features <= 1, ValueError("Num features ratio must be between [0..1].")) if self.estimator is not None: check_true( isinstance( self.estimator, (RandomForestRegressor, RandomForestClassifier, XGBClassifier, XGBRegressor, ExtraTreesClassifier, ExtraTreesRegressor, LGBMClassifier, LGBMRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor, CatBoostClassifier, CatBoostRegressor)), ValueError("Unknown tree-based estimator" + str(self.estimator)))
def _validate_args(seed, selection_method) -> NoReturn: """ Validates arguments for the constructor. """ # Seed check_true(isinstance(seed, int), TypeError("The seed must be an integer.")) check_true(seed >= 0, TypeError("The seed must be a non-negative integer.")) # Selection Method type check_true( isinstance(selection_method, (SelectionMethod.Correlation, SelectionMethod.Linear, SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance)), TypeError("Unknown selection type: " + str(selection_method))) # Selection method value selection_method._validate()
def plot_importance(scores: pd.DataFrame, columns: Optional[list] = None, max_num_features: Optional[int] = None, normalize: Optional[str] = None, ignore_constant: Optional[bool] = True, **kwargs): """Plot feature selector scores. Parameters ---------- scores: pd.DataFrame Data frame with scores for each feature (index) and method (columns). Each feature could have multiple rows from different cross-validation folds. columns: list (default=None) List of methods (columns) to include in statistics. If None, all methods (columns) will be used. max_num_features: int or None, optional (default=None) Max number of top features displayed on plot. If None all features will be displayed. normalize: bool, optional (default=False) Whether to normalize scores such that scores sum to 1 for each column. This ensures that scores are comparable between different methods. ignore_constant: bool, optional (default=True) Whether to ignore columns with the same score for all features. **kwargs Other parameters passed to ``sns.catplot``. Returns ------- ax : matplotlib.axes.Axes The plot with feature scores. """ check_true(isinstance(scores, pd.DataFrame), ValueError("Selector scores must be a data frame.")) # Get columns to use if columns is None: columns = scores.columns # Make copy of data frame # Fill nan with zero df = scores[columns].copy() df.fillna(0, inplace=True) # Group by feature for CV results df = df.groupby(df.index).mean() # Get normalized scores such that scores for each method sums to 1 if normalize: df = normalize_columns(df) # Drop methods with constant scores if ignore_constant: mask = ~np.isclose(np.var(df, axis=0), 0) df = df.loc[:, mask] # Set max_num_features to total number of features if None if max_num_features is None: max_num_features = len(df) # Calculate the mean score and sort in descending order mean_score = np.mean(df, axis=1) index = (-mean_score).argsort().values df = df.iloc[index, :] # Convert data to long format and plot df = df.head(max_num_features).reset_index().melt(id_vars="index") ax = sns.catplot(x="index", y="value", data=df, kind="bar", color="darkgreen", **kwargs) ax.set_xlabels("feature") ax.set_ylabels("score") return ax
def calculate_statistics( scores: pd.DataFrame, selected: pd.DataFrame, columns: Optional[list] = None, ignore_constant: Optional[bool] = True) -> pd.DataFrame: """ Calculate statistics for each feature using scores/selections from list of methods. Returns data frame with calculated statistics for each feature. Parameters ---------- scores: pd.DataFrame Data frame with scores for each feature (index) and selector (columns). Each feature could have multiple rows from different cross-validation folds. selected: pd.DataFrame Data frame with selection flag for each feature (index) and selector (columns). Each feature could have multiple rows from different cross-validation folds. columns: list (default=None) List of methods (columns) to include in statistics. If None, all methods (columns) will be used. ignore_constant: bool, optional (default=True) Whether to ignore methods with the same score for all features. Returns ------- Data frame with statistics for each feature """ check_true(isinstance(scores, pd.DataFrame), ValueError("scores must be a data frame.")) check_true(isinstance(selected, pd.DataFrame), ValueError("selection must be a data frame.")) check_true( scores.shape == selected.shape, ValueError("Shapes of scores and selected data frames must match.")) check_true( np.all(scores.index == selected.index), ValueError("Index of score and selection data frames must match.")) check_true( np.all(scores.columns == selected.columns), ValueError("Columns of score and selection data frames must match.")) # Get columns to use if columns is None: columns = scores.columns # Copy data frames scores_df = scores[columns].copy() selected_df = selected[columns].copy() # Group by feature for CV results scores_df = scores_df.groupby(scores_df.index).mean() selected_df = selected_df.groupby(selected_df.index).mean() # Drop methods with constant scores if ignore_constant: mask = ~np.isclose(np.var(scores_df, axis=0), 0) scores_df = scores_df.loc[:, mask] selected_df = selected_df.loc[:, mask] # Calculate statistics stats_df = pd.DataFrame(index=scores_df.index) stats_df["score_mean"] = scores_df.mean(axis=1) stats_df["score_mean_norm"] = normalize_columns(scores_df).mean(axis=1) stats_df["selection_freq"] = selected_df.sum(axis=1) stats_df["selection_freq_norm"] = normalize_columns(selected_df).sum( axis=1) # Sort stats_df.sort_values(by="score_mean_norm", ascending=False, inplace=True) return stats_df
def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Linear, SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance]], data: pd.DataFrame, labels: Optional[pd.Series] = None, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Benchmark with a given set of feature selectors. Return a tuple of data frames with scores, runtime and selected features for each method. Returns ------- Tuple of data frames with scores, selected features and runtime for each method. """ check_true(selectors is not None, ValueError("Benchmark selectors cannot be none.")) check_true(data is not None, ValueError("Benchmark data cannot be none.")) # Output files if output_filename is not None: output_file = open(output_filename, "a") else: output_file = None # Drop features without any variance if drop_zero_variance_features: selector = Selective(SelectionMethod.Variance()) data = selector.fit_transform(data, labels) method_to_runtime = {} score_df = pd.DataFrame(index=data.columns) selected_df = pd.DataFrame(index=data.columns) for method_name, method in selectors.items(): selector = Selective(method) t0 = time() if verbose: print("\n>>> Running", method_name) scores = None selected = [] try: subset = selector.fit_transform(data, labels) scores = selector.get_absolute_scores() selected = [1 if c in subset.columns else 0 for c in data.columns] method_to_runtime[method_name] = round((time() - t0) / 60, 2) except Exception as exp: print("Exception", exp) scores = np.repeat(0, len(data.columns)) selected = np.repeat(0, len(data.columns)) method_to_runtime[method_name] = str(round( (time() - t0) / 60, 2)) + " (exception)" finally: score_df[method_name] = scores selected_df[method_name] = selected if output_filename is not None: output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") output_file.write(str(selected) + "\n") output_file.write(str(scores) + "\n") if verbose: print( f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") # Format runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis( "method").reset_index() return score_df, selected_df, runtime_df
def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Linear, SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance]], data: pd.DataFrame, labels: Optional[pd.Series] = None, cv: Optional[int] = None, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False, seed: int = Constants.default_seed) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Benchmark with a given set of feature selectors. Return a tuple of data frames with scores, runtime and selected features for each method. Parameters ---------- selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Linear, SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance]] Dictionary of feature selection methods to benchmark. data: pd.DataFrame Data of shape (n_samples, n_features) used for feature selection. labels: pd.Series, optional (default=None) The target values (class labels in classification, real numbers in regression). cv: int, optional (default=None) Number of folds to use for cross-validation. output_filename: str, optional (default=None) If not None, benchmarking output is saved. If file exists, results are appended, otherwise file is created. drop_zero_variance_features: bool, optional (default=True) Whether to drop features with zero variance before running feature selector methods or not. verbose: bool, optional (default=False) Whether to print progress messages or not. seed: int, optional (default=Constants.default_seed) The random seed to initialize the random number generator. Returns ------- Tuple of data frames with scores, selected features and runtime for each method. If cv is not None, the data frames will contain the concatenated results from each fold. """ check_true(selectors is not None, ValueError("Benchmark selectors cannot be none.")) check_true(data is not None, ValueError("Benchmark data cannot be none.")) if cv is None: return _bench(selectors=selectors, data=data, labels=labels, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, verbose=verbose) else: # Create K-Fold object kf = KFold(n_splits=cv, shuffle=True, random_state=seed) # Initialize variables t0 = time() train_labels, test_labels = None, None score_df, selected_df, runtime_df = pd.DataFrame(), pd.DataFrame( ), pd.DataFrame() # Split data into cv-folds and run _bench for each fold if verbose: print("\n>>> Running") for fold, (train_index, _) in enumerate(kf.split(data)): if verbose: print("\tFold", fold, "...") # Split data, labels into folds train_data = data.iloc[train_index] if labels is not None: train_labels = labels.iloc[train_index] # Run benchmark score_cv_df, selected_cv_df, runtime_cv_df = _bench( selectors=selectors, data=train_data, labels=train_labels, output_filename=output_filename, drop_zero_variance_features=drop_zero_variance_features, verbose=False) # Concatenate data frames score_df = pd.concat((score_df, score_cv_df)) selected_df = pd.concat((selected_df, selected_cv_df)) runtime_df = pd.concat((runtime_df, runtime_cv_df)) if verbose: print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") return score_df, selected_df, runtime_df
def _validate(self): check_true(isinstance(self.threshold, (int, float)), TypeError("Threshold must a non-negative number.")) check_true(self.threshold >= 0, ValueError("Threshold must be non-negative."))
def _validate(self): check_true(isinstance(self.num_features, (int, float)), TypeError("Num features must a number.")) check_true(self.num_features > 0, ValueError("Num features must be greater than zero.")) if isinstance(self.num_features, float): check_true( self.num_features <= 1, ValueError("Num features ratio must be between [0..1].")) check_true( self.regularization in ["none", "lasso", "ridge"], ValueError( "Regularization can only be none, lasso, or ridge.")) check_true(isinstance(self.alpha, (int, float)), TypeError("Alpha must a number.")) check_true(self.alpha >= 0, ValueError("Alpha cannot be negative"))
def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Linear, SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance]], data: pd.DataFrame, labels: Optional[pd.Series] = None, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, verbose: bool = False) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Benchmark with a given set of feature selectors. Return a tuple of data frames with scores, runtime and selected features for each method. Parameters ---------- selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Linear, SelectionMethod.TreeBased, SelectionMethod.Statistical, SelectionMethod.Variance]] Dictionary of feature selection methods to benchmark. data: pd.DataFrame Data of shape (n_samples, n_features) used for feature selection. labels: pd.Series, optional (default=None) The target values (class labels in classification, real numbers in regression). output_filename: str, optional (default=None) If not None, benchmarking output is saved. If file exists, results are appended, otherwise file is created. drop_zero_variance_features: bool, optional (default=True) Whether to drop features with zero variance before running feature selector methods or not. verbose: bool, optional (default=False) Whether to print progress messages or not. Returns ------- Tuple of data frames with scores, selected features and runtime for each method. """ check_true(selectors is not None, ValueError("Benchmark selectors cannot be none.")) check_true(data is not None, ValueError("Benchmark data cannot be none.")) # Output files if output_filename is not None: output_file = open(output_filename, "a") else: output_file = None # Drop features without any variance if drop_zero_variance_features: selector = Selective(SelectionMethod.Variance()) data = selector.fit_transform(data, labels) method_to_runtime = {} score_df = pd.DataFrame(index=data.columns) selected_df = pd.DataFrame(index=data.columns) for method_name, method in selectors.items(): selector = Selective(method) t0 = time() if verbose: print("\n>>> Running", method_name) scores = None selected = [] try: subset = selector.fit_transform(data, labels) scores = selector.get_absolute_scores() selected = [1 if c in subset.columns else 0 for c in data.columns] method_to_runtime[method_name] = round((time() - t0) / 60, 2) except Exception as exp: print("Exception", exp) scores = np.repeat(0, len(data.columns)) selected = np.repeat(0, len(data.columns)) method_to_runtime[method_name] = str(round( (time() - t0) / 60, 2)) + " (exception)" finally: score_df[method_name] = scores selected_df[method_name] = selected if output_filename is not None: output_file.write(method_name + " " + str(method_to_runtime[method_name]) + "\n") output_file.write(str(selected) + "\n") output_file.write(str(scores) + "\n") if verbose: print( f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") # Convert to series runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis( "method").reset_index() return score_df, selected_df, runtime_df