def test_backward_subset_feature_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name): features_sets = { "first": ["x1", "x2"], "second": ["x4", "x5"], "third": ["x3", "x6"] } logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=-1, early_stop=10, iter_limit=50, min_remaining_features=5) assert len(get_used_features( first(logs)[0])) <= 5 # Assert stop by remaining features logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1, min_remaining_features=3) assert len(logs) == 1 # Assert stop by iter limit logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50, min_remaining_features=1) assert len(logs) == 2 # Assert stop by early_stop
def test_poor_man_boruta_selection(train_df, holdout_df, train_fn, eval_fn, base_extractor, metric_name): features = ["x1", "x2", "x3", "x4", "x5", "x6"] logs = poor_man_boruta_selection(train_df, holdout_df, train_fn, features, eval_fn, base_extractor, metric_name, max_removed_by_step=1, threshold=0, early_stop=10, iter_limit=50, min_remaining_features=5) assert len(get_used_features( first(logs))) <= 6 # Assert stop by remaining features logs = poor_man_boruta_selection(train_df, holdout_df, train_fn, features, eval_fn, base_extractor, metric_name, max_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1, min_remaining_features=3) assert len(logs) == 1 # Assert stop by iter limit logs = poor_man_boruta_selection(train_df, holdout_df, train_fn, features, eval_fn, base_extractor, metric_name, max_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50, min_remaining_features=1) assert len(logs) == 2 # Assert stop by early_stop
def test_feature_importance_backward_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name): features = ["x1", "x2", "x3", "x4", "x5", "x6"] logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=50, min_remaining_features=5) assert len(get_used_features( first(logs))) <= 5 # Assert stop by remaining features logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1, min_remaining_features=3) assert len(logs) == 1 # Assert stop by iter limit logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor, metric_name, num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50, min_remaining_features=1) assert len(logs) == 2 # Assert stop by early_stop
def stop_by_num_features(logs: ListLogListType, min_num_features: int = 50) -> bool: """ Checks for logs to see if feature selection should stop Parameters ---------- logs : list of list of dict A list of log-like lists of dictionaries evaluations. min_num_features: int (default 50) The minimun number of features the model can have before stopping Returns ------- stop: bool A boolean whether to stop recursion or not """ return len(get_used_features(first(logs))) <= min_num_features
def test_get_used_features(logs): result = get_used_features(logs[0]) assert result == ['x1', 'x2', 'x4', 'x5', 'x3', 'x6']
def remove_by_feature_shuffling(log: LogType, predict_fn: PredictFnType, eval_fn: EvalFnType, eval_data: pd.DataFrame, extractor: ExtractorFnType, metric_name: str, max_removed_by_step: int = 50, threshold: float = 0.005, speed_up_by_importance: bool = False, parallel: bool = False, nthread: int = 1, seed: int = 7) -> List[str]: """ Performs feature selection based on the evaluation of the test vs the evaluation of the test with randomly shuffled features Parameters ---------- log : LogType Dictionaries evaluations. predict_fn: function pandas.DataFrame -> pandas.DataFrame A partially defined predictor that takes a DataFrame and returns the predicted score for this dataframe eval_fn : function DataFrame -> log dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. eval_data: pandas.DataFrame Data used to evaluate the model after shuffling extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted max_removed_by_step: int (default 5) The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in performance is up to the defined threshold. threshold: float (default 0.005) Threshold for model performance comparison speed_up_by_importance: bool (default True) If it should narrow search looking at feature importance first before getting PIMP importance. If True, will only shuffle the top num_removed_by_step in terms of feature importance. parallel: bool (default False) nthread: int (default 1) seed: int (default 7) Random seed Returns ---------- features: list of str The remaining features after removing based on feature importance """ random.seed(seed) curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name) eval_size = eval_data.shape[0] features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \ if speed_up_by_importance else get_used_features(log) def shuffle(feature: str) -> pd.DataFrame: return eval_data.assign( **{feature: eval_data[feature].sample(frac=1.0)}) feature_to_delta_metric = compose( lambda m: curr_metric - m, get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name), gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle) if parallel: metrics = Parallel(n_jobs=nthread, backend="threading")( delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle) feature_to_delta_metric = dict(zip(features_to_shuffle, metrics)) gc.collect() else: feature_to_delta_metric = { feature: feature_to_delta_metric(feature) for feature in features_to_shuffle } return pipe(feature_to_delta_metric, valfilter(lambda delta_metric: delta_metric < threshold), sorted(key=lambda f: feature_to_delta_metric.get(f)), take(max_removed_by_step), list)