Exemplo n.º 1
0
def stop_by_num_features_parallel(logs: ListLogListType,
                                  extractor: ExtractorFnType,
                                  metric_name: str,
                                  min_num_features: int = 50) -> bool:
    """
    Selects the best log out of a list to see if feature selection should stop

    Parameters
    ----------
    logs : list of list of list of dict
        A list of log-like lists of dictionaries evaluations.

    extractor: function str -> float
        A extractor that take a string and returns the value of that string on a dict

    metric_name: str
        String with the name of the column that refers to the metric column to be extracted

    min_num_features: int (default 50)
        The minimun number of features the model can have before stopping

    Returns
    ----------
    stop: bool
        A boolean whether to stop recursion or not
    """

    best_log = get_best_performing_log(first(logs), extractor, metric_name)

    return stop_by_num_features([best_log], min_num_features)
Exemplo n.º 2
0
def stop_by_no_improvement_parallel(logs: ListLogListType,
                                    extractor: ExtractorFnType,
                                    metric_name: str,
                                    early_stop: int = 3,
                                    threshold: float = 0.001) -> bool:
    """
    Checks for logs to see if feature selection should stop

    Parameters
    ----------
    logs : list of list of dict
        A list of log-like lists of dictionaries evaluations.

    extractor: function str -> float
        A extractor that take a string and returns the value of that string on a dict

    metric_name: str
        String with the name of the column that refers to the metric column to be extracted

    early_stop: int (default 3)
        Number of iterations without improvements before stopping

    threshold: float (default 0.001)
        Threshold for model performance comparison

    Returns
    ----------
    stop: bool
        A boolean whether to stop recursion or not
    """

    if len(logs) < early_stop:
        return False

    log_list = [
        get_best_performing_log(log, extractor, metric_name) for log in logs
    ]

    limited_logs = list(take(early_stop, log_list))
    curr_auc = get_avg_metric_from_extractor(limited_logs[-1], extractor,
                                             metric_name)

    return all([
        (curr_auc - get_avg_metric_from_extractor(log, extractor, metric_name))
        <= threshold for log in limited_logs[:-1]
    ])
Exemplo n.º 3
0
def remove_features_subsets(
        log_list: LogListType,
        extractor: ExtractorFnType,
        metric_name: str,
        num_removed_by_step: int = 1) -> List[Tuple[str, ...]]:
    """
        Performs feature selection based on the best performing model out of
        several trained models

        Parameters
        ----------
        log_list : list of dict
            A list of log-like lists of dictionaries evaluations.

        extractor: function string -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 1)
            The number of features to remove

        Returns
        ----------
        keys: list of str
            The remaining keys of feature sets after choosing the current best subset

    """

    best_log = get_best_performing_log(log_list, extractor, metric_name)
    best_subset: List[str] = first(gen_dict_extract('used_subsets', best_log))

    return list(
        combinations(best_subset,
                     len(best_subset) - num_removed_by_step))
Exemplo n.º 4
0
def test_get_best_performing_log(logs, base_extractor, metric_name):
    result = get_best_performing_log(logs, base_extractor, metric_name)
    assert result == logs[0]