Exemplo n.º 1
0
def binary_pfd(data, *args, **kwargs):
    """
    Uses feature permutation to automatically search for minimal PFDs.

    In AutoGluon, higher performance metrics are always better. This leads to
    the circumstnace that the MSE is negative! So don't dispair!
    """
    logger = logging.getLogger('pfd')
    logger.debug(f"Start linear search of PFDs for dataset {data.title}")
    df_train, df_validate, df_test = helps.load_splits(data)

    df_imp, measured_performance, metric, rhs = global_predictor_explained(
        data, df_train, df_validate, df_test)

    include_cols = list(df_train.columns)
    lhs = [c for c in include_cols if c != rhs]

    logger.info(f"Trained a predictor with {metric} "
                f"{measured_performance}.")
    print("These are the feature importances found via "
          "feature permutation:")
    print(df_imp.loc[lhs, ['description', 'importance']].sort_values(
        'importance', ascending=False))
    print(f"What's your threshold for {metric}?")
    threshold = float(input(''))

    data = df_imp.loc[:, 'importance']
    iterate_pfd = opt.get_pfd_iterator(df_train, df_validate, df_test, rhs)
    opt.run_binary_search(data, threshold, iterate_pfd)
Exemplo n.º 2
0
def linear_pfd(data, *args, **kwargs):
    """
    Uses feature permutation to automatically search for minimal PFDs.

    In AutoGluon, higher performance metrics are always better. This leads to
    the circumstnace that the MSE is negative! So don't dispair!
    """
    logger = logging.getLogger('pfd')
    logger.debug(f"Start linear search of PFDs for dataset {data.title}")
    df_train, df_validate, df_test = helps.load_splits(data)

    df_imp, measured_performance, metric, rhs = global_predictor_explained(
        data, df_train, df_validate, df_test)

    include_cols = list(df_train.columns)
    lhs = [c for c in include_cols if c != rhs]

    logger.info(f"Trained a predictor with {metric} "
                f"{measured_performance}.")
    print("These are the feature importances found via "
          "feature permutation:")
    print(df_imp.loc[lhs, ['description', 'importance']].sort_values(
        'importance', ascending=False))
    print(f"What's your threshold for {metric}?")
    threshold = float(input(''))

    for i in range(len(df_train.columns)):
        if measured_performance < threshold:
            logger.info("The newly trained model's performance of "
                        f"{measured_performance} is below the threshold of "
                        f"{threshold}. Stopping the search.")
            break

        # the i-least important column
        exclude_col = int(df_imp.iloc[-(i+1), :].name)
        include_cols = [c for c in include_cols if c != exclude_col]
        lhs = [c for c in include_cols if c != rhs]

        logger.info(f"Begin predictor training with LHS {lhs}")
        measured_performance = opt.iterate_pfd(include_cols,
                                               df_train,
                                               df_validate,
                                               df_test,
                                               rhs)
        logger.info(
            f"Trained a predictor with {metric} {measured_performance}")
Exemplo n.º 3
0
    def load_data(self):
        """ Loads train/validate/test splits. Sets class-attributes
        df_train, df_validate, df_test and df_columns. """
        df = load_original_data(self.data)
        df_train, df_validate, df_test = load_splits(self.data)

        no_dupl = check_split_for_duplicates([df])
        no_dupl_splits = check_split_for_duplicates(
            [df_train, df_validate, df_test])

        if no_dupl == no_dupl_splits:
            self.df_train = df_train
            self.df_validate = df_validate
            self.df_test = df_test
            self.columns = list(df_test.columns)
        else:
            e = '''Found that the number of duplicates in train/validate/test
            splits deviates from the number of duplicates in the original
            dataset. Please fix this and try again.'''
            raise ValueError(e)
Exemplo n.º 4
0
def manual_pfd(data, *args, **kwargs):
    """
    Use feature permutation to compute the PFD of a dataset. The user is
    prompted to insert how much mean absolute deviation from the mean
    function value they want to secrifice for a smaller LHS.
    """
    logger = logging.getLogger('pfd')
    logger.debug(f"Start manual search of PFDs for dataset {data.title}")
    df_train, df_validate, df_test = helps.load_splits(data)
    df_imp, measured_performance, metric, rhs = global_predictor_explained(
        data, df_train, df_validate, df_test)

    exclude_cols = []
    include_cols = list(df_train.columns)
    lhs = [c for c in include_cols if c != rhs]

    while True:
        logger.info(f"Trained a predictor with {metric} "
                    f"{measured_performance}.")
        print("These are the feature importances found via "
              "feature permutation:")
        print(df_imp.loc[lhs, ['description', 'importance']].sort_values(
            'importance', ascending=False))
        print(f'Excluded: {exclude_cols}')

        print("Which columns do you want to exclude? (q to quit)")
        i = input('')
        if i == 'q':
            break

        exclude_cols = [int(c) for c in i.split(',')]
        include_cols = [c for c in include_cols if c not in exclude_cols]
        lhs = [c for c in include_cols if c != rhs]

        logger.info(f"Begin predictor training with LHS {lhs}")
        measured_performance = opt.iterate_pfd(include_cols,
                                               df_train,
                                               df_validate,
                                               df_test,
                                               str(rhs))
Exemplo n.º 5
0
def jump_pfd(data, *args, **kwargs):
    """
    Uses refined searching stategies to greedily jump to a node in the
    search lattice.

    AG's feature_permutation returns feature importances that aren't
    additive and thus don't sum up to equal the model's loss function.
    Which is why I normalize the permutation feature importances to the
    model's measured_performance. This is not theoretically sound though
    -- there is no reason to think that FIs calculated for features
    [x_1, x_2, x_3, x_4] are the same FIs calculated for features
    [x_2, x_3, x_4] when using feature_permutation.

    Note that in AutoGluon, higher performance metrics are always better.
    This leads to the circumstance that the MSE is negative!

    Also, shap.TreeExplainer is capable of explaining a model's log_loss,
    which might be worthwhile investigating in future versions.
    """
    logger = logging.getLogger('pfd')
    logger.debug(f"Start automatical search of PFDs for dataset {data.title}")

    df_train, df_validate, df_test = helps.load_splits(data)

    df_imp, measured_performance, metric, rhs = global_predictor_explained(
        data, df_train, df_validate, df_test)
    exclude_cols = []
    include_cols = list(df_train.columns)
    lhs = [c for c in include_cols if c != rhs]

    logger.info(f"Trained a predictor with {metric} "
                f"{measured_performance}.")
    print("These are the feature importances found via "
          "feature permutation:")
    print(df_imp.loc[lhs, ['description', 'importance']].sort_values(
        'importance', ascending=False))
    print(f"What's your threshold for {metric}?")
    threshold = float(input(''))

    logger.debug("User set threshold of {threshold}")

    # the margin is how much performance we can shave off
    margin = measured_performance - threshold
    if margin < 0:
        logger.info(f"The set threshold of {threshold} is below "
                    "the measured_performance of {measured_performance}. "
                    "Stopping the search.")

    if measured_performance < threshold:
        logger.info("The newly trained model's performance of "
                    f"{measured_performance} is below the threshold of "
                    f" {threshold}. Stopping the search.")

    df_imp['normalized_importance'] = (df_imp.loc[:, 'importance']
                                       / df_imp.loc[:, 'importance'].sum()) * measured_performance
    df_importance_cumsum = df_imp.sort_values(
        'normalized_importance', ascending=True).cumsum()
    importance_distance = df_importance_cumsum.loc[:,
                                                   'normalized_importance'] - margin

    logger.debug("Calculated the following importance distance "
                 f"{importance_distance}")
    exclude_cols = [int(x[0]) for x in importance_distance.iteritems()
                    if x[1] < 0]
    include_cols = [c for c in include_cols if c not in exclude_cols]
    lhs = [c for c in include_cols if c != rhs]

    logger.info("Training a predictor next to check threshold.")
    logger.info("Begin predictor training")
    measured_performance = opt.iterate_pfd(include_cols,
                                           df_train,
                                           df_validate,
                                           df_test,
                                           rhs)
    logger.info("Using Feature Permutation Importances, jumped "
                f"to LHS {lhs}, resulting in a Model with {metric} "
                f"{round(measured_performance, 3)}. The threshold aimed for "
                f"was a {metric} of {threshold}.")