예제 #1
0
def run_error_analyzer(validation_data,
                       model,
                       X_test,
                       y_test,
                       feature_names,
                       categorical_features,
                       model_task,
                       filters=None,
                       composite_filters=None,
                       is_empty_validation_data=False):
    error_analyzer = ModelAnalyzer(model,
                                   X_test,
                                   y_test,
                                   feature_names,
                                   categorical_features,
                                   model_task=model_task)
    filtered_data = filter_from_cohort(error_analyzer, filters,
                                       composite_filters)

    # validate there is some data selected for each of the filters
    if is_empty_validation_data:
        assert validation_data.shape[0] == 0
    else:
        assert validation_data.shape[0] > 0
    assert validation_data.equals(filtered_data)
def run_error_analyzer(model,
                       X_test,
                       y_test,
                       feature_names,
                       analyzer_type,
                       categorical_features=None,
                       tree_features=None,
                       max_depth=3,
                       num_leaves=31,
                       min_child_samples=20,
                       filters=None,
                       composite_filters=None,
                       metric=None,
                       model_task=None):
    if analyzer_type == AnalyzerType.MODEL:
        error_analyzer = ModelAnalyzer(model,
                                       X_test,
                                       y_test,
                                       feature_names,
                                       categorical_features,
                                       metric=metric,
                                       model_task=model_task)
    else:
        pred_y = model.predict(X_test)
        error_analyzer = PredictionsAnalyzer(pred_y,
                                             X_test,
                                             y_test,
                                             feature_names,
                                             categorical_features,
                                             metric=metric,
                                             model_task=model_task)
    if tree_features is None:
        tree_features = feature_names
    tree = error_analyzer.compute_error_tree(
        tree_features,
        filters,
        composite_filters,
        max_depth=max_depth,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples)
    validation_data = X_test
    if filters is not None or composite_filters is not None:
        validation_data = filter_from_cohort(error_analyzer, filters,
                                             composite_filters)
        y_test = validation_data[TRUE_Y]
        validation_data = validation_data.drop(columns=[TRUE_Y, ROW_INDEX])
        if not isinstance(X_test, pd.DataFrame):
            validation_data = validation_data.values
    validation_data_len = len(validation_data)
    assert tree is not None
    assert len(tree) > 0
    assert ERROR in tree[0]
    assert ID in tree[0]
    assert PARENTID in tree[0]
    assert tree[0][PARENTID] is None
    assert SIZE in tree[0]
    assert tree[0][SIZE] == validation_data_len
    for node in tree:
        assert node[SIZE] >= min(min_child_samples, validation_data_len)
예제 #3
0
def run_error_analyzer(model,
                       X_test,
                       y_test,
                       feature_names,
                       categorical_features,
                       model_task,
                       filters=None,
                       composite_filters=None,
                       matrix_features=None):
    error_analyzer = ModelAnalyzer(model,
                                   X_test,
                                   y_test,
                                   feature_names,
                                   categorical_features,
                                   model_task=model_task)
    # features, filters, composite_filters
    if matrix_features is None:
        features = [feature_names[0], feature_names[1]]
    else:
        features = matrix_features
    json_matrix = error_analyzer.compute_matrix(features, filters,
                                                composite_filters)
    validation_data = X_test
    if filters is not None or composite_filters is not None:
        validation_data = filter_from_cohort(X_test, filters,
                                             composite_filters, feature_names,
                                             y_test, categorical_features,
                                             error_analyzer.categories)
        y_test = validation_data[TRUE_Y]
        validation_data = validation_data.drop(columns=[TRUE_Y, ROW_INDEX])
        if not isinstance(X_test, pd.DataFrame):
            validation_data = validation_data.values
    expected_count = len(validation_data)
    metric = error_analyzer.metric
    if metric == Metrics.ERROR_RATE:
        expected_error = sum(model.predict(validation_data) != y_test)
    elif metric == Metrics.MEAN_SQUARED_ERROR:
        func = metric_to_func[metric]
        pred_y = model.predict(validation_data)
        expected_error = func(y_test, pred_y)
    else:
        raise NotImplementedError(
            "Metric {} validation not supported yet".format(metric))
    validate_matrix(json_matrix,
                    expected_count,
                    expected_error,
                    features,
                    metric=metric)
예제 #4
0
def run_error_analyzer(model,
                       X_test,
                       y_test,
                       feature_names,
                       categorical_features,
                       model_task,
                       filters=None,
                       composite_filters=None,
                       matrix_features=None,
                       quantile_binning=False,
                       num_bins=BIN_THRESHOLD,
                       metric=None):
    error_analyzer = ModelAnalyzer(model,
                                   X_test,
                                   y_test,
                                   feature_names,
                                   categorical_features,
                                   model_task=model_task,
                                   metric=metric)
    # features, filters, composite_filters
    if matrix_features is None:
        features = [feature_names[0], feature_names[1]]
    else:
        features = matrix_features
    matrix = error_analyzer.compute_matrix(features,
                                           filters,
                                           composite_filters,
                                           quantile_binning=quantile_binning,
                                           num_bins=num_bins)
    validation_data = X_test
    if filters is not None or composite_filters is not None:
        validation_data = filter_from_cohort(error_analyzer, filters,
                                             composite_filters)
        y_test = validation_data[TRUE_Y]
        validation_data = validation_data.drop(columns=[TRUE_Y, ROW_INDEX])
        if not isinstance(X_test, pd.DataFrame):
            validation_data = validation_data.values
    expected_count = len(validation_data)
    metric = error_analyzer.metric
    expected_error = get_expected_metric_error(error_analyzer, metric, model,
                                               validation_data, y_test)
    validate_matrix(matrix,
                    expected_count,
                    expected_error,
                    features,
                    metric=metric)
예제 #5
0
def compute_json_error_tree(analyzer, features, filters, composite_filters):
    # Fit a surrogate model on errors
    surrogate = LGBMClassifier(n_estimators=1, max_depth=3)
    is_model_analyzer = hasattr(analyzer, MODEL)
    if is_model_analyzer:
        filtered_df = filter_from_cohort(analyzer.dataset, filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories)
    else:
        filtered_df = filter_from_cohort(analyzer.dataset, filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories, analyzer.pred_y)
    row_index = filtered_df[ROW_INDEX]
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        diff = analyzer.model.predict(input_data) != true_y
    else:
        diff = pred_y != true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    indexes = []
    for feature in features:
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    cat_ind_reindexed = []
    categories_reindexed = []
    if analyzer.categorical_features:
        # Inplace replacement of columns
        for idx, c_i in enumerate(analyzer.categorical_indexes):
            input_data[:, c_i] = analyzer.string_indexed_data[row_index, idx]
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    dataset_sub_names = list(dataset_sub_names)
    if analyzer.categorical_features:
        for c_index, feature in enumerate(analyzer.categorical_features):
            try:
                index_sub = dataset_sub_names.index(feature)
            except ValueError:
                continue
            cat_ind_reindexed.append(index_sub)
            categories_reindexed.append(analyzer.categories[c_index])
        surrogate.fit(dataset_sub_features,
                      diff,
                      categorical_feature=cat_ind_reindexed)
    else:
        surrogate.fit(dataset_sub_features, diff)
    filtered_indexed_df = pd.DataFrame(dataset_sub_features,
                                       columns=dataset_sub_names)
    filtered_indexed_df[DIFF] = diff
    model_json = surrogate._Booster.dump_model()
    tree_structure = model_json["tree_info"][0]['tree_structure']
    max_split_index = get_max_split_index(tree_structure) + 1
    json_tree = traverse(filtered_indexed_df, tree_structure, max_split_index,
                         (categories_reindexed, cat_ind_reindexed), [],
                         dataset_sub_names)
    return json_tree
예제 #6
0
def compute_json_matrix(analyzer, features, filters, composite_filters):
    if features[0] is None and features[1] is None:
        raise ValueError(
            "One or two features must be specified to compute the heat map")
    is_model_analyzer = hasattr(analyzer, 'model')
    if is_model_analyzer:
        filtered_df = filter_from_cohort(analyzer.dataset, filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories)
    else:
        filtered_df = filter_from_cohort(analyzer.dataset, filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories, analyzer.pred_y)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    metric = analyzer.metric
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if is_model_analyzer:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = analyzer.model.predict(input_data) != true_y
        else:
            diff = analyzer.model.predict(input_data) - true_y
    else:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = pred_y != true_y
        else:
            diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    if analyzer.model_task == ModelTask.CLASSIFICATION:
        df_err = df_err[df_err[DIFF]]
    else:
        df_err[TRUE_Y] = true_y
        df_err[PRED_Y] = pred_y
    # construct json matrix
    json_matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > BIN_THRESHOLD and not f1_is_cat:
            tabdf1, bins = pd.cut(df[feat1], BIN_THRESHOLD, retbins=True)
            tabdf1_err = pd.cut(df_err[feat1], bins)
            categories1 = tabdf1.cat.categories
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > BIN_THRESHOLD and not f2_is_cat:
            tabdf2, bins = pd.cut(df[feat2], BIN_THRESHOLD, retbins=True)
            tabdf2_err = pd.cut(df_err[feat2], bins)
            categories2 = tabdf2.cat.categories
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        if metric == Metrics.ERROR_RATE:
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2])
        else:
            aggfunc = _AggFunc(metric_to_func[metric])
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2],
                                       values=list(
                                           zip(df_err[TRUE_Y],
                                               df_err[PRED_Y])),
                                       aggfunc=aggfunc._agg_func_pair)
            matrix_total = matrix_total.fillna(0)
            matrix_error = matrix_error.fillna(0)
        json_matrix = json_matrix_2d(categories1, categories2, matrix_total,
                                     matrix_error, metric)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > BIN_THRESHOLD and not f1_is_cat:
            cutdf, bins = pd.cut(df[feat1], BIN_THRESHOLD, retbins=True)
            bin_range = range(BIN_THRESHOLD)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cut_err = pd.cut(df_err[feat1], bins)
            catr_err = cut_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cut_err.cat.categories[val_err]
            categories = cutdf.cat.categories
        else:
            categories, counts = np.unique(df[feat1].to_numpy(),
                                           return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            cut_err = df_err
        # Compute the given metric for each group, if not using error rate
        if metric != Metrics.ERROR_RATE:
            aggfunc = _AggFunc(metric_to_func[metric])
            cutdf_err = pd.DataFrame(cut_err)
            cutdf_err['metric_values'] = list(
                zip(df_err[TRUE_Y], df_err[PRED_Y]))
            grouped = cutdf_err.groupby([feat1])
            counts_err = grouped.agg(aggfunc._agg_func_triplet)
            counts_err = counts_err.values.ravel()
        json_matrix = json_matrix_1d(categories, val_err, counts, counts_err,
                                     metric)
    return json_matrix
예제 #7
0
def compute_error_tree(analyzer,
                       features,
                       filters,
                       composite_filters,
                       max_depth=DEFAULT_MAX_DEPTH,
                       num_leaves=DEFAULT_NUM_LEAVES,
                       min_child_samples=DEFAULT_MIN_CHILD_SAMPLES):
    """Computes the error tree for the given dataset.

    :param analyzer: The error analyzer containing the categorical
        features and categories for the full dataset.
    :type analyzer: BaseAnalyzer
    :param features: The features to train the surrogate model on.
    :type features: numpy.ndarray or pandas.DataFrame
    :param filters: The filters to apply to the dataset.
    :type filters: numpy.ndarray or pandas.DataFrame
    :param composite_filters: The composite filters to apply to the dataset.
    :type composite_filters: numpy.ndarray or pandas.DataFrame
    :param max_depth: The maximum depth of the surrogate tree trained
        on errors.
    :type max_depth: int
    :param num_leaves: The number of leaves of the surrogate tree
        trained on errors.
    :type num_leaves: int
    :param min_child_samples: The minimal number of data required to
        create one leaf.
    :return: The tree representation as a list of nodes.
    :rtype: list[dict[str, str]]

    :Example:

    An example of running compute_error_tree with a
    filter and a composite filter:

    >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
    >>> from erroranalysis._internal.surrogate_error_tree import (
    ...     compute_error_tree)
    >>> from erroranalysis._internal.constants import ModelTask
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn import svm
    >>> breast_cancer_data = load_breast_cancer()
    >>> feature_names = breast_cancer_data.feature_names
    >>> X_train, X_test, y_train, y_test = train_test_split(
    ...     breast_cancer_data.data, breast_cancer_data.target,
    ...     test_size=0.2, random_state=0)
    >>> categorical_features = []
    >>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
    ...               random_state=777)
    >>> model = clf.fit(X_train, y_train)
    >>> model_task = ModelTask.CLASSIFICATION
    >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
    ...                          categorical_features, model_task=model_task)
    >>> filters = [{'arg': [23.85], 'column': 'mean radius',
    ...             'method': 'less and equal'}]
    >>> composite_filters = [{'compositeFilters':
    ...                      [{'compositeFilters':
    ...                       [{'arg': [13.45, 22.27],
    ...                         'column': 'mean radius',
    ...                         'method': 'in the range of'},
    ...                        {'arg': [10.88, 24.46],
    ...                         'column': 'mean texture',
    ...                         'method': 'in the range of'}],
    ...                        'operation': 'and'}],
    ...                      'operation': 'or'}]
    >>> tree = compute_error_tree(analyzer, ['mean radius', 'mean texture'],
    ...                           filters, composite_filters)
    """
    # Fit a surrogate model on errors
    if max_depth is None:
        max_depth = DEFAULT_MAX_DEPTH
    if num_leaves is None:
        num_leaves = DEFAULT_NUM_LEAVES
    if min_child_samples is None:
        min_child_samples = DEFAULT_MIN_CHILD_SAMPLES
    filtered_df = filter_from_cohort(analyzer,
                                     filters,
                                     composite_filters)
    if filtered_df.shape[0] == 0:
        return create_empty_node(analyzer.metric)
    is_model_analyzer = hasattr(analyzer, MODEL)
    indexes = []
    for feature in features:
        indexes.append(analyzer.feature_names.index(feature))
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    dataset_sub_names = list(dataset_sub_names)
    if not is_spark(filtered_df):
        booster, filtered_indexed_df, cat_info = get_surrogate_booster_local(
            filtered_df, analyzer, is_model_analyzer, indexes,
            dataset_sub_names, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed, categories_reindexed = cat_info
    else:
        booster, filtered_indexed_df = get_surrogate_booster_pyspark(
            filtered_df, analyzer, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed = []
        categories_reindexed = []
    dumped_model = booster.dump_model()
    tree_structure = dumped_model["tree_info"][0]['tree_structure']
    max_split_index = get_max_split_index(tree_structure) + 1
    cache_subtree_features(tree_structure, dataset_sub_names)
    tree = traverse(filtered_indexed_df,
                    tree_structure,
                    max_split_index,
                    (categories_reindexed,
                     cat_ind_reindexed),
                    [],
                    dataset_sub_names,
                    metric=analyzer.metric,
                    classes=analyzer.classes)
    return tree
예제 #8
0
def compute_matrix(analyzer,
                   features,
                   filters,
                   composite_filters,
                   quantile_binning=False,
                   num_bins=BIN_THRESHOLD):
    if num_bins <= 0:
        raise ValueError(
            'Number of bins parameter must be greater than 0 for the heatmap')
    if features[0] is None and features[1] is None:
        raise ValueError(
            'One or two features must be specified to compute the heat map')
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    is_model_analyzer = hasattr(analyzer, 'model')
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    metric = analyzer.metric
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if is_model_analyzer:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = analyzer.model.predict(input_data) != true_y
        else:
            diff = analyzer.model.predict(input_data) - true_y
    else:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = pred_y != true_y
        else:
            diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    if metric == Metrics.ERROR_RATE:
        df_err = df_err[df_err[DIFF]]
    else:
        df_err[TRUE_Y] = true_y
        df_err[PRED_Y] = pred_y
    # construct matrix
    matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            tabdf1 = bin_data(df,
                              feat1,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories1 = tabdf1.cat.categories
            if len(categories1) < num_bins:
                warn_duplicate_edges(feat1)
            tabdf1_err = bin_data(df_err,
                                  feat1,
                                  categories1,
                                  quantile_binning=quantile_binning)
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > num_bins and not f2_is_cat:
            tabdf2 = bin_data(df,
                              feat2,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories2 = tabdf2.cat.categories
            if len(categories2) < num_bins:
                warn_duplicate_edges(feat2)
            tabdf2_err = bin_data(df_err,
                                  feat2,
                                  categories2,
                                  quantile_binning=quantile_binning)
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        if metric == Metrics.ERROR_RATE:
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2])
        else:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2],
                                       values=list(
                                           zip(df_err[TRUE_Y],
                                               df_err[PRED_Y])),
                                       aggfunc=aggfunc._agg_func_pair)
            fill_matrix_nulls(matrix_total, aggfunc._fill_na_value())
            fill_matrix_nulls(matrix_error, aggfunc._fill_na_value())
        matrix = matrix_2d(categories1, categories2, matrix_total,
                           matrix_error, metric)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            cutdf = bin_data(df,
                             feat1,
                             num_bins,
                             quantile_binning=quantile_binning)
            num_categories = len(cutdf.cat.categories)
            bin_range = range(num_categories)
            if len(cutdf.cat.categories) < num_bins:
                warn_duplicate_edges(feat1)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cut_err = bin_data(df_err,
                               feat1,
                               cutdf.cat.categories,
                               quantile_binning=quantile_binning)
            catr_err = cut_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cut_err.cat.categories[val_err]
            categories = cutdf.cat.categories
        else:
            categories, counts = np.unique(df[feat1].to_numpy(),
                                           return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            cut_err = df_err
        # Compute the given metric for each group, if not using error rate
        if metric != Metrics.ERROR_RATE:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            cutdf_err = pd.DataFrame(cut_err)
            cutdf_err['metric_values'] = list(
                zip(df_err[TRUE_Y], df_err[PRED_Y]))
            grouped = cutdf_err.groupby([feat1])
            agg_func = {'metric_values': aggfunc._agg_func_triplet}
            counts_err = grouped.agg(agg_func)
            counts_err = counts_err.values.ravel()
        matrix = matrix_1d(categories, val_err, counts, counts_err, metric)
    return matrix
예제 #9
0
def compute_json_matrix(analyzer, features, filters, composite_filters):
    if features[0] is None and features[1] is None:
        raise ValueError(
            "One or two features must be specified to compute the heat map")
    is_model_analyzer = hasattr(analyzer, 'model')
    if is_model_analyzer:
        filtered_df = filter_from_cohort(analyzer.dataset, filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories)
    else:
        filtered_df = filter_from_cohort(analyzer.dataset, filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories, analyzer.pred_y)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        diff = analyzer.model.predict(input_data) != true_y
    else:
        diff = pred_y != true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    df_err = df_err[df_err[DIFF]]
    # construct json matrix
    json_matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > BIN_THRESHOLD and not f1_is_cat:
            tabdf1, bins = pd.cut(df[feat1], BIN_THRESHOLD, retbins=True)
            tabdf1_err = pd.cut(df_err[feat1], bins)
            categories1 = tabdf1.cat.categories
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > BIN_THRESHOLD and not f2_is_cat:
            tabdf2, bins = pd.cut(df[feat2], BIN_THRESHOLD, retbins=True)
            tabdf2_err = pd.cut(df_err[feat2], bins)
            categories2 = tabdf2.cat.categories
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        matrix_total = pd.crosstab(tabdf1,
                                   tabdf2,
                                   rownames=[feat1],
                                   colnames=[feat2])
        matrix_error = pd.crosstab(tabdf1_err,
                                   tabdf2_err,
                                   rownames=[feat1],
                                   colnames=[feat2])
        json_matrix = json_matrix_2d(categories1, categories2, matrix_total,
                                     matrix_error)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > BIN_THRESHOLD and not f1_is_cat:
            cutdf, bins = pd.cut(df[feat1], BIN_THRESHOLD, retbins=True)
            bin_range = range(BIN_THRESHOLD)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cutdf_err = pd.cut(df_err[feat1], bins)
            catr_err = cutdf_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cutdf_err.cat.categories[val_err]
            json_matrix = json_matrix_1d(cutdf.cat.categories, val_err, counts,
                                         counts_err)
        else:
            values, counts = np.unique(df[feat1].to_numpy(),
                                       return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            json_matrix = json_matrix_1d(values, val_err, counts, counts_err)
    return json_matrix
예제 #10
0
def compute_matrix(analyzer,
                   features,
                   filters,
                   composite_filters,
                   quantile_binning=False,
                   num_bins=BIN_THRESHOLD):
    """Compute a matrix of metrics for a given set of feature names.

    The filters and composite filters are used to filter the data
    prior to computing the matrix.

    :param analyzer: The error analyzer.
    :type analyzer: BaseAnalyzer
    :param features: A list of one or two feature names to compute metrics for.
    :type features: list
    :param filters: A list of filters to apply to the data.
    :type filters: list
    :param composite_filters: A list of composite filters to apply to the data.
    :type composite_filters: list
    :param quantile_binning: Whether to use quantile binning.
    :type quantile_binning: bool
    :param num_bins: The number of bins to use for quantile binning.
    :type num_bins: int
    :return: A dictionary representation of the computed matrix which can be
        saved to JSON.
    :rtype: dict

    :Example:

    An example of running compute_matrix with a filter and a composite
    filter:

    >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
    >>> from erroranalysis._internal.matrix_filter import (
    ...     compute_matrix)
    >>> from erroranalysis._internal.constants import ModelTask
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn import svm
    >>> breast_cancer_data = load_breast_cancer()
    >>> feature_names = breast_cancer_data.feature_names
    >>> X_train, X_test, y_train, y_test = train_test_split(
    ...     breast_cancer_data.data, breast_cancer_data.target,
    ...     test_size=0.5, random_state=0)
    >>> categorical_features = []
    >>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
    ...               random_state=777)
    >>> model = clf.fit(X_train, y_train)
    >>> model_task = ModelTask.CLASSIFICATION
    >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
    ...                          categorical_features, model_task=model_task)
    >>> filters = [{'arg': [23.85], 'column': 'mean radius',
    ...             'method': 'less and equal'}]
    >>> composite_filters = [{'compositeFilters':
    ...                      [{'compositeFilters':
    ...                       [{'arg': [13.45, 22.27],
    ...                         'column': 'mean radius',
    ...                         'method': 'in the range of'},
    ...                        {'arg': [10.88, 24.46],
    ...                         'column': 'mean texture',
    ...                         'method': 'in the range of'}],
    ...                        'operation': 'and'}],
    ...                      'operation': 'or'}]
    >>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'],
    ...                         filters, composite_filters)
    """
    if num_bins <= 0:
        raise ValueError(
            'Number of bins parameter must be greater than 0 for the heatmap')
    if features[0] is None and features[1] is None:
        raise ValueError(
            'One or two features must be specified to compute the heat map')
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    is_model_analyzer = hasattr(analyzer, 'model')
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    metric = analyzer.metric
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if is_model_analyzer:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = analyzer.model.predict(input_data) != true_y
        else:
            diff = analyzer.model.predict(input_data) - true_y
    else:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = pred_y != true_y
        else:
            diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    if metric == Metrics.ERROR_RATE:
        df_err = df_err[df_err[DIFF]]
    else:
        df_err[TRUE_Y] = true_y
        df_err[PRED_Y] = pred_y
    # construct matrix
    matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            tabdf1 = bin_data(df,
                              feat1,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories1 = tabdf1.cat.categories
            if len(categories1) < num_bins:
                warn_duplicate_edges(feat1)
            tabdf1_err = bin_data(df_err,
                                  feat1,
                                  categories1,
                                  quantile_binning=quantile_binning)
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > num_bins and not f2_is_cat:
            tabdf2 = bin_data(df,
                              feat2,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories2 = tabdf2.cat.categories
            if len(categories2) < num_bins:
                warn_duplicate_edges(feat2)
            tabdf2_err = bin_data(df_err,
                                  feat2,
                                  categories2,
                                  quantile_binning=quantile_binning)
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        if metric == Metrics.ERROR_RATE:
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2])
        else:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2],
                                       values=list(
                                           zip(df_err[TRUE_Y],
                                               df_err[PRED_Y])),
                                       aggfunc=aggfunc._agg_func_pair)
            fill_matrix_nulls(matrix_total, aggfunc._fill_na_value())
            fill_matrix_nulls(matrix_error, aggfunc._fill_na_value())
        matrix = matrix_2d(categories1, categories2, matrix_total,
                           matrix_error, metric)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            cutdf = bin_data(df,
                             feat1,
                             num_bins,
                             quantile_binning=quantile_binning)
            num_categories = len(cutdf.cat.categories)
            bin_range = range(num_categories)
            if len(cutdf.cat.categories) < num_bins:
                warn_duplicate_edges(feat1)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cut_err = bin_data(df_err,
                               feat1,
                               cutdf.cat.categories,
                               quantile_binning=quantile_binning)
            catr_err = cut_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cut_err.cat.categories[val_err]
            categories = cutdf.cat.categories
        else:
            categories, counts = np.unique(df[feat1].to_numpy(),
                                           return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            cut_err = df_err
        # Compute the given metric for each group, if not using error rate
        if metric != Metrics.ERROR_RATE:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            cutdf_err = pd.DataFrame(cut_err)
            cutdf_err['metric_values'] = list(
                zip(df_err[TRUE_Y], df_err[PRED_Y]))
            grouped = cutdf_err.groupby([feat1])
            agg_func = {'metric_values': aggfunc._agg_func_grouped}
            counts_err = grouped.agg(agg_func)
            counts_err = counts_err.values.ravel()
        matrix = matrix_1d(categories, val_err, counts, counts_err, metric)
    return matrix
def compute_json_error_tree(analyzer,
                            features,
                            filters,
                            composite_filters,
                            max_depth=DEFAULT_MAX_DEPTH,
                            num_leaves=DEFAULT_NUM_LEAVES):
    # Fit a surrogate model on errors
    if max_depth is None:
        max_depth = DEFAULT_MAX_DEPTH
    if num_leaves is None:
        num_leaves = DEFAULT_NUM_LEAVES
    is_model_analyzer = hasattr(analyzer, MODEL)
    if is_model_analyzer:
        filtered_df = filter_from_cohort(analyzer.dataset,
                                         filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories)
    else:
        filtered_df = filter_from_cohort(analyzer.dataset,
                                         filters,
                                         composite_filters,
                                         analyzer.feature_names,
                                         analyzer.true_y,
                                         analyzer.categorical_features,
                                         analyzer.categories,
                                         analyzer.pred_y)
    row_index = filtered_df[ROW_INDEX]
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if analyzer.model_task == ModelTask.CLASSIFICATION:
        diff = pred_y != true_y
    else:
        diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()

    if analyzer.categorical_features:
        # Inplace replacement of columns
        for idx, c_i in enumerate(analyzer.categorical_indexes):
            input_data[:, c_i] = analyzer.string_indexed_data[row_index, idx]
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    dataset_sub_names = list(dataset_sub_names)

    categorical_info = get_categorical_info(analyzer,
                                            dataset_sub_names)
    cat_ind_reindexed, categories_reindexed = categorical_info

    surrogate = create_surrogate_model(analyzer,
                                       dataset_sub_features,
                                       diff,
                                       max_depth,
                                       num_leaves,
                                       cat_ind_reindexed)

    filtered_indexed_df = pd.DataFrame(dataset_sub_features,
                                       columns=dataset_sub_names)
    filtered_indexed_df[DIFF] = diff
    filtered_indexed_df[TRUE_Y] = true_y
    filtered_indexed_df[PRED_Y] = pred_y
    model_json = surrogate._Booster.dump_model()
    tree_structure = model_json["tree_info"][0]['tree_structure']
    max_split_index = get_max_split_index(tree_structure) + 1
    json_tree = traverse(filtered_indexed_df,
                         tree_structure,
                         max_split_index,
                         (categories_reindexed,
                          cat_ind_reindexed),
                         [],
                         dataset_sub_names,
                         metric=analyzer.metric)
    return json_tree
def compute_error_tree(analyzer,
                       features,
                       filters,
                       composite_filters,
                       max_depth=DEFAULT_MAX_DEPTH,
                       num_leaves=DEFAULT_NUM_LEAVES,
                       min_child_samples=DEFAULT_MIN_CHILD_SAMPLES):
    """Computes the error tree for the given dataset.

    :param analyzer: The error analyzer containing the categorical
        features and categories for the full dataset.
    :type analyzer: BaseAnalyzer
    :param features: The features to train the surrogate model on.
    :type features: numpy.ndarray or pandas.DataFrame
    :param filters: The filters to apply to the dataset.
    :type filters: numpy.ndarray or pandas.DataFrame
    :param composite_filters: The composite filters to apply to the dataset.
    :type composite_filters: numpy.ndarray or pandas.DataFrame
    :param max_depth: The maximum depth of the surrogate tree trained
        on errors.
    :type max_depth: int
    :param num_leaves: The number of leaves of the surrogate tree
        trained on errors.
    :type num_leaves: int
    :param min_child_samples: The minimal number of data required to
        create one leaf.
    :return: The tree representation as a list of nodes.
    :rtype: list[dict[str, str]]
    """
    # Fit a surrogate model on errors
    if max_depth is None:
        max_depth = DEFAULT_MAX_DEPTH
    if num_leaves is None:
        num_leaves = DEFAULT_NUM_LEAVES
    if min_child_samples is None:
        min_child_samples = DEFAULT_MIN_CHILD_SAMPLES
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    if filtered_df.shape[0] == 0:
        return create_empty_node(analyzer.metric)
    is_model_analyzer = hasattr(analyzer, MODEL)
    indexes = []
    for feature in features:
        indexes.append(analyzer.feature_names.index(feature))
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    dataset_sub_names = list(dataset_sub_names)
    if not is_spark(filtered_df):
        booster, filtered_indexed_df, cat_info = get_surrogate_booster_local(
            filtered_df, analyzer, is_model_analyzer, indexes,
            dataset_sub_names, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed, categories_reindexed = cat_info
    else:
        booster, filtered_indexed_df = get_surrogate_booster_pyspark(
            filtered_df, analyzer, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed = []
        categories_reindexed = []
    dumped_model = booster.dump_model()
    tree_structure = dumped_model["tree_info"][0]['tree_structure']
    max_split_index = get_max_split_index(tree_structure) + 1
    cache_subtree_features(tree_structure, dataset_sub_names)
    tree = traverse(filtered_indexed_df,
                    tree_structure,
                    max_split_index, (categories_reindexed, cat_ind_reindexed),
                    [],
                    dataset_sub_names,
                    metric=analyzer.metric,
                    classes=analyzer.classes)
    return tree