예제 #1
0
    def filter_data_from_cohort(self, filters, composite_filters,
                                include_original_columns_only=False):
        """Filters the dataset on the model based on the specified filters.

        :param filters: The filters.
        :type filters: list[dict]
        :param composite_filters: The composite filters.
        :type composite_filters: list[dict]
        :return: The filtered dataset converted to a pandas DataFrame.
        :param include_original_columns_only: Whether to just include
                                              the original data columns.
        :type include_original_columns_only: bool
        :rtype: pandas.DataFrame
        """
        df = self.dataset
        if not is_spark(df):
            if not isinstance(df, pd.DataFrame):
                df = pd.DataFrame(df, columns=self.features)
            else:
                # Note: we make a non-deep copy of the input DataFrame since
                # we will add columns below
                df = df.copy()
        self._add_filter_cols(df, filters)
        df = self._apply_recursive_filter(
            df, filters, self.categorical_features, self.categories)
        df = self._apply_recursive_filter(
            df, composite_filters, self.categorical_features, self.categories)
        df = self._post_process_df(
            df, include_original_columns_only=include_original_columns_only)
        return df
def filter_from_cohort(analyzer, filters, composite_filters):
    """Filters the dataset on the analyzer based on the specified filters.

    :param analyzer: The error analyzer.
    :type: BaseAnalyzer
    :param filters: The filters.
    :type filters: list[dict]
    :param composite_filters: The composite filters.
    :type composite_filters: list[dict]
    :return: The filtered dataset converted to a pandas DataFrame.
    :rtype: pandas.DataFrame
    """
    df = analyzer.dataset
    feature_names = analyzer.feature_names
    true_y = analyzer.true_y
    categorical_features = analyzer.categorical_features
    categories = analyzer.categories
    if not is_spark(df):
        if not isinstance(df, pd.DataFrame):
            df = pd.DataFrame(df, columns=feature_names)
        else:
            # Note: we make a non-deep copy of the input DataFrame since
            # we will add columns below
            df = df.copy()
    add_filter_cols(analyzer, df, filters, true_y)
    df = apply_recursive_filter(df, filters, categorical_features, categories)
    df = apply_recursive_filter(df, composite_filters, categorical_features,
                                categories)
    df = post_process_df(df)
    return df
예제 #3
0
    def _add_filter_cols(self, df, filters):
        """Adds special columns to the dataset for filtering and postprocessing.

        :param df: The dataset as a pandas dataframe.
        :type df: pandas.DataFrame
        :param filters: The filters.
        :type filters: list[dict]
        """
        has_classification_outcome = self._filters_has_classification_outcome(
            filters)
        has_regression_error = self._filters_has_regression_error(filters)

        if isinstance(self.true_y, str):
            df.rename(columns={self.true_y: TRUE_Y})
        else:
            df[TRUE_Y] = self.true_y

        if self.model is None:
            df[PRED_Y] = self.pred_y

        if not is_spark(df):
            df[ROW_INDEX] = np.arange(0, len(self.true_y))
        if has_classification_outcome:
            if PRED_Y in df:
                pred_y = df[PRED_Y]
            else:
                # calculate directly via prediction on model
                pred_y = self.model.predict(
                    df.drop(columns=[TRUE_Y, ROW_INDEX]))

            classes = get_ordered_classes(
                self.classes, self.true_y, pred_y)

            # calculate classification outcome and add to df
            if len(classes) == 2:
                df[CLASSIFICATION_OUTCOME] = \
                    self._compute_binary_classification_outcome_data(
                        self.true_y, pred_y, classes)
            else:
                df[CLASSIFICATION_OUTCOME] = \
                    self._compute_multiclass_classification_outcome_data(
                        self.true_y, pred_y)
        elif has_regression_error:
            if PRED_Y in df:
                pred_y = df[PRED_Y]
            else:
                # calculate directly via prediction on model
                pred_y = self.model.predict(
                    df.drop(columns=[TRUE_Y, ROW_INDEX]))
            # calculate regression error and add to df
            df[REGRESSION_ERROR] = self._compute_regression_error_data(
                self.true_y, pred_y)
def add_filter_cols(analyzer, df, filters, true_y):
    """Adds special columns to the dataset for filtering and postprocessing.

    :param analyzer: The error analyzer.
    :type: BaseAnalyzer
    :param filters: The filters.
    :type filters: list[dict]
    :param true_y: The true labels.
    :type true_y: list
    """
    has_classification_outcome = filters_has_classification_outcome(
        analyzer, filters)
    if isinstance(true_y, str):
        df.rename(columns={true_y: TRUE_Y})
    else:
        df[TRUE_Y] = true_y
    is_model_analyzer = hasattr(analyzer, MODEL)
    if not is_model_analyzer:
        df[PRED_Y] = analyzer.pred_y
    if not is_spark(df):
        df[ROW_INDEX] = np.arange(0, len(true_y))
    if has_classification_outcome:
        if PRED_Y in df:
            pred_y = df[PRED_Y]
        else:
            # calculate directly via prediction on model
            pred_y = analyzer.model.predict(
                df.drop(columns=[TRUE_Y, ROW_INDEX]))
        classes = get_ordered_classes(analyzer.classes, true_y, pred_y)
        # calculate classification outcome and add to df
        classification_outcome = []
        if not isinstance(pred_y, np.ndarray):
            pred_y = np.array(pred_y)
        if not isinstance(true_y, np.ndarray):
            true_y = np.array(true_y)
        for i in range(len(true_y)):
            if true_y[i] == pred_y[i]:
                if true_y[i] == classes[0]:
                    # True negative == 0
                    classification_outcome.append(3)
                else:
                    # True positive == 3
                    classification_outcome.append(3)
            else:
                if true_y[i] == classes[0]:
                    # False negative == 2
                    classification_outcome.append(2)
                else:
                    # False positive == 1
                    classification_outcome.append(1)
        df[CLASSIFICATION_OUTCOME] = classification_outcome
예제 #5
0
def compute_error_tree(analyzer,
                       features,
                       filters,
                       composite_filters,
                       max_depth=DEFAULT_MAX_DEPTH,
                       num_leaves=DEFAULT_NUM_LEAVES,
                       min_child_samples=DEFAULT_MIN_CHILD_SAMPLES):
    """Computes the error tree for the given dataset.

    :param analyzer: The error analyzer containing the categorical
        features and categories for the full dataset.
    :type analyzer: BaseAnalyzer
    :param features: The features to train the surrogate model on.
    :type features: numpy.ndarray or pandas.DataFrame
    :param filters: The filters to apply to the dataset.
    :type filters: numpy.ndarray or pandas.DataFrame
    :param composite_filters: The composite filters to apply to the dataset.
    :type composite_filters: numpy.ndarray or pandas.DataFrame
    :param max_depth: The maximum depth of the surrogate tree trained
        on errors.
    :type max_depth: int
    :param num_leaves: The number of leaves of the surrogate tree
        trained on errors.
    :type num_leaves: int
    :param min_child_samples: The minimal number of data required to
        create one leaf.
    :return: The tree representation as a list of nodes.
    :rtype: list[dict[str, str]]

    :Example:

    An example of running compute_error_tree with a
    filter and a composite filter:

    >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
    >>> from erroranalysis._internal.surrogate_error_tree import (
    ...     compute_error_tree)
    >>> from erroranalysis._internal.constants import ModelTask
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn import svm
    >>> breast_cancer_data = load_breast_cancer()
    >>> feature_names = breast_cancer_data.feature_names
    >>> X_train, X_test, y_train, y_test = train_test_split(
    ...     breast_cancer_data.data, breast_cancer_data.target,
    ...     test_size=0.2, random_state=0)
    >>> categorical_features = []
    >>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
    ...               random_state=777)
    >>> model = clf.fit(X_train, y_train)
    >>> model_task = ModelTask.CLASSIFICATION
    >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
    ...                          categorical_features, model_task=model_task)
    >>> filters = [{'arg': [23.85], 'column': 'mean radius',
    ...             'method': 'less and equal'}]
    >>> composite_filters = [{'compositeFilters':
    ...                      [{'compositeFilters':
    ...                       [{'arg': [13.45, 22.27],
    ...                         'column': 'mean radius',
    ...                         'method': 'in the range of'},
    ...                        {'arg': [10.88, 24.46],
    ...                         'column': 'mean texture',
    ...                         'method': 'in the range of'}],
    ...                        'operation': 'and'}],
    ...                      'operation': 'or'}]
    >>> tree = compute_error_tree(analyzer, ['mean radius', 'mean texture'],
    ...                           filters, composite_filters)
    """
    # Fit a surrogate model on errors
    if max_depth is None:
        max_depth = DEFAULT_MAX_DEPTH
    if num_leaves is None:
        num_leaves = DEFAULT_NUM_LEAVES
    if min_child_samples is None:
        min_child_samples = DEFAULT_MIN_CHILD_SAMPLES
    filtered_df = filter_from_cohort(analyzer,
                                     filters,
                                     composite_filters)
    if filtered_df.shape[0] == 0:
        return create_empty_node(analyzer.metric)
    is_model_analyzer = hasattr(analyzer, MODEL)
    indexes = []
    for feature in features:
        indexes.append(analyzer.feature_names.index(feature))
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    dataset_sub_names = list(dataset_sub_names)
    if not is_spark(filtered_df):
        booster, filtered_indexed_df, cat_info = get_surrogate_booster_local(
            filtered_df, analyzer, is_model_analyzer, indexes,
            dataset_sub_names, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed, categories_reindexed = cat_info
    else:
        booster, filtered_indexed_df = get_surrogate_booster_pyspark(
            filtered_df, analyzer, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed = []
        categories_reindexed = []
    dumped_model = booster.dump_model()
    tree_structure = dumped_model["tree_info"][0]['tree_structure']
    max_split_index = get_max_split_index(tree_structure) + 1
    cache_subtree_features(tree_structure, dataset_sub_names)
    tree = traverse(filtered_indexed_df,
                    tree_structure,
                    max_split_index,
                    (categories_reindexed,
                     cat_ind_reindexed),
                    [],
                    dataset_sub_names,
                    metric=analyzer.metric,
                    classes=analyzer.classes)
    return tree
예제 #6
0
def node_to_dict(df, tree, nodeid, categories, json,
                 feature_names, metric, parent=None,
                 side=TreeSide.UNKNOWN, classes=None):
    """Converts a node and children to a dictionary that can be saved as JSON.

    This is a method that is called on the current node and then its children
    recursively to construct the dictionary representation.

    :param df: The DataFrame to use for the current node.
    :type df: pandas.DataFrame
    :param tree: The tree to use for the current node.
    :type tree: dict
    :param nodeid: The id of the current node.
    :type nodeid: int
    :param categories: The list of categories for the current node.
    :type categories: list[tuple]
    :param json: The JSON to write the node to.
    :type json: dict
    :param feature_names: The set of feature names.
    :type feature_names: set[str]
    :param metric: The metric to use for the current node.
    :type metric: str
    :param parent: The parent node.
    :type parent: dict
    :param side: The side of the current node from the parent, if known.
    :type side: TreeSide
    :param classes: The list of classes.
    :type classes: list[str]
    :return: The JSON with the node and all children added.
    :rtype: dict
    """
    p_node_name = None
    condition = None
    arg = None
    method = None
    parentid = None
    if parent is not None:
        parentid = int(parent[SPLIT_INDEX])
        p_node_name_val = feature_names[parent[SPLIT_FEATURE]]
        # use number.Integral to check for any numpy or python number type
        if isinstance(p_node_name_val, numbers.Integral):
            # for numeric column names, we can use @df[numeric_colname] syntax
            p_node_query = "@df[" + str(p_node_name_val) + "]"
        else:
            # for string column names, we can just use column name directly
            # with backticks
            p_node_query = "`" + str(p_node_name_val) + "`"
        p_node_name = str(p_node_name_val)
        parent_threshold = parent['threshold']
        parent_decision_type = parent['decision_type']
        if side == TreeSide.LEFT_CHILD:
            if parent_decision_type == '<=':
                method = "less and equal"
                arg = float(parent_threshold)
                condition = "{} <= {:.2f}".format(p_node_name,
                                                  parent_threshold)
                df = filter_by_threshold(df, p_node_name_val,
                                         parent_threshold, side)
            elif parent_decision_type == '==':
                method = CohortFilterMethods.METHOD_INCLUDES
                arg = create_categorical_arg(parent_threshold)
                query, condition = create_categorical_query(method,
                                                            arg,
                                                            p_node_name,
                                                            p_node_query,
                                                            parent,
                                                            categories)
                df = df.query(query)
        elif side == TreeSide.RIGHT_CHILD:
            if parent_decision_type == '<=':
                method = "greater"
                arg = float(parent_threshold)
                condition = "{} > {:.2f}".format(p_node_name,
                                                 parent_threshold)
                df = filter_by_threshold(df, p_node_name_val,
                                         parent_threshold, side)
            elif parent_decision_type == '==':
                method = CohortFilterMethods.METHOD_EXCLUDES
                arg = create_categorical_arg(parent_threshold)
                query, condition = create_categorical_query(method,
                                                            arg,
                                                            p_node_name,
                                                            p_node_query,
                                                            parent,
                                                            categories)
                df = df.query(query)
    total = df.shape[0]
    if is_spark(df):
        metric_value, success, error = compute_metrics_pyspark(
            df, metric, total)
    else:
        metric_value, success, error = compute_metrics_local(
            df, metric, total, classes)
    metric_name = metric_to_display_name[metric]
    is_error_metric = metric in error_metrics
    if SPLIT_FEATURE in tree:
        node_name = str(feature_names[tree[SPLIT_FEATURE]])
    else:
        node_name = None
    json.append(get_json_node(arg, condition, error, nodeid, method,
                              node_name, parentid, p_node_name,
                              total, success, metric_name,
                              metric_value, is_error_metric))
    return json, df
예제 #7
0
def traverse(df,
             tree,
             max_split_index,
             categories,
             dict,
             feature_names,
             parent=None,
             side=TreeSide.UNKNOWN,
             metric=None,
             classes=None):
    """Traverses the current node in the tree to create a list of nodes.

    :param df: The DataFrame containing the features and labels.
    :type df: pandas.DataFrame
    :param tree: The current node in the tree to traverse.
    :type tree: dict
    :param max_split_index: The max split index for the tree.
    :type max_split_index: int
    :param categories: The list of categorical features and categories.
    :type categories: list[tuple]
    :param dict: The dictionary to store the nodes in.
    :type dict: dict
    :param feature_names: The list of feature names.
    :type feature_names: list[str]
    :param parent: The parent node of the current node.
    :type parent: Node or None
    :param side: The side of the parent node the current node is on.
    :type side: TreeSide
    :param metric: The metric to use for the current node.
    :type metric: str
    :param classes: The list of classes for the current node.
    :type classes: list[str]
    :return: The tree representation as a list of nodes.
    :rtype: list[dict[str, str]]
    """
    if SPLIT_INDEX in tree:
        nodeid = tree[SPLIT_INDEX]
    elif LEAF_INDEX in tree:
        nodeid = max_split_index + tree[LEAF_INDEX]
    else:
        nodeid = 0

    # reduce DataFrame to just features split on at each step for perf
    if not is_spark(df):
        df = filter_to_used_features(df, tree)

    # write current node to a dictionary that can be saved as JSON
    dict, df = node_to_dict(df, tree, nodeid, categories, dict,
                            feature_names, metric, parent, side,
                            classes)

    # write children to a dictionary that can be saved as JSON
    if LEAF_VALUE not in tree:
        left_child = tree[TreeSide.LEFT_CHILD]
        right_child = tree[TreeSide.RIGHT_CHILD]
        dict = traverse(df, left_child, max_split_index,
                        categories, dict, feature_names,
                        tree, TreeSide.LEFT_CHILD, metric,
                        classes)
        dict = traverse(df, right_child, max_split_index,
                        categories, dict, feature_names,
                        tree, TreeSide.RIGHT_CHILD, metric,
                        classes)
    return dict
예제 #8
0
    def __init__(self, explanation, model, dataset, true_y, classes, features,
                 categorical_features, true_y_dataset, pred_y, pred_y_dataset,
                 model_task, metric, max_depth, num_leaves, min_child_samples,
                 sample_dataset):
        """Initialize the ErrorAnalysis Dashboard Input.

        :param explanation: An object that represents an explanation.
        :type explanation: ExplanationMixin
        :param model: An object that represents a model.
            It is assumed that for the classification case
            it has a method of predict_proba() returning
            the prediction probabilities for each
            class and for the regression case a method of predict()
            returning the prediction value.
        :type model: object
        :param dataset: A matrix of feature vector examples
        (# examples x # features), the same samples
            used to build the explanation.
            Will overwrite any set on explanation object already.
        :type dataset: numpy.ndarray or list[][] or pandas.DataFrame
        :param true_y: The true labels for the provided explanation.
            Will overwrite any set on explanation object already.
        :type true_y: numpy.ndarray or list[] or pandas.Series
        :param classes: The class names.
        :type classes: numpy.ndarray or list[]
        :param features: Feature names.
        :type features: numpy.ndarray or list[]
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param true_y_dataset: The true labels for the provided dataset.
        Only needed if the explanation has a sample of instances from the
        original dataset.  Otherwise specify true_y parameter only.
        :type true_y_dataset: numpy.ndarray or list[] or pandas.Series
        :param pred_y: The predicted y values, can be passed in as an
            alternative to the model and explanation for a more limited
            view.
        :type pred_y: numpy.ndarray or list[] or pandas.Series
        :param pred_y_dataset: The predicted labels for the provided dataset.
            Only needed if providing a sample dataset for the UI while using
            the full dataset for the tree view and heatmap. Otherwise specify
            pred_y parameter only.
        :type pred_y_dataset: numpy.ndarray or list[] or pandas.Series
        :param model_task: Optional parameter to specify whether the model
            is a classification or regression model. In most cases, the
            type of the model can be inferred based on the shape of the
            output, where a classifier has a predict_proba method and
            outputs a 2 dimensional array, while a regressor has a
            predict method and outputs a 1 dimensional array.
        :type model_task: str
        :param metric: The metric name to evaluate at each tree node or
            heatmap grid.  Currently supported classification metrics
            include 'error_rate', 'recall_score' for binary
            classification and 'micro_recall_score' or
            'macro_recall_score' for multiclass classification,
            'precision_score' for binary classification and
            'micro_precision_score' or 'macro_precision_score'
            for multiclass classification, 'f1_score' for binary
            classification and 'micro_f1_score' or 'macro_f1_score'
            for multiclass classification, and 'accuracy_score'.
            Supported regression metrics include 'mean_absolute_error',
            'mean_squared_error', 'r2_score', and 'median_absolute_error'.
        :type metric: str
        :param max_depth: The maximum depth of the surrogate tree trained
            on errors.
        :type max_depth: int
        :param num_leaves: The number of leaves of the surrogate tree
            trained on errors.
        :type num_leaves: int
        :param min_child_samples: The minimal number of data required
            to create one leaf.
        :type min_child_samples: int
        :param sample_dataset: Dataset with fewer samples than the main
            dataset. Used to improve performance only when an
            Explanation object is not provided.  Used only if
            explanation is not specified for the dataset explorer.
            Specify less than 10k points for optimal performance.
        :type sample_dataset: pd.DataFrame or numpy.ndarray or list[][]
        """
        self._model = model
        self._categorical_features = categorical_features
        self._string_ind_data = None
        self._categories = []
        self._categorical_indexes = []
        self._is_classifier = is_classifier(model)
        self._dataframeColumns = None
        self.dashboard_input = {}

        model_available = model is not None

        if model_available and pred_y is not None:
            raise ValueError('Only model or pred_y can be specified, not both')

        self.dashboard_input[ENABLE_PREDICT] = model_available

        self.dashboard_input[
            ExplanationDashboardInterface.IS_CLASSIFIER] = self._is_classifier

        is_pyspark = is_spark(dataset)
        if is_pyspark:
            self.setup_pyspark(model, dataset, true_y, classes, features,
                               categorical_features, true_y_dataset, pred_y,
                               pred_y_dataset, model_task, metric, max_depth,
                               num_leaves, min_child_samples, sample_dataset,
                               model_available)
        else:
            self.setup_local(explanation, model, dataset, true_y, classes,
                             features, categorical_features, true_y_dataset,
                             pred_y, pred_y_dataset, model_task, metric,
                             max_depth, num_leaves, min_child_samples,
                             sample_dataset, model_available)
        data = self.get_error_analysis_data(max_depth, num_leaves,
                                            min_child_samples,
                                            self._error_analyzer.metric,
                                            is_pyspark)
        self.dashboard_input[
            ExplanationDashboardInterface.ERROR_ANALYSIS_DATA] = data
def compute_error_tree(analyzer,
                       features,
                       filters,
                       composite_filters,
                       max_depth=DEFAULT_MAX_DEPTH,
                       num_leaves=DEFAULT_NUM_LEAVES,
                       min_child_samples=DEFAULT_MIN_CHILD_SAMPLES):
    """Computes the error tree for the given dataset.

    :param analyzer: The error analyzer containing the categorical
        features and categories for the full dataset.
    :type analyzer: BaseAnalyzer
    :param features: The features to train the surrogate model on.
    :type features: numpy.ndarray or pandas.DataFrame
    :param filters: The filters to apply to the dataset.
    :type filters: numpy.ndarray or pandas.DataFrame
    :param composite_filters: The composite filters to apply to the dataset.
    :type composite_filters: numpy.ndarray or pandas.DataFrame
    :param max_depth: The maximum depth of the surrogate tree trained
        on errors.
    :type max_depth: int
    :param num_leaves: The number of leaves of the surrogate tree
        trained on errors.
    :type num_leaves: int
    :param min_child_samples: The minimal number of data required to
        create one leaf.
    :return: The tree representation as a list of nodes.
    :rtype: list[dict[str, str]]
    """
    # Fit a surrogate model on errors
    if max_depth is None:
        max_depth = DEFAULT_MAX_DEPTH
    if num_leaves is None:
        num_leaves = DEFAULT_NUM_LEAVES
    if min_child_samples is None:
        min_child_samples = DEFAULT_MIN_CHILD_SAMPLES
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    if filtered_df.shape[0] == 0:
        return create_empty_node(analyzer.metric)
    is_model_analyzer = hasattr(analyzer, MODEL)
    indexes = []
    for feature in features:
        indexes.append(analyzer.feature_names.index(feature))
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    dataset_sub_names = list(dataset_sub_names)
    if not is_spark(filtered_df):
        booster, filtered_indexed_df, cat_info = get_surrogate_booster_local(
            filtered_df, analyzer, is_model_analyzer, indexes,
            dataset_sub_names, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed, categories_reindexed = cat_info
    else:
        booster, filtered_indexed_df = get_surrogate_booster_pyspark(
            filtered_df, analyzer, max_depth, num_leaves, min_child_samples)
        cat_ind_reindexed = []
        categories_reindexed = []
    dumped_model = booster.dump_model()
    tree_structure = dumped_model["tree_info"][0]['tree_structure']
    max_split_index = get_max_split_index(tree_structure) + 1
    cache_subtree_features(tree_structure, dataset_sub_names)
    tree = traverse(filtered_indexed_df,
                    tree_structure,
                    max_split_index, (categories_reindexed, cat_ind_reindexed),
                    [],
                    dataset_sub_names,
                    metric=analyzer.metric,
                    classes=analyzer.classes)
    return tree
def node_to_dict(df,
                 tree,
                 nodeid,
                 categories,
                 json,
                 feature_names,
                 metric,
                 parent=None,
                 side=TreeSide.UNKNOWN,
                 classes=None):
    p_node_name = None
    condition = None
    arg = None
    method = None
    parentid = None
    if parent is not None:
        parentid = int(parent[SPLIT_INDEX])
        p_node_name_val = feature_names[parent[SPLIT_FEATURE]]
        # use number.Integral to check for any numpy or python number type
        if isinstance(p_node_name_val, numbers.Integral):
            # for numeric column names, we can use @df[numeric_colname] syntax
            p_node_query = "@df[" + str(p_node_name_val) + "]"
        else:
            # for string column names, we can just use column name directly
            # with backticks
            p_node_query = "`" + str(p_node_name_val) + "`"
        p_node_name = str(p_node_name_val)
        parent_threshold = parent['threshold']
        parent_decision_type = parent['decision_type']
        if side == TreeSide.LEFT_CHILD:
            if parent_decision_type == '<=':
                method = "less and equal"
                arg = float(parent_threshold)
                condition = "{} <= {:.2f}".format(p_node_name,
                                                  parent_threshold)
                df = df[df[p_node_name_val] <= parent_threshold]
            elif parent_decision_type == '==':
                method = CohortFilterMethods.METHOD_INCLUDES
                arg = create_categorical_arg(parent_threshold)
                query, condition = create_categorical_query(
                    method, arg, p_node_name, p_node_query, parent, categories)
                df = df.query(query)
        elif side == TreeSide.RIGHT_CHILD:
            if parent_decision_type == '<=':
                method = "greater"
                arg = float(parent_threshold)
                condition = "{} > {:.2f}".format(p_node_name, parent_threshold)
                df = df[df[p_node_name_val] > parent_threshold]
            elif parent_decision_type == '==':
                method = CohortFilterMethods.METHOD_EXCLUDES
                arg = create_categorical_arg(parent_threshold)
                query, condition = create_categorical_query(
                    method, arg, p_node_name, p_node_query, parent, categories)
                df = df.query(query)
    total = df.shape[0]
    if is_spark(df):
        metric_value, success, error = compute_metrics_pyspark(
            df, metric, total)
    else:
        metric_value, success, error = compute_metrics_local(
            df, metric, total, classes)
    metric_name = metric_to_display_name[metric]
    is_error_metric = metric in error_metrics
    if SPLIT_FEATURE in tree:
        node_name = feature_names[tree[SPLIT_FEATURE]]
    else:
        node_name = None
    json.append(
        get_json_node(arg, condition, error, nodeid, method, node_name,
                      parentid, p_node_name, total, success, metric_name,
                      metric_value, is_error_metric))
    return json, df