Пример #1
0
    def get_node_index(cls, graph, X, **kwargs):
        index = dict_utils.get(kwargs, 'index', list(range(X.shape[0])))

        X = utils.make_ndarray(X)
        index = utils.make_ndarray(index, shape=-1)
        if X.shape[0] != index.shape[0]:
            raise ValueError('Mismatch data and index shape')

        result = {}
        root = 0
        verify = dict_utils.get(kwargs, 'verify', True)
        feature_data = graph.get_data(FEATURE_DATA_KEY)
        n_childes = graph.n_child()

        def recurse(node, index):
            result[node] = index
            if n_childes[node] == 2:
                left_index, right_index = cls.get_child_index(
                    X[index, ], index, feature_data[node])
                recurse(graph.df.at[node, 'child_left'], left_index)
                recurse(graph.df.at[node, 'child_right'], right_index)

        recurse(root, index)

        if verify:
            leaf_nodes = np.where(n_childes == 0)[0]
            node_index = dict_utils.subset_dict(result, leaf_nodes)
            result_array = np.concatenate(list(node_index.values()))
            unique, counts = np.unique(result_array, return_counts=True)
            if np.any(counts != 1):
                raise ValueError('Index in multiple leaf')
            if unique.shape[0] != index.shape[0]:
                raise ValueError('Missing index in result')
        return result
Пример #2
0
 def _set_color_scalar(self, score, **kwargs):
     if not np.all(np.isnan(score)):
         normalize_bound = dict_utils.get(kwargs, 'normalize_bound', lambda x: (np.nanmin(x), np.nanmax(x)))
         reverse_cmap = dict_utils.get(kwargs, 'reverse_cmap', False)
         if callable(normalize_bound):
             normalize_bound = normalize_bound(score)
         self.color = self.default_color_map_scalar(score, normalize_bound, reverse_cmap)
         self.filled = True
Пример #3
0
 def get_graph_info(tree, tree_dump, tree_iter, kwargs):
     features_name = dict_utils.get(kwargs, 'features_name')
     class_name = dict_utils.get(kwargs, 'class_name')
     graph_info = {'_fitted': True,
                   'objective': kwargs['objective'], 'n_class': kwargs['n_class'], 'class_name': class_name,
                   'n_features': kwargs['n_features'], 'features_name': features_name,
                   'score_data': {'pred_score_key': None,
                                  'color_score_key': None}}
     return graph_info
Пример #4
0
 def has_split(df, node=None):
     if node is not None:
         split = dict_utils.get(df.at[node, 'data'], key=FEATURE_DATA_KEY)
         if split is None or split.is_none():
             return False
     splits = BiGraphDF.get_data(df, keys=FEATURE_DATA_KEY)
     return splits.apply(lambda x: x is not None and x.is_split())
Пример #5
0
 def compare_fit(cls, graph, X_test, y_test, **kwargs):
     score_handler = dict_utils.get(kwargs, 'score_handler',
                                    _default_compare_score_handler)
     if not graph.is_fitted():
         raise NotFittedError
     try:
         population_cal_field = graph.score_data[
             graph.score_data['population_cal_field_key']]
     except:
         population_cal_field = {}
     kwargs['fit_population_cal_field'] = population_cal_field
     kwargs['fit_score_dict'] = graph.get_score().to_dict()
     kwargs['score_handler'] = score_handler
     actual_population_cal_field, scores = cls._fit(graph, X_test, y_test,
                                                    **kwargs)
     score_data = {
         'pred_score_key': score_handler.pred_score_key,
         'color_score_key': score_handler.color_score_key,
         'population_cal_field_key': score_handler.population_cal_field_key,
         score_handler.population_cal_field_key: population_cal_field,
         'actual_population_cal_field_key': actual_population_cal_field
     }
     return graph.update_graph(score=scores,
                               new_graph_info={'score_data': score_data},
                               inplace=False)
Пример #6
0
    def to_networkx(self, **kwargs):
        max_depth = dict_utils.get(kwargs, 'max_depth', np.Inf)
        NetworkxHelperClass = dict_utils.get(kwargs, 'nx_helper',
                                             NetworkxHelper)

        if max_depth < 0:
            raise ValueError("max depth should be non negative")

        nx_helper = NetworkxHelperClass(self, **kwargs)

        graph = nx.DiGraph(node={
            'color': 'black',
            'fontname': 'helvetica',
            'shape': 'box',
            'style': 'filled, ' * nx_helper.filled + 'rounded'
        },
                           edge={'fontname': 'helvetica'},
                           graph_info=nx_helper.graph_info)

        def add_node(node, parent, depth):
            if depth > max_depth:
                return
            if nx_helper.filled and (not pd.isnull(nx_helper.color[node])):
                graph.add_node(node,
                               type=self.df.at[node, 'type'],
                               data=self.df.at[node, 'data'],
                               label=nx_helper.labels[node],
                               fillcolor=nx_helper.color[node])
            else:
                graph.add_node(node,
                               type=self.df.at[node, 'type'],
                               data=self.df.at[node, 'data'],
                               label=nx_helper.labels[node])
            if depth > 1:
                graph.add_edge(parent, node)
            elif depth == 1:
                graph.add_edge(parent,
                               node,
                               label=self.df.at[node, 'type'] == -1)
            if self.n_child(node) == 2:
                add_node(self.df.at[node, 'child_left'], node, depth + 1)
                add_node(self.df.at[node, 'child_right'], node, depth + 1)

        add_node(0, ROOT_PARENT, 0)
        return graph
Пример #7
0
def default_compare_score_fn(graph, node, y, population_cal_field, score_dict,
                             fit_population_cal_field, fit_score_dict,
                             **kwargs):
    pred_score_key = graph.score_data['pred_score_key']
    return {
        'train_y': dict_utils.get(fit_score_dict[node], pred_score_key,
                                  np.nan),
        'actual_y': np.mean(y)
    }
Пример #8
0
    def get_child(tree_iter, node, kwargs):
        edges = tree_iter.edges(node)
        if len(edges) == 0:
            return
        if len(edges) != 2:
            raise ValueError('Invalid BiGraph')

        (start1, end1), (start2, end2) = edges
        if start1 != node or start2 != node:
            raise ValueError('Invalid BiGraph')

        type1 = dict_utils.get(tree_iter.nodes[end1], 'type')
        type2 = dict_utils.get(tree_iter.nodes[end2], 'type')
        if type1 not in [-1, 1]:
            type1 = None

        if type1 == 1 or (type1 != -1 and type2 == -1):  # left != end1
            end1, end2 = end2, end1  # Now end1 become left
        return tree_iter, end1, tree_iter, end2
Пример #9
0
    def __init__(self, graph, **kwargs):
        self.show_id = dict_utils.get(kwargs, 'show_id', None)
        self.decimals = dict_utils.get(kwargs, 'decimals', 4)

        graph_info = graph.get_graph_info(exclude_df=True)
        score_data = dict_utils.get(graph_info, 'score_data')
        self.graph_info = graph_info
        self.features_name = dict_utils.get_first([kwargs, graph_info], 'features_name')
        self.pred_score_key = dict_utils.get_first([kwargs, score_data], 'pred_score_key')
        self.color_score_key = dict_utils.get_first([kwargs, score_data], 'color_score_key')
        self.objective = dict_utils.get_first([kwargs, graph_info], 'objective')
        self.n_class = dict_utils.get_first([kwargs, graph_info], 'n_class')
        self.fitted = dict_utils.get_first([kwargs, graph_info], '_fitted')

        keys = dict_utils.get(kwargs, 'keys', None)
        self.data_keys = OrderedSet(dict_utils.get(kwargs, 'data_keys', keys))
        self.score_keys = OrderedSet(dict_utils.get(kwargs, 'score_keys', keys))
        if FEATURE_DATA_KEY not in self.data_keys:
            self.data_keys = OrderedSet((FEATURE_DATA_KEY,)) | self.data_keys

        if self.pred_score_key is not None and self.pred_score_key not in self.score_keys:
            self.score_keys = self.score_keys | OrderedSet((self.pred_score_key,))

        self.filled = False
        self.color = None
        self.labels = None
        self.execute_function(graph, **kwargs)
Пример #10
0
 def from_custom_extractor(extractor, tree, **kwargs):
     """
     :param extractor: Custom graph extractor that return df, graph_info
     :param tree: first input of extractor
     :param kwargs: additional kwargs of extractor
         keys used as input of BiGraph init: verify (default False), reindex (default False),
              copy_df (default False), copy_graph (default False), copy (default None)
     :return: BiGraph
     """
     verify = dict_utils.get(kwargs, 'verify', False)
     reindex = dict_utils.get(kwargs, 'reindex', False)
     copy_df = dict_utils.get(kwargs, 'copy_df', False)
     copy_graph = dict_utils.get(kwargs, 'copy_graph', False)
     copy = dict_utils.get(kwargs, 'copy', None)
     df, graph_info = extractor.extract_graph(tree, kwargs=kwargs)
     return BiGraph(df=df,
                    graph_info=graph_info,
                    verify=verify,
                    reindex=reindex,
                    copy_df=copy_df,
                    copy_graph=copy_graph,
                    copy=copy)
Пример #11
0
    def extract_graph(cls, tree, kwargs=None):
        kwargs = kwargs if kwargs is not None else {}
        cls.verify(tree, kwargs)
        cls.set_defaults(tree, kwargs)
        copy_df = dict_utils.get(kwargs, 'copy_df', False)

        tree_dump = cls.get_dump(tree, kwargs)
        tree_iter, root_node = cls.get_iterator(tree_dump, kwargs)
        root_is_leaf = not cls.has_child(tree_iter, root_node, kwargs)
        root_data = cls.get_data(tree_iter, root_node, is_leaf=root_is_leaf, kwargs=kwargs)
        root_score = cls.get_score(tree_iter, root_node, is_leaf=root_is_leaf, kwargs=kwargs)
        root = dict(zip(GRAPH_COL, [0, ROOT_PARENT, TREE_LEAF, TREE_LEAF, TYPE_ROOT, root_data, root_score]))
        row_list = [root]

        # noinspection PyShadowingNames
        def recurse(tree_iter, parent, parent_node, depth):  # parent: internal node id; parent_node: tree node id
            if not cls.has_child(tree_iter, parent_node, kwargs):
                return

            left_tree_iter, left_node, right_tree_iter, right_node = cls.get_child(tree_iter, parent_node, kwargs)
            left_is_leaf = not cls.has_child(left_tree_iter, left_node, kwargs)
            right_is_leaf = not cls.has_child(right_tree_iter, right_node, kwargs)

            left_data = cls.get_data(left_tree_iter, left_node, is_leaf=left_is_leaf, kwargs=kwargs)
            left_score = cls.get_score(left_tree_iter, left_node, is_leaf=left_is_leaf, kwargs=kwargs)
            right_data = cls.get_data(right_tree_iter, right_node, is_leaf=right_is_leaf, kwargs=kwargs)
            right_score = cls.get_score(right_tree_iter, right_node, is_leaf=right_is_leaf, kwargs=kwargs)

            left = dict(zip(GRAPH_COL, [depth, parent, TREE_LEAF, TREE_LEAF, TYPE_LEFT, left_data, left_score]))
            right = dict(zip(GRAPH_COL, [depth, parent, TREE_LEAF, TREE_LEAF, TYPE_RIGHT, right_data, right_score]))

            left_id = len(row_list)
            right_id = left_id + 1

            row_list.append(left)
            row_list.append(right)
            parent = row_list[parent]
            parent.update({'child_left': left_id,
                           'child_right': right_id})

            recurse(left_tree_iter, left_id, left_node, depth + 1)
            recurse(right_tree_iter, right_id, right_node, depth + 1)

        recurse(tree_iter, parent=0, parent_node=root_node, depth=1)

        graph_info = cls.get_graph_info(tree, tree_dump, tree_iter, kwargs)
        graph_info = graph_info if graph_info is not None else {}
        df = pd.DataFrame(row_list, columns=GRAPH_COL)
        return df, graph_info
Пример #12
0
 def get_graph_info(tree, tree_dump, tree_iter, kwargs):
     if kwargs['objective'] == 'classification':
         n_class = tree.n_classes_
         class_name = tree.classes_
     else:
         n_class = None
         class_name = None
     n_features = tree.n_features_
     features_name = dict_utils.get(kwargs, 'features_name')
     graph_info = {'_fitted': True,
                   'objective': kwargs['objective'], 'n_class': n_class, 'class_name': class_name,
                   'n_features': n_features, 'features_name': features_name,
                   'score_data': {'pred_score_key': kwargs['pred_score_key'],
                                  'color_score_key': kwargs['color_score_key']}}
     return graph_info
Пример #13
0
 def get_score(df,
               keys=None,
               default=None,
               order_dict=False,
               missing='ignore'):
     if keys is None:
         return df['score']
     keys = utils.flatten_list(keys)
     if len(keys) == 1:
         series = df['score'].apply(
             lambda x: dict_utils.get(x, keys[0], default))
     else:
         series = df['score'].apply(lambda x: dict_utils.subset_dict(
             x, keys, default, order_dict, missing))
         series.rename(keys, inplace=True)
     return series
Пример #14
0
 def fit(cls, graph, X, y, **kwargs):
     score_handler = dict_utils.get(kwargs, 'score_handler',
                                    _default_fit_score_handler)
     kwargs['score_handler'] = score_handler
     population_cal_field, scores = cls._fit(graph, X, y, **kwargs)
     score_data = {
         'pred_score_key': score_handler.pred_score_key,
         'color_score_key': score_handler.color_score_key,
         'population_cal_field_key': score_handler.population_cal_field_key,
         score_handler.population_cal_field_key: population_cal_field
     }
     return graph.update_graph(score=scores,
                               new_graph_info={
                                   'score_data': score_data,
                                   '_fitted': True,
                                   '_internally_fitted': True
                               },
                               inplace=False)
Пример #15
0
 def compare_fit(self, X, y, **kwargs):
     data_helper = dict_utils.get(kwargs, 'data_helper', DataHelper)
     return data_helper.compare_fit(self, X, y, **kwargs)
Пример #16
0
 def predict(self, X, **kwargs):
     data_helper = dict_utils.get(kwargs, 'data_helper', DataHelper)
     return data_helper.predict(self, X, **kwargs)
Пример #17
0
 def set_defaults(tree, kwargs):
     kwargs['tree_index'] = int(dict_utils.get(kwargs, 'tree_index', 0))
Пример #18
0
 def get_score(tree_iter, node, is_leaf, kwargs):
     return dict_utils.get(tree_iter.nodes[node], 'score')
Пример #19
0
 def get_data(tree_iter, node, is_leaf, kwargs):
     return dict_utils.get(tree_iter.nodes[node], 'data')
Пример #20
0
 def set_defaults(tree, kwargs):
     kwargs['root_id'] = dict_utils.get(kwargs, 'root_id', None)