示例#1
0
    def _build_metafeatures(self, X, y):
        """
        Build the meta-features associated with a particular node.

        These are various features that can be used in training and prediction time,
        e.g the number of training samples available for the classifier trained at that node,
        the number of targets (classes) to be predicted at that node, etc.

        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            The training data matrix at current node.

        Returns
        -------
        metafeatures : dict
            Python dictionary of meta-features. The following meta-features are computed by default:
            * 'n_samples' - Number of samples used to train classifier at given node.
            * 'n_targets' - Number of targets (classes) to classify into at given node.

        """
        # Indices of non-zero rows in X, i.e rows corresponding to relevant samples for this node.
        ix = nnz_rows_ix(X)

        return dict(
            n_samples=len(ix),
            n_targets=len(np.unique(y[ix])),
        )
示例#2
0
    def _train_local_classifier(self, X, y, node_id):
        if self.graph_.out_degree(node_id) == 0:
            # Leaf node
            if self.algorithm == "lcpn":
                # Leaf nodes do not get a classifier assigned in LCPN algorithm mode.
                self.logger.debug(
                    "_train_local_classifier() - skipping leaf node %s when algorithm is 'lcpn'",
                    node_id,
                )
                return

        X = self.graph_.node[node_id]["X"]
        nnz_rows = nnz_rows_ix(X)
        X_ = X[nnz_rows, :]

        y_rolled_up = rollup_nodes(
            graph=self.graph_,
            source=node_id,
            targets=[y[idx] for idx in nnz_rows],
        )

        if self.is_tree_:
            y_ = flatten_list(y_rolled_up)
        else:
            # Class hierarchy graph is a DAG
            X_, y_ = apply_rollup_Xy(X_, y_rolled_up)

        num_targets = len(np.unique(y_))

        self.logger.debug(
            "_train_local_classifier() - Training local classifier for node: %s, X_.shape: %s, len(y): %s, n_targets: %s",  # noqa:E501
            node_id,
            X_.shape,
            len(y_),
            num_targets,
        )

        if X_.shape[0] == 0:
            # No training data could be materialized for current node
            # TODO: support a 'strict' mode flag to explicitly enable/disable fallback logic here?
            self.logger.warning(
                "_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node %s",  # noqa:E501
                node_id,
            )
            return
        elif num_targets == 1:
            # Training data could be materialized for only a single target at current node
            # TODO: support a 'strict' mode flag to explicitly enable/disable fallback logic here?
            constant = y_[0]
            self.logger.debug(
                "_train_local_classifier() - only a single target (child node) available to train classifier for node %s, Will trivially predict %s",  # noqa:E501
                node_id,
                constant,
            )

            clf = DummyClassifier(strategy="constant", constant=constant)
        else:
            clf = self._base_estimator_for(node_id)

        clf.fit(X=X_, y=y_)
        self.graph_.node[node_id][CLASSIFIER] = clf