示例#1
0
def test_init_raw_predictions_shapes():
    # Make sure get_init_raw_predictions returns float64 arrays with shape
    # (n_samples, K) where K is 1 for binary classification and regression, and
    # K = n_classes for multiclass classification
    rng = np.random.RandomState(0)

    n_samples = 100
    X = rng.normal(size=(n_samples, 5))
    y = rng.normal(size=n_samples)
    for loss in (LeastSquaresError(n_classes=1),
                 LeastAbsoluteError(n_classes=1),
                 QuantileLossFunction(n_classes=1),
                 HuberLossFunction(n_classes=1)):
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, 1)
        assert raw_predictions.dtype == np.float64

    y = rng.randint(0, 2, size=n_samples)
    for loss in (BinomialDeviance(n_classes=2), ExponentialLoss(n_classes=2)):
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, 1)
        assert raw_predictions.dtype == np.float64

    for n_classes in range(3, 5):
        y = rng.randint(0, n_classes, size=n_samples)
        loss = MultinomialDeviance(n_classes=n_classes)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, n_classes)
        assert raw_predictions.dtype == np.float64
def test_init_raw_predictions_shapes():
    # Make sure get_init_raw_predictions returns float64 arrays with shape
    # (n_samples, K) where K is 1 for binary classification and regression, and
    # K = n_classes for multiclass classification
    rng = np.random.RandomState(0)

    n_samples = 100
    X = rng.normal(size=(n_samples, 5))
    y = rng.normal(size=n_samples)
    for loss in (LeastSquaresError(n_classes=1),
                 LeastAbsoluteError(n_classes=1),
                 QuantileLossFunction(n_classes=1),
                 HuberLossFunction(n_classes=1)):
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, 1)
        assert raw_predictions.dtype == np.float64

    y = rng.randint(0, 2, size=n_samples)
    for loss in (BinomialDeviance(n_classes=2),
                 ExponentialLoss(n_classes=2)):
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, 1)
        assert raw_predictions.dtype == np.float64

    for n_classes in range(3, 5):
        y = rng.randint(0, n_classes, size=n_samples)
        loss = MultinomialDeviance(n_classes=n_classes)
        init_estimator = loss.init_estimator().fit(X, y)
        raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
        assert raw_predictions.shape == (n_samples, n_classes)
        assert raw_predictions.dtype == np.float64
示例#3
0
class GradientBoostingEnsemble:
    """Implement gradient boosting ensemble of trees.

  Attributes:
    nb_trees (int): The total number of trees in the model.
    nb_trees_per_ensemble (int): The number of trees in each ensemble.
    max_depth (int): The depth for the trees.
    privacy_budget (float): The privacy budget available for the model.
    learning_rate (float): The learning rate.
    l2_threshold (int): Threshold for the loss function. For the square loss
        function (default), this is 1.
    l2_lambda (float): Regularization parameter for l2 loss function.
        For the square loss function (default), this is 0.1.
    trees (List[List[DifferentiallyPrivateTree]]): A list of k-classes DP trees.
  """

    # pylint: disable=invalid-name, too-many-arguments, unused-variable

    def __init__(self,
                 nb_trees: int,
                 nb_trees_per_ensemble: int,
                 n_classes: Optional[int] = None,
                 max_depth: int = 6,
                 privacy_budget: float = 1.0,
                 learning_rate: float = 0.1,
                 max_leaves: Optional[int] = None,
                 min_samples_split: int = 2,
                 balance_partition: bool = True,
                 use_bfs: bool = False,
                 use_3_trees: bool = False,
                 cat_idx: Optional[List[int]] = None,
                 num_idx: Optional[List[int]] = None) -> None:
        """Initialize the GradientBoostingEnsemble class.

    Args:
      nb_trees (int): The total number of trees in the model.
      nb_trees_per_ensemble (int): The number of trees in each ensemble.
      n_classes (int): Number of classes. Triggers regression (None) vs classification.
      max_depth (int): Optional. The depth for the trees. Default is 6.
      privacy_budget (float): Optional. The privacy budget available for the
          model. Default is 1.0.
      learning_rate (float): Optional. The learning rate. Default is 0.1.
      max_leaves (int): Optional. The max number of leaf nodes for the trees.
          Tree will grow in a best-leaf first fashion until it contains
          max_leaves or until it reaches maximum depth, whichever comes first.
      min_samples_split (int): Optional. The minimum number of samples required
          to split an internal node. Default is 2.
      balance_partition (bool): Optional. Balance data repartition for training
          the trees. The default is True, meaning all trees within an ensemble
          will receive an equal amount of training samples. If set to False,
          each tree will receive <x> samples where <x> is given in line 8 of
          the algorithm in the author's paper.
      use_bfs (bool): Optional. If max_leaves is specified, then this is
          automatically True. This will build the tree in a BFS fashion instead
          of DFS. Default is False.
      use_3_trees (bool): Optional. If True, only build trees that have 3
          nodes, and then assemble nb_trees based on these sub-trees, at random.
          Default is False.
      cat_idx (List): Optional. List of indices for categorical features.
      num_idx (List): Optional. List of indices for numerical features.
      """
        self.nb_trees = nb_trees
        self.nb_trees_per_ensemble = nb_trees_per_ensemble
        self.max_depth = max_depth
        self.privacy_budget = privacy_budget
        self.learning_rate = learning_rate
        self.max_leaves = max_leaves
        self.min_samples_split = min_samples_split
        self.balance_partition = balance_partition
        self.use_bfs = use_bfs
        self.use_3_trees = use_3_trees
        self.cat_idx = cat_idx
        self.num_idx = num_idx
        self.trees = []  # type: List[List[DifferentiallyPrivateTree]]
        # classification vs regression
        self.loss_ = MultinomialDeviance(
            n_classes) if n_classes else LeastSquaresError(
                1)  # type: LossFunction
        self.init_ = self.loss_.init_estimator()

        # Loss parameters
        self.l2_threshold = 1.0
        self.l2_lambda = 0.1

        # Initial score
        self.init_score = None

        if self.use_3_trees and self.use_bfs:
            # Since we're building 3-node trees it's the same anyways.
            self.use_bfs = False

    def Train(self, X: np.array, y: np.array) -> 'GradientBoostingEnsemble':
        """Train the ensembles of gradient boosted trees.

    Args:
      X (np.array): The features.
      y (np.array): The label.

    Returns:
      GradientBoostingEnsemble: A GradientBoostingEnsemble object.
    """

        # Init gradients
        self.init_.fit(X, y)
        self.init_score = self.loss_.get_init_raw_predictions(
            X, self.init_)  # (n_samples, K)
        update_gradients = True

        X_train, X_test, y_train, y_test = train_test_split(X, y)
        X, y = X_train, y_train

        # Number of ensembles in the model
        nb_ensembles = int(np.ceil(self.nb_trees / self.nb_trees_per_ensemble))

        # Privacy budget allocated to all trees in each ensemble
        tree_privacy_budget = np.divide(self.privacy_budget, nb_ensembles)
        # In multi-class classification the budget for each tree
        # is the same as for the whole K trees but halved
        # As each datapoint is only assigned to one class,
        # it only matters if it is assigned to the considered class or not but not to which other
        # Thus it always remains 2 - independently of how many total classes exists
        privacy_budget_per_tree = 2 if self.loss_.is_multi_class else 1
        tree_privacy_budget = np.divide(tree_privacy_budget,
                                        privacy_budget_per_tree)

        prev_score = np.inf

        # Train all trees
        for tree_index in range(self.nb_trees):
            # Compute sensitivity
            delta_g = 3 * np.square(self.l2_threshold)
            delta_v = min(
                self.l2_threshold / (1 + self.l2_lambda),
                2 * self.l2_threshold * math.pow(
                    (1 - self.learning_rate), tree_index))

            current_tree_for_ensemble = tree_index % self.nb_trees_per_ensemble
            if current_tree_for_ensemble == 0:
                # Initialize the dataset and the gradients
                X_ensemble = np.copy(X)
                y_ensemble = np.copy(y)
                prev_score = np.inf
                update_gradients = True
                # gradient initialization will happen later in the per-class-loop

            # Compute the number of rows that the current tree will use for training
            if self.balance_partition:
                # All trees will receive same amount of samples
                if self.nb_trees % self.nb_trees_per_ensemble == 0:
                    # Perfect split
                    number_of_rows = int(len(X) / self.nb_trees_per_ensemble)
                else:
                    # Partitioning data across ensembles
                    if np.ceil(tree_index /
                               self.nb_trees_per_ensemble) == np.ceil(
                                   self.nb_trees / self.nb_trees_per_ensemble):
                        number_of_rows = int(
                            len(X) /
                            (self.nb_trees % self.nb_trees_per_ensemble))
                    else:
                        number_of_rows = int(
                            len(X) / self.nb_trees_per_ensemble) + int(
                                len(X) /
                                (self.nb_trees % self.nb_trees_per_ensemble))
            else:
                # Line 8 of Algorithm 2 from the paper
                number_of_rows = int(
                    (len(X) * self.learning_rate * math.pow(
                        (1 - self.learning_rate), current_tree_for_ensemble)) /
                    (1 - math.pow(
                        (1 - self.learning_rate), self.nb_trees_per_ensemble)))

            # If using the formula from the algorithm, some trees may not get
            # samples. In that case we skip the tree and issue a warning. This
            # should hint the user to change its parameters (likely the ensembles
            # are too unbalanced)
            if number_of_rows == 0:
                logger.warning(
                    'The choice of trees per ensemble vs. the total number '
                    'of trees is not balanced properly; some trees will '
                    'not get any training samples. Try using '
                    'balance_partition=True or change your parameters.')
                continue

            # Select <number_of_rows> rows at random from the ensemble dataset
            rows = np.random.randint(len(X_ensemble), size=number_of_rows)
            X_tree = X_ensemble[rows, :]
            y_tree = y_ensemble[rows]

            # train for each class a seperate tree on the same rows.
            # In regression or binary classification, K has been set to one.
            k_trees = []  # type: List[DifferentiallyPrivateTree]
            for kth_tree in range(self.loss_.K):
                if tree_index == 0:
                    # First tree, start with initial scores (mean of labels)
                    gradients = self.ComputeGradientForLossFunction(
                        y, self.init_score[:len(y)], kth_tree)
                else:
                    # Update gradients of all training instances on loss l
                    if update_gradients:
                        gradients = self.ComputeGradientForLossFunction(
                            y_ensemble, self.Predict(X_ensemble),
                            kth_tree)  # type: ignore

                assert gradients is not None
                gradients_tree = gradients[rows]

                # Gradient based data filtering
                norm_1_gradient = np.abs(gradients_tree)
                rows_gbf = norm_1_gradient <= self.l2_threshold
                X_tree = X_tree[rows_gbf, :]
                y_tree = y_tree[rows_gbf]
                gradients_tree = gradients_tree[rows_gbf]

                # Get back the original row index from the first filtering
                selected_rows = rows[rows_gbf] if tree_index > 0 else rows

                # Fit a differentially private decision tree
                tree = DifferentiallyPrivateTree(
                    tree_index,
                    self.learning_rate,
                    self.l2_threshold,
                    self.l2_lambda,
                    tree_privacy_budget,
                    delta_g,
                    delta_v,
                    self.loss_,
                    max_depth=self.max_depth,
                    max_leaves=self.max_leaves,
                    min_samples_split=self.min_samples_split,
                    use_bfs=self.use_bfs,
                    use_3_trees=self.use_3_trees,
                    cat_idx=self.cat_idx,
                    num_idx=self.num_idx)
                # in multi-class classification, the target has to be binary
                # as each tree is a per-class regressor
                y_target = ((y_tree == kth_tree).astype(np.float64)
                            if self.loss_.is_multi_class else y_tree)
                tree.Fit(X_tree, y_target, gradients_tree)

                # Add the tree to its corresponding ensemble
                k_trees.append(tree)
            self.trees.append(k_trees)

            score = self.loss_(y_test,
                               self.Predict(X_test))  # i.e. mse or deviance
            if score >= prev_score:
                # This tree doesn't improve overall prediction quality, removing from
                # model
                update_gradients = self.loss_.is_multi_class  # not reusing gradients in multi-class as they are class-dependent
                self.trees.pop()
            else:
                print(tree_index, score)
                update_gradients = True
                prev_score = score
                # Remove the selected rows from the ensemble's dataset
                # The instances that were filtered out by GBF can still be used for the
                # training of the next trees
                X_ensemble = np.delete(X_ensemble, selected_rows, axis=0)
                y_ensemble = np.delete(y_ensemble, selected_rows)
        if self.use_3_trees:
            self.Combine_3_trees(self.trees)
        return self

    def Combine_3_trees(
            self, k_trees: List[List['DifferentiallyPrivateTree']]) -> None:
        """Combine 3-trees together to construct bigger decision trees.

    Args:
      k_trees (List[List[DifferentiallyPrivateTree]]): A k-list of 3-trees.
    """

        self.trees = []  # Re-init final predictions trees
        for index, k_three_tree in enumerate(
                k_trees):  # iterate through the ensemble
            k_trees_ = []  # type: List[DifferentiallyPrivateTree]
            for k, three_tree in enumerate(
                    k_three_tree):  # iterate through the classes
                # select a whole ensemble per class; continue as if there were only one class
                copy = list(np.copy([i[k] for i in k_trees]))
                copy.pop(index)
                if len(copy) == 0:
                    continue
                queue_children = Queue()  # type: Queue['DecisionNode']
                queue_children.put(
                    three_tree.root_node.left_child)  # type: ignore
                queue_children.put(
                    three_tree.root_node.right_child)  # type: ignore
                depth = 1
                privacy_budget_for_node = np.around(np.divide(
                    three_tree.privacy_budget / 2, three_tree.max_depth + 1),
                                                    decimals=7)
                while not queue_children.empty():
                    if depth == self.max_depth or len(copy) == 0:
                        break
                    left_child = queue_children.get()
                    right_child = queue_children.get()
                    for child in [left_child, right_child]:
                        if len(copy) == 0 or not child or not child.X.any(
                        ):  # type: ignore
                            continue
                        # Apply exponential mechanism to find sub 3-node tree
                        probabilities = []
                        max_gain = -np.inf
                        for candidate_index, candidate in enumerate(copy):
                            if not candidate.root_node.X.any():
                                continue
                            # Compute distance between the two nodes. Lower is better.
                            gain = np.linalg.norm(
                                np.matmul(np.transpose(child.X), child.X) -
                                np.matmul(np.transpose(candidate.root_node.X),
                                          candidate.root_node.X))
                            exp_gain = (privacy_budget_for_node *
                                        gain) / (2. * three_tree.delta_g)
                            if exp_gain > max_gain:
                                max_gain = exp_gain
                            prob = {
                                'candidate_index': candidate_index,
                                'index': candidate.root_node.index,
                                'value': candidate.root_node.value,
                                'gain': exp_gain
                            }
                            probabilities.append(prob)
                        candidate = ExponentialMechanism(probabilities,
                                                         max_gain,
                                                         reverse=True)
                        if not candidate or not candidate[
                                'index'] or not candidate['value']:
                            continue
                        copy.pop(candidate['candidate_index'])
                        split_index = candidate['index']
                        split_value = candidate['value']
                        left_, right_ = self.SplitNode(
                            child, split_index, split_value,
                            three_tree.privacy_budget, index,
                            three_tree.delta_v)
                        queue_children.put(left_)
                        queue_children.put(right_)
                    depth += 1
                k_trees_.append(three_tree)
            self.trees.append(k_trees_)
        if not self.trees or self.trees == [[]]:
            self.trees = k_trees

    def SplitNode(self, node: 'DecisionNode', index: int, value: float,
                  tree_privacy_budget: float, tree_index: int,
                  delta_v: float) -> Tuple['DecisionNode', 'DecisionNode']:
        """Split children of a 3-nodes tree based on the (index, value) pair.

    Args:
      node (DecisionNode): The node to split.
      index (int): The feature's index on which to split the node.
      value (float): The feature's value on which to split the node.
      tree_privacy_budget (float): The privacy budget for the current tree.
      tree_index (int): The index of the tree.
      delta_v (float): The loss function's sensitivity for the tree.

    Returns:
      Tuple: Children created after the split.
    """

        assert node.X is not None
        assert node.y is not None
        assert node.gradients is not None

        # Split indices of instances from the node's dataset
        lhs_op, rhs_op = self.GetOperators(index)
        lhs = np.where(lhs_op(node.X[:, index], value))[0]
        rhs = np.where(rhs_op(node.X[:, index], value))[0]

        # Compute the associated predictions
        lhs_prediction = ComputePredictions(node.gradients[lhs], node.y[lhs],
                                            self.loss_, self.l2_lambda)
        rhs_prediction = ComputePredictions(node.gradients[rhs], node.y[rhs],
                                            self.loss_, self.l2_lambda)

        # Mark current node as split node and not leaf node
        node.prediction = None
        node.index = index
        node.value = value

        # Add children to node
        node.left_child = DecisionNode(X=node.X[lhs],
                                       y=node.y[lhs],
                                       prediction=lhs_prediction,
                                       gradients=node.gradients[lhs])
        node.right_child = DecisionNode(X=node.X[rhs],
                                        y=node.y[rhs],
                                        prediction=rhs_prediction,
                                        gradients=node.gradients[rhs])

        # Apply Geometry leaf clipping
        ClipLeaves([node.left_child, node.right_child], self.l2_threshold,
                   self.learning_rate, tree_index)

        # Add noise to the leaf predictions
        laplace_scale = delta_v / tree_privacy_budget / 2
        AddLaplacianNoise([node.left_child, node.right_child], laplace_scale)

        # Shrink by learning rate
        Shrink([node.left_child, node.right_child], self.learning_rate)

        return node.left_child, node.right_child

    def Predict(self, X: np.array) -> np.array:
        """Predict values from the ensemble of gradient boosted trees.

    See https://github.com/microsoft/LightGBM/issues/1778.

    Args:
      X (np.array): The dataset for which to predict values.

    Returns:
      np.array of shape (n_samples, K): The predictions.
    """
        # sum across the ensemble per class
        predictions = np.sum([[tree.Predict(X) for tree in k_trees]
                              for k_trees in self.trees],
                             axis=0).T
        assert self.init_score is not None
        init_score = self.init_score[:len(predictions)]
        return np.add(init_score, predictions)

    def PredictLabels(self, X: np.ndarray) -> np.ndarray:
        """Predict labels out of the raw prediction values of `Predict`.
    Only defined for classification tasks.

    Args:
      X (np.ndarray): The dataset for which to predict labels.

    Returns:
      np.ndarray: The label predictions.
    """
        if type(self.loss_) is not MultinomialDeviance:
            raise ValueError("Labels are not defined for regression tasks.")

        raw_predictions = self.Predict(X)
        encoded_labels = self.loss_._raw_prediction_to_decision(
            raw_predictions)
        return encoded_labels

    def ComputeGradientForLossFunction(self, y: np.array, y_pred: np.array,
                                       k: int) -> np.array:
        """Compute the gradient of the loss function.

    Args:
      y (np.array): The true values.
      y_pred (np.array): The predictions.
      k (int): the class.

    Returns:
      (np.array): The gradient of the loss function.
    """
        if self.loss_.is_multi_class:
            y = (y == k).astype(np.float64)
        # sklearn's impl is using the negative gradient (i.e. y - F).
        # Here the positive gradient is used though
        return -self.loss_.negative_gradient(y, y_pred, k=k)

    def GetOperators(self, index: int) -> Tuple[Any, Any]:
        """Return operators to use to split a node's dataset.

    Args:
      index (int): The index for the feature to split the data on.

    Returns:
      Tuple[Any, Any]: The operators to use.
    """
        if self.cat_idx and index in self.cat_idx:
            # Categorical feature
            return operator.eq, operator.ne
        # Numerical feature
        return operator.lt, operator.ge