def __init__(self, n_estimator=10, learning_rate=0.01, min_sample=2, min_gain=0.1, max_depth=10): super(GBDTClassificationScratch, self).__init__(n_estimator, learning_rate) """ min_sample: 当数据集样本数少于min_sample时不再划分 min_gain: 如果划分后收益不能超过该值则不进行划分 对分类树来说基尼指数需要有足够的下降 对回归树来说平方误差要有足够的下降 max_depth: 树的最大高度 """ self._min_sample = min_sample self._min_gain = min_gain self._max_depth = max_depth # 分类树损失函数维交叉熵损失 self._loss = CrossEntropyLoss()
class GBDTClassificationScratch(GBDTScratch): def __init__(self, n_estimator=10, learning_rate=0.01, min_sample=2, min_gain=0.1, max_depth=10): super(GBDTClassificationScratch, self).__init__(n_estimator, learning_rate) """ min_sample: 当数据集样本数少于min_sample时不再划分 min_gain: 如果划分后收益不能超过该值则不进行划分 对分类树来说基尼指数需要有足够的下降 对回归树来说平方误差要有足够的下降 max_depth: 树的最大高度 """ self._min_sample = min_sample self._min_gain = min_gain self._max_depth = max_depth # 分类树损失函数维交叉熵损失 self._loss = CrossEntropyLoss() def fit(self, X, y): """模型训练""" # 先对输入标签做one hot编码 y = self._to_one_hot(y) n_sample, self._n_class = y.shape # 初始残差为每个类别的平均值 residual_pred = np.full_like(y, np.mean(y, axis=0)) for _ in range(self._n_estimator): label_trees = [] residual_update = np.zeros_like(residual_pred) # 每个类别分别学习树 for j in range(self._n_class): residual_gradient = self._loss.calc_gradient( y[:, j], residual_pred[:, j]) tree = CARTRegressionScratch(self._min_sample, self._min_gain, self._max_depth) # 每棵树以残差为目标进行训练 tree.fit(X, residual_gradient) label_trees.append(tree) for i in range(n_sample): residual_update[i, j] = tree.predict(X[i]) self._trees.append(label_trees) residual_pred -= self._lr * residual_update def predict(self, x): """给定输入样本,预测输出""" y_pred = np.zeros(self._n_class) for label_trees in self._trees: for i in range(len(label_trees)): residual_update = label_trees[i].predict(x) y_pred[i] -= self._lr * residual_update # 返回概率值最大的类别,省略了指数计算 return np.argmax(y_pred) def _to_one_hot(self, y): """将离散标签进行one hot编码""" n_col = np.amax(y) + 1 one_hot = np.zeros((y.shape[0], n_col)) # 将类别所在列置为1 one_hot[np.arange(y.shape[0]), y] = 1 return one_hot