def __init__(self, loss='square', learning_rate=0.3, n_estimators=20, max_depth=6, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_child_weight=1, reg_lambda=1.0, gamma=0, num_thread=-1): self.learning_rate = learning_rate self.n_estimators = n_estimators self.max_depth = max_depth self.subsample = subsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.reg_lambda = reg_lambda self.gamma = gamma self.num_thread = num_thread self.min_child_weight = min_child_weight self.first_round_pred = 0.0 self.trees = [] self.eval_metric = None self._is_classifier = False if loss == 'logistic': self.loss = LogisticLoss() elif loss == 'square': self.loss = SquareLoss() else: if callable(loss): self.loss = CustomLoss(loss) else: raise Exception('unsupported loss function: {0}'.format(loss))
def fit(self, train_data, validation_data, early_stopping_rounds=np.inf, eval_metric=None, loss="logisticloss", eta=0.3, num_round=1000, max_depth=6, pool_size=1, min_instances_byleaf=1, scale_pos_weight=1, subsample=0.8, colsample_bytree=0.8, min_child_weight=1, reg_lambda=1.0, gamma=0): """ :param train_data: Data object, train data :param validation_data: Data object, validation data :param eta: learning rate :param num_round: number of boosting round :param max_depth: max depth of each tree :param pool_size: the num of processes :param subsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param min_instances_byleaf: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss :param reg_lambda: lambda :param gamma: gamma :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_round = num_round self.max_depth = max_depth self.pool_size = pool_size self.subsample = subsample self.colsample_bytree = colsample_bytree self.reg_lambda = reg_lambda self.gamma = gamma self.min_instances_byleaf = min_instances_byleaf self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0.0 # initial loss function if loss == "logisticloss": self.loss = LogisticLoss(self.reg_lambda) elif loss == "squareloss": self.loss = SquareLoss(self.reg_lambda) self.first_round_pred = train_data.getLabelMean() else: raise NotImplementedError( "loss should be 'logisticloss' or 'squareloss'") # to evaluate on validation set and conduct early stopping do_validation = True valData = validation_data.getData() if not valData: raise ValueError("validation_data is empty !") valIdxList = [] #save an fixed order valLabels = [] for idx in valData: valData[idx][ 'yPred'] = self.first_round_pred #init it with traindata valIdxList.append(idx) valLabels.append(valData[idx]['label']) best_val_metric = np.inf best_round = 0 become_worse_round = 0 data = train_data.getData() if not train_data: raise ValueError("train_data is empty !") idxList = [] #save an fixed order labels = [] for idx in data: data[idx]['yPred'] = self.first_round_pred data[idx]['grad'] = self.loss.grad(data[idx]['grad'], data[idx]['label']) data[idx]['hess'] = self.loss.hess(data[idx]['hess'], data[idx]['label']) if data[idx]['label'] == 1.0: data[idx]['weight'] = self.scale_pos_weight idxList.append(idx) labels.append(data[idx]['label']) labels = np.array(labels) for i in range(self.num_round): # weighted grad and hess for idx in data: data[idx]['grad'] = data[idx]['grad'] * data[idx]['weight'] data[idx]['hess'] = data[idx]['hess'] * data[idx]['weight'] # row and column sample before training the current tree factors = train_data.getFactors() factorTypes = train_data.getFeatureTypes() sampledFactors = random.sample( factors, int(len(factors) * self.colsample_bytree)) sampledData = {} for idx in random.sample(idxList, int(len(idxList) * self.subsample)): sampledData.update({idx: data[idx]}) # train current tree tree = Tree() tree.fit(sampledData, sampledFactors, factorTypes, max_depth=self.max_depth, pool_size=self.pool_size, min_child_weight=self.min_child_weight, min_instances_byleaf=self.min_instances_byleaf, reg_lambda=self.reg_lambda, gamma=self.gamma) # predict the whole trainset and update y_pred,grad,hess preds = tree.predict(sampledData) for idx in sampledData: data[idx]['yPred'] += self.eta * preds[idx] data[idx]['grad'] = self.loss.grad(data[idx]["yPred"], data[idx]["label"]) data[idx]['hess'] = self.loss.hess(data[idx]["yPred"], data[idx]["label"]) # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree) # print training information if self.eval_metric is None: print "Apollo round {iteration}".format(iteration=i) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") curPreds = np.array([data[idx]["yPred"] for idx in idxList]) train_metric = mertric_func(self.loss.transform(curPreds), labels) if not do_validation: print "Apollo round {iteration}, train-{eval_metric} is {train_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric) else: valPreds = tree.predict(valData) for idx in valData: valData[idx]['yPred'] += self.eta * valPreds[idx] curValPreds = [valData[idx]['yPred'] for idx in valIdxList] assert len(curValPreds) == len(valLabels) val_metric = mertric_func( self.loss.transform(np.array(curValPreds)), np.array(valLabels)) print "Apollo round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric) # check if to early stop if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "Apollo training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break
def fit(self, X, y, eta=0.01, num_boost_round=1000, max_depth=5, rowsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_sample_split=10, loss="logisticloss", l2_regularization=1.0, gamma=0.1, num_thread=-1, eval_metric=None): """ :param X: pandas.core.frame.DataFrame :param y: pandas.core.series.Series :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param rowsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param colsample_bylevel: column sample rate when spliting each tree node, the number of features = total_features*colsample_bytree*colsample_bylevel :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param l2_regularization: lambda :param gamma: gamma :param seed: random seed :param num_thread: number of thread to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.rowsample = rowsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.l2_regularization = l2_regularization self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric if loss == "logisticloss": self.loss = LogisticLoss(l2_regularization) elif loss == "squareloss": self.loss = SquareLoss(l2_regularization) else: try: self.loss = CustomizeLoss(loss, l2_regularization) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) self.first_round_pred = y.mean() # Y stores label, y_pred, grad, hess Y = pd.DataFrame(y.values, columns=['label']) # only one column "label" Y['y_pred'] = self.first_round_pred Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) for i in range(self.num_boost_round): # sample samples and features to train current tree data = X.sample(frac=self.colsample_bytree, axis=1) data = pd.concat([data, Y], axis=1) data = data.sample(frac=self.rowsample, axis=0) Y_selected = data[['label', 'y_pred', 'grad', 'hess']] X_selected = data.drop(['label', 'y_pred', 'grad', 'hess'], axis=1) # train current tree tree = Tree() tree.fit(X_selected, Y_selected, max_depth=self.max_depth, colsample_bylevel=self.colsample_bylevel, min_sample_split=self.min_sample_split, l2_regularization=self.l2_regularization, gamma=self.gamma, num_thread=self.num_thread) # predict the whole dataset and update y_pred,grad,hess preds = tree.predict(X) Y['y_pred'] += self.eta * preds Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) if self.eval_metric is not None: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") metric_value = mertric_func( self.loss.transform(Y.y_pred.values), Y.label.values) print "TGBoost round {iteration}, {eval_metric} is {metric_value}".format( iteration=i, eval_metric=self.eval_metric, metric_value=metric_value) else: print "TGBoost round {iteration}" # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree)
def fit(self, features, label, validation_data=(None, None), early_stopping_rounds=np.inf, maximize=True, eval_metric=None, loss="logisticloss", eta=0.3, num_boost_round=1000, max_depth=6, scale_pos_weight=1, subsample=0.8, colsample=0.8, min_child_weight=1, min_sample_split=10, reg_lambda=1.0, gamma=0, num_thread=-1): """ :param features: np.array :param label: np.array :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param subsample: row sample rate when building a tree :param colsample: column sample rate when building a tree :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param reg_lambda: lambda :param gamma: gamma :param num_thread: number of threself.tree_predict_Xad to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.subsample = subsample self.colsample = colsample self.reg_lambda = reg_lambda self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0 # initial loss function if loss == "logisticloss": self.loss = LogisticLoss() elif loss == "squareloss": self.loss = SquareLoss() self.first_round_pred = label.mean() else: try: self.loss = CustomizeLoss(loss) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) # initialize row_sampler, col_sampler, bin_structure, attribute_list, class_list row_sampler = RowSampler(features.shape[0], self.subsample) col_sampler = ColumnSampler(features.shape[1], self.colsample) bin_structure = BinStructure(features) attribute_list = AttributeList(features, bin_structure) class_list = ClassList(label) class_list.initialize_pred(self.first_round_pred) class_list.update_grad_hess(self.loss) # to evaluate on validation set and conduct early stopping # we should get (val_features,val_label) # and set some variable to check when to stop do_validation = True if not isinstance(validation_data, tuple): raise TypeError( "validation_data should be (val_features, val_label)") val_features, val_label = validation_data val_pred = None if val_features is None or val_label is None: do_validation = False else: val_pred = np.ones(val_label.shape) * self.first_round_pred if maximize: best_val_metric = -np.inf best_round = 0 become_worse_round = 0 else: best_val_metric = np.inf best_round = 0 become_worse_round = 0 # start learning logging.info("TGBoost start training") for i in range(self.num_boost_round): t0 = time() # train current tree tree = Tree(self.min_sample_split, self.min_child_weight, self.max_depth, self.colsample, self.subsample, self.reg_lambda, self.gamma, self.num_thread) tree.fit(attribute_list, class_list, row_sampler, col_sampler, bin_structure) # when finish building this tree, update the class_list.pred, grad, hess class_list.update_pred(self.eta) class_list.update_grad_hess(self.loss) # save this tree self.trees.append(tree) t1 = time() # print training information if self.eval_metric is None: logging.info("TGBoost round {iteration}".format(iteration=i)) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") train_metric = mertric_func( self.loss.transform(class_list.pred), label) if not do_validation: logging.info( "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, exec time {tc:.3f}s" .format(iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, tc=t1 - t0)) else: val_pred += self.eta * tree.predict(val_features) val_metric = mertric_func(self.loss.transform(val_pred), val_label) logging.info( "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, val-{eval_metric}: {val_metric:.4f}, exec time {tc:.3f}s" .format(iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric, tc=t1 - t0)) # check whether to early stop if maximize: if val_metric > best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: logging.info( "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric:.4f}" .format(best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric)) break else: if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: logging.info( "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric:.4f}" .format(best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric)) break
def fit(self, X, y, validation_data=(None, None), early_stopping_rounds=np.inf, maximize=True, eval_metric=None, loss="logisticloss", eta=0.3, num_boost_round=1000, max_depth=6, scale_pos_weight=1, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_child_weight=1, min_sample_split=10, reg_lambda=1.0, gamma=0, num_thread=-1): """ :param X: pandas.core.frame.DataFrame :param y: pandas.core.series.Series :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param subsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param colsample_bylevel: column sample rate when spliting each tree node, the number of features = total_features*colsample_bytree*colsample_bylevel :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param reg_lambda: lambda :param gamma: gamma :param seed: random seed :param num_thread: number of threself.tree_predict_Xad to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.subsample = subsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.reg_lambda = reg_lambda self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0.0 X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) # initial loss function if loss == "logisticloss": self.loss = LogisticLoss(reg_lambda) elif loss == "squareloss": self.loss = SquareLoss(reg_lambda) self.first_round_pred = y.mean() else: try: self.loss = CustomizeLoss(loss, reg_lambda) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) # to evaluate on validation set and conduct early stopping # we should get (val_X,val_y) # and set some variable to check when to stop do_validation = True if not isinstance(validation_data, tuple): raise TypeError("validation_data should be (val_X, val_y)") val_X, val_y = validation_data if val_X is None or val_y is None: do_validation = False else: # type check if not isinstance(val_X, pd.core.frame.DataFrame): raise TypeError("val_X should be 'pd.core.frame.DataFrame'") if not isinstance(val_y, pd.core.series.Series): raise TypeError("val_X should be 'pd.core.series.Series'") val_X.reset_index(drop=True, inplace=True) val_y.reset_index(drop=True, inplace=True) val_Y = pd.DataFrame(val_y.values, columns=['label']) val_Y['y_pred'] = self.first_round_pred if maximize: best_val_metric = -np.inf best_round = 0 become_worse_round = 0 else: best_val_metric = np.inf best_round = 0 become_worse_round = 0 # Y stores: label, y_pred, grad, hess, sample_weight Y = pd.DataFrame(y.values, columns=['label']) Y['y_pred'] = self.first_round_pred Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) Y['sample_weight'] = 1.0 Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight for i in range(self.num_boost_round): # weighted grad and hess Y.grad = Y.grad * Y.sample_weight Y.hess = Y.hess * Y.sample_weight # row and column sample before training the current tree data = X.sample(frac=self.colsample_bytree, axis=1) data = pd.concat([data, Y], axis=1) data = data.sample(frac=self.subsample, axis=0) Y_selected = data[['label', 'y_pred', 'grad', 'hess']] X_selected = data.drop( ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1) # train current tree tree = Tree() tree.fit(X_selected, Y_selected, max_depth=self.max_depth, min_child_weight=self.min_child_weight, colsample_bylevel=self.colsample_bylevel, min_sample_split=self.min_sample_split, reg_lambda=self.reg_lambda, gamma=self.gamma, num_thread=self.num_thread) # predict the whole trainset and update y_pred,grad,hess preds = tree.predict(X) Y['y_pred'] += self.eta * preds Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree) # print training information if self.eval_metric is None: print "TGBoost round {iteration}".format(iteration=i) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") train_metric = mertric_func( self.loss.transform(Y.y_pred.values), Y.label.values) if not do_validation: print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric) else: val_Y['y_pred'] += self.eta * tree.predict(val_X) val_metric = mertric_func( self.loss.transform(val_Y.y_pred.values), val_Y.label.values) print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric) # check if to early stop if maximize: if val_metric > best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break else: if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break