예제 #1
0
    def __init__(self,
                 loss='square',
                 learning_rate=0.3,
                 n_estimators=20,
                 max_depth=6,
                 subsample=0.8,
                 colsample_bytree=0.8,
                 colsample_bylevel=0.8,
                 min_child_weight=1,
                 reg_lambda=1.0,
                 gamma=0,
                 num_thread=-1):

        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.num_thread = num_thread
        self.min_child_weight = min_child_weight
        self.first_round_pred = 0.0
        self.trees = []
        self.eval_metric = None

        self._is_classifier = False

        if loss == 'logistic':
            self.loss = LogisticLoss()
        elif loss == 'square':
            self.loss = SquareLoss()
        else:
            if callable(loss):
                self.loss = CustomLoss(loss)
            else:
                raise Exception('unsupported loss function: {0}'.format(loss))
예제 #2
0
파일: apollo.py 프로젝트: wai7niu8/Apollo
    def fit(self,
            train_data,
            validation_data,
            early_stopping_rounds=np.inf,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_round=1000,
            max_depth=6,
            pool_size=1,
            min_instances_byleaf=1,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=1,
            reg_lambda=1.0,
            gamma=0):
        """
        :param train_data: Data object, train data
        :param validation_data: Data object, validation data
        :param eta: learning rate
        :param num_round: number of boosting round
        :param max_depth: max depth of each tree
        :param pool_size: the num of processes
        :param subsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param min_instances_byleaf: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_round = num_round
        self.max_depth = max_depth
        self.pool_size = pool_size
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_instances_byleaf = min_instances_byleaf
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0.0

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss(self.reg_lambda)
        elif loss == "squareloss":
            self.loss = SquareLoss(self.reg_lambda)
            self.first_round_pred = train_data.getLabelMean()
        else:
            raise NotImplementedError(
                "loss should be 'logisticloss' or 'squareloss'")

        # to evaluate on validation set and conduct early stopping
        do_validation = True
        valData = validation_data.getData()
        if not valData:
            raise ValueError("validation_data is empty !")

        valIdxList = []  #save an fixed order
        valLabels = []
        for idx in valData:
            valData[idx][
                'yPred'] = self.first_round_pred  #init it with traindata
            valIdxList.append(idx)
            valLabels.append(valData[idx]['label'])

        best_val_metric = np.inf
        best_round = 0
        become_worse_round = 0

        data = train_data.getData()
        if not train_data:
            raise ValueError("train_data is empty !")
        idxList = []  #save an fixed order
        labels = []
        for idx in data:
            data[idx]['yPred'] = self.first_round_pred
            data[idx]['grad'] = self.loss.grad(data[idx]['grad'],
                                               data[idx]['label'])
            data[idx]['hess'] = self.loss.hess(data[idx]['hess'],
                                               data[idx]['label'])
            if data[idx]['label'] == 1.0:
                data[idx]['weight'] = self.scale_pos_weight
            idxList.append(idx)
            labels.append(data[idx]['label'])
        labels = np.array(labels)
        for i in range(self.num_round):
            # weighted grad and hess
            for idx in data:
                data[idx]['grad'] = data[idx]['grad'] * data[idx]['weight']
                data[idx]['hess'] = data[idx]['hess'] * data[idx]['weight']

            # row and column sample before training the current tree
            factors = train_data.getFactors()
            factorTypes = train_data.getFeatureTypes()
            sampledFactors = random.sample(
                factors, int(len(factors) * self.colsample_bytree))
            sampledData = {}
            for idx in random.sample(idxList,
                                     int(len(idxList) * self.subsample)):
                sampledData.update({idx: data[idx]})

            # train current tree
            tree = Tree()
            tree.fit(sampledData,
                     sampledFactors,
                     factorTypes,
                     max_depth=self.max_depth,
                     pool_size=self.pool_size,
                     min_child_weight=self.min_child_weight,
                     min_instances_byleaf=self.min_instances_byleaf,
                     reg_lambda=self.reg_lambda,
                     gamma=self.gamma)

            # predict the whole trainset and update y_pred,grad,hess
            preds = tree.predict(sampledData)
            for idx in sampledData:
                data[idx]['yPred'] += self.eta * preds[idx]
                data[idx]['grad'] = self.loss.grad(data[idx]["yPred"],
                                                   data[idx]["label"])
                data[idx]['hess'] = self.loss.hess(data[idx]["yPred"],
                                                   data[idx]["label"])

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)

            # print training information
            if self.eval_metric is None:
                print "Apollo round {iteration}".format(iteration=i)
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                curPreds = np.array([data[idx]["yPred"] for idx in idxList])
                train_metric = mertric_func(self.loss.transform(curPreds),
                                            labels)

                if not do_validation:
                    print "Apollo round {iteration}, train-{eval_metric} is {train_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric)
                else:
                    valPreds = tree.predict(valData)
                    for idx in valData:
                        valData[idx]['yPred'] += self.eta * valPreds[idx]
                    curValPreds = [valData[idx]['yPred'] for idx in valIdxList]
                    assert len(curValPreds) == len(valLabels)
                    val_metric = mertric_func(
                        self.loss.transform(np.array(curValPreds)),
                        np.array(valLabels))
                    print "Apollo round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric,
                        val_metric=val_metric)

                    # check if to early stop
                    if val_metric < best_val_metric:
                        best_val_metric = val_metric
                        best_round = i
                        become_worse_round = 0
                    else:
                        become_worse_round += 1
                    if become_worse_round > early_stopping_rounds:
                        print "Apollo training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format(
                            best_round=best_round,
                            eval_metric=eval_metric,
                            best_val_metric=best_val_metric)
                        break
예제 #3
0
    def fit(self,
            X,
            y,
            eta=0.01,
            num_boost_round=1000,
            max_depth=5,
            rowsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_sample_split=10,
            loss="logisticloss",
            l2_regularization=1.0,
            gamma=0.1,
            num_thread=-1,
            eval_metric=None):
        """
        :param X: pandas.core.frame.DataFrame
        :param y: pandas.core.series.Series
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param rowsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param colsample_bylevel: column sample rate when spliting each tree node,
                                  the number of features = total_features*colsample_bytree*colsample_bylevel
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param l2_regularization: lambda
        :param gamma: gamma
        :param seed: random seed
        :param num_thread: number of thread to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.rowsample = rowsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.l2_regularization = l2_regularization
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric

        if loss == "logisticloss":
            self.loss = LogisticLoss(l2_regularization)
        elif loss == "squareloss":
            self.loss = SquareLoss(l2_regularization)
        else:
            try:
                self.loss = CustomizeLoss(loss, l2_regularization)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        self.first_round_pred = y.mean()

        # Y stores label, y_pred, grad, hess
        Y = pd.DataFrame(y.values,
                         columns=['label'])  # only one column "label"
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

        for i in range(self.num_boost_round):
            # sample samples and features to train current tree
            data = X.sample(frac=self.colsample_bytree, axis=1)
            data = pd.concat([data, Y], axis=1)
            data = data.sample(frac=self.rowsample, axis=0)
            Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
            X_selected = data.drop(['label', 'y_pred', 'grad', 'hess'], axis=1)

            # train current tree
            tree = Tree()
            tree.fit(X_selected,
                     Y_selected,
                     max_depth=self.max_depth,
                     colsample_bylevel=self.colsample_bylevel,
                     min_sample_split=self.min_sample_split,
                     l2_regularization=self.l2_regularization,
                     gamma=self.gamma,
                     num_thread=self.num_thread)

            # predict the whole dataset and update y_pred,grad,hess
            preds = tree.predict(X)
            Y['y_pred'] += self.eta * preds
            Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
            Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

            if self.eval_metric is not None:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")
                metric_value = mertric_func(
                    self.loss.transform(Y.y_pred.values), Y.label.values)
                print "TGBoost round {iteration}, {eval_metric} is {metric_value}".format(
                    iteration=i,
                    eval_metric=self.eval_metric,
                    metric_value=metric_value)
            else:
                print "TGBoost round {iteration}"

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)
예제 #4
0
    def fit(self,
            features,
            label,
            validation_data=(None, None),
            early_stopping_rounds=np.inf,
            maximize=True,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_boost_round=1000,
            max_depth=6,
            scale_pos_weight=1,
            subsample=0.8,
            colsample=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1):
        """
        :param features: np.array
        :param label: np.array
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param subsample: row sample rate when building a tree
        :param colsample: column sample rate when building a tree
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param num_thread: number of threself.tree_predict_Xad to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample = colsample
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss()
        elif loss == "squareloss":
            self.loss = SquareLoss()
            self.first_round_pred = label.mean()
        else:
            try:
                self.loss = CustomizeLoss(loss)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        # initialize row_sampler, col_sampler, bin_structure, attribute_list, class_list
        row_sampler = RowSampler(features.shape[0], self.subsample)
        col_sampler = ColumnSampler(features.shape[1], self.colsample)
        bin_structure = BinStructure(features)
        attribute_list = AttributeList(features, bin_structure)
        class_list = ClassList(label)
        class_list.initialize_pred(self.first_round_pred)
        class_list.update_grad_hess(self.loss)

        # to evaluate on validation set and conduct early stopping
        # we should get (val_features,val_label)
        # and set some variable to check when to stop
        do_validation = True
        if not isinstance(validation_data, tuple):
            raise TypeError(
                "validation_data should be (val_features, val_label)")

        val_features, val_label = validation_data
        val_pred = None
        if val_features is None or val_label is None:
            do_validation = False
        else:
            val_pred = np.ones(val_label.shape) * self.first_round_pred

        if maximize:
            best_val_metric = -np.inf
            best_round = 0
            become_worse_round = 0
        else:
            best_val_metric = np.inf
            best_round = 0
            become_worse_round = 0

        # start learning
        logging.info("TGBoost start training")
        for i in range(self.num_boost_round):
            t0 = time()
            # train current tree
            tree = Tree(self.min_sample_split, self.min_child_weight,
                        self.max_depth, self.colsample, self.subsample,
                        self.reg_lambda, self.gamma, self.num_thread)
            tree.fit(attribute_list, class_list, row_sampler, col_sampler,
                     bin_structure)

            # when finish building this tree, update the class_list.pred, grad, hess
            class_list.update_pred(self.eta)
            class_list.update_grad_hess(self.loss)
            # save this tree
            self.trees.append(tree)

            t1 = time()

            # print training information
            if self.eval_metric is None:
                logging.info("TGBoost round {iteration}".format(iteration=i))
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                train_metric = mertric_func(
                    self.loss.transform(class_list.pred), label)

                if not do_validation:
                    logging.info(
                        "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, exec time {tc:.3f}s"
                        .format(iteration=i,
                                eval_metric=self.eval_metric,
                                train_metric=train_metric,
                                tc=t1 - t0))
                else:
                    val_pred += self.eta * tree.predict(val_features)
                    val_metric = mertric_func(self.loss.transform(val_pred),
                                              val_label)
                    logging.info(
                        "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, val-{eval_metric}: {val_metric:.4f}, exec time {tc:.3f}s"
                        .format(iteration=i,
                                eval_metric=self.eval_metric,
                                train_metric=train_metric,
                                val_metric=val_metric,
                                tc=t1 - t0))

                    # check whether to early stop
                    if maximize:
                        if val_metric > best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            logging.info(
                                "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric:.4f}"
                                .format(best_round=best_round,
                                        eval_metric=eval_metric,
                                        best_val_metric=best_val_metric))
                            break
                    else:
                        if val_metric < best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            logging.info(
                                "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric:.4f}"
                                .format(best_round=best_round,
                                        eval_metric=eval_metric,
                                        best_val_metric=best_val_metric))
                            break
예제 #5
0
    def fit(self,
            X,
            y,
            validation_data=(None, None),
            early_stopping_rounds=np.inf,
            maximize=True,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_boost_round=1000,
            max_depth=6,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1):
        """
        :param X: pandas.core.frame.DataFrame
        :param y: pandas.core.series.Series
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param subsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param colsample_bylevel: column sample rate when spliting each tree node,
                                  the number of features = total_features*colsample_bytree*colsample_bylevel
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param seed: random seed
        :param num_thread: number of threself.tree_predict_Xad to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0.0

        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss(reg_lambda)
        elif loss == "squareloss":
            self.loss = SquareLoss(reg_lambda)
            self.first_round_pred = y.mean()
        else:
            try:
                self.loss = CustomizeLoss(loss, reg_lambda)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        # to evaluate on validation set and conduct early stopping
        # we should get (val_X,val_y)
        # and set some variable to check when to stop
        do_validation = True
        if not isinstance(validation_data, tuple):
            raise TypeError("validation_data should be (val_X, val_y)")

        val_X, val_y = validation_data
        if val_X is None or val_y is None:
            do_validation = False
        else:
            # type check
            if not isinstance(val_X, pd.core.frame.DataFrame):
                raise TypeError("val_X should be 'pd.core.frame.DataFrame'")
            if not isinstance(val_y, pd.core.series.Series):
                raise TypeError("val_X should be 'pd.core.series.Series'")
            val_X.reset_index(drop=True, inplace=True)
            val_y.reset_index(drop=True, inplace=True)
            val_Y = pd.DataFrame(val_y.values, columns=['label'])
            val_Y['y_pred'] = self.first_round_pred

        if maximize:
            best_val_metric = -np.inf
            best_round = 0
            become_worse_round = 0
        else:
            best_val_metric = np.inf
            best_round = 0
            become_worse_round = 0

        # Y stores: label, y_pred, grad, hess, sample_weight
        Y = pd.DataFrame(y.values, columns=['label'])
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)
        Y['sample_weight'] = 1.0
        Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight

        for i in range(self.num_boost_round):
            # weighted grad and hess
            Y.grad = Y.grad * Y.sample_weight
            Y.hess = Y.hess * Y.sample_weight
            # row and column sample before training the current tree
            data = X.sample(frac=self.colsample_bytree, axis=1)
            data = pd.concat([data, Y], axis=1)
            data = data.sample(frac=self.subsample, axis=0)
            Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
            X_selected = data.drop(
                ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1)

            # train current tree
            tree = Tree()
            tree.fit(X_selected,
                     Y_selected,
                     max_depth=self.max_depth,
                     min_child_weight=self.min_child_weight,
                     colsample_bylevel=self.colsample_bylevel,
                     min_sample_split=self.min_sample_split,
                     reg_lambda=self.reg_lambda,
                     gamma=self.gamma,
                     num_thread=self.num_thread)

            # predict the whole trainset and update y_pred,grad,hess
            preds = tree.predict(X)
            Y['y_pred'] += self.eta * preds
            Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
            Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)

            # print training information
            if self.eval_metric is None:
                print "TGBoost round {iteration}".format(iteration=i)
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                train_metric = mertric_func(
                    self.loss.transform(Y.y_pred.values), Y.label.values)

                if not do_validation:
                    print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric)
                else:
                    val_Y['y_pred'] += self.eta * tree.predict(val_X)
                    val_metric = mertric_func(
                        self.loss.transform(val_Y.y_pred.values),
                        val_Y.label.values)
                    print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric,
                        val_metric=val_metric)

                    # check if to early stop
                    if maximize:
                        if val_metric > best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            print "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric}".format(
                                best_round=best_round,
                                eval_metric=eval_metric,
                                best_val_metric=best_val_metric)
                            break
                    else:
                        if val_metric < best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            print "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format(
                                best_round=best_round,
                                eval_metric=eval_metric,
                                best_val_metric=best_val_metric)
                            break