def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression, debug):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth
        self.init_estimate = None
        self.regression = regression
        self.debug = debug
        self.multipliers = []

        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(min_samples_split=self.min_samples_split,
                                  min_impurity=min_impurity,
                                  max_depth=self.max_depth)
            self.trees.append(tree)
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression, debug):
        self.n_estimators = n_estimators  # Number of trees
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
        self.min_impurity = min_impurity  # Minimum variance reduction to continue
        self.max_depth = max_depth  # Maximum depth for tree
        self.init_estimate = None  # The initial prediction of y
        self.regression = regression
        self.debug = debug

        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(min_samples_split=self.min_samples_split,
                                  min_impurity=min_impurity,
                                  max_depth=self.max_depth)

            self.trees.append(tree)
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression, debug):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth
        self.init_estimate = None
        self.regression = regression
        self.debug = debug
        self.multipliers = []
        
        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(
                    min_samples_split=self.min_samples_split,
                    min_impurity=min_impurity,
                    max_depth=self.max_depth)
            self.trees.append(tree)
 def __init__(self, learning_rate=.1, momentum=0.3, gradient_descent=True):
     self.param = None
     self.learning_rate = learning_rate
     self.momentum = momentum
     self.gradient_descent = gradient_descent
     self.sigmoid = Sigmoid()
     self.log_loss = LogisticLoss()
示例#5
0
    def __init__(self,
                 n_estimators=200,
                 learning_rate=0.001,
                 min_samples_split=2,
                 min_impurity=1e-7,
                 max_depth=2,
                 debug=False):
        self.n_estimators = n_estimators  # Number of trees
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
        self.min_impurity = min_impurity  # Minimum variance reduction to continue
        self.max_depth = max_depth  # Maximum depth for tree
        self.debug = debug

        # Log loss for classification
        self.loss = LogisticLoss(grad_wrt_theta=False)

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = XGBoostRegressionTree(
                min_samples_split=self.min_samples_split,
                min_impurity=min_impurity,
                max_depth=self.max_depth,
                loss=self.loss)

            self.trees.append(tree)
class GradientBoosting(object):
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression, debug):
        self.n_estimators = n_estimators            # Number of trees
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
        self.min_impurity = min_impurity              # Minimum variance reduction to continue
        self.max_depth = max_depth                  # Maximum depth for tree
        self.init_estimate = None                   # The initial prediction of y
        self.regression = regression
        self.debug = debug
        self.multipliers = []
        
        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(
                    min_samples_split=self.min_samples_split,
                    min_impurity=min_impurity,
                    max_depth=self.max_depth)
            self.trees.append(tree)


    def fit(self, X, y):
        y_pred = np.full(np.shape(y), np.mean(y, axis=0))
        for i, tree in enumerate(self.trees):
            
            gradient = self.loss.gradient(y, y_pred)
            tree.fit(X, gradient)
            update = tree.predict(X)
            # Update y prediction
            y_pred -= np.multiply(self.learning_rate, update)
            
            if self.debug:
                progress = 100 * (i / self.n_estimators)
                print ("Progress: %.2f%%" % progress)

    def predict(self, X):
        y_pred = np.array([])
        # Make predictions
        for i, tree in enumerate(self.trees):
            update = tree.predict(X)
            update = np.multiply(self.learning_rate, update)
            # prediction = np.array(prediction).reshape(np.shape(y_pred))
            y_pred = -update if not y_pred.any() else y_pred - update

        if not self.regression:
            # Turn into probability distribution
            y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
            # Set label to the value that maximizes probability
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred
class LogisticRegression():
    """The Logistic Regression classifier. 
    Parameters:
    -----------
    learning_rate: float
        The step length that will be taken when following the negative gradient during
        training.
    gradient_descent: boolean
        True or false depending if gradient descent should be used when training. If 
        false then we use batch optimization by least squares.
    """
    def __init__(self, learning_rate=.1, gradient_descent=True):
        self.param = None
        self.learning_rate = learning_rate
        self.gradient_descent = gradient_descent
        self.sigmoid = Sigmoid()
        self.log_loss = LogisticLoss()

    def fit(self, X, y, n_iterations=4000):
        # Add dummy ones for bias weights
        X = np.insert(X, 0, 1, axis=1)

        n_samples, n_features = np.shape(X)

        # Initial parameters between [-1/sqrt(N), 1/sqrt(N)]
        a = -1 / math.sqrt(n_features)
        b = -a
        self.param = (b - a) * np.random.random((n_features, )) + a

        # Tune parameters for n iterations
        for i in range(n_iterations):
            # Make a new prediction
            y_pred = self.sigmoid.function(X.dot(self.param))
            if self.gradient_descent:
                # Move against the gradient of the loss function with
                # respect to the parameters to minimize the loss
                self.param -= self.learning_rate * self.log_loss.gradient(
                    y, X, self.param)
            else:
                # Make a diagonal matrix of the sigmoid gradient column vector
                diag_gradient = make_diagonal(
                    self.sigmoid.gradient(X.dot(self.param)))
                # Batch opt:
                self.param = np.linalg.pinv(X.T.dot(diag_gradient).dot(X)).dot(
                    X.T).dot(
                        diag_gradient.dot(X).dot(self.param) + y - y_pred)

    def predict(self, X):
        # Add dummy ones for bias weights
        X = np.insert(X, 0, 1, axis=1)
        # Print a final prediction
        dot = X.dot(self.param)
        y_pred = np.round(self.sigmoid.function(dot)).astype(int)
        return y_pred
class GradientBoosting(object):
    """Super class of GradientBoostingClassifier and GradientBoostinRegressor. 
    Uses a collection of regression trees that trains on predicting the gradient
    of the loss function. 

    Parameters:
    -----------
    n_estimators: int
        The number of classification trees that are used.
    learning_rate: float
        The step length that will be taken when following the negative gradient during
        training.
    min_samples_split: int
        The minimum number of samples needed to make a split when building a tree.
    min_impurity: float
        The minimum impurity required to split the tree further. 
    max_depth: int
        The maximum depth of a tree.
    regression: boolean
        True or false depending on if we're doing regression or classification.
    debug: boolean
        True or false depending on if we wish to display the training progress.
    """
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression, debug):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth
        self.init_estimate = None
        self.regression = regression
        self.debug = debug
        self.multipliers = []

        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(min_samples_split=self.min_samples_split,
                                  min_impurity=min_impurity,
                                  max_depth=self.max_depth)
            self.trees.append(tree)

    def fit(self, X, y):
        y_pred = np.full(np.shape(y), np.mean(y, axis=0))
        for i, tree in enumerate(self.trees):

            gradient = self.loss.gradient(y, y_pred)
            tree.fit(X, gradient)
            update = tree.predict(X)
            # Update y prediction
            y_pred -= np.multiply(self.learning_rate, update)

            if self.debug:
                progress = 100 * (i / self.n_estimators)
                print("Progress: %.2f%%" % progress)

    def predict(self, X):
        y_pred = np.array([])
        # Make predictions
        for i, tree in enumerate(self.trees):
            update = tree.predict(X)
            update = np.multiply(self.learning_rate, update)
            # prediction = np.array(prediction).reshape(np.shape(y_pred))
            y_pred = -update if not y_pred.any() else y_pred - update

        if not self.regression:
            # Turn into probability distribution
            y_pred = np.exp(y_pred) / np.expand_dims(
                np.sum(np.exp(y_pred), axis=1), axis=1)
            # Set label to the value that maximizes probability
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred
class GradientBoosting(object):
    """Super class of GradientBoostingClassifier and GradientBoostinRegressor. 
    Uses a collection of regression trees that trains on predicting the gradient
    of the loss function. 

    Parameters:
    -----------
    n_estimators: int
        The number of classification trees that are used.
    learning_rate: float
        The step length that will be taken when following the negative gradient during
        training.
    min_samples_split: int
        The minimum number of samples needed to make a split when building a tree.
    min_impurity: float
        The minimum impurity required to split the tree further. 
    max_depth: int
        The maximum depth of a tree.
    regression: boolean
        True or false depending on if we're doing regression or classification.
    debug: boolean
        True or false depending on if we wish to display the training progress.
    """
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression, debug):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth
        self.init_estimate = None
        self.regression = regression
        self.debug = debug
        self.multipliers = []
        
        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(
                    min_samples_split=self.min_samples_split,
                    min_impurity=min_impurity,
                    max_depth=self.max_depth)
            self.trees.append(tree)


    def fit(self, X, y):
        y_pred = np.full(np.shape(y), np.mean(y, axis=0))
        for i, tree in enumerate(self.trees):
            
            gradient = self.loss.gradient(y, y_pred)
            tree.fit(X, gradient)
            update = tree.predict(X)
            # Update y prediction
            y_pred -= np.multiply(self.learning_rate, update)
            
            if self.debug:
                progress = 100 * (i / self.n_estimators)
                print ("Progress: %.2f%%" % progress)

    def predict(self, X):
        y_pred = np.array([])
        # Make predictions
        for i, tree in enumerate(self.trees):
            update = tree.predict(X)
            update = np.multiply(self.learning_rate, update)
            # prediction = np.array(prediction).reshape(np.shape(y_pred))
            y_pred = -update if not y_pred.any() else y_pred - update

        if not self.regression:
            # Turn into probability distribution
            y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
            # Set label to the value that maximizes probability
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred
class GradientBoosting(object):
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression):
        self.n_estimators = n_estimators  # Number of trees
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
        self.min_impurity = min_impurity  # Minimum variance reduction to continue
        self.max_depth = max_depth  # Maximum depth for tree
        self.init_estimate = None  # The initial prediction of y
        self.regression = regression

        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(min_samples_split=self.min_samples_split,
                                  min_impurity=min_impurity,
                                  max_depth=self.max_depth)

            self.trees.append(tree)

    def fit(self, X, y):
        # Set initial predictions to median of y
        self.init_estimate = np.median(y, axis=0)
        y_pred = np.full(np.shape(y), self.init_estimate)
        for tree in self.trees:

            gradient = self.loss.gradient(y, y_pred)
            tree.fit(X, gradient)
            gradient_est = tree.predict(X)

            # Make sure shape is same as y_pred
            gradient_est = np.array(gradient_est).reshape(np.shape(y_pred))

            # Update y prediction by the estimated gradient value
            y_pred -= np.multiply(self.learning_rate, gradient_est)

    def predict(self, X):
        # Fix shape of y_pred as (n_samples, n_outputs)
        n_samples = np.shape(X)[0]
        if not np.shape(self.init_estimate):
            y_pred = np.full(n_samples, self.init_estimate)
        else:
            n_outputs = np.shape(self.init_estimate)[0]
            y_pred = np.full((n_samples, n_outputs), self.init_estimate)

        # Make predictions
        for tree in self.trees:
            prediction = tree.predict(X)
            prediction = np.array(prediction).reshape(np.shape(y_pred))
            y_pred -= np.multiply(self.learning_rate, prediction)

        if not self.regression:
            # Turn into probability distribution
            y_pred = np.exp(y_pred) / np.expand_dims(
                np.sum(np.exp(y_pred), axis=1), axis=1)
            # Set label to the value that maximizes probability
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred