Пример #1
0
class SGD(
        IterativeComponentWithSampleWeight,
        AutoSklearnClassificationAlgorithm,
):
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 tol,
                 learning_rate,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 average=False,
                 random_state=None):
        self.max_iter = self.get_max_iter()
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average

        self.estimator = None
        self.n_iter_ = None

    @staticmethod
    def get_max_iter():
        return 1024

    def get_current_iter(self):
        return self.n_iter_

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None
            self.n_iter_ = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
            self.n_iter_ = self.estimator.n_iter_
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter,
                                          self.max_iter)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X,
                y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None)
            self.n_iter_ += self.estimator.n_iter_

        if self.estimator.max_iter >= self.max_iter or self.estimator.max_iter > self.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'SGD Classifier',
            'name': 'Stochastic Gradient Descent Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'handles_multioutput': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter(
            "loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default_value="log",
        )
        penalty = CategoricalHyperparameter("penalty",
                                            ["l1", "l2", "elasticnet"],
                                            default_value="l2")
        alpha = UniformFloatHyperparameter("alpha",
                                           1e-7,
                                           1e-1,
                                           log=True,
                                           default_value=0.0001)
        l1_ratio = UniformFloatHyperparameter("l1_ratio",
                                              1e-9,
                                              1,
                                              log=True,
                                              default_value=0.15)
        fit_intercept = CategoricalHyperparameter("fit_intercept",
                                                  ["True", "False"],
                                                  default_value="True")
        tol = UniformFloatHyperparameter("tol",
                                         1e-5,
                                         1e-1,
                                         log=True,
                                         default_value=1e-4)
        epsilon = UniformFloatHyperparameter("epsilon",
                                             1e-5,
                                             1e-1,
                                             default_value=1e-4,
                                             log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default_value="invscaling")
        eta0 = UniformFloatHyperparameter("eta0",
                                          1e-7,
                                          1e-1,
                                          default_value=0.01,
                                          log=True)
        power_t = UniformFloatHyperparameter("power_t",
                                             1e-5,
                                             1,
                                             default_value=0.5)
        average = CategoricalHyperparameter("average", ["False", "True"],
                                            default_value="False")
        cs.add_hyperparameters([
            loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon,
            learning_rate, eta0, power_t, average
        ])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")

        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        # eta0 is only relevant if learning_rate!='optimal' according to code
        # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/
        # linear_model/sgd_fast.pyx#L603
        eta0_in_inv_con = InCondition(eta0, learning_rate,
                                      ["invscaling", "constant"])
        cs.add_conditions([
            elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con
        ])

        return cs
Пример #2
0
def _train(train_data,
           valid_data,
           costs,
           importance,
           max_iter=10,
           alpha=0.1,
           minibatch=1000,
           epochs=10,
           l1_ratio=1.0,
           penalty='none',
           eta0=0.01):
    """Train one cost-aware linear model using SGD.

    Args:
        max_iter: number of passes over the mini-batch (mini-epoch?)
        alpha: regularizer weight
        minibatch: size of a mini-batch
        epochs: number of passes over the training data
    """
    x_train, y_train, qid_train = train_data
    x_valid, y_valid, qid_valid = valid_data

    model = SGDClassifier(alpha=alpha,
                          verbose=False,
                          shuffle=False,
                          n_iter=max_iter,
                          learning_rate='constant',
                          penalty=penalty,
                          l1_ratio=l1_ratio,
                          eta0=eta0)

    model.classes_ = np.array([-1, 1])

    # fit SGD over the full data to initialize the model weights
    model.fit(x_train, y_train)

    valid_scores = (np.nan, np.nan, np.nan)
    if x_valid is not None:
        m = test_all(model.decision_function(x_valid), y_valid, qid_valid, 1)
        valid_scores = (m['ndcg@10'], m['p@10'], m['err@10'])
        print(
            '[%3i]: weighted L1 %8.2f, cost %8d, features %4d, valid ndcg@10/p@10/err@10 %0.4f/%0.4f/%0.4f'
            % (0, np.sum(np.abs(model.coef_[0] * costs)),
               np.sum(costs[np.nonzero(
                   model.coef_[0])]), np.count_nonzero(model.coef_[0]),
               valid_scores[0], valid_scores[1], valid_scores[2]))

    # SGD algorithm (Tsuruoka et al., 2009)
    u = np.zeros(x_train.shape[1])
    q = np.zeros(x_train.shape[1])

    for epoch in range(1, epochs + 1):
        for iterno, batch in enumerate(
                batch_generator(x_train, y_train, minibatch, x_train.shape[0]),
                1):
            x, y = batch

            # call the internal method to specify custom classes, coef_init, and intercept_init
            model._partial_fit(x,
                               y,
                               alpha=model.alpha,
                               C=1.0,
                               loss=model.loss,
                               learning_rate=model.learning_rate,
                               n_iter=1,
                               classes=model.classes_,
                               sample_weight=None,
                               coef_init=model.coef_,
                               intercept_init=model.intercept_)

            new_w = np.zeros(model.coef_.shape[1])
            u += model.eta0 * model.alpha * costs / float(
                x_train.shape[0])  # note the costs

            for i in range(len(model.coef_[0])):
                if model.coef_[0][i] > 0:
                    new_w[i] = max(0, model.coef_[0][i] - (u[i] + q[i]))
                elif model.coef_[0][i] < 0:
                    new_w[i] = min(0, model.coef_[0][i] + (u[i] - q[i]))
            q += new_w - model.coef_[0]
            model.coef_[0] = new_w

        valid_scores = (np.nan, np.nan, np.nan)
        if x_valid is not None:
            m = test_all(model.decision_function(x_valid), y_valid, qid_valid,
                         1)
            valid_scores = (m['ndcg@10'], m['p@10'], m['err@10'])
        print(
            '[%3i]: weighted L1 %8.2f, cost %8d, features %4d, valid ndcg@10/p@10/err@10 %0.4f/%0.4f/%0.4f'
            % (epoch, np.sum(np.abs(model.coef_[0] * costs)),
               np.sum(costs[np.nonzero(
                   model.coef_[0])]), np.count_nonzero(model.coef_[0]),
               valid_scores[0], valid_scores[1], valid_scores[2]))

    return model