def SGD(self, train_features, test_features): print("in SGD") self.train_features = train_features self.test_features = test_features scores = [] submission = pd.DataFrame.from_dict({'id': test['Id']}) SGD_file = 'SGD.pckl' SGD_model_pkl = open(SGD_file, 'wb') for class_name in class_names: train_target = train[class_name] classifier = SGDClassifier(loss='modified_huber', penalty='l2', alpha=0.001, random_state=42, max_iter=200, tol=0.20, learning_rate='optimal') cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc')) scores.append(cv_score) print('CV score for class {} is {}'.format(class_name, cv_score)) classifier.fit(train_features, train_target) pickle.dump(classifier, SGD_model_pkl) submission[class_name] = classifier.predict_proba(test_features)[:, 1] print('Total CV score is {}'.format(np.mean(scores))) SGD_model_pkl.close() submission.to_csv('SGD.csv', index=False)
class SGDClassifierImpl(): def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False): self._hyperparams = { 'loss': loss, 'penalty': penalty, 'alpha': alpha, 'l1_ratio': l1_ratio, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'shuffle': shuffle, 'verbose': verbose, 'epsilon': epsilon, 'n_jobs': n_jobs, 'random_state': random_state, 'learning_rate': learning_rate, 'eta0': eta0, 'power_t': power_t, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'class_weight': class_weight, 'warm_start': warm_start, 'average': average} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def partial_fit(self, X, y=None, classes = None): if not hasattr(self, "_wrapped_model"): self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.partial_fit(X, y, classes = classes) return self
class SGD( IterativeComponentWithSampleWeight, BaseClassificationModel, ): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None self.time_limit = None self.start_time = time.time() def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: if isinstance(self.loss, tuple): nested_loss = self.loss self.loss = nested_loss[0] if self.loss == 'modified_huber': self.epsilon = nested_loss[1]['epsilon'] if isinstance(self.penalty, tuple): nested_penalty = self.penalty self.penalty = nested_penalty[0] if self.penalty == "elasticnet": self.l1_ratio = nested_penalty[1]['l1_ratio'] if isinstance(self.learning_rate, tuple): nested_learning_rate = self.learning_rate self.learning_rate = nested_learning_rate[0] if self.learning_rate == 'invscaling': self.eta0 = nested_learning_rate[1]['eta0'] self.power_t = nested_learning_rate[1]['power_t'] elif self.learning_rate == 'constant': self.eta0 = nested_learning_rate[1]['eta0'] self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) if self.eta0 is not None else 0.01 self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator.max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter( "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter( "l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter( "eta0", 1e-7, 1e-1, default_value=0.01, log=True) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, log=True, default_value=0.5) average = CategoricalHyperparameter( "average", ["False", "True"], default_value="False") cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con]) return cs elif optimizer == 'tpe': eta0 = hp.loguniform('sgd_eta0', np.log(1e-7), np.log(1e-1)) space = { 'loss': hp.choice('sgd_loss', [ ("modified_huber", {'epsilon': hp.loguniform('sgd_epsilon', np.log(1e-5), np.log(1e-1))}), ("hinge", {}), ("log", {}), ("squared_hinge", {}), ("perceptron", {})]), 'penalty': hp.choice('sgd_penalty', [("elasticnet", {'l1_ratio': hp.loguniform('sgd_l1_ratio', np.log(1e-9), np.log(1))}), ("l1", None), ("l2", None)]), 'alpha': hp.loguniform('sgd_alpha', np.log(1e-7), np.log(1e-1)), 'fit_intercept': hp.choice('sgd_fit_intercept', ["True"]), 'tol': hp.loguniform('sgd_tol', np.log(1e-5), np.log(1e-1)), 'learning_rate': hp.choice('sgd_learning_rate', [("optimal", {}), ("invscaling", {'power_t': hp.loguniform('sgd_power_t', np.log(1e-5), np.log(1)), 'eta0': eta0}), ("constant", {'eta0': eta0})]), 'average': hp.choice('sgd_average', ["True", "False"])} init_trial = {'loss': ("log", {}), 'penalty': ("l2", {}), 'alpha': 1e-4, 'fit_intercept': "True", 'tol': 1e-4, 'learning_rate': ("invscaling", {'power_t': 0.5, 'eta0': 0.01}), 'average': "False"} return space
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): n_iter = 2 self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight, refit=True) while not self.configuration_fully_fitted(): n_iter *= 2 self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) return self def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None) if self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter("alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter("eta0", 1e-7, 1e-1, default_value=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.25) average = CategoricalHyperparameter("average", ["False", "True"], default_value="False") cs.add_hyperparameters([ loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average ]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_conditions([elasticnet, epsilon_condition, power_t_condition]) return cs
class SGD( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter( "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter( "l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter( "eta0", 1e-7, 1e-1, default_value=0.01, log=True) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.5) average = CategoricalHyperparameter( "average", ["False", "True"], default_value="False") cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con]) return cs
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight, refit=True) while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight) return self def iterative_fit(self, X, y, n_iter=1, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=n_iter, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) else: self.estimator.n_iter += n_iter self.estimator.partial_fit(X, y, classes=np.unique(y), sample_weight=sample_weight) if self.estimator.n_iter >= self.n_iter: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="log") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default="l2") alpha = UniformFloatHyperparameter("alpha", 10e-7, 1e-1, log=True, default=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 1e-9, 1, log=True, default=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, log=True, default=20) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal") eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.25) average = CategoricalHyperparameter("average", ["False", "True"], default="False") cs.add_hyperparameters([ loss, penalty, alpha, l1_ratio, fit_intercept, n_iter, epsilon, learning_rate, eta0, power_t, average ]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_conditions([elasticnet, epsilon_condition, power_t_condition]) return cs
class SGD(ParamSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.class_weight = class_weight self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not self.estimator.n_iter < self.n_iter def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,), # TODO find out what is best used here! 'preferred_dtype' : None} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = cs.add_hyperparameter(CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="hinge")) penalty = cs.add_hyperparameter(CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default="l2")) alpha = cs.add_hyperparameter(UniformFloatHyperparameter( "alpha", 10e-7, 1e-1, log=True, default=0.0001)) l1_ratio = cs.add_hyperparameter(UniformFloatHyperparameter( "l1_ratio", 0, 1, default=0.15)) fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter( "fit_intercept", "True")) n_iter = cs.add_hyperparameter(UniformIntegerHyperparameter( "n_iter", 5, 1000, default=20)) epsilon = cs.add_hyperparameter(UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default=1e-4, log=True)) learning_rate = cs.add_hyperparameter(CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal")) eta0 = cs.add_hyperparameter(UniformFloatHyperparameter( "eta0", 10**-7, 0.1, default=0.01)) power_t = cs.add_hyperparameter(UniformFloatHyperparameter( "power_t", 1e-5, 1, default=0.25)) average = cs.add_hyperparameter(CategoricalHyperparameter( "average", ["False", "True"], default="False")) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_condition(elasticnet) cs.add_condition(epsilon_condition) cs.add_condition(power_t_condition) return cs def __str__(self): return "ParamSklearn StochasticGradientClassifier"
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, class_weight, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.class_weight = class_weight self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.estimator = None def fit(self, X, Y): # TODO: maybe scale training data that its norm becomes 1? # http://scikit-learn.org/stable/modules/sgd.html#id1 self.alpha = float(self.alpha) self.fit_intercept = bool(self.fit_intercept) self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) self.epsilon = float(self.epsilon) self.eta0 = float(self.eta0) self.power_t = float(self.power_t) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, random_state=self.random_state) self.estimator.fit(X, Y) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, # TODO find out what is best used here! 'preferred_dtype': None } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="hinge") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default="l2") alpha = UniformFloatHyperparameter("alpha", 10**-7, 10**-1, log=True, default=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 0, 1, default=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal") eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.5) # This does not allow for other resampling methods! class_weight = CategoricalHyperparameter("class_weight", ["None", "auto"], default="None") cs = ConfigurationSpace() cs.add_hyperparameter(loss) cs.add_hyperparameter(penalty) cs.add_hyperparameter(alpha) cs.add_hyperparameter(l1_ratio) cs.add_hyperparameter(fit_intercept) cs.add_hyperparameter(n_iter) cs.add_hyperparameter(epsilon) cs.add_hyperparameter(learning_rate) cs.add_hyperparameter(eta0) cs.add_hyperparameter(power_t) cs.add_hyperparameter(class_weight) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_condition(elasticnet) cs.add_condition(epsilon_condition) cs.add_condition(power_t_condition) return cs def __str__(self): return "AutoSklearn StochasticGradientClassifier"
class SGD: def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight) iteration = 2 while not self.configuration_fully_fitted(): n_iter = int(2**iteration / 2) self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) iteration += 1 return self def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df)