def _default_params(cls): return { 'regressors': [ pp.StandardScaler() | lm.LinearRegression(intercept_lr=.1), pp.StandardScaler() | lm.PARegressor(), ] }
def test_online_batch_consistent(): # Batch batch = ( preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression() ) ) dataset = datasets.ImageSegments() batch_metric = metrics.MacroF1() for i, x in enumerate(pd.read_csv(dataset.path, chunksize=1)): y = x.pop('category') y_pred = batch.predict_many(x) batch.fit_many(x, y) for yt, yp in zip(y, y_pred): if yp is not None: batch_metric.update(yt, yp) if i == 30: break # Online online = ( preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression() ) ) online_metric = metrics.MacroF1() X = pd.read_csv(dataset.path) Y = X.pop('category') for i, (x, y) in enumerate(stream.iter_pandas(X, Y)): y_pred = online.predict_one(x) online.fit_one(x, y) if y_pred is not None: online_metric.update(y, y_pred) if i == 30: break assert online_metric.get() == batch_metric.get()
def test_phishing(client, app): r = client.post('/api/init', json={'flavor': 'binary'}) model = preprocessing.StandardScaler() | linear_model.LogisticRegression() client.post('/api/model', data=pickle.dumps(model)) for i, (x, y) in enumerate(datasets.Phishing().take(30)): # Predict/learn via chantilly r = client.post('/api/predict', data=json.dumps({ 'id': i, 'features': x }), content_type='application/json') client.post('/api/learn', data=json.dumps({ 'id': i, 'ground_truth': y }), content_type='application/json') # Predict/learn directly via creme y_pred = model.predict_proba_one(x) model.fit_one(x, y) # Compare the predictions from both sides assert math.isclose(y_pred[True], r.json['prediction']['true'])
def test_phishing_without_id(client, app): r = client.post('/api/init', json={'flavor': 'binary'}) model = preprocessing.StandardScaler() | linear_model.LogisticRegression() client.post('/api/model', data=pickle.dumps(model)) for x, y in datasets.Phishing().take(30): # Predict/learn via chantilly r = client.post('/api/predict', data=json.dumps({'features': x}), content_type='application/json') client.post('/api/learn', data=json.dumps({ 'features': x, 'ground_truth': y }), content_type='application/json') # Predict/learn directly via creme y_pred = model.predict_proba_one(x) # Because no ID is provided, chantilly will ask the model to make a prediction a second # time in order to update the metric model.predict_proba_one(x) model.fit_one(x, y) # Compare the predictions from both sides assert math.isclose(y_pred[True], r.json['prediction']['true'])
def test_pipeline_add_at_start(): def a(x): pass pipeline = preprocessing.StandardScaler() | linear_model.LinearRegression() pipeline = a | pipeline assert str(pipeline) == 'a | StandardScaler | LinearRegression'
def build_oracle(self) -> compose.Pipeline: model = compose.Pipeline( ('scale', preprocessing.StandardScaler()), ('learn', multiclass.OneVsRestClassifier( binary_classifier=linear_model.LogisticRegression()))) return model
def test_standard_scaler_one_many_consistent(): """Checks that using fit_one or fit_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) one = preprocessing.StandardScaler() for x, _ in stream.iter_pandas(X): one.fit_one(x) many = preprocessing.StandardScaler() for xb in np.array_split(X, 10): many.fit_many(xb) for i in X: assert math.isclose(one.counts[i], many.counts[i]) assert math.isclose(one.means[i], many.means[i]) assert math.isclose(one.vars[i], many.vars[i])
def test_standard_scaler_shuffle_columns(): """Checks that fit_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) normal = preprocessing.StandardScaler() for xb in np.array_split(X, 10): normal.fit_many(xb) shuffled = preprocessing.StandardScaler() for xb in np.array_split(X, 10): cols = np.random.permutation(X.columns) shuffled.fit_many(xb[cols]) for i in X: assert math.isclose(shuffled.counts[i], shuffled.counts[i]) assert math.isclose(shuffled.means[i], shuffled.means[i]) assert math.isclose(shuffled.vars[i], shuffled.vars[i])
def __init__(self): self.model = anomaly.HalfSpaceTrees(n_trees=10, height=3, window_size=25, seed=42, limits={'ptr': [-15, 32]}) self.scaler = preprocessing.StandardScaler()
def test_standard_scaler_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) ss = preprocessing.StandardScaler() for xb in np.array_split(X, 10): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) ss.fit_many(xb[cols])
def test_set_params_pipeline(): obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42) obj.fit_one({'x': 3}, 6) params = {'LinearRegression': {'l2': 21}} new = obj._set_params(params) assert new['LinearRegression'].l2 == 21 assert obj['LinearRegression'].l2 == 42 assert new['LinearRegression'].weights == {} assert new['LinearRegression'].weights != obj['LinearRegression'].weights
def test_finite_differences(lm, X_y): """Checks the gradient of a linear model via finite differences. References: 1. [How to test gradient implementations](https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/) 2. [Stochastic Gradient Descent Tricks](https://cilvr.cs.nyu.edu/diglib/lsml/bottou-sgd-tricks-2012.pdf) """ scaler = preprocessing.StandardScaler() eps = 1e-6 for x, y in X_y: print(x) print('---------') x = scaler.fit_one(x).transform_one(x) print(x) print('+++++++++') # Store the current gradient and weights gradient, _ = lm._eval_gradient_one(x, y, 1) weights = lm.weights.copy() # d is a set of weight perturbations for d in iter_perturbations(weights.keys()): # Pertubate the weights and obtain the loss with the new weights lm.weights = {i: weights[i] + eps * di for i, di in d.items()} forward = lm.loss(y_true=y, y_pred=lm._raw_dot_one(x)) lm.weights = {i: weights[i] - eps * di for i, di in d.items()} backward = lm.loss(y_true=y, y_pred=lm._raw_dot_one(x)) # We expect g and h to be equal g = utils.math.dot(d, gradient) h = (forward - backward) / (2 * eps) # Compare signs # TODO: reactivate this check #assert np.sign(g) == np.sign(h) # Check absolute difference # TODO: decrease the tolerance assert abs(g - h) < 1e-5 # Reset the weights to their original values in order not to influence # the training loop, even though it doesn't really matter. lm.weights = weights lm.fit_one(x, y)
def test_log_reg_sklearn_coherence(): """Checks that the sklearn and creme implementations produce the same results.""" ss = preprocessing.StandardScaler() cr = lm.LogisticRegression(optimizer=optim.SGD(.01)) sk = sklm.SGDClassifier(learning_rate='constant', eta0=.01, alpha=.0, loss='log') for x, y in datasets.Bananas(): x = ss.fit_one(x).transform_one(x) cr.fit_one(x, y) sk.partial_fit([list(x.values())], [y], classes=[False, True]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[0][i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def test_perceptron_sklearn_coherence(): """Checks that the sklearn and creme implementations produce the same results.""" ss = preprocessing.StandardScaler() cr = lm.Perceptron() sk = sklm.Perceptron() for x, y in datasets.Bananas(): x = ss.fit_one(x).transform_one(x) cr.fit_one(x, y) sk.partial_fit([list(x.values())], [y], classes=[False, True]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[0][i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def __init__(self, p: int, d: int, q: int, m: int = 1, sp: int = 0, sd: int = 0, sq: int = 0, regressor: creme.base.Regressor = None): self.p = p self.d = d self.q = q self.m = m self.sp = sp self.sd = sd self.sq = sq self.regressor = ( regressor if regressor is not None else preprocessing.StandardScaler() | linear_model.LinearRegression() ) self.differencer = Differencer(d=d, m=1) + Differencer(d=sd, m=1) self.y_trues = collections.deque(maxlen=max(p, m * sp)) self.errors = collections.deque(maxlen=max(p, m * sq))
def main(): import datetime as dt from creme import compose from creme import datasets from creme import feature_extraction from creme import linear_model from creme import metrics as metricss from creme import preprocessing from creme import stats from creme import stream X_y = datasets.Bikes() X_y = stream.simulate_qa(X_y, moment='moment', delay=dt.timedelta(minutes=30)) def add_time_features(x): return {**x, 'hour': x['moment'].hour, 'day': x['moment'].weekday()} model = add_time_features model |= (compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind') + feature_extraction.TargetAgg( by=['station', 'hour'], how=stats.Mean()) + feature_extraction.TargetAgg(by='station', how=stats.EWMean())) model |= preprocessing.StandardScaler() model |= linear_model.LinearRegression() metric = metricss.MAE() questions = {} for i, x, y in X_y: # Question is_question = y is None if is_question: y_pred = model.predict_one(x) questions[i] = y_pred # Answer else: metric.update(y, questions[i]) model = model.fit_one(x, y) if i >= 30000 and i % 30000 == 0: print(i, metric)
def main(): def add_hour(x): x['hour'] = x['moment'].hour return x benchmark.benchmark( get_X_y=datasets.fetch_bikes, n=182470, get_pp=lambda: (compose.Whitelister('clouds', 'humidity', 'pressure', 'temperature', 'wind') + (add_hour | feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean()) ) | preprocessing.StandardScaler()), models=[ # ('creme', 'LinReg', linear_model.LinearRegression( # optimizer=optim.VanillaSGD(0.01), # l2=0. # )), ('creme', 'GLM', linear_model.GLMRegressor(optimizer=optim.VanillaSGD(0.01), l2=0.)), ('creme', 'GLM', meta.Detrender( linear_model.GLMRegressor(optimizer=optim.VanillaSGD(0.01), l2=0.))), # ('sklearn', 'SGD', compat.CremeRegressorWrapper( # sklearn_estimator=sk_linear_model.SGDRegressor( # learning_rate='constant', # eta0=0.01, # fit_intercept=True, # penalty='none' # ), # )), # ('sklearn', 'SGD no intercept', compat.CremeRegressorWrapper( # sklearn_estimator=sk_linear_model.SGDRegressor( # learning_rate='constant', # eta0=0.01, # fit_intercept=False, # penalty='none' # ), # )), ], get_metric=metrics.MSE)
def test_lin_reg_sklearn_coherence(): """Checks that the sklearn and creme implementations produce the same results.""" class SquaredLoss: """sklearn removes the leading 2 from the gradient of the squared loss.""" def gradient(self, y_true, y_pred): return y_pred - y_true ss = preprocessing.StandardScaler() cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss()) sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0) for x, y in datasets.TrumpApproval(): x = ss.fit_one(x).transform_one(x) cr.fit_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def __init__(self, name, identifier, availability_topic, number_of_sensors, number_of_metrics, logger, model, learning_time=dt.timedelta(days=7)): self._name = name self._identifier = identifier self._availability_topic = availability_topic self._date_of_birth = dt.datetime.now() self._learning_time = learning_time self._sensors = {} self._metrics = {} self._switch = None self._blinds = None self._number_of_sensors = number_of_sensors self._number_of_metrics = number_of_metrics self._logger = logger #self._last_true = 100 self._last_example = np.array([]) self._last_pred = None self._able_to_predict = True if model == 1: self.log_message('info', 'Using AdaptiveRandomForestRegressor model') self._model = AdaptiveRandomForestRegressor(random_state=43, n_estimators=100, grace_period=50, max_features=11, leaf_prediction='mean', split_confidence=0.09, lambda_value=10) elif model == 2: self.log_message('info', 'Using PARegressor model') self._model = preprocessing.StandardScaler() | compose.Discard( 'lights') | linear_model.PARegressor(C=0.05, mode=1, eps=0.1)
def get_all_estimators(): ignored = (Creme2SKLBase, SKL2CremeBase, compat.PyTorch2CremeRegressor, compose.FuncTransformer, compose.Pipeline, ensemble.StackingBinaryClassifier, feature_extraction.Agg, feature_extraction.TargetAgg, feature_extraction.Differ, feature_selection.PoissonInclusion, imblearn.RandomOverSampler, imblearn.RandomUnderSampler, imblearn.RandomSampler, impute.PreviousImputer, impute.StatImputer, linear_model.FFMClassifier, linear_model.FFMRegressor, linear_model.FMClassifier, linear_model.FMRegressor, linear_model.HOFMClassifier, linear_model.HOFMRegressor, linear_model.SoftmaxRegression, meta.PredClipper, meta.TransformedTargetRegressor, multioutput.ClassifierChain, multioutput.RegressorChain, preprocessing.OneHotEncoder, reco.Baseline, reco.BiasedMF, reco.FunkMF, reco.RandomNormal, time_series.Detrender, time_series.GroupDetrender, time_series.SNARIMAX) def is_estimator(obj): return inspect.isclass(obj) and issubclass(obj, base.Estimator) for submodule in importlib.import_module('creme').__all__: if submodule == 'base': continue for _, obj in inspect.getmembers( importlib.import_module(f'creme.{submodule}'), is_estimator): if issubclass(obj, ignored): continue elif issubclass(obj, dummy.StatisticRegressor): inst = obj(statistic=stats.Mean()) elif issubclass(obj, meta.BoxCoxRegressor): inst = obj(regressor=linear_model.LinearRegression()) elif issubclass(obj, tree.RandomForestClassifier): inst = obj() elif issubclass(obj, ensemble.BaggingClassifier): inst = obj(linear_model.LogisticRegression()) elif issubclass(obj, ensemble.BaggingRegressor): inst = obj(linear_model.LinearRegression()) elif issubclass(obj, ensemble.AdaBoostClassifier): inst = obj(linear_model.LogisticRegression()) elif issubclass(obj, ensemble.HedgeRegressor): inst = obj([ preprocessing.StandardScaler() | linear_model.LinearRegression(intercept_lr=.1), preprocessing.StandardScaler() | linear_model.PARegressor(), ]) elif issubclass(obj, feature_selection.SelectKBest): inst = obj(similarity=stats.PearsonCorrelation()) elif issubclass(obj, linear_model.LinearRegression): inst = preprocessing.StandardScaler() | obj(intercept_lr=.1) elif issubclass(obj, linear_model.PARegressor): inst = preprocessing.StandardScaler() | obj() elif issubclass(obj, multiclass.OneVsRestClassifier): inst = obj(binary_classifier=linear_model.LogisticRegression()) else: inst = obj() yield inst
elif issubclass(obj, multiclass.OneVsRestClassifier): inst = obj(binary_classifier=linear_model.LogisticRegression()) else: inst = obj() yield inst @pytest.mark.parametrize('estimator, check', [ pytest.param( copy.deepcopy(estimator), check, id=f'{estimator}:{check.__name__}') for estimator in list(get_all_estimators()) + [ feature_extraction.TFIDF(), linear_model.LogisticRegression(), preprocessing.StandardScaler() | linear_model.LinearRegression(), preprocessing.StandardScaler() | linear_model.PAClassifier(), preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(linear_model.LogisticRegression()), preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(linear_model.PAClassifier()), naive_bayes.GaussianNB(), preprocessing.StandardScaler(), cluster.KMeans(n_clusters=5, seed=42), preprocessing.MinMaxScaler(), preprocessing.MinMaxScaler() + preprocessing.StandardScaler(), preprocessing.PolynomialExtender(), feature_selection.VarianceThreshold(), feature_selection.SelectKBest(similarity=stats.PearsonCorrelation()) ] for check in utils.estimator_checks.yield_checks(estimator) ])
def lin_reg(client): model = preprocessing.StandardScaler() | linear_model.LinearRegression() client.post('/api/model/lin-reg', data=pickle.dumps(model))
def test_pipeline_duplicate_step(): with pytest.raises(KeyError): preprocessing.StandardScaler() | preprocessing.StandardScaler()
def test_union_duplicate_step(): with pytest.raises(KeyError): preprocessing.StandardScaler() + preprocessing.StandardScaler()
from creme import preprocessing from creme import tree from creme import utils @pytest.mark.parametrize('model, param_grid, count', [ (linear_model.LinearRegression(), { 'optimizer': [(optim.SGD, { 'lr': [1, 2] }), (optim.Adam, { 'beta_1': [.1, .01, .001], 'lr': [.1, .01, .001, .0001] })] }, 2 + 3 * 4), (preprocessing.StandardScaler() | linear_model.LinearRegression(), { 'LinearRegression': { 'optimizer': [(optim.SGD, { 'lr': [1, 2] }), (optim.Adam, { 'beta_1': [.1, .01, .001], 'lr': [.1, .01, .001, .0001] })] } }, 2 + 3 * 4), (compose.Pipeline(('Scaler', None), linear_model.LinearRegression()), { 'Scaler': [ preprocessing.MinMaxScaler(), preprocessing.MaxAbsScaler(), preprocessing.StandardScaler()
def get_all_estimators(): ignored = (CremeBaseWrapper, SKLBaseWrapper, base.Wrapper, compose.FuncTransformer, ensemble.StackingBinaryClassifier, feature_extraction.Agg, feature_extraction.TargetAgg, feature_extraction.Differ, linear_model.FMRegressor, linear_model.SoftmaxRegression, multioutput.ClassifierChain, multioutput.RegressorChain, naive_bayes.BernoulliNB, naive_bayes.ComplementNB, preprocessing.OneHotEncoder, tree.DecisionTreeClassifier) def is_estimator(obj): return inspect.isclass(obj) and issubclass(obj, base.Estimator) for submodule in importlib.import_module('creme').__all__: if submodule == 'base': continue for name, obj in inspect.getmembers( importlib.import_module(f'creme.{submodule}'), is_estimator): if issubclass(obj, ignored): continue if issubclass(obj, dummy.StatisticRegressor): inst = obj(statistic=stats.Mean()) elif issubclass(obj, ensemble.BaggingClassifier): inst = obj(linear_model.LogisticRegression()) elif issubclass(obj, ensemble.BaggingRegressor): inst = obj(linear_model.LinearRegression()) elif issubclass(obj, ensemble.HedgeRegressor): inst = obj([ preprocessing.StandardScaler() | linear_model.LinearRegression(intercept_lr=0.1), preprocessing.StandardScaler() | linear_model.PARegressor(), ]) elif issubclass(obj, feature_selection.RandomDiscarder): inst = obj(n_to_keep=5) elif issubclass(obj, feature_selection.SelectKBest): inst = obj(similarity=stats.PearsonCorrelation()) elif issubclass(obj, linear_model.LinearRegression): inst = preprocessing.StandardScaler() | obj(intercept_lr=0.1) elif issubclass(obj, linear_model.PARegressor): inst = preprocessing.StandardScaler() | obj() elif issubclass(obj, multiclass.OneVsRestClassifier): inst = obj(binary_classifier=linear_model.LogisticRegression()) else: inst = obj() yield inst
elif issubclass(obj, multiclass.OneVsRestClassifier): inst = obj(binary_classifier=linear_model.LogisticRegression()) else: inst = obj() yield inst @pytest.mark.parametrize('estimator', [ pytest.param(copy.deepcopy(estimator), id=str(estimator)) for estimator in list(get_all_estimators()) + [ feature_extraction.TFIDF(), linear_model.LogisticRegression(), preprocessing.StandardScaler() | linear_model.LinearRegression(), preprocessing.StandardScaler() | linear_model.PAClassifier(), preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(linear_model.LogisticRegression()), preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(linear_model.PAClassifier()), naive_bayes.GaussianNB(), preprocessing.StandardScaler(), cluster.KMeans(n_clusters=5, seed=42), preprocessing.MinMaxScaler(), preprocessing.MinMaxScaler() + preprocessing.StandardScaler(), preprocessing.PolynomialExtender(), feature_selection.VarianceThreshold(), feature_selection.SelectKBest(similarity=stats.PearsonCorrelation()) ] ]) def test_check_estimator(estimator): utils.estimator_checks.check_estimator(estimator)
optimizer=torch_optim( torch_model.parameters())) inputs = layers.Input(shape=(n_features, )) predictions = layers.Dense(1, kernel_initializer='zeros', bias_initializer='zeros')(inputs) keras_model = models.Model(inputs=inputs, outputs=predictions) keras_model.compile(optimizer=keras_optim, loss='mean_squared_error') keras_lin_reg = KerasRegressor(keras_model) creme_metric = metrics.MAE() torch_metric = metrics.MAE() keras_metric = metrics.MAE() scaler = preprocessing.StandardScaler() for x, y in X_y: x = scaler.fit_one(x).transform_one(x) creme_metric.update(y, creme_lin_reg.predict_one(x)) creme_lin_reg.fit_one(x, y) torch_metric.update(y, torch_lin_reg.predict_one(x)) torch_lin_reg.fit_one(x, y) keras_metric.update(y, keras_lin_reg.predict_one(x)) keras_lin_reg.fit_one(x, y) print(name, creme_metric.get(), torch_metric.get(), keras_metric.get())