def _yield_datasets(model: Estimator): """Generates datasets for a given model.""" from sklearn import datasets as sk_datasets from river import base, compose, datasets, preprocessing, stream, utils # Recommendation models can be regressors or classifiers, but they have requirements as to the # structure of the data if isinstance(utils.inspect.extract_relevant(model), Recommender): if utils.inspect.isregressor(model): yield _DummyDataset( ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ({"user": "******", "item": "Harry Potter"}, 5), ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ) return # Multi-output regression elif utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = ( compose.SelectType(str) | preprocessing.OneHotEncoder() ) + compose.SelectType(int) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def test_clone_idempotent(): model = preprocessing.StandardScaler() | linear_model.LogisticRegression( optimizer=optim.Adam(), l2=0.1) trace = [] for x, y in datasets.Phishing(): trace.append(model.predict_proba_one(x)) model.learn_one(x, y) clone = model.clone() for i, (x, y) in enumerate(datasets.Phishing()): assert clone.predict_proba_one(x) == trace[i] clone.learn_one(x, y)
def yield_datasets(model): from river import base from river import compose from river import datasets from river import preprocessing from river import stream from river import utils from sklearn import datasets as sk_datasets # Multi-output regression if utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType( int ) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def test_sklearn_coherence(river_params, sklearn_params): """Checks that the sklearn and river implementations produce the same results.""" rv = anomaly.OneClassSVM(**river_params) sk = sklm.SGDOneClassSVM(**sklearn_params) for x, _ in datasets.Phishing().take(100): rv.learn_one(x) sk.partial_fit([list(x.values())]) for i, w in enumerate(rv.weights.values()): assert math.isclose(w, sk.coef_[i])
def load_binary_clf_tracks() -> typing.List[Track]: """Return binary classification tracks.""" return [ Track( name="Phishing", dataset=datasets.Phishing(), metric=metrics.Accuracy() + metrics.F1(), ), Track( name="Bananas", dataset=datasets.Bananas(), metric=metrics.Accuracy() + metrics.F1(), ), ]
from river import compose from river import preprocessing from river import linear_model from river import metrics from river import datasets from river import optim optimizer = optim.SGD(0.1) model = compose.Pipeline(preprocessing.StandardScaler(), linear_model.LogisticRegression(optimizer)) metric = metrics.ROCAUC() precision = metrics.Precision() for x, y in datasets.Phishing(): y_pred = model.predict_proba_one(x) model.learn_one(x, y) metric.update(y, y_pred) precision.update(y, y_pred) print(metric) print(precision)
from river import datasets dataset = datasets.Phishing() from river import compose from river import linear_model from river import metrics from river import preprocessing model = compose.Pipeline(preprocessing.StandardScaler(), linear_model.LogisticRegression()) metric = metrics.Accuracy() for x, y in dataset: y_pred = model.predict_one(x) metric = metric.update(y, y_pred) model = model.learn_one(x, y) print(metric)
import pytest from river import datasets, synth, tree def get_regression_data(): return iter(synth.Friedman(seed=42).take(200)) @pytest.mark.parametrize( "dataset, splitter", [ (datasets.Phishing(), tree.splitter.ExhaustiveSplitter()), (datasets.Phishing(), tree.splitter.HistogramSplitter()), (datasets.Phishing(), tree.splitter.GaussianSplitter()), ], ) def test_class_splitter(dataset, splitter): model = tree.HoeffdingTreeClassifier(splitter=splitter, grace_period=10, leaf_prediction="mc", split_confidence=0.1) for x, y in dataset: model.learn_one(x, y) assert model.height > 0 @pytest.mark.parametrize( "dataset, splitter",