示例#1
0
def _yield_datasets(model: Estimator):
    """Generates datasets for a given model."""

    from sklearn import datasets as sk_datasets

    from river import base, compose, datasets, preprocessing, stream, utils

    # Recommendation models can be regressors or classifiers, but they have requirements as to the
    # structure of the data
    if isinstance(utils.inspect.extract_relevant(model), Recommender):
        if utils.inspect.isregressor(model):
            yield _DummyDataset(
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
                ({"user": "******", "item": "Harry Potter"}, 5),
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
            )
        return

    # Multi-output regression
    elif utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (
                    compose.SelectType(str) | preprocessing.OneHotEncoder()
                ) + compose.SelectType(int)
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)
示例#2
0
def test_memory_usage_multilabel():
    dataset = datasets.Music().take(500)

    model = tree.LabelCombinationHoeffdingTreeClassifier(
        leaf_prediction="mc",
        splitter=tree.splitter.ExhaustiveSplitter(),
        max_size=1,
        memory_estimate_period=100,
    )
    for x, y in dataset:
        model.learn_one(x, y)

    assert model._raw_memory_usage / (2 ** 20) < 1
示例#3
0
def yield_datasets(model):

    from river import base
    from river import compose
    from river import datasets
    from river import preprocessing
    from river import stream
    from river import utils
    from sklearn import datasets as sk_datasets

    # Multi-output regression
    if utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType(
                    int
                )
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)