예제 #1
0
    def predict(self, X):
        """Predicts the target of an entire dataset contained in memory.

        Parameters:
            X: array-like of shape (n_samples, n_features).

        Returns:
            array of shape (n_samples,): Predicted target values for each row of `X`.

        """

        # Check the fit method has been called
        utils.validation.check_is_fitted(self, attributes='instance_')

        # Check the input
        X = utils.check_array(X, **SKLEARN_INPUT_X_PARAMS)

        if X.shape[1] != self.n_features_in_:
            raise ValueError(f'Expected {self.n_features_in_} features, got {X.shape[1]}')

        # Make a prediction for each observation
        y_pred = [None] * len(X)
        for i, (x, _) in enumerate(stream.iter_array(X)):
            y_pred[i] = self.instance_.predict_one(x)

        # Convert back to the expected labels if an encoder was necessary for BinaryClassifier
        y_pred = np.asarray(y_pred)
        if hasattr(self, 'label_encoder_'):
            y_pred = self.label_encoder_.inverse_transform(y_pred.astype(int))

        return y_pred
예제 #2
0
    def predict_proba(self, X):
        """Predicts the target probability of an entire dataset contained in memory.

        Parameters:
            X (array-like of shape (n_samples, n_features))

        Returns:
            array of shape (n_samples,): Predicted target values for each row of `X`.

        """

        # Check the fit method has been called
        utils.validation.check_is_fitted(self, attributes='instance_')

        # Check the input
        X = utils.check_array(X, **SKLEARN_INPUT_X_PARAMS)

        if X.shape[1] != self.n_features_in_:
            raise ValueError(f'Expected {self.n_features_in_} features, got {X.shape[1]}')

        # creme's predictions have to converted to follow the scikit-learn conventions
        def reshape_probas(y_pred):
            return [y_pred.get(c, 0) for c in self.classes_]

        # Make a prediction for each observation
        y_pred = np.empty(shape=(len(X), len(self.classes_)))
        for i, (x, _) in enumerate(stream.iter_array(X)):
            y_pred[i] = reshape_probas(self.instance_.predict_proba_one(x))

        return y_pred
예제 #3
0
    def predict(self, X):
        """Predicts the target of an entire dataset contained in memory.

        Parameters:
            X: array-like of shape (n_samples, n_features).

        Returns:
            array of shape (n_samples,): Predicted target values for each row of `X`.

        """

        # Check the fit method has been called
        utils.validation.check_is_fitted(self, attributes='instance_')

        # Check the input
        X = utils.check_array(X, **SKLEARN_INPUT_X_PARAMS)

        if X.shape[1] != self.n_features_in_:
            raise ValueError(f'Expected {self.n_features_in_} features, got {X.shape[1]}')

        # Make a prediction for each observation
        y_pred = np.empty(shape=len(X))
        for i, (x, _) in enumerate(stream.iter_array(X)):
            y_pred[i] = self.instance_.predict_one(x)

        return y_pred
예제 #4
0
def iter_pandas(X: pd.DataFrame, y: typing.Union[pd.Series, pd.DataFrame] = None,
                **kwargs) -> base.typing.Stream:
    """Iterates over the rows of a `pandas.DataFrame`.

    Parameters:
        X: A dataframe of features.
        y: A series or a dataframe with one column per target.

    Example:

        >>> import pandas as pd
        >>> from creme import stream

        >>> X = pd.DataFrame({
        ...     'x1': [1, 2, 3, 4],
        ...     'x2': ['blue', 'yellow', 'yellow', 'blue'],
        ...     'y': [True, False, False, True]
        ... })
        >>> y = X.pop('y')

        >>> for xi, yi in stream.iter_pandas(X, y):
        ...     print(xi, yi)
        {'x1': 1, 'x2': 'blue'} True
        {'x1': 2, 'x2': 'yellow'} False
        {'x1': 3, 'x2': 'yellow'} False
        {'x1': 4, 'x2': 'blue'} True

    """

    kwargs['feature_names'] = X.columns
    if isinstance(y, pd.DataFrame):
        kwargs['target_names'] = y.columns

    yield from stream.iter_array(X=X.to_numpy(), y=y if y is None else y.to_numpy(), **kwargs)
예제 #5
0
def iter_sklearn_dataset(dataset: 'sklearn.utils.Bunch',
                         **kwargs) -> base.typing.Stream:
    """Iterates rows from one of the datasets provided by scikit-learn.

    This allows you to use any dataset from [scikit-learn's `datasets` module](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets). For instance, you can use the `fetch_openml` function to get access to all of the
    datasets from the OpenML website.

    Parameters:
        dataset: A scikit-learn dataset.

    Example:

        >>> import pprint
        >>> from sklearn import datasets
        >>> from creme import stream

        >>> dataset = datasets.load_boston()

        >>> for xi, yi in stream.iter_sklearn_dataset(dataset):
        ...     pprint.pprint(xi)
        ...     print(yi)
        ...     break
        {'AGE': 65.2,
         'B': 396.9,
         'CHAS': 0.0,
         'CRIM': 0.00632,
         'DIS': 4.09,
         'INDUS': 2.31,
         'LSTAT': 4.98,
         'NOX': 0.538,
         'PTRATIO': 15.3,
         'RAD': 1.0,
         'RM': 6.575,
         'TAX': 296.0,
         'ZN': 18.0}
        24.0

    """
    kwargs['X'] = dataset.data
    kwargs['y'] = dataset.target
    try:
        kwargs['feature_names'] = dataset.feature_names
    except AttributeError:
        pass

    yield from stream.iter_array(**kwargs)
예제 #6
0
def iter_sklearn_dataset(dataset: 'sklearn.utils.Bunch',
                         **kwargs) -> base.typing.Stream:
    """Yields rows from one of the datasets provided by scikit-learn.

    Parameters:
        dataset: A scikit-learn dataset.

    Example:

        >>> import pprint
        >>> from sklearn import datasets
        >>> from creme import stream

        >>> dataset = datasets.load_boston()

        >>> for xi, yi in stream.iter_sklearn_dataset(dataset):
        ...     pprint.pprint(xi)
        ...     print(yi)
        ...     break
        {'AGE': 65.2,
         'B': 396.9,
         'CHAS': 0.0,
         'CRIM': 0.00632,
         'DIS': 4.09,
         'INDUS': 2.31,
         'LSTAT': 4.98,
         'NOX': 0.538,
         'PTRATIO': 15.3,
         'RAD': 1.0,
         'RM': 6.575,
         'TAX': 296.0,
         'ZN': 18.0}
        24.0

    """
    kwargs['X'] = dataset.data
    kwargs['y'] = dataset.target
    try:
        kwargs['feature_names'] = dataset.feature_names
    except AttributeError:
        pass

    yield from stream.iter_array(**kwargs)