class TimeSeriesRegressor(BaseSKMObject, RegressorMixin):
    def __init__(self, estimator: RegressorMixin, max_window_size=100):
        super().__init__()

        if not isinstance(estimator, RegressorMixin):
            raise ValueError(
                "estimator must be a Regressor, "
                "Call TimeSeriesRegressor with an instance of RegressorMixin")

        self.max_window_size = max_window_size
        self.estimator = estimator
        self.window = InstanceWindow(max_size=max_window_size, dtype=float)
        self.first_fit = True

    def partial_fit(self, X, y=None, sample_weight=None):
        """ Partially fits the model on the samples X and corresponding targets y.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            The data upon which the algorithm will create its model. 
            If y is not provided the value X[t+1] in X will be used as target For X[t]
            
        y: numpy.ndarray, optional
            An array-like containing the targets for all samples in X.
            y must have the shape as X
            
        sample_weight: Not used.
        
        Returns
        -------
        TimeSeriesRegressor
            self
        Notes
        -----
        For the TimeSeries Classifier, fitting the model is the
        equivalent of inserting the newer samples in the observed window,
        and if the max_window_size is reached, removing older results and then using
        the the max_window_size past X values to predict future X values by feeding
        them as features to the provided model.
        To store the viewed samples we use a InstanceWindow object. For this 
        class' documentation please visit skmultiflow.core.utils.data_structures
        """
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
        r = X.shape[0]

        if y is not None:
            r_t = y.shape[0]

            if r != r_t:
                raise ValueError(
                    "Batch size of X is different from the number of attributes in y "
                    "Batch size of must be the same for X and y")

        if self.first_fit:
            if r <= self.max_window_size:
                raise ValueError(
                    "Number of elments of First call to partial_fit less than max_window_size "
                    "Call partial_fit with more than {} elements".format(
                        self.max_window_size))

        for i in range(r):
            if y is not None:
                self.window.add_element(np.asarray([X[i]]), np.asarray([y[i]]))
            elif i > 0:
                self.window.add_element(np.asarray([X[i - 1]]),
                                        np.asarray([X[i]]))
            if self.max_window_size == self.window.n_samples:
                self.estimator.partial_fit(
                    self.window.get_attributes_matrix().reshape((1, -1)),
                    self.window.get_targets_matrix()[-1].reshape((1, -1)),
                    sample_weight=sample_weight)
        self.first_fit = False
        return self

    def reset(self):
        self.window.reset()
        self.estimator.reset()

        return self

    def clone_window(self):
        window = InstanceWindow(
            n_features=self.window.n_attributes,
            n_targets=self.window.n_targets,
            categorical_list=self.window.categorical_attributes,
            max_size=self.window.max_size)
        window._buffer = np.array(self.window._buffer)
        window._n_samples = self.window._n_samples

        return window

    def predict(self, X):
        """ Predicts the next value For all values in X.
        The estimator consider X[0] as the value conming exactly after the last partially fit value.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            All the samples we want to predict the label for.
            
        Returns
        -------
        list
            A list containing the predicted values for all instances in X.
        
        """

        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        r = X.shape[0]

        window = self.clone_window()

        predictions = []
        for i in range(r):
            window.add_element(np.asarray([X[i]]), np.asarray([X[i]]))
            if self.max_window_size == self.window.n_samples:
                pred = self.estimator.predict(
                    window.get_attributes_matrix().reshape((1, -1)))
                if (len(pred.flatten()) == 1):
                    pred = pred[0]
                predictions.append(pred[0])

        return np.array(predictions)

    def predict_proba(self, X):
        """
        Method not implemented for this Estimator
        """
        raise NotImplementedError

    def forcast(self, X, n_steps):
        """ Predicts the next n_steps values coming after all values in X.
        The estimator consider X[0] as the value conming exactly after the last partially fit value.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            All the samples we want to predict the next value for.
        n_steps: 
            The number of values to Forcast
        Returns
        -------
        list
            A list containing the predicted n_steps to come after values in X.
         
        """
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        r = X.shape[0]

        window = self.clone_window()

        for i in range(r):
            window.add_element(np.asarray([X[i]]), np.asarray([X[i]]))

        forecasts = []
        for i in range(n_steps):
            next_element = self.estimator.predict(
                window.get_attributes_matrix().reshape((1, -1)))
            window.add_element(next_element.reshape((1, -1)),
                               next_element.reshape((1, -1)))
            if (len(next_element.flatten()) == 1):
                next_element = next_element[0]
            forecasts.append(next_element[0])

        return np.asarray(forecasts)
示例#2
0
class KNNClassifier(BaseSKMObject, ClassifierMixin):
    """ K-Nearest Neighbors classifier.
    
    This is a non-parametric classification method. The output of this
    algorithm are the n_neighbors closest training examples to the query sample
    X.
    
    It works by keeping track of a fixed number of training samples, in 
    our case it keeps track of the last max_window_size training samples.
    Then, whenever a query request is executed, the algorithm will search 
    its stored samples and find the closest ones using a selected distance 
    metric.
    
    To store the samples, while reducing search times, we use a structure 
    called KD Tree (a K Dimensional Tree, for n_neighbors dimensional problems).
    Although we do have our own KDTree implementation, which accepts 
    custom metrics, we recommend using the standard scikit-learn KDTree,  
    that even though doesn't accept custom metrics, is optimized and will 
    function faster.
    
    Parameters
    ----------
    n_neighbors: int (default=5)
        The number of nearest neighbors to search for.
        
    max_window_size: int (default=1000)
        The maximum size of the window storing the last viewed samples.
        
    leaf_size: int (default=30)
        The maximum number of samples that can be stored in one leaf node, 
        which determines from which point the algorithm will switch for a 
        brute-force approach. The bigger this number the faster the tree 
        construction time, but the slower the query time will be.
        
    nominal_attributes: numpy.ndarray (optional, default=None)
        List of Nominal attributes. If empty, then assume that all attributes are numerical.
    
    Raises
    ------
    NotImplementedError: A few of the functions described here are not 
    implemented since they have no application in this context.
    
    ValueError: A ValueError is raised if the predict function is called 
    before at least n_neighbors samples have been analyzed by the algorithm.
    
    Notes
    -----
    For a KDTree functionality explanation, please see our KDTree 
    documentation, under skmultiflow.lazy.neighbors.kdtree.
    
    This classifier is not optimal for a mixture of categorical and 
    numerical features.
    
    If you wish to use our KDTree implementation please refer to this class' 
    function __predict_proba
    
    Examples
    --------
    >>> # Imports
    >>> from skmultiflow.lazy import KNNClassifier
    >>> from skmultiflow.data import SEAGenerator
    >>> # Setting up the stream
    >>> stream = SEAGenerator(random_state=1, noise_percentage=.1)
    >>> stream.prepare_for_use()
    >>> # Pre training the classifier with 200 samples
    >>> X, y = stream.next_sample(200)
    >>> knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40)
    >>> knn.partial_fit(X, y)
    >>> # Preparing the processing of 5000 samples and correct prediction count
    >>> n_samples = 0
    >>> corrects = 0
    >>> while n_samples < 5000:
    ...     X, y = stream.next_sample()
    ...     my_pred = knn.predict(X)
    ...     if y[0] == my_pred[0]:
    ...         corrects += 1
    ...     knn = knn.partial_fit(X, y)
    ...     n_samples += 1
    >>>
    >>> # Displaying results
    >>> print('KNNClassifier usage example')
    >>> print('{} samples analyzed.'.format(n_samples))
    5000 samples analyzed.
    >>> print("KNNClassifier's performance: {}".format(corrects/n_samples))
    KNN's performance: 0.8788
    
    """
    def __init__(self,
                 n_neighbors=5,
                 max_window_size=1000,
                 leaf_size=30,
                 nominal_attributes=None):
        super().__init__()
        self.n_neighbors = n_neighbors
        self.max_window_size = max_window_size
        self.c = 0
        self.window = InstanceWindow(max_size=max_window_size, dtype=float)
        self.first_fit = True
        self.classes = []
        self.leaf_size = leaf_size
        self.nominal_attributes = nominal_attributes
        if self.nominal_attributes is None:
            self._nominal_attributes = []

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """ Partially fits the model on the samples X and corresponding targets y.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            The data upon which the algorithm will create its model.
            
        y: Array-like
            An array-like containing the classification targets for all 
            samples in X.

        classes: numpy.ndarray, optional (default=None)
            Array with all possible/known classes.

        sample_weight: Not used.
        
        Returns
        -------
        KNNClassifier
            self

        Notes
        -----
        For the K-Nearest Neighbors Classifier, fitting the model is the
        equivalent of inserting the newer samples in the observed window,
        and if the size_limit is reached, removing older results. To store
        the viewed samples we use a InstanceWindow object. For this class'
        documentation please visit skmultiflow.core.utils.data_structures

        """
        r, c = get_dimensions(X)

        if classes is not None:
            self.classes = list(set().union(self.classes, classes))

        for i in range(r):
            self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]]))
        return self

    def reset(self):
        self.window.reset()
        return self

    def predict(self, X):
        """ predict
        
        Predicts the label of the X sample, by searching the KDTree for 
        the n_neighbors-Nearest Neighbors.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            All the samples we want to predict the label for.
            
        Returns
        -------
        list
            A list containing the predicted labels for all instances in X.
        
        """
        r, c = get_dimensions(X)
        proba = self.predict_proba(X)
        predictions = []
        for i in range(r):
            predictions.append(np.argmax(proba[i]))
        return np.array(predictions)

    def predict_proba(self, X):
        """ predict_proba
         
        Calculates the probability of each sample in X belonging to each 
        of the labels, based on the knn algorithm.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
        
        Raises
        ------
        ValueError: If there is an attempt to call this function before, 
        at least, n_neighbors samples have been analyzed by the learner, a ValueError
        is raised.
        
        Returns
        -------
        numpy.ndarray
            An array of shape (n_samples, n_features), in which each outer entry is 
            associated with the X entry of the same index. And where the list in 
            index [i] contains len(self.target_value) elements, each of which represents
            the probability that the i-th sample of X belongs to a certain label.
         
        """
        if self.window is None or self.window.n_samples < self.n_neighbors:
            raise ValueError(
                "KNNClassifier must be (partially) fitted on n_neighbors samples before doing any prediction."
            )
        proba = []
        r, c = get_dimensions(X)

        self.classes = list(set().union(
            self.classes, np.unique(self.window.get_targets_matrix())))

        new_dist, new_ind = self.__predict_proba(X)
        for i in range(r):
            votes = [0.0 for _ in range(int(max(self.classes) + 1))]
            for index in new_ind[i]:
                votes[int(
                    self.window.get_targets_matrix()[index])] += 1. / len(
                        new_ind[i])
            proba.append(votes)

        return np.array(proba)

    def __predict_proba(self, X):
        """ __predict_proba
        
        Private implementation of the predict_proba method.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
        
        Returns
        -------
        tuple list
            One list with the k-nearest neighbor's distances and another 
            one with their indexes.
        
        """
        # To use our own KDTree implementation please replace it as follows
        # tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean',
        #              nominal_attributes=self._nominal_attributes, return_distance=True)

        tree = sk.KDTree(self.window.get_attributes_matrix(),
                         self.leaf_size,
                         metric='euclidean')
        dist, ind = tree.query(np.asarray(X), k=self.n_neighbors)
        return dist, ind
示例#3
0
class KNNClassifier(BaseSKMObject, ClassifierMixin):
    """ k-Nearest Neighbors classifier.
    
    This non-parametric classification method keeps a data window with the last max_window_size
    training samples. The predicted class-label for a given query sample is obtained in two steps:
    first, find the closest n_neighbors to the query sample in the data window. Second, aggregate
    the class-labels of the n_neighbors to define the predicted class for the query sample.

    
    Parameters
    ----------
    n_neighbors: int (default=5)
        The number of nearest neighbors to search for.
        
    max_window_size: int (default=1000)
        The maximum size of the window storing the last viewed samples.
        
    leaf_size: int (default=30)
        The maximum number of samples that can be stored in one leaf node, 
        which determines from which point the algorithm will switch for a 
        brute-force approach. The bigger this number the faster the tree 
        construction time, but the slower the query time will be.
        
    nominal_attributes: numpy.ndarray (optional, default=None)
        List of Nominal attributes. If empty, then assume that all attributes are numerical.
    
    Notes
    -----
    This classifier is not optimal for a mixture of categorical and numerical features.

    Examples
    --------
    >>> # Imports
    >>> from skmultiflow.lazy import KNNClassifier
    >>> from skmultiflow.data import SEAGenerator
    >>> # Setting up the stream
    >>> stream = SEAGenerator(random_state=1, noise_percentage=.1)
    >>> # Pre training the classifier with 200 samples
    >>> X, y = stream.next_sample(200)
    >>> knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40)
    >>> knn.partial_fit(X, y)
    >>> # Preparing the processing of 5000 samples and correct prediction count
    >>> n_samples = 0
    >>> corrects = 0
    >>> while n_samples < 5000:
    ...     X, y = stream.next_sample()
    ...     my_pred = knn.predict(X)
    ...     if y[0] == my_pred[0]:
    ...         corrects += 1
    ...     knn = knn.partial_fit(X, y)
    ...     n_samples += 1
    >>>
    >>> # Displaying results
    >>> print('KNNClassifier usage example')
    >>> print('{} samples analyzed.'.format(n_samples))
    5000 samples analyzed.
    >>> print("KNNClassifier's performance: {}".format(corrects/n_samples))
    KNN's performance: 0.8788
    
    """
    def __init__(self,
                 n_neighbors=5,
                 max_window_size=1000,
                 leaf_size=30,
                 nominal_attributes=None):
        super().__init__()
        self.n_neighbors = n_neighbors
        self.max_window_size = max_window_size
        self.c = 0
        self.window = InstanceWindow(max_size=max_window_size, dtype=float)
        self.first_fit = True
        self.classes = []
        self.leaf_size = leaf_size
        self.nominal_attributes = nominal_attributes
        if self.nominal_attributes is None:
            self._nominal_attributes = []

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """ Partially fits the model on the samples X and corresponding targets y.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            The data upon which the algorithm will create its model.
            
        y: Array-like
            An array-like containing the classification targets for all 
            samples in X.

        classes: numpy.ndarray, optional (default=None)
            Array with all possible/known classes.

        sample_weight: Not used.
        
        Returns
        -------
        KNNClassifier
            self

        Notes
        -----
        For the K-Nearest Neighbors Classifier, fitting the model is the
        equivalent of inserting the newer samples in the observed window,
        and if the size_limit is reached, removing older results. To store
        the viewed samples we use a InstanceWindow object. For this class'
        documentation please visit skmultiflow.core.utils.data_structures

        """
        r, c = get_dimensions(X)

        if classes is not None:
            self.classes = list(set().union(self.classes, classes))

        for i in range(r):
            self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]]))
        return self

    def reset(self):
        self.window.reset()
        return self

    def predict(self, X):
        """ Predicts the class label of the X sample.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            All the samples we want to predict the label for.
            
        Returns
        -------
        list
            A list containing the predicted labels for all instances in X.
        
        """
        r, c = get_dimensions(X)
        proba = self.predict_proba(X)
        predictions = []
        for i in range(r):
            predictions.append(np.argmax(proba[i]))
        return np.array(predictions)

    def predict_proba(self, X):
        """ Estimates the probability of each sample in X belonging to each of the class-labels.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
        
        Returns
        -------
        numpy.ndarray
            An array of shape (n_samples, n_features), in which each outer entry is 
            associated with the X entry of the same index. And where the list in 
            index [i] contains len(self.target_value) elements, each of which represents
            the probability that the i-th sample of X belongs to a certain label.
         
        """
        r, c = get_dimensions(X)
        if self.window is None or self.window.n_samples < self.n_neighbors:
            # The model is empty, defaulting to zero
            return np.zeros(shape=(r, 1))
        proba = []

        self.classes = list(set().union(
            self.classes, np.unique(self.window.get_targets_matrix())))

        new_dist, new_ind = self.__predict_proba(X)
        for i in range(r):
            votes = [0.0 for _ in range(int(max(self.classes) + 1))]
            for index in new_ind[i]:
                votes[int(
                    self.window.get_targets_matrix()[index])] += 1. / len(
                        new_ind[i])
            proba.append(votes)

        return np.asarray(proba)

    def __predict_proba(self, X):
        """ __predict_proba
        
        Private implementation of the predict_proba method.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
        
        Returns
        -------
        tuple list
            One list with the k-nearest neighbor's distances and another 
            one with their indexes.
        
        """
        # To use our own KDTree implementation please replace it as follows
        # tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean',
        #              nominal_attributes=self._nominal_attributes, return_distance=True)

        tree = sk.KDTree(self.window.get_attributes_matrix(),
                         self.leaf_size,
                         metric='euclidean')
        dist, ind = tree.query(np.asarray(X), k=self.n_neighbors)
        return dist, ind