class TimeSeriesRegressor(BaseSKMObject, RegressorMixin): def __init__(self, estimator: RegressorMixin, max_window_size=100): super().__init__() if not isinstance(estimator, RegressorMixin): raise ValueError( "estimator must be a Regressor, " "Call TimeSeriesRegressor with an instance of RegressorMixin") self.max_window_size = max_window_size self.estimator = estimator self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True def partial_fit(self, X, y=None, sample_weight=None): """ Partially fits the model on the samples X and corresponding targets y. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. If y is not provided the value X[t+1] in X will be used as target For X[t] y: numpy.ndarray, optional An array-like containing the targets for all samples in X. y must have the shape as X sample_weight: Not used. Returns ------- TimeSeriesRegressor self Notes ----- For the TimeSeries Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the max_window_size is reached, removing older results and then using the the max_window_size past X values to predict future X values by feeding them as features to the provided model. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures """ if len(X.shape) == 1: X = X.reshape(-1, 1) if len(y.shape) == 1: y = y.reshape(-1, 1) r = X.shape[0] if y is not None: r_t = y.shape[0] if r != r_t: raise ValueError( "Batch size of X is different from the number of attributes in y " "Batch size of must be the same for X and y") if self.first_fit: if r <= self.max_window_size: raise ValueError( "Number of elments of First call to partial_fit less than max_window_size " "Call partial_fit with more than {} elements".format( self.max_window_size)) for i in range(r): if y is not None: self.window.add_element(np.asarray([X[i]]), np.asarray([y[i]])) elif i > 0: self.window.add_element(np.asarray([X[i - 1]]), np.asarray([X[i]])) if self.max_window_size == self.window.n_samples: self.estimator.partial_fit( self.window.get_attributes_matrix().reshape((1, -1)), self.window.get_targets_matrix()[-1].reshape((1, -1)), sample_weight=sample_weight) self.first_fit = False return self def reset(self): self.window.reset() self.estimator.reset() return self def clone_window(self): window = InstanceWindow( n_features=self.window.n_attributes, n_targets=self.window.n_targets, categorical_list=self.window.categorical_attributes, max_size=self.window.max_size) window._buffer = np.array(self.window._buffer) window._n_samples = self.window._n_samples return window def predict(self, X): """ Predicts the next value For all values in X. The estimator consider X[0] as the value conming exactly after the last partially fit value. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted values for all instances in X. """ if len(X.shape) == 1: X = X.reshape(-1, 1) r = X.shape[0] window = self.clone_window() predictions = [] for i in range(r): window.add_element(np.asarray([X[i]]), np.asarray([X[i]])) if self.max_window_size == self.window.n_samples: pred = self.estimator.predict( window.get_attributes_matrix().reshape((1, -1))) if (len(pred.flatten()) == 1): pred = pred[0] predictions.append(pred[0]) return np.array(predictions) def predict_proba(self, X): """ Method not implemented for this Estimator """ raise NotImplementedError def forcast(self, X, n_steps): """ Predicts the next n_steps values coming after all values in X. The estimator consider X[0] as the value conming exactly after the last partially fit value. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the next value for. n_steps: The number of values to Forcast Returns ------- list A list containing the predicted n_steps to come after values in X. """ if len(X.shape) == 1: X = X.reshape(-1, 1) r = X.shape[0] window = self.clone_window() for i in range(r): window.add_element(np.asarray([X[i]]), np.asarray([X[i]])) forecasts = [] for i in range(n_steps): next_element = self.estimator.predict( window.get_attributes_matrix().reshape((1, -1))) window.add_element(next_element.reshape((1, -1)), next_element.reshape((1, -1))) if (len(next_element.flatten()) == 1): next_element = next_element[0] forecasts.append(next_element[0]) return np.asarray(forecasts)
class KNNClassifier(BaseSKMObject, ClassifierMixin): """ K-Nearest Neighbors classifier. This is a non-parametric classification method. The output of this algorithm are the n_neighbors closest training examples to the query sample X. It works by keeping track of a fixed number of training samples, in our case it keeps track of the last max_window_size training samples. Then, whenever a query request is executed, the algorithm will search its stored samples and find the closest ones using a selected distance metric. To store the samples, while reducing search times, we use a structure called KD Tree (a K Dimensional Tree, for n_neighbors dimensional problems). Although we do have our own KDTree implementation, which accepts custom metrics, we recommend using the standard scikit-learn KDTree, that even though doesn't accept custom metrics, is optimized and will function faster. Parameters ---------- n_neighbors: int (default=5) The number of nearest neighbors to search for. max_window_size: int (default=1000) The maximum size of the window storing the last viewed samples. leaf_size: int (default=30) The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. nominal_attributes: numpy.ndarray (optional, default=None) List of Nominal attributes. If empty, then assume that all attributes are numerical. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least n_neighbors samples have been analyzed by the algorithm. Notes ----- For a KDTree functionality explanation, please see our KDTree documentation, under skmultiflow.lazy.neighbors.kdtree. This classifier is not optimal for a mixture of categorical and numerical features. If you wish to use our KDTree implementation please refer to this class' function __predict_proba Examples -------- >>> # Imports >>> from skmultiflow.lazy import KNNClassifier >>> from skmultiflow.data import SEAGenerator >>> # Setting up the stream >>> stream = SEAGenerator(random_state=1, noise_percentage=.1) >>> stream.prepare_for_use() >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_sample(200) >>> knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) >>> knn.partial_fit(X, y) >>> # Preparing the processing of 5000 samples and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_sample() ... my_pred = knn.predict(X) ... if y[0] == my_pred[0]: ... corrects += 1 ... knn = knn.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying results >>> print('KNNClassifier usage example') >>> print('{} samples analyzed.'.format(n_samples)) 5000 samples analyzed. >>> print("KNNClassifier's performance: {}".format(corrects/n_samples)) KNN's performance: 0.8788 """ def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, nominal_attributes=None): super().__init__() self.n_neighbors = n_neighbors self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size self.nominal_attributes = nominal_attributes if self.nominal_attributes is None: self._nominal_attributes = [] def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially fits the model on the samples X and corresponding targets y. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: numpy.ndarray, optional (default=None) Array with all possible/known classes. sample_weight: Not used. Returns ------- KNNClassifier self Notes ----- For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures """ r, c = get_dimensions(X) if classes is not None: self.classes = list(set().union(self.classes, classes)) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def reset(self): self.window.reset() return self def predict(self, X): """ predict Predicts the label of the X sample, by searching the KDTree for the n_neighbors-Nearest Neighbors. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted labels for all instances in X. """ r, c = get_dimensions(X) proba = self.predict_proba(X) predictions = [] for i in range(r): predictions.append(np.argmax(proba[i])) return np.array(predictions) def predict_proba(self, X): """ predict_proba Calculates the probability of each sample in X belonging to each of the labels, based on the knn algorithm. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Raises ------ ValueError: If there is an attempt to call this function before, at least, n_neighbors samples have been analyzed by the learner, a ValueError is raised. Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.target_value) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ if self.window is None or self.window.n_samples < self.n_neighbors: raise ValueError( "KNNClassifier must be (partially) fitted on n_neighbors samples before doing any prediction." ) proba = [] r, c = get_dimensions(X) self.classes = list(set().union( self.classes, np.unique(self.window.get_targets_matrix()))) new_dist, new_ind = self.__predict_proba(X) for i in range(r): votes = [0.0 for _ in range(int(max(self.classes) + 1))] for index in new_ind[i]: votes[int( self.window.get_targets_matrix()[index])] += 1. / len( new_ind[i]) proba.append(votes) return np.array(proba) def __predict_proba(self, X): """ __predict_proba Private implementation of the predict_proba method. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- tuple list One list with the k-nearest neighbor's distances and another one with their indexes. """ # To use our own KDTree implementation please replace it as follows # tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean', # nominal_attributes=self._nominal_attributes, return_distance=True) tree = sk.KDTree(self.window.get_attributes_matrix(), self.leaf_size, metric='euclidean') dist, ind = tree.query(np.asarray(X), k=self.n_neighbors) return dist, ind
class KNNClassifier(BaseSKMObject, ClassifierMixin): """ k-Nearest Neighbors classifier. This non-parametric classification method keeps a data window with the last max_window_size training samples. The predicted class-label for a given query sample is obtained in two steps: first, find the closest n_neighbors to the query sample in the data window. Second, aggregate the class-labels of the n_neighbors to define the predicted class for the query sample. Parameters ---------- n_neighbors: int (default=5) The number of nearest neighbors to search for. max_window_size: int (default=1000) The maximum size of the window storing the last viewed samples. leaf_size: int (default=30) The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. nominal_attributes: numpy.ndarray (optional, default=None) List of Nominal attributes. If empty, then assume that all attributes are numerical. Notes ----- This classifier is not optimal for a mixture of categorical and numerical features. Examples -------- >>> # Imports >>> from skmultiflow.lazy import KNNClassifier >>> from skmultiflow.data import SEAGenerator >>> # Setting up the stream >>> stream = SEAGenerator(random_state=1, noise_percentage=.1) >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_sample(200) >>> knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) >>> knn.partial_fit(X, y) >>> # Preparing the processing of 5000 samples and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_sample() ... my_pred = knn.predict(X) ... if y[0] == my_pred[0]: ... corrects += 1 ... knn = knn.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying results >>> print('KNNClassifier usage example') >>> print('{} samples analyzed.'.format(n_samples)) 5000 samples analyzed. >>> print("KNNClassifier's performance: {}".format(corrects/n_samples)) KNN's performance: 0.8788 """ def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, nominal_attributes=None): super().__init__() self.n_neighbors = n_neighbors self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size self.nominal_attributes = nominal_attributes if self.nominal_attributes is None: self._nominal_attributes = [] def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially fits the model on the samples X and corresponding targets y. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: numpy.ndarray, optional (default=None) Array with all possible/known classes. sample_weight: Not used. Returns ------- KNNClassifier self Notes ----- For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures """ r, c = get_dimensions(X) if classes is not None: self.classes = list(set().union(self.classes, classes)) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def reset(self): self.window.reset() return self def predict(self, X): """ Predicts the class label of the X sample. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted labels for all instances in X. """ r, c = get_dimensions(X) proba = self.predict_proba(X) predictions = [] for i in range(r): predictions.append(np.argmax(proba[i])) return np.array(predictions) def predict_proba(self, X): """ Estimates the probability of each sample in X belonging to each of the class-labels. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.target_value) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ r, c = get_dimensions(X) if self.window is None or self.window.n_samples < self.n_neighbors: # The model is empty, defaulting to zero return np.zeros(shape=(r, 1)) proba = [] self.classes = list(set().union( self.classes, np.unique(self.window.get_targets_matrix()))) new_dist, new_ind = self.__predict_proba(X) for i in range(r): votes = [0.0 for _ in range(int(max(self.classes) + 1))] for index in new_ind[i]: votes[int( self.window.get_targets_matrix()[index])] += 1. / len( new_ind[i]) proba.append(votes) return np.asarray(proba) def __predict_proba(self, X): """ __predict_proba Private implementation of the predict_proba method. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- tuple list One list with the k-nearest neighbor's distances and another one with their indexes. """ # To use our own KDTree implementation please replace it as follows # tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean', # nominal_attributes=self._nominal_attributes, return_distance=True) tree = sk.KDTree(self.window.get_attributes_matrix(), self.leaf_size, metric='euclidean') dist, ind = tree.query(np.asarray(X), k=self.n_neighbors) return dist, ind