def clone_window(self): window = InstanceWindow( n_features=self.window.n_attributes, n_targets=self.window.n_targets, categorical_list=self.window.categorical_attributes, max_size=self.window.max_size) window._buffer = np.array(self.window._buffer) window._n_samples = self.window._n_samples return window
def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, categorical_list=None): super().__init__() self.n_neighbors = n_neighbors self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size if categorical_list is None: self.categorical_list = []
def __init__(self, estimator: RegressorMixin, max_window_size=100): super().__init__() if not isinstance(estimator, RegressorMixin): raise ValueError( "estimator must be a Regressor, " "Call TimeSeriesRegressor with an instance of RegressorMixin") self.max_window_size = max_window_size self.estimator = estimator self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True
def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Partially fits the model. This is done by updating the window with new samples while also updating the adwin algorithm. Then we verify if a change was detected, and if so, the window is correctly split at the drift moment. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNNAdwin self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) if self.window._num_samples >= self.n_neighbors: add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0 self.adwin.add_element(add) else: self.adwin.add_element(0) if self.window._num_samples >= self.n_neighbors: changed = self.adwin.detected_change() if changed: if self.adwin.width < self.window._num_samples: for i in range(self.window._num_samples, self.adwin.width, -1): self.window.delete_element() return self
def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, nominal_attributes=None): super().__init__() self.n_neighbors = n_neighbors self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size self.nominal_attributes = nominal_attributes if self.nominal_attributes is None: self._nominal_attributes = []
def fit(self, X, y, classes=None, weight=None): """ fit Fits the model on the samples X and targets y. This is actually the function as the partial fit. For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNN self """ r, c = get_dimensions(X) if classes is not None: self.classes = list(set().union(self.classes, classes)) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self
def prepare_post_analysis_req(self, num_features, num_targets, num_classes, target_values, record=False): # Need to get the dataset information but we do not want to # take it as an argument to the classifier itself, nor we do want to # ask it at each data instance. Hence we take dataset info from user # explicitly to create _chunk_data entries. #chunk_size = self._chunk_size self._chunk_data = InstanceWindow(n_features=num_features, n_targets=num_targets, max_size=self._chunk_size) #self._chunk_data = chunk_data # num_targets shows how many columns you want to predict in the data. # num classes is eqv to possible number of values that that column # can have. self._num_classes = num_classes self._target_values = target_values self._record = record if (self._record): # Create files that keeps record of: # - weights at each chunk # - individual component results for every instance # - ground truths for every instance. self._f_comp_preds = open("component_predictions.csv", "w+") self._f_truths = open("ground_truths.csv", "w+") self._f_weights = open("weights.csv", "w+") self._f_comp_preds.write(str(self._chunk_size) + '\n') self._f_comp_preds.close() self._f_truths.close() self._f_weights.close() return
class Goowe(StreamModel): #class Goowe(BaseEstimator): """ GOOWE (Geometrically Optimum Online Weighted Ensemble), as it is described in Bonab and Can (2017). Common notation in the code is as follows: K for maximum number of classifiers in the ensemble. N for data instances. A, d as they are, in the aforementioned paper. Parameters ---------- n_max_components: int Ensemble size limit. Maximum number of component classifiers. chunk_size: int The amount of instances necessary for ensemble to learn concepts from. At each chunk_size many instances, some training is done. window_size: int Size of sliding window, which keeps record of the last k instances that are encountered in the data stream. """ def __init__(self, n_max_components: int = 10, chunk_size: int = 500, window_size: int = 100, logging=True): super().__init__() self._num_of_max_classifiers = n_max_components self._chunk_size = chunk_size self._Logging = logging self._num_of_current_classifiers = 0 self._num_of_processed_instances = 0 self._classifiers = np.empty((self._num_of_max_classifiers), dtype=object) self._weights = np.zeros((self._num_of_max_classifiers, )) # What to save from current Data Chunk --> will be used for # adjusting weights, pruning purposes and so on. # Individual predictions of components, overall prediction of ensemble, # and ground truth info. self._chunk_comp_preds = FastBuffer(max_size=chunk_size) self._chunk_ensm_preds = FastBuffer(max_size=chunk_size) # chunk_data has instances in the chunk and their ground truth. # To be initialized after receiving n_features, n_targets self._chunk_data = None # self._chunk_truths = FastBuffer(max_size=chunk_size) # some external stuff that is about the data we are dealing with # but useful for recording predictions self._num_classes = None self._target_values = None # Required to correctly train HTs self._record = False # Boolean for keeping records to files # TODO: Implement Sliding Window Continuous Evaluator. # What to save at Sliding Window (last n instances) --> will be # used for continuous evaluation. # self._sliding_window_ensemble_preds =FastBuffer(max_size=window_size) # self._sliding_window_truths = FastBuffer(max_size=window_size) def prepare_post_analysis_req(self, num_features, num_targets, num_classes, target_values, record=False): # Need to get the dataset information but we do not want to # take it as an argument to the classifier itself, nor we do want to # ask it at each data instance. Hence we take dataset info from user # explicitly to create _chunk_data entries. #chunk_size = self._chunk_size self._chunk_data = InstanceWindow(n_features=num_features, n_targets=num_targets, max_size=self._chunk_size) #self._chunk_data = chunk_data # num_targets shows how many columns you want to predict in the data. # num classes is eqv to possible number of values that that column # can have. self._num_classes = num_classes self._target_values = target_values self._record = record if (self._record): # Create files that keeps record of: # - weights at each chunk # - individual component results for every instance # - ground truths for every instance. self._f_comp_preds = open("component_predictions.csv", "w+") self._f_truths = open("ground_truths.csv", "w+") self._f_weights = open("weights.csv", "w+") self._f_comp_preds.write(str(self._chunk_size) + '\n') self._f_comp_preds.close() self._f_truths.close() self._f_weights.close() return def _get_components_predictions_for_instance(self, inst): """ For a given data instance, takes predictions of individual components from the ensemble as a matrix. Parameters ---------- inst: data instance for which votes of components are delivered. Returns ---------- numpy.array A 2-d numpy array where each row corresponds to predictions of each classifier. """ preds = np.zeros((self._num_of_current_classifiers, self._num_classes)) # print(np.shape(preds)) for k in range(len(preds)): kth_comp_pred = self._classifiers[k].predict_proba(inst) # print(kth_comp_pred[0]) # print(preds) # print("Component {}'s Prediction: {}".format(k, kth_comp_pred)) preds[k, :] = kth_comp_pred[0] if (self._Logging): print('Component Predictions:') print(preds) return preds def _adjust_weights(self): """ Weight adustment by solving linear least squares, as it is described in Bonab and Can (2017). """ # Prepare variables for Weight Adjustment # print('number of current classifiers: {}'.format(self._num_of_current_classifiers)) A = np.zeros(shape=(self._num_of_current_classifiers, self._num_of_current_classifiers)) d = np.zeros(shape=(self._num_of_current_classifiers, )) # Go over all the data chunk, calculate values of (S_i x S_j) for A. # (S_i x O) for d. y_all = self._chunk_data.get_targets_matrix().astype(int) # print(y_all) for i in range(len(y_all)): class_index = y_all[i] comp_preds = self._chunk_comp_preds.get_next_element() #print("{} components predictions:".format(i)) #print(comp_preds) A = A + comp_preds.dot(comp_preds.T) d = d + comp_preds[0][class_index] # A and d are filled. Now, the linear system Aw=d to be solved # to get our desired weights. w is of size K. # print("Solving Aw=d") # print(A) # print(d) w = np.linalg.lstsq(A, d, rcond=None)[0] # _weights has maximum size but what we found can be # smaller. Therefore, need to put the values of w to global weights if (self._num_of_current_classifiers < self._num_of_max_classifiers): for i in range(len(w)): self._weights[i] = w[i] else: # If full size, there is no problem. self._weights = w # print("After solving Aw=d weights:") # print(self._weights) return def _normalize_weights(self): """ Normalizes the weights of the ensemble to (0, 1) range. Performs (x_i - min(x)) / (max(x) - min(x)) on the nonzero elements of the weight vector. """ min = np.amin(self._weights[:self._num_of_current_classifiers]) max = np.amax(self._weights[:self._num_of_current_classifiers]) if (min == max): # all weights are the same for i in range(self._num_of_current_classifiers): self._weights[i] = 1. / self._num_of_current_classifiers else: for i in range(self._num_of_current_classifiers): self._weights[i] = (self._weights[i] - min) / (max - min) return def _normalize_weights_softmax(self): """ Normalizes the weights of the ensemble to (0, 1) range. Performs (x_i - min(x)) / (max(x) - min(x)) on the nonzero elements of the weight vector. """ cur_weights = self._weights[:self._num_of_current_classifiers] self._weights[:self._num_of_current_classifiers] = np.exp( cur_weights) / sum(np.exp(cur_weights)) return def _process_chunk(self): """ A subroutine that runs at the end of each chunk, allowing the components to be trained and ensemble weights to be adjusted. Until the first _process_chunk call, the ensemble is not yet ready. At first call, the first component is learned. At the rest of the calls, new components are formed, and the older ones are trained by the given chunk. If the ensemble size is reached, then the lowest weighted component is removed from the ensemble. """ new_clf = HoeffdingTree() # with default parameters for now new_clf.reset() # Save records of previous chunk if (self._record and self._num_of_current_classifiers > 0): self._record_truths_this_chunk() self._record_comp_preds_this_chunk() self._record_weights_this_chunk() # Case 1: No classifier in the ensemble yet, first chunk: if (self._num_of_current_classifiers == 0): self._classifiers[0] = new_clf self._weights[0] = 1.0 # weight is 1 for the first clf self._num_of_current_classifiers += 1 else: # First, adjust the weights of the old component classifiers # according to what happened in this chunk. self._adjust_weights() # Case 2: There are classifiers in the ensemble but # the ensemble size is still not capped. if (self._num_of_current_classifiers < self._num_of_max_classifiers): # Put the new classifier to ensemble with the weight of 1 self._classifiers[self._num_of_current_classifiers] = new_clf self._weights[self._num_of_current_classifiers] = float(1.0) self._num_of_current_classifiers += 1 # Case 3: Ensemble size is capped. Need to replace the component # with lowest weight. else: assert (self._num_of_current_classifiers == self._num_of_max_classifiers), "Ensemble not full." index_of_lowest_weight = np.argmin(self._weights) self._classifiers[index_of_lowest_weight] = new_clf self._weights[index_of_lowest_weight] = 1.0 # Normalizing weigths to simplify numbers self._normalize_weights_softmax() # maybe useful. we'll see. if (self._Logging): print("After normalization weights: ") print(self._weights) # Ensemble maintenance is done. Now train all classifiers # in the ensemble from the current chunk. # Can be parallelized. data_features = self._chunk_data.get_attributes_matrix() data_truths = self._chunk_data.get_targets_matrix() data_truths = data_truths.astype(int).flatten() if (self._Logging): print("Starting training the components with the current chunk...") for k in range(self._num_of_current_classifiers): print("Training classifier {}".format(k)) self._classifiers[k].partial_fit(data_features, data_truths, classes=self._target_values) print( "Training the components with the current chunk completed...") else: for k in range(self._num_of_current_classifiers): self._classifiers[k].partial_fit(data_features, data_truths, classes=self._target_values) return def _record_truths_this_chunk(self): f = open("ground_truths.csv", "ab") data_truths = self._chunk_data.get_targets_matrix() data_truths = data_truths.astype(int).flatten() # Default behaviour is to store list of lists for savetxt. # Hence, to prevent newline after each element of list, we surround # the truth array with one more set of bracketts. np.savetxt(f, [data_truths], delimiter=",", fmt='%d') f.close() return def _record_comp_preds_this_chunk(self): f = open("component_predictions.csv", "a+") np.savetxt(f, [self._num_of_current_classifiers], fmt='%d') comp_preds = np.array(self._chunk_comp_preds.get_queue()) for i in range(len(comp_preds)): np.savetxt(f, comp_preds[i], delimiter=',', fmt='%1.5f') f.close() return def _record_weights_this_chunk(self): f = open("weights.csv", "a+") np.savetxt(f, [self._num_of_current_classifiers], fmt='%d') weights = self._weights np.savetxt(f, [weights], delimiter=',', fmt='%1.5f') f.close() return # -------------------------------------------------- # Overridden methods from the parent (StreamModel) # -------------------------------------------------- def fit(self, X, y, classes=None, weight=None): raise NotImplementedError("For now, only the stream version " "is implemented. Use partial_fit()") def partial_fit(self, X, y, classes=None, weight=None): # This method should work with individual instances, as well as bunch # of instances, since there can be pre-training for warm start. # If an individual instance is inputted, then just save X and y to # train from them later. if (len(X) == 1): # Save X and y to train classifiers later # y is required to be 1x1, and hence the square bracketts. y_i = np.array([y]) # print(type(X)) # print(type(y_i)) # print(X) # print(y_i) self._chunk_data.add_element(X, y_i) # If still filling the chunk, then just add the instance to the # current data chunk, wait for it to be filled. self._num_of_processed_instances += 1 # If at the end of a chunk, start training components # and adjusting weights using information in this chunk. if (self._num_of_processed_instances % self._chunk_size == 0): print("Instance {}".format(self._num_of_processed_instances)) self._process_chunk() elif (len(X) > 1): # Input is a chunk. Add them individually. for i in range(len(X)): X_i = np.array([X[i]]) y_i = np.array([[y[i]]]) # print(X_i) # print(y_i) self._chunk_data.add_element(X_i, y_i) self._num_of_processed_instances += 1 # If at the end of a chunk, start training components # and adjusting weights using information in this chunk. if (self._num_of_processed_instances % self._chunk_size == 0): print("Instance {}".format( self._num_of_processed_instances)) self._process_chunk() else: print("Something wrong with the data...") print("len(X) is: {}".format(len(X))) return def predict(self, X): """ For a given data instance, yields the prediction values. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) Samples for which we want to predict the labels. Returns ------- numpy.array Predicted labels for all instances in X. """ predictions = [] if (len(X) == 1): predictions.append(np.argmax(self.predict_proba(X))) elif (len(X) > 1): # Add many predictions for i in range(len(X)): relevance_scores = self.predict_proba(X[i]) predictions.append(np.argmax(relevance_scores)) # print(np.argmax(relevance_scores)) if (self._Logging): print('Ensemble Prediction:') print(np.array(predictions)) return np.array(predictions) #, one_hot def predict_proba(self, X): """ For a given data instance, takes WEIGHTED combination of components to get relevance scores for each class. Parameters ---------- X: data instance for which weighted combination is delivered. Returns ---------- numpy.array A vector with number_of_classes elements where each element represents class score of corresponding class for this instance. """ weights = np.array(self._weights) # get only the useful weights weights = weights[:self._num_of_current_classifiers] components_preds = self._get_components_predictions_for_instance(X) #print('*****************************') #print(components_preds) #print('*****************************') # Save individual component predictions and ensemble prediction # for later analysis. self._chunk_comp_preds.add_element([components_preds]) #print(weights) #print(components_preds) #print(self.get_classifiers()) weighted_ensemble_vote = np.dot(weights, components_preds) # print("Weighted Ensemble vote: {}".format(weighted_ensemble_vote)) self._chunk_ensm_preds.add_element(weighted_ensemble_vote) return weighted_ensemble_vote def reset(self): pass def score(self, X, y): pass def get_info(self): return 'The Ensemble GOOWE (Bonab and Can, 2017) with' + \ ' - n_max_components: ' + str(self._num_of_max_classifiers) + \ ' - num_of_current_components: ' + str(self._num_of_current_classifiers) + \ ' - chunk_size: ' + str(self._chunk_size) + \ ' - num_dimensions_in_label_space(num_classes): ' + str(self._num_classes) + \ ' - recording: ' + str(self._record) def get_class_type(self): pass # Some getters and setters.. def get_number_of_current_classifiers(self): return self._num_of_current_classifiers def get_number_of_max_classifiers(self): return self._num_of_max_classifiers # Helper methods for GooweMS def get_classifiers(self): return self._classifiers def set_classifiers(self, classifiers): self._classifiers = classifiers def get_weights(self): return self._weights
class KNNAdwin(KNN): """ K-Nearest Neighbors Classifier with ADWIN Change detector This Classifier is an improvement from the regular KNN classifier, as it is resistant to concept drift. It utilises the ADWIN change detector to decide which samples to keep and which ones to forget, and by doing so it regulates the sample window size. To know more about the ADWIN change detector, please visit skmultiflow.classification.core.drift_detection.adwin It uses the regular KNN Classifier as a base class, with the major difference that this class keeps a variable size window, instead of a fixed size one and also it updates the adwin algorithm at each partial_fit call. Parameters ---------- n_neighbors: int The number of nearest neighbors to search for. max_window_size: int The maximum size of the window storing the last viewed samples. leaf_size: int The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. categorical_list: An array-like Each entry is the index of a categorical feature. May be requested further filtering. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least k samples have been analyzed by the algorithm. Examples -------- >>> # Imports >>> from skmultiflow.lazy.knn_adwin import KNNAdwin >>> from skmultiflow.data.file_stream import FileStream >>> # Setting up the stream >>> stream = FileStream('skmultiflow/data/datasets/covtype.csv') >>> stream.prepare_for_use() >>> # Setting up the KNNAdwin classifier >>> knn_adwin = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000) >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_sample(200) >>> knn_adwin = knn_adwin.partial_fit(X, y) >>> # Keeping track of sample count and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_sample() ... pred = knn_adwin.predict(X) ... if y[0] == pred[0]: ... corrects += 1 ... knn_adwin = knn_adwin.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying the results >>> print('KNN usage example') >>> print(str(n_samples) + ' samples analyzed.') 5000 samples analyzed. >>> print("KNNAdwin's performance: " + str(corrects/n_samples)) KNNAdwin's performance: 0.7798 """ def __init__(self, n_neighbors=5, max_window_size=sys.maxsize, leaf_size=30, categorical_list=None): super().__init__(n_neighbors=n_neighbors, max_window_size=max_window_size, leaf_size=leaf_size, categorical_list=categorical_list) self.adwin = ADWIN() self.window = None def reset(self): """ reset Resets the adwin algorithm as well as the base model kept by the KNN base class. Returns ------- KNNAdwin self """ self.adwin = ADWIN() return super().reset() def fit(self, X, y, classes=None, weights=None): self.partial_fit(X, y, classes, weights) return self def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Partially fits the model. This is done by updating the window with new samples while also updating the adwin algorithm. Then we verify if a change was detected, and if so, the window is correctly split at the drift moment. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNNAdwin self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) if self.window.n_samples >= self.n_neighbors: add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0 self.adwin.add_element(add) else: self.adwin.add_element(0) if self.window.n_samples >= self.n_neighbors: changed = self.adwin.detected_change() if changed: if self.adwin.width < self.window.n_samples: for i in range(self.window.n_samples, self.adwin.width, -1): self.window.delete_element() return self def get_info(self): info = '{}:'.format(type(self).__name__) info += ' - n_neighbors: {}'.format(self.n_neighbors) info += ' - max_window_size: {}'.format(self.max_window_size) info += ' - leaf_size: {}'.format(self.leaf_size) return info
class KNNClassifier(BaseSKMObject, ClassifierMixin): """ K-Nearest Neighbors classifier. This is a non-parametric classification method. The output of this algorithm are the n_neighbors closest training examples to the query sample X. It works by keeping track of a fixed number of training samples, in our case it keeps track of the last max_window_size training samples. Then, whenever a query request is executed, the algorithm will search its stored samples and find the closest ones using a selected distance metric. To store the samples, while reducing search times, we use a structure called KD Tree (a K Dimensional Tree, for n_neighbors dimensional problems). Although we do have our own KDTree implementation, which accepts custom metrics, we recommend using the standard scikit-learn KDTree, that even though doesn't accept custom metrics, is optimized and will function faster. Parameters ---------- n_neighbors: int (default=5) The number of nearest neighbors to search for. max_window_size: int (default=1000) The maximum size of the window storing the last viewed samples. leaf_size: int (default=30) The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. nominal_attributes: numpy.ndarray (optional, default=None) List of Nominal attributes. If empty, then assume that all attributes are numerical. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least n_neighbors samples have been analyzed by the algorithm. Notes ----- For a KDTree functionality explanation, please see our KDTree documentation, under skmultiflow.lazy.neighbors.kdtree. This classifier is not optimal for a mixture of categorical and numerical features. If you wish to use our KDTree implementation please refer to this class' function __predict_proba Examples -------- >>> # Imports >>> from skmultiflow.lazy import KNNClassifier >>> from skmultiflow.data import SEAGenerator >>> # Setting up the stream >>> stream = SEAGenerator(random_state=1, noise_percentage=.1) >>> stream.prepare_for_use() >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_sample(200) >>> knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) >>> knn.partial_fit(X, y) >>> # Preparing the processing of 5000 samples and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_sample() ... my_pred = knn.predict(X) ... if y[0] == my_pred[0]: ... corrects += 1 ... knn = knn.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying results >>> print('KNNClassifier usage example') >>> print('{} samples analyzed.'.format(n_samples)) 5000 samples analyzed. >>> print("KNNClassifier's performance: {}".format(corrects/n_samples)) KNN's performance: 0.8788 """ def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, nominal_attributes=None): super().__init__() self.n_neighbors = n_neighbors self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size self.nominal_attributes = nominal_attributes if self.nominal_attributes is None: self._nominal_attributes = [] def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially fits the model on the samples X and corresponding targets y. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: numpy.ndarray, optional (default=None) Array with all possible/known classes. sample_weight: Not used. Returns ------- KNNClassifier self Notes ----- For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures """ r, c = get_dimensions(X) if classes is not None: self.classes = list(set().union(self.classes, classes)) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def reset(self): self.window.reset() return self def predict(self, X): """ predict Predicts the label of the X sample, by searching the KDTree for the n_neighbors-Nearest Neighbors. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted labels for all instances in X. """ r, c = get_dimensions(X) proba = self.predict_proba(X) predictions = [] for i in range(r): predictions.append(np.argmax(proba[i])) return np.array(predictions) def predict_proba(self, X): """ predict_proba Calculates the probability of each sample in X belonging to each of the labels, based on the knn algorithm. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Raises ------ ValueError: If there is an attempt to call this function before, at least, n_neighbors samples have been analyzed by the learner, a ValueError is raised. Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.target_value) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ if self.window is None or self.window.n_samples < self.n_neighbors: raise ValueError( "KNNClassifier must be (partially) fitted on n_neighbors samples before doing any prediction." ) proba = [] r, c = get_dimensions(X) self.classes = list(set().union( self.classes, np.unique(self.window.get_targets_matrix()))) new_dist, new_ind = self.__predict_proba(X) for i in range(r): votes = [0.0 for _ in range(int(max(self.classes) + 1))] for index in new_ind[i]: votes[int( self.window.get_targets_matrix()[index])] += 1. / len( new_ind[i]) proba.append(votes) return np.array(proba) def __predict_proba(self, X): """ __predict_proba Private implementation of the predict_proba method. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- tuple list One list with the k-nearest neighbor's distances and another one with their indexes. """ # To use our own KDTree implementation please replace it as follows # tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean', # nominal_attributes=self._nominal_attributes, return_distance=True) tree = sk.KDTree(self.window.get_attributes_matrix(), self.leaf_size, metric='euclidean') dist, ind = tree.query(np.asarray(X), k=self.n_neighbors) return dist, ind
class TimeSeriesRegressor(BaseSKMObject, RegressorMixin): def __init__(self, estimator: RegressorMixin, max_window_size=100): super().__init__() if not isinstance(estimator, RegressorMixin): raise ValueError( "estimator must be a Regressor, " "Call TimeSeriesRegressor with an instance of RegressorMixin") self.max_window_size = max_window_size self.estimator = estimator self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True def partial_fit(self, X, y=None, sample_weight=None): """ Partially fits the model on the samples X and corresponding targets y. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. If y is not provided the value X[t+1] in X will be used as target For X[t] y: numpy.ndarray, optional An array-like containing the targets for all samples in X. y must have the shape as X sample_weight: Not used. Returns ------- TimeSeriesRegressor self Notes ----- For the TimeSeries Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the max_window_size is reached, removing older results and then using the the max_window_size past X values to predict future X values by feeding them as features to the provided model. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures """ if len(X.shape) == 1: X = X.reshape(-1, 1) if len(y.shape) == 1: y = y.reshape(-1, 1) r = X.shape[0] if y is not None: r_t = y.shape[0] if r != r_t: raise ValueError( "Batch size of X is different from the number of attributes in y " "Batch size of must be the same for X and y") if self.first_fit: if r <= self.max_window_size: raise ValueError( "Number of elments of First call to partial_fit less than max_window_size " "Call partial_fit with more than {} elements".format( self.max_window_size)) for i in range(r): if y is not None: self.window.add_element(np.asarray([X[i]]), np.asarray([y[i]])) elif i > 0: self.window.add_element(np.asarray([X[i - 1]]), np.asarray([X[i]])) if self.max_window_size == self.window.n_samples: self.estimator.partial_fit( self.window.get_attributes_matrix().reshape((1, -1)), self.window.get_targets_matrix()[-1].reshape((1, -1)), sample_weight=sample_weight) self.first_fit = False return self def reset(self): self.window.reset() self.estimator.reset() return self def clone_window(self): window = InstanceWindow( n_features=self.window.n_attributes, n_targets=self.window.n_targets, categorical_list=self.window.categorical_attributes, max_size=self.window.max_size) window._buffer = np.array(self.window._buffer) window._n_samples = self.window._n_samples return window def predict(self, X): """ Predicts the next value For all values in X. The estimator consider X[0] as the value conming exactly after the last partially fit value. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted values for all instances in X. """ if len(X.shape) == 1: X = X.reshape(-1, 1) r = X.shape[0] window = self.clone_window() predictions = [] for i in range(r): window.add_element(np.asarray([X[i]]), np.asarray([X[i]])) if self.max_window_size == self.window.n_samples: pred = self.estimator.predict( window.get_attributes_matrix().reshape((1, -1))) if (len(pred.flatten()) == 1): pred = pred[0] predictions.append(pred[0]) return np.array(predictions) def predict_proba(self, X): """ Method not implemented for this Estimator """ raise NotImplementedError def forcast(self, X, n_steps): """ Predicts the next n_steps values coming after all values in X. The estimator consider X[0] as the value conming exactly after the last partially fit value. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the next value for. n_steps: The number of values to Forcast Returns ------- list A list containing the predicted n_steps to come after values in X. """ if len(X.shape) == 1: X = X.reshape(-1, 1) r = X.shape[0] window = self.clone_window() for i in range(r): window.add_element(np.asarray([X[i]]), np.asarray([X[i]])) forecasts = [] for i in range(n_steps): next_element = self.estimator.predict( window.get_attributes_matrix().reshape((1, -1))) window.add_element(next_element.reshape((1, -1)), next_element.reshape((1, -1))) if (len(next_element.flatten()) == 1): next_element = next_element[0] forecasts.append(next_element[0]) return np.asarray(forecasts)
class KNNClassifier(BaseSKMObject, ClassifierMixin): """ k-Nearest Neighbors classifier. This non-parametric classification method keeps a data window with the last max_window_size training samples. The predicted class-label for a given query sample is obtained in two steps: first, find the closest n_neighbors to the query sample in the data window. Second, aggregate the class-labels of the n_neighbors to define the predicted class for the query sample. Parameters ---------- n_neighbors: int (default=5) The number of nearest neighbors to search for. max_window_size: int (default=1000) The maximum size of the window storing the last viewed samples. leaf_size: int (default=30) The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. nominal_attributes: numpy.ndarray (optional, default=None) List of Nominal attributes. If empty, then assume that all attributes are numerical. Notes ----- This classifier is not optimal for a mixture of categorical and numerical features. Examples -------- >>> # Imports >>> from skmultiflow.lazy import KNNClassifier >>> from skmultiflow.data import SEAGenerator >>> # Setting up the stream >>> stream = SEAGenerator(random_state=1, noise_percentage=.1) >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_sample(200) >>> knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) >>> knn.partial_fit(X, y) >>> # Preparing the processing of 5000 samples and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_sample() ... my_pred = knn.predict(X) ... if y[0] == my_pred[0]: ... corrects += 1 ... knn = knn.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying results >>> print('KNNClassifier usage example') >>> print('{} samples analyzed.'.format(n_samples)) 5000 samples analyzed. >>> print("KNNClassifier's performance: {}".format(corrects/n_samples)) KNN's performance: 0.8788 """ def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, nominal_attributes=None): super().__init__() self.n_neighbors = n_neighbors self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size self.nominal_attributes = nominal_attributes if self.nominal_attributes is None: self._nominal_attributes = [] def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially fits the model on the samples X and corresponding targets y. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: numpy.ndarray, optional (default=None) Array with all possible/known classes. sample_weight: Not used. Returns ------- KNNClassifier self Notes ----- For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures """ r, c = get_dimensions(X) if classes is not None: self.classes = list(set().union(self.classes, classes)) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def reset(self): self.window.reset() return self def predict(self, X): """ Predicts the class label of the X sample. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted labels for all instances in X. """ r, c = get_dimensions(X) proba = self.predict_proba(X) predictions = [] for i in range(r): predictions.append(np.argmax(proba[i])) return np.array(predictions) def predict_proba(self, X): """ Estimates the probability of each sample in X belonging to each of the class-labels. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.target_value) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ r, c = get_dimensions(X) if self.window is None or self.window.n_samples < self.n_neighbors: # The model is empty, defaulting to zero return np.zeros(shape=(r, 1)) proba = [] self.classes = list(set().union( self.classes, np.unique(self.window.get_targets_matrix()))) new_dist, new_ind = self.__predict_proba(X) for i in range(r): votes = [0.0 for _ in range(int(max(self.classes) + 1))] for index in new_ind[i]: votes[int( self.window.get_targets_matrix()[index])] += 1. / len( new_ind[i]) proba.append(votes) return np.asarray(proba) def __predict_proba(self, X): """ __predict_proba Private implementation of the predict_proba method. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- tuple list One list with the k-nearest neighbor's distances and another one with their indexes. """ # To use our own KDTree implementation please replace it as follows # tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean', # nominal_attributes=self._nominal_attributes, return_distance=True) tree = sk.KDTree(self.window.get_attributes_matrix(), self.leaf_size, metric='euclidean') dist, ind = tree.query(np.asarray(X), k=self.n_neighbors) return dist, ind