class BatchClassifier: def __init__(self, window_size=100, max_models=10): self.H = [] self.h = None # TODO self.window_size = window_size self.window = InstanceWindow(max_size=window_size, dtype=float) self.max_models = max_models def partial_fit(self, X, y=None, classes=None): # TODO r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.window_size, dtype=float) # models = [] modeles = 0 if not self.H: # Slice pretraining set debut = 0 fin = self.window_size while (modeles < self.max_models): X_batch = X[debut:fin, :] y_batch = y[debut:fin] debut += self.window_size fin += self.window_size self.h = DecisionTreeClassifier() self.h.fit(X_batch, y_batch) self.H.append(self.h) # <-- and append it to the ensemble modeles += 1 else: for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) for model in range(modeles): self.h = DecisionTreeClassifier() self.h.fit(self.window.get_attributes_matrix(), self.window.get_targets_matrix()) self.H.append(self.h) # <-- and append it to the ensemble return self def predict(self, X): # TODO N, _ = X.shape predictions = [] y = [] for h in self.H: y.append(h.predict(X)) for i in range(N): votes = Counter([j[i] for j in y]) if votes == {}: # Tree is empty, all classes equal, default to zero predictions.append(0) else: predictions.append(max(votes, key=votes.get)) return predictions
class BatchClassifier: def __init__(self, window_size=100, max_models=10): self.H = [] self.h = None self.window_size = window_size self.window = InstanceWindow(max_size=window_size, dtype=float) self.num_models = max_models # TODO return def partial_fit(self, X, y=None, classes=None): # Update window with new data r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.window_size) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) # If window is full, create and train new Decision Tree if self.window._num_samples == self.window_size: self.h = DecisionTreeClassifier() self.h.fit(self.window.get_attributes_matrix(), self.window.get_targets_matrix()) # Add new Decision Tree to model set self._add_to_buffer(self.h) # Clear window self.window = InstanceWindow(max_size=self.window_size, dtype=float) return return return self def predict(self, X): N, D = X.shape # Check there is at least a Decision Tree fitted if len(self.H) == 0: # print('Returning zeros, no model yet') return zeros(N) maj = np.argmax(self._predict_proba(X), axis=1) # print('Returning predictions ' + str(maj)) return maj def _predict_proba(self, X): avg = np.average(np.asarray([clf.predict_proba(X) for clf in self.H]), axis=0) return avg def _add_to_buffer(self, item): if len(self.H) == self.num_models: self.H.pop(0) self.H.append(item) return self
class BatchClassifier: def __init__(self, window_size=100, max_models=10): self.window = InstanceWindow(max_size=window_size) self.H = [] # Current classifier self.h = 0 self.max_models = max_models def partial_fit(self, X, y=None, classes=None): # if not initialized if self.H is None: self.H = [] r, c = get_dimensions(X) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) clf = DecisionTreeClassifier() clf.fit(X, y) self.h %= 10 self.H[clf] = clf self.h += 1 # N.B.: The 'classes' option is not important for this classifier # HINT: You can build a decision tree model on a set of data like this: # h = DecisionTreeClassifier() # h.fit(X_batch,y_batch) # self.H.append(h) # <-- and append it to the ensemble return self def predict(self, X): N, D = X.shape # You also need to change this line to return your prediction instead of 0s: # TODO predict using the max_models, and return the majority class y = [] for clf, i in self.H, range(self.max_models): y.append(clf.predict(X)) return zeros(N)
class KNN(BaseClassifier): """ K-Nearest Neighbors Classifier This is a non-parametric classification method. The output of this algorithm are the k closest training examples to the query sample X. It works by keeping track of a fixed number of training samples, in our case it keeps track of the last max_window_size training samples. Then, whenever a query request is executed, the algorithm will search its stored samples and find the closest ones using a selected distance metric. To store the samples, while reducing search times, we use a structure called KD Tree (a K Dimensional Tree, for k dimensional problems). Although we do have our own KDTree implementation, which accepts custom metrics, we recommend using the standard scikit-learn KDTree, that even though doesn't accept custom metrics, is optimized and will function faster. Parameters ---------- k: int The number of nearest neighbors to search for. max_window_size: int The maximum size of the window storing the last viewed samples. leaf_size: int The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. categorical_list: An array-like Each entry is the index of a categorical feature. May be requested further filtering. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least k samples have been analyzed by the algorithm. Notes ----- For a KDTree functionality explanation, please see our KDTree documentation, under skmultiflow.lazy.neighbors.kdtree. This classifier is not optimal for a mixture of categorical and numerical features. If you wish to use our KDTree implementation please refer to this class' function __predict_proba Examples -------- >>> # Imports >>> from skmultiflow.classification.lazy.knn import KNN >>> from skmultiflow.data.file_stream import FileStream >>> from skmultiflow.options.file_option import FileOption >>> # Setting up the stream >>> opt = FileOption('FILE', 'OPT_NAME', 'skmultiflow/datasets/sea_big.csv', 'csv', False) >>> stream = FileStream(opt, -1, 1) >>> stream.prepare_for_use() >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_instance(200) >>> knn = KNN(k=8, max_window_size=2000, leaf_size=40) >>> knn.partial_fit(X, y) >>> # Preparing the processing of 5000 samples and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_instance() ... my_pred = knn.predict(X) ... if y[0] == my_pred[0]: ... corrects += 1 ... knn = knn.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying results >>> print('KNN usage example') >>> print(str(n_samples) + ' samples analyzed.') 5000 samples analyzed. >>> print("KNN's performance: " + str(corrects/n_samples)) KNN's performance: 0.868 """ def __init__(self, k=5, max_window_size=1000, leaf_size=30, categorical_list=[]): super().__init__() self.k = k self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size self.categorical_list = categorical_list def fit(self, X, y, classes=None, weight=None): """ fit Fits the model on the samples X and targets y. This is actually the function as the partial fit. For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNN self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Fits the model on the samples X and targets y. For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNN self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def reset(self): self.window = None return self def predict(self, X): """ predict Predicts the label of the X sample, by searching the KDTree for the k-Nearest Neighbors. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted labels for all instances in X. """ r, c = get_dimensions(X) probs = self.predict_proba(X) preds = [] for i in range(r): preds.append(self.classes[probs[i].index(np.max(probs[i]))]) return preds def _predict(self, X): raise NotImplementedError def predict_proba(self, X): """ predict_proba Calculates the probability of each sample in X belonging to each of the labels, based on the knn algorithm. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Raises ------ ValueError: If there is an attempt to call this function before, at least, k samples have been analyzed by the learner, a ValueError is raised. Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.classes) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ if self.window is None: raise ValueError( "KNN should be partially fitted on at least k samples before doing any prediction." ) if self.window._num_samples < self.k: raise ValueError( "KNN should be partially fitted on at least k samples before doing any prediction." ) probs = [] r, c = get_dimensions(X) self.classes = list(set().union( self.classes, np.unique(self.window.get_targets_matrix()))) new_dist, new_ind = self.__predict_proba(X) for i in range(r): classes = [0 for j in range(len(self.classes))] for index in new_ind[i]: classes[self.classes.index( self.window.get_targets_matrix()[index])] += 1 probs.append([x / len(new_ind) for x in classes]) return probs def __predict_proba(self, X): """ __predict_proba Private implementation of the predict_proba method. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- tuple list One list with the k-nearest neighbor's distances and another one with their indexes. Notes ----- If you wish to use our own KDTree implementation please comment the third line of this function and uncomment the first and second lines. """ #tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean', # categorical_list=self.categorical_list, return_distance=True) tree = sk.KDTree(self.window.get_attributes_matrix(), self.leaf_size, metric='euclidean') dist, ind = tree.query(np.asarray(X), k=self.k) return dist, ind def score(self, X, y): raise NotImplementedError def get_info(self): return 'KNN Classifier: max_window_size: ' + str(self.max_window_size) + \ ' - leaf_size: ' + str(self.leaf_size)
class KNNAdwin(KNN): """ K-Nearest Neighbors Classifier with ADWIN Change detector This Classifier is an improvement from the regular KNN classifier, as it is resistant to concept drift. It utilises the ADWIN change detector to decide which samples to keep and which ones to forget, and by doing so it regulates the sample window size. To know more about the ADWIN change detector, please visit skmultiflow.classification.core.drift_detection.adwin It uses the regular KNN Classifier as a base class, with the major difference that this class keeps a variable size window, instead of a fixed size one and also it updates the adwin algorithm at each partial_fit call. Parameters ---------- k: int The number of nearest neighbors to search for. max_window_size: int The maximum size of the window storing the last viewed samples. leaf_size: int The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. categorical_list: An array-like Each entry is the index of a categorical feature. May be requested further filtering. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least k samples have been analyzed by the algorithm. Examples -------- >>> # Imports >>> from skmultiflow.classification.lazy.knn_adwin import KNNAdwin >>> from skmultiflow.classification.lazy.knn import KNN >>> from skmultiflow.data.file_stream import FileStream >>> # Setting up the stream >>> stream = FileStream('skmultiflow/datasets/covtype.csv', -1, 1) >>> stream.prepare_for_use() >>> # Setting up the KNNAdwin classifier >>> knn_adwin = KNNAdwin(k=8, leaf_size=40, max_window_size=2000) >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_sample(200) >>> knn_adwin = knn_adwin.partial_fit(X, y) >>> # Keeping track of sample count and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_sample() ... pred = knn_adwin.predict(X) ... if y[0] == pred[0]: ... corrects += 1 ... knn_adwin = knn_adwin.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying the results >>> print('KNN usage example') >>> print(str(n_samples) + ' samples analyzed.') 5000 samples analyzed. >>> print("KNNAdwin's performance: " + str(corrects/n_samples)) KNNAdwin's performance: 0.7798 """ def __init__(self, k=5, max_window_size=sys.maxsize, leaf_size=30, categorical_list=[]): super().__init__(k=k, max_window_size=max_window_size, leaf_size=leaf_size, categorical_list=categorical_list) self.adwin = ADWIN() self.window = None def reset(self): """ reset Resets the adwin algorithm as well as the base model kept by the KNN base class. Returns ------- KNNAdwin self """ self.adwin = ADWIN() return super().reset() def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Partially fits the model. This is done by updating the window with new samples while also updating the adwin algorithm. Then we verify if a change was detected, and if so, the window is correctly split at the drift moment. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNNAdwin self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) if self.window._num_samples >= self.k: add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0 self.adwin.add_element(add) else: self.adwin.add_element(0) if self.window._num_samples >= self.k: changed = self.adwin.detected_change() if changed: if self.adwin._width < self.window._num_samples: for i in range(self.window._num_samples, self.adwin._width, -1): self.window.delete_element() return self
class BatchClassifier: def __init__(self, window_size = 100, max_models = 100): self.H = [] self.h = None self.window_size = window_size self.max_models = max_models self.window = InstanceWindow(window_size) self.j = 0 # self.n_DT=0 def partial_fit(self, X, y=None, classes=None): # Get information on the input stream r, c = get_dimensions(X) # DEBUG MESSAGES # print("Begin MAX H "+str(self.max_models)) # print("r:" +str(r)+" c:" +str(c)) for i in range(r): # Check if the window is instanciated if self.window is None: self.window = InstanceWindow(self.window_size) # Add an element to the window (1 row) self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) # Increment the counter for the n_elements self.j+=1 # Create the model (DT) if self.h is None : self.h = DecisionTreeClassifier() # Check if the window is full if self.j == self.window_size: # A new model has to be generated # print("### FITTING MODEL "+str(self.n_DT)+" UNTIL RECORD "+str(i)+" ###") # Train the new model X_batch=self.window.get_attributes_matrix() y_batch=self.window.get_targets_matrix() self.h.fit(X_batch,y_batch) # Keep only self.max_models : pop the oldest to push a new one if(len(self.H) == self.max_models): self.H.pop(0) self.H.append(self.h) # Update the counters # self.n_DT+=1 self.j=0 # DEBUG MESSAGES # print("CURRENT LEN H "+str(len(self.H))) # print("CURRENT MAX H "+str(self.max_models)) # print("HELLO WORLD "+str(self.H)) return self def predict(self, X): # TODO N,D = X.shape # print("### PREDICTING "+str(X)+" ###") # print("N:" +str(N)+" D:" +str(D)) # Set the predictions to zero predictions = zeros(len(self.H)) if len(self.H) > 0 else 0 # Compute predictions with the current models # print("CURRENT LEN H "+str(len(self.H))) for i in range(len(self.H)): predictions[i] = self.H[i].predict(X) # print("PREDICTIONS: "+str(predictions)) # print("FINAL PRED: "+str(np.bincount(asarray(predictions, dtype=int64)).argmax())) # # Return Majority class of predictions # return ndarray(shape=(N,), buffer=np.bincount(asarray(predictions, dtype=int64)).argmax()) return predictions