class BatchClassifier: def __init__(self, window_size=100, max_models=10): self.H = [] self.h = None # TODO self.window_size = window_size self.window = InstanceWindow(max_size=window_size, dtype=float) self.max_models = max_models def partial_fit(self, X, y=None, classes=None): # TODO r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.window_size, dtype=float) # models = [] modeles = 0 if not self.H: # Slice pretraining set debut = 0 fin = self.window_size while (modeles < self.max_models): X_batch = X[debut:fin, :] y_batch = y[debut:fin] debut += self.window_size fin += self.window_size self.h = DecisionTreeClassifier() self.h.fit(X_batch, y_batch) self.H.append(self.h) # <-- and append it to the ensemble modeles += 1 else: for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) for model in range(modeles): self.h = DecisionTreeClassifier() self.h.fit(self.window.get_attributes_matrix(), self.window.get_targets_matrix()) self.H.append(self.h) # <-- and append it to the ensemble return self def predict(self, X): # TODO N, _ = X.shape predictions = [] y = [] for h in self.H: y.append(h.predict(X)) for i in range(N): votes = Counter([j[i] for j in y]) if votes == {}: # Tree is empty, all classes equal, default to zero predictions.append(0) else: predictions.append(max(votes, key=votes.get)) return predictions
class BatchClassifier: def __init__(self, window_size=100, max_models=10): self.H = [] self.h = None self.window_size = window_size self.window = InstanceWindow(max_size=window_size, dtype=float) self.num_models = max_models # TODO return def partial_fit(self, X, y=None, classes=None): # Update window with new data r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.window_size) for i in range(r): self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) # If window is full, create and train new Decision Tree if self.window._num_samples == self.window_size: self.h = DecisionTreeClassifier() self.h.fit(self.window.get_attributes_matrix(), self.window.get_targets_matrix()) # Add new Decision Tree to model set self._add_to_buffer(self.h) # Clear window self.window = InstanceWindow(max_size=self.window_size, dtype=float) return return return self def predict(self, X): N, D = X.shape # Check there is at least a Decision Tree fitted if len(self.H) == 0: # print('Returning zeros, no model yet') return zeros(N) maj = np.argmax(self._predict_proba(X), axis=1) # print('Returning predictions ' + str(maj)) return maj def _predict_proba(self, X): avg = np.average(np.asarray([clf.predict_proba(X) for clf in self.H]), axis=0) return avg def _add_to_buffer(self, item): if len(self.H) == self.num_models: self.H.pop(0) self.H.append(item) return self
class KNN(BaseClassifier): """ K-Nearest Neighbors Classifier This is a non-parametric classification method. The output of this algorithm are the k closest training examples to the query sample X. It works by keeping track of a fixed number of training samples, in our case it keeps track of the last max_window_size training samples. Then, whenever a query request is executed, the algorithm will search its stored samples and find the closest ones using a selected distance metric. To store the samples, while reducing search times, we use a structure called KD Tree (a K Dimensional Tree, for k dimensional problems). Although we do have our own KDTree implementation, which accepts custom metrics, we recommend using the standard scikit-learn KDTree, that even though doesn't accept custom metrics, is optimized and will function faster. Parameters ---------- k: int The number of nearest neighbors to search for. max_window_size: int The maximum size of the window storing the last viewed samples. leaf_size: int The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. categorical_list: An array-like Each entry is the index of a categorical feature. May be requested further filtering. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least k samples have been analyzed by the algorithm. Notes ----- For a KDTree functionality explanation, please see our KDTree documentation, under skmultiflow.lazy.neighbors.kdtree. This classifier is not optimal for a mixture of categorical and numerical features. If you wish to use our KDTree implementation please refer to this class' function __predict_proba Examples -------- >>> # Imports >>> from skmultiflow.classification.lazy.knn import KNN >>> from skmultiflow.data.file_stream import FileStream >>> from skmultiflow.options.file_option import FileOption >>> # Setting up the stream >>> opt = FileOption('FILE', 'OPT_NAME', 'skmultiflow/datasets/sea_big.csv', 'csv', False) >>> stream = FileStream(opt, -1, 1) >>> stream.prepare_for_use() >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_instance(200) >>> knn = KNN(k=8, max_window_size=2000, leaf_size=40) >>> knn.partial_fit(X, y) >>> # Preparing the processing of 5000 samples and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_instance() ... my_pred = knn.predict(X) ... if y[0] == my_pred[0]: ... corrects += 1 ... knn = knn.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying results >>> print('KNN usage example') >>> print(str(n_samples) + ' samples analyzed.') 5000 samples analyzed. >>> print("KNN's performance: " + str(corrects/n_samples)) KNN's performance: 0.868 """ def __init__(self, k=5, max_window_size=1000, leaf_size=30, categorical_list=[]): super().__init__() self.k = k self.max_window_size = max_window_size self.c = 0 self.window = InstanceWindow(max_size=max_window_size, dtype=float) self.first_fit = True self.classes = [] self.leaf_size = leaf_size self.categorical_list = categorical_list def fit(self, X, y, classes=None, weight=None): """ fit Fits the model on the samples X and targets y. This is actually the function as the partial fit. For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNN self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Fits the model on the samples X and targets y. For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. To store the viewed samples we use a InstanceWindow object. For this class' documentation please visit skmultiflow.core.utils.data_structures Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNN self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) return self def reset(self): self.window = None return self def predict(self, X): """ predict Predicts the label of the X sample, by searching the KDTree for the k-Nearest Neighbors. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) All the samples we want to predict the label for. Returns ------- list A list containing the predicted labels for all instances in X. """ r, c = get_dimensions(X) probs = self.predict_proba(X) preds = [] for i in range(r): preds.append(self.classes[probs[i].index(np.max(probs[i]))]) return preds def _predict(self, X): raise NotImplementedError def predict_proba(self, X): """ predict_proba Calculates the probability of each sample in X belonging to each of the labels, based on the knn algorithm. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Raises ------ ValueError: If there is an attempt to call this function before, at least, k samples have been analyzed by the learner, a ValueError is raised. Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.classes) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ if self.window is None: raise ValueError( "KNN should be partially fitted on at least k samples before doing any prediction." ) if self.window._num_samples < self.k: raise ValueError( "KNN should be partially fitted on at least k samples before doing any prediction." ) probs = [] r, c = get_dimensions(X) self.classes = list(set().union( self.classes, np.unique(self.window.get_targets_matrix()))) new_dist, new_ind = self.__predict_proba(X) for i in range(r): classes = [0 for j in range(len(self.classes))] for index in new_ind[i]: classes[self.classes.index( self.window.get_targets_matrix()[index])] += 1 probs.append([x / len(new_ind) for x in classes]) return probs def __predict_proba(self, X): """ __predict_proba Private implementation of the predict_proba method. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- tuple list One list with the k-nearest neighbor's distances and another one with their indexes. Notes ----- If you wish to use our own KDTree implementation please comment the third line of this function and uncomment the first and second lines. """ #tree = KDTree(self.window.get_attributes_matrix(), metric='euclidean', # categorical_list=self.categorical_list, return_distance=True) tree = sk.KDTree(self.window.get_attributes_matrix(), self.leaf_size, metric='euclidean') dist, ind = tree.query(np.asarray(X), k=self.k) return dist, ind def score(self, X, y): raise NotImplementedError def get_info(self): return 'KNN Classifier: max_window_size: ' + str(self.max_window_size) + \ ' - leaf_size: ' + str(self.leaf_size)
class BatchClassifier: def __init__(self, window_size = 100, max_models = 100): self.H = [] self.h = None self.window_size = window_size self.max_models = max_models self.window = InstanceWindow(window_size) self.j = 0 # self.n_DT=0 def partial_fit(self, X, y=None, classes=None): # Get information on the input stream r, c = get_dimensions(X) # DEBUG MESSAGES # print("Begin MAX H "+str(self.max_models)) # print("r:" +str(r)+" c:" +str(c)) for i in range(r): # Check if the window is instanciated if self.window is None: self.window = InstanceWindow(self.window_size) # Add an element to the window (1 row) self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) # Increment the counter for the n_elements self.j+=1 # Create the model (DT) if self.h is None : self.h = DecisionTreeClassifier() # Check if the window is full if self.j == self.window_size: # A new model has to be generated # print("### FITTING MODEL "+str(self.n_DT)+" UNTIL RECORD "+str(i)+" ###") # Train the new model X_batch=self.window.get_attributes_matrix() y_batch=self.window.get_targets_matrix() self.h.fit(X_batch,y_batch) # Keep only self.max_models : pop the oldest to push a new one if(len(self.H) == self.max_models): self.H.pop(0) self.H.append(self.h) # Update the counters # self.n_DT+=1 self.j=0 # DEBUG MESSAGES # print("CURRENT LEN H "+str(len(self.H))) # print("CURRENT MAX H "+str(self.max_models)) # print("HELLO WORLD "+str(self.H)) return self def predict(self, X): # TODO N,D = X.shape # print("### PREDICTING "+str(X)+" ###") # print("N:" +str(N)+" D:" +str(D)) # Set the predictions to zero predictions = zeros(len(self.H)) if len(self.H) > 0 else 0 # Compute predictions with the current models # print("CURRENT LEN H "+str(len(self.H))) for i in range(len(self.H)): predictions[i] = self.H[i].predict(X) # print("PREDICTIONS: "+str(predictions)) # print("FINAL PRED: "+str(np.bincount(asarray(predictions, dtype=int64)).argmax())) # # Return Majority class of predictions # return ndarray(shape=(N,), buffer=np.bincount(asarray(predictions, dtype=int64)).argmax()) return predictions